Graphviz 13.0.0~dev.20241220.2304
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include <assert.h>
14#include <common/render.h>
15#include <common/htmltable.h>
16#include "htmlparse.h"
17#include <common/htmllex.h>
18#include <cdt/cdt.h>
19#include <cgraph/tokenize.h>
20#include <limits.h>
21#include <stdbool.h>
22#include <stddef.h>
23#include <stdint.h>
24#include <util/alloc.h>
25#include <util/gv_ctype.h>
26#include <util/startswith.h>
27#include <util/strcasecmp.h>
28#include <util/strview.h>
29#include <util/unused.h>
30
31#ifdef HAVE_EXPAT
32#ifdef _WIN32
33// ensure that the expat functions get the correct storage class
34// declarations also on MinGW
35#define XML_USE_MSC_EXTENSIONS 1
36#endif
37#include <expat.h>
38#endif
39
40#ifndef XML_STATUS_ERROR
41#define XML_STATUS_ERROR 0
42#endif
43
44static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
45
46/* error_context:
47 * Print the last 2 "token"s seen.
48 */
50{
51 agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
52 ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
53}
54
55/* htmlerror:
56 * yyerror - called by yacc output
57 */
58void htmlerror(htmlscan_t *scanner, const char *msg)
59{
60 htmllexstate_t *ctx = &scanner->lexer;
61 if (ctx->error)
62 return;
63 ctx->error = 1;
64 agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
65 error_context(&scanner->lexer);
66}
67
68#ifdef HAVE_EXPAT
69/* lexerror:
70 * called by lexer when unknown <..> is found.
71 */
72static void lexerror(htmllexstate_t *ctx, const char *name)
73{
74 ctx->tok = T_error;
75 ctx->error = 1;
76 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
77}
78
79typedef int (*attrFn) (void *, char *);
80typedef int (*bcmpfn) (const void *, const void *);
81
82/* Mechanism for automatically processing attributes */
83typedef struct {
84 char *name; /* attribute name */
85 attrFn action; /* action to perform if name matches */
86} attr_item;
87
88#define ISIZE (sizeof(attr_item))
89
90/* icmp:
91 * Compare an attr_item. Used in bsearch
92 */
93static int icmp(const void *name, const void *item) {
94 const attr_item *j = item;
95 return strcasecmp(name, j->name);
96}
97
98static int bgcolorfn(htmldata_t * p, char *v)
99{
100 p->bgcolor = strdup(v);
101 return 0;
102}
103
104static int pencolorfn(htmldata_t * p, char *v)
105{
106 p->pencolor = strdup(v);
107 return 0;
108}
109
110static int hreffn(htmldata_t * p, char *v)
111{
112 p->href = strdup(v);
113 return 0;
114}
115
116static int sidesfn(htmldata_t * p, char *v)
117{
118 unsigned short flags = 0;
119 char c;
120
121 while ((c = *v++)) {
122 switch (gv_tolower(c)) {
123 case 'l' :
125 break;
126 case 't' :
127 flags |= BORDER_TOP;
128 break;
129 case 'r' :
131 break;
132 case 'b' :
134 break;
135 default :
136 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
137 break;
138 }
139 }
140 if (flags != BORDER_MASK)
141 p->flags |= flags;
142 return 0;
143}
144
145static int titlefn(htmldata_t * p, char *v)
146{
147 p->title = strdup(v);
148 return 0;
149}
150
151static int portfn(htmldata_t * p, char *v)
152{
153 p->port = strdup(v);
154 return 0;
155}
156
157#define DELIM " ,"
158
159static int stylefn(htmldata_t * p, char *v)
160{
161 int rv = 0;
162 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
163 strview_t tk = tok_get(&t);
164 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
165 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
166 else if (strview_case_str_eq(tk,"SOLID")) {
167 p->style.dotted = false;
168 p->style.dashed = false;
169 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
170 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
171 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
172 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
173 else {
174 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
175 tk.data);
176 rv = 1;
177 }
178 }
179 return rv;
180}
181
182static int targetfn(htmldata_t * p, char *v)
183{
184 p->target = strdup(v);
185 return 0;
186}
187
188static int idfn(htmldata_t * p, char *v)
189{
190 p->id = strdup(v);
191 return 0;
192}
193
194
195/* doInt:
196 * Scan v for integral value. Check that
197 * the value is >= min and <= max. Return value in ul.
198 * String s is name of value.
199 * Return 0 if okay; 1 otherwise.
200 */
201static int doInt(char *v, char *s, int min, int max, long *ul)
202{
203 int rv = 0;
204 char *ep;
205 long b = strtol(v, &ep, 10);
206
207 if (ep == v) {
208 agwarningf("Improper %s value %s - ignored", s, v);
209 rv = 1;
210 } else if (b > max) {
211 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
212 rv = 1;
213 } else if (b < min) {
214 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
215 rv = 1;
216 } else
217 *ul = b;
218 return rv;
219}
220
221
222static int gradientanglefn(htmldata_t * p, char *v)
223{
224 long u;
225
226 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
227 return 1;
228 p->gradientangle = (unsigned short) u;
229 return 0;
230}
231
232
233static int borderfn(htmldata_t * p, char *v)
234{
235 long u;
236
237 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
238 return 1;
239 p->border = (unsigned char) u;
240 p->flags |= BORDER_SET;
241 return 0;
242}
243
244static int cellpaddingfn(htmldata_t * p, char *v)
245{
246 long u;
247
248 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
249 return 1;
250 p->pad = (unsigned char) u;
251 p->flags |= PAD_SET;
252 return 0;
253}
254
255static int cellspacingfn(htmldata_t * p, char *v)
256{
257 long u;
258
259 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
260 return 1;
261 p->space = (signed char) u;
262 p->flags |= SPACE_SET;
263 return 0;
264}
265
266static int cellborderfn(htmltbl_t * p, char *v)
267{
268 long u;
269
270 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
271 return 1;
272 p->cellborder = (int8_t)u;
273 return 0;
274}
275
276static int columnsfn(htmltbl_t * p, char *v)
277{
278 if (*v != '*') {
279 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
280 return 1;
281 }
282 p->vrule = true;
283 return 0;
284}
285
286static int rowsfn(htmltbl_t * p, char *v)
287{
288 if (*v != '*') {
289 agwarningf("Unknown value %s for ROWS - ignored\n", v);
290 return 1;
291 }
292 p->hrule = true;
293 return 0;
294}
295
296static int fixedsizefn(htmldata_t * p, char *v)
297{
298 int rv = 0;
299 if (!strcasecmp(v, "TRUE"))
300 p->flags |= FIXED_FLAG;
301 else if (strcasecmp(v, "FALSE")) {
302 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
303 rv = 1;
304 }
305 return rv;
306}
307
308static int valignfn(htmldata_t * p, char *v)
309{
310 int rv = 0;
311 if (!strcasecmp(v, "BOTTOM"))
312 p->flags |= VALIGN_BOTTOM;
313 else if (!strcasecmp(v, "TOP"))
314 p->flags |= VALIGN_TOP;
315 else if (strcasecmp(v, "MIDDLE")) {
316 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
317 rv = 1;
318 }
319 return rv;
320}
321
322static int halignfn(htmldata_t * p, char *v)
323{
324 int rv = 0;
325 if (!strcasecmp(v, "LEFT"))
326 p->flags |= HALIGN_LEFT;
327 else if (!strcasecmp(v, "RIGHT"))
328 p->flags |= HALIGN_RIGHT;
329 else if (strcasecmp(v, "CENTER")) {
330 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
331 rv = 1;
332 }
333 return rv;
334}
335
336static int cell_halignfn(htmldata_t * p, char *v)
337{
338 int rv = 0;
339 if (!strcasecmp(v, "LEFT"))
340 p->flags |= HALIGN_LEFT;
341 else if (!strcasecmp(v, "RIGHT"))
342 p->flags |= HALIGN_RIGHT;
343 else if (!strcasecmp(v, "TEXT"))
344 p->flags |= HALIGN_TEXT;
345 else if (strcasecmp(v, "CENTER"))
346 rv = 1;
347 if (rv)
348 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
349 return rv;
350}
351
352static int balignfn(htmldata_t * p, char *v)
353{
354 int rv = 0;
355 if (!strcasecmp(v, "LEFT"))
356 p->flags |= BALIGN_LEFT;
357 else if (!strcasecmp(v, "RIGHT"))
358 p->flags |= BALIGN_RIGHT;
359 else if (strcasecmp(v, "CENTER"))
360 rv = 1;
361 if (rv)
362 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
363 return rv;
364}
365
366static int heightfn(htmldata_t * p, char *v)
367{
368 long u;
369
370 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
371 return 1;
372 p->height = (unsigned short) u;
373 return 0;
374}
375
376static int widthfn(htmldata_t * p, char *v)
377{
378 long u;
379
380 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
381 return 1;
382 p->width = (unsigned short) u;
383 return 0;
384}
385
386static int rowspanfn(htmlcell_t * p, char *v)
387{
388 long u;
389
390 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
391 return 1;
392 if (u == 0) {
393 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
394 return 1;
395 }
396 p->rowspan = (uint16_t)u;
397 return 0;
398}
399
400static int colspanfn(htmlcell_t * p, char *v)
401{
402 long u;
403
404 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
405 return 1;
406 if (u == 0) {
407 agwarningf("COLSPAN value cannot be 0 - ignored\n");
408 return 1;
409 }
410 p->colspan = (uint16_t)u;
411 return 0;
412}
413
414static int fontcolorfn(textfont_t * p, char *v)
415{
416 p->color = v;
417 return 0;
418}
419
420static int facefn(textfont_t * p, char *v)
421{
422 p->name = v;
423 return 0;
424}
425
426static int ptsizefn(textfont_t * p, char *v)
427{
428 long u;
429
430 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
431 return 1;
432 p->size = (double) u;
433 return 0;
434}
435
436static int srcfn(htmlimg_t * p, char *v)
437{
438 p->src = strdup(v);
439 return 0;
440}
441
442static int scalefn(htmlimg_t * p, char *v)
443{
444 p->scale = strdup(v);
445 return 0;
446}
447
448static int alignfn(int *p, char *v)
449{
450 int rv = 0;
451 if (!strcasecmp(v, "RIGHT"))
452 *p = 'r';
453 else if (!strcasecmp(v, "LEFT"))
454 *p = 'l';
455 else if (!strcasecmp(v, "CENTER"))
456 *p = 'n';
457 else {
458 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
459 rv = 1;
460 }
461 return rv;
462}
463
464/* Tables used in binary search; MUST be alphabetized */
465static attr_item tbl_items[] = {
466 {"align", (attrFn) halignfn},
467 {"bgcolor", (attrFn) bgcolorfn},
468 {"border", (attrFn) borderfn},
469 {"cellborder", (attrFn) cellborderfn},
470 {"cellpadding", (attrFn) cellpaddingfn},
471 {"cellspacing", (attrFn) cellspacingfn},
472 {"color", (attrFn) pencolorfn},
473 {"columns", (attrFn) columnsfn},
474 {"fixedsize", (attrFn) fixedsizefn},
475 {"gradientangle", (attrFn) gradientanglefn},
476 {"height", (attrFn) heightfn},
477 {"href", (attrFn) hreffn},
478 {"id", (attrFn) idfn},
479 {"port", (attrFn) portfn},
480 {"rows", (attrFn) rowsfn},
481 {"sides", (attrFn) sidesfn},
482 {"style", (attrFn) stylefn},
483 {"target", (attrFn) targetfn},
484 {"title", (attrFn) titlefn},
485 {"tooltip", (attrFn) titlefn},
486 {"valign", (attrFn) valignfn},
487 {"width", (attrFn) widthfn},
488};
489
490static attr_item cell_items[] = {
491 {"align", (attrFn) cell_halignfn},
492 {"balign", (attrFn) balignfn},
493 {"bgcolor", (attrFn) bgcolorfn},
494 {"border", (attrFn) borderfn},
495 {"cellpadding", (attrFn) cellpaddingfn},
496 {"cellspacing", (attrFn) cellspacingfn},
497 {"color", (attrFn) pencolorfn},
498 {"colspan", (attrFn) colspanfn},
499 {"fixedsize", (attrFn) fixedsizefn},
500 {"gradientangle", (attrFn) gradientanglefn},
501 {"height", (attrFn) heightfn},
502 {"href", (attrFn) hreffn},
503 {"id", (attrFn) idfn},
504 {"port", (attrFn) portfn},
505 {"rowspan", (attrFn) rowspanfn},
506 {"sides", (attrFn) sidesfn},
507 {"style", (attrFn) stylefn},
508 {"target", (attrFn) targetfn},
509 {"title", (attrFn) titlefn},
510 {"tooltip", (attrFn) titlefn},
511 {"valign", (attrFn) valignfn},
512 {"width", (attrFn) widthfn},
513};
514
515static attr_item font_items[] = {
516 {"color", (attrFn) fontcolorfn},
517 {"face", (attrFn) facefn},
518 {"point-size", (attrFn) ptsizefn},
519};
520
521static attr_item img_items[] = {
522 {"scale", (attrFn) scalefn},
523 {"src", (attrFn) srcfn},
524};
525
526static attr_item br_items[] = {
527 {"align", (attrFn) alignfn},
528};
529
530/* doAttrs:
531 * General function for processing list of name/value attributes.
532 * Do binary search on items table. If match found, invoke action
533 * passing it tp and attribute value.
534 * Table size is given by nel
535 * Name/value pairs are in array atts, which is null terminated.
536 * s is the name of the HTML element being processed.
537 */
538static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
539 char *s) {
540 char *name;
541 char *val;
542 attr_item *ip;
543
544 while ((name = *atts++) != NULL) {
545 val = *atts++;
546 ip = bsearch(name, items, nel, ISIZE, icmp);
547 if (ip)
548 ctx->warn |= ip->action(tp, val);
549 else {
550 agwarningf("Illegal attribute %s in %s - ignored\n", name,
551 s);
552 ctx->warn = 1;
553 }
554 }
555}
556
557static void mkBR(htmllexstate_t *ctx, char **atts)
558{
559 ctx->htmllval->i = UNSET_ALIGN;
560 doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
561}
562
563static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
564{
565 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
566
567 doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
568
569 return img;
570}
571
572static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
573 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
574
575 tf.size = -1.0; /* unassigned */
576 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
577 assert(flags <= FLAGS_MAX);
578 tf.flags = (unsigned char)(flags & FLAGS_MAX);
579 if (atts)
580 doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
581
582 return dtinsert(ctx->gvc->textfont_dt, &tf);
583}
584
585static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
586{
588
589 cell->colspan = 1;
590 cell->rowspan = 1;
591 doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
592
593 return cell;
594}
595
596static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
597{
598 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
599
600 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
601 tbl->cellborder = -1; // unset cell border attribute
602 doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
603
604 return tbl;
605}
606
607static void startElement(void *user, const char *name, char **atts)
608{
609 htmllexstate_t *ctx = user;
610
611 if (strcasecmp(name, "TABLE") == 0) {
612 ctx->htmllval->tbl = mkTbl(ctx, atts);
613 ctx->inCell = 0;
614 ctx->tok = T_table;
615 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
616 ctx->inCell = 0;
617 ctx->tok = T_row;
618 } else if (strcasecmp(name, "TD") == 0) {
619 ctx->inCell = 1;
620 ctx->htmllval->cell = mkCell(ctx, atts);
621 ctx->tok = T_cell;
622 } else if (strcasecmp(name, "FONT") == 0) {
623 ctx->htmllval->font = mkFont(ctx, atts, 0);
624 ctx->tok = T_font;
625 } else if (strcasecmp(name, "B") == 0) {
626 ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
627 ctx->tok = T_bold;
628 } else if (strcasecmp(name, "S") == 0) {
629 ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
630 ctx->tok = T_s;
631 } else if (strcasecmp(name, "U") == 0) {
632 ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
633 ctx->tok = T_underline;
634 } else if (strcasecmp(name, "O") == 0) {
635 ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
636 ctx->tok = T_overline;
637 } else if (strcasecmp(name, "I") == 0) {
638 ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
639 ctx->tok = T_italic;
640 } else if (strcasecmp(name, "SUP") == 0) {
641 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
642 ctx->tok = T_sup;
643 } else if (strcasecmp(name, "SUB") == 0) {
644 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
645 ctx->tok = T_sub;
646 } else if (strcasecmp(name, "BR") == 0) {
647 mkBR(ctx, atts);
648 ctx->tok = T_br;
649 } else if (strcasecmp(name, "HR") == 0) {
650 ctx->tok = T_hr;
651 } else if (strcasecmp(name, "VR") == 0) {
652 ctx->tok = T_vr;
653 } else if (strcasecmp(name, "IMG") == 0) {
654 ctx->htmllval->img = mkImg(ctx, atts);
655 ctx->tok = T_img;
656 } else if (strcasecmp(name, "HTML") == 0) {
657 ctx->tok = T_html;
658 } else {
659 lexerror(ctx, name);
660 }
661}
662
663static void endElement(void *user, const char *name)
664{
665 htmllexstate_t *ctx = user;
666
667 if (strcasecmp(name, "TABLE") == 0) {
668 ctx->tok = T_end_table;
669 ctx->inCell = 1;
670 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
671 ctx->tok = T_end_row;
672 } else if (strcasecmp(name, "TD") == 0) {
673 ctx->tok = T_end_cell;
674 ctx->inCell = 0;
675 } else if (strcasecmp(name, "HTML") == 0) {
676 ctx->tok = T_end_html;
677 } else if (strcasecmp(name, "FONT") == 0) {
678 ctx->tok = T_end_font;
679 } else if (strcasecmp(name, "B") == 0) {
680 ctx->tok = T_n_bold;
681 } else if (strcasecmp(name, "U") == 0) {
682 ctx->tok = T_n_underline;
683 } else if (strcasecmp(name, "O") == 0) {
684 ctx->tok = T_n_overline;
685 } else if (strcasecmp(name, "I") == 0) {
686 ctx->tok = T_n_italic;
687 } else if (strcasecmp(name, "SUP") == 0) {
688 ctx->tok = T_n_sup;
689 } else if (strcasecmp(name, "SUB") == 0) {
690 ctx->tok = T_n_sub;
691 } else if (strcasecmp(name, "S") == 0) {
692 ctx->tok = T_n_s;
693 } else if (strcasecmp(name, "BR") == 0) {
694 if (ctx->tok == T_br)
695 ctx->tok = T_BR;
696 else
697 ctx->tok = T_end_br;
698 } else if (strcasecmp(name, "HR") == 0) {
699 if (ctx->tok == T_hr)
700 ctx->tok = T_HR;
701 else
702 ctx->tok = T_end_hr;
703 } else if (strcasecmp(name, "VR") == 0) {
704 if (ctx->tok == T_vr)
705 ctx->tok = T_VR;
706 else
707 ctx->tok = T_end_vr;
708 } else if (strcasecmp(name, "IMG") == 0) {
709 if (ctx->tok == T_img)
710 ctx->tok = T_IMG;
711 else
712 ctx->tok = T_end_img;
713 } else {
714 lexerror(ctx, name);
715 }
716}
717
718/* characterData:
719 * Generate T_string token. Do this only when immediately in
720 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
721 * Strip out formatting characters but keep spaces.
722 * Distinguish between all whitespace vs. strings with non-whitespace
723 * characters.
724 */
725static void characterData(void *user, const char *s, int length)
726{
727 htmllexstate_t *ctx = user;
728
729 int i, cnt = 0;
730 unsigned char c;
731
732 if (ctx->inCell) {
733 for (i = length; i; i--) {
734 c = *s++;
735 if (c >= ' ') {
736 cnt++;
737 agxbputc(ctx->xb, (char)c);
738 }
739 }
740 if (cnt) ctx->tok = T_string;
741 }
742}
743#endif
744
745int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
746{
747#ifdef HAVE_EXPAT
748 htmllexstate_t *ctx = &scanner->lexer;
749
750 ctx->xb = xb;
751 ctx->lb = (agxbuf){0};
752 ctx->ptr = src;
753 ctx->mode = 0;
754 ctx->warn = 0;
755 ctx->error = 0;
756 ctx->currtok = (strview_t){0};
757 ctx->prevtok = (strview_t){0};
758 ctx->inCell = 1;
759 ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
760 ctx->gvc = GD_gvc(env->g);
761 XML_SetUserData(ctx->parser, ctx);
762 XML_SetElementHandler(ctx->parser,
763 (XML_StartElementHandler) startElement,
764 endElement);
765 XML_SetCharacterDataHandler(ctx->parser, characterData);
766 return 0;
767#else
768 static int first;
769 if (!first) {
771 "Not built with libexpat. Table formatting is not available.\n");
772 first++;
773 }
774 return 1;
775#endif
776}
777
779{
780#ifdef HAVE_EXPAT
781 htmllexstate_t *ctx = &scanner->lexer;
782 int rv = ctx->error ? 3 : ctx->warn;
783 XML_ParserFree(ctx->parser);
784 agxbfree (&ctx->lb);
785 return rv;
786#else
787 return 1;
788#endif
789}
790
792static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
793 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
794 // implicitly assuming that source and destination do not overlap
795 char *src_copy = gv_strdup(src);
796 agxbput(dst, src_copy);
797 free(src_copy);
798}
799
800#ifdef HAVE_EXPAT
801/* eatComment:
802 * Given first character after open comment, eat characters
803 * up to comment close, returning pointer to closing > if it exists,
804 * or null character otherwise.
805 * We rely on HTML strings having matched nested <>.
806 */
807static char *eatComment(htmllexstate_t *ctx, char *p)
808{
809 int depth = 1;
810 char *s = p;
811 char c;
812
813 while (depth && (c = *s++)) {
814 if (c == '<')
815 depth++;
816 else if (c == '>')
817 depth--;
818 }
819 s--; /* move back to '\0' or '>' */
820 if (*s) {
821 char *t = s - 2;
822 if (t < p || !startswith(t, "--")) {
823 agwarningf("Unclosed comment\n");
824 ctx->warn = 1;
825 }
826 }
827 return s;
828}
829
830/* findNext:
831 * Return next XML unit. This is either <..>, an HTML
832 * comment <!-- ... -->, or characters up to next <.
833 */
834static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
835{
836 char* t = s + 1;
837 char c;
838
839 if (*s == '<') {
840 if (startswith(t, "!--"))
841 t = eatComment(ctx, t + 3);
842 else
843 while (*t && *t != '>')
844 t++;
845 if (*t != '>') {
846 agwarningf("Label closed before end of HTML element\n");
847 ctx->warn = 1;
848 } else
849 t++;
850 } else {
851 t = s;
852 while ((c = *t) && c != '<') {
853 if (c == '&' && *(t+1) != '#') {
854 t = scanEntity(t + 1, xb);
855 }
856 else {
857 agxbputc(xb, c);
858 t++;
859 }
860 }
861 }
862 return t;
863}
864
881static void protect_rsqb(agxbuf *xb) {
882
883 // if the buffer is empty, we have nothing to do
884 if (agxblen(xb) == 0) {
885 return;
886 }
887
888 // check the last character and if it is not ], we have nothing to do
889 char *data = agxbuse(xb);
890 size_t size = strlen(data);
891 assert(size > 0);
892 if (data[size - 1] != ']') {
893 agxbput_move(xb, data);
894 return;
895 }
896
897 // truncate ] and write back the remaining prefix
898 data[size - 1] = '\0';
899 agxbput_move(xb, data);
900
901 // write an XML-escaped version of ] as a replacement
902 agxbput(xb, "&#93;");
903}
904#endif
905
906
908 return htmllineno_ctx(&scanner->lexer);
909}
910
911static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
912#ifdef HAVE_EXPAT
913 return XML_GetCurrentLineNumber(ctx->parser);
914#else
915 return 0;
916#endif
917}
918
919#ifdef DEBUG
920static void printTok(htmllexstate_t *ctx, int tok)
921{
922 char *s;
923
924 switch (tok) {
925 case T_end_br:
926 s = "T_end_br";
927 break;
928 case T_end_img:
929 s = "T_end_img";
930 break;
931 case T_row:
932 s = "T_row";
933 break;
934 case T_end_row:
935 s = "T_end_row";
936 break;
937 case T_html:
938 s = "T_html";
939 break;
940 case T_end_html:
941 s = "T_end_html";
942 break;
943 case T_end_table:
944 s = "T_end_table";
945 break;
946 case T_end_cell:
947 s = "T_end_cell";
948 break;
949 case T_end_font:
950 s = "T_end_font";
951 break;
952 case T_string:
953 s = "T_string";
954 break;
955 case T_error:
956 s = "T_error";
957 break;
958 case T_n_italic:
959 s = "T_n_italic";
960 break;
961 case T_n_bold:
962 s = "T_n_bold";
963 break;
964 case T_n_underline:
965 s = "T_n_underline";
966 break;
967 case T_n_overline:
968 s = "T_n_overline";
969 break;
970 case T_n_sup:
971 s = "T_n_sup";
972 break;
973 case T_n_sub:
974 s = "T_n_sub";
975 break;
976 case T_n_s:
977 s = "T_n_s";
978 break;
979 case T_HR:
980 s = "T_HR";
981 break;
982 case T_hr:
983 s = "T_hr";
984 break;
985 case T_end_hr:
986 s = "T_end_hr";
987 break;
988 case T_VR:
989 s = "T_VR";
990 break;
991 case T_vr:
992 s = "T_vr";
993 break;
994 case T_end_vr:
995 s = "T_end_vr";
996 break;
997 case T_BR:
998 s = "T_BR";
999 break;
1000 case T_br:
1001 s = "T_br";
1002 break;
1003 case T_IMG:
1004 s = "T_IMG";
1005 break;
1006 case T_img:
1007 s = "T_img";
1008 break;
1009 case T_table:
1010 s = "T_table";
1011 break;
1012 case T_cell:
1013 s = "T_cell";
1014 break;
1015 case T_font:
1016 s = "T_font";
1017 break;
1018 case T_italic:
1019 s = "T_italic";
1020 break;
1021 case T_bold:
1022 s = "T_bold";
1023 break;
1024 case T_underline:
1025 s = "T_underline";
1026 break;
1027 case T_overline:
1028 s = "T_overline";
1029 break;
1030 case T_sup:
1031 s = "T_sup";
1032 break;
1033 case T_sub:
1034 s = "T_sub";
1035 break;
1036 case T_s:
1037 s = "T_s";
1038 break;
1039 default:
1040 s = "<unknown>";
1041 }
1042 if (tok == T_string) {
1043 const char *token_text = agxbuse(ctx->xb);
1044 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1045 agxbput_move(ctx->xb, token_text);
1046 } else
1047 fprintf(stderr, "%s\n", s);
1048}
1049
1050#endif
1051
1052int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
1053{
1054#ifdef HAVE_EXPAT
1055 static char *begin_html = "<HTML>";
1056 static char *end_html = "</HTML>";
1057
1058 char *s;
1059 char *endp = 0;
1060 size_t len, llen;
1061 int rv;
1062 htmllexstate_t *ctx = &scanner->lexer;
1063
1064 ctx->htmllval = htmllval;
1065 ctx->tok = 0;
1066 do {
1067 if (ctx->mode == 2)
1068 return EOF;
1069 if (ctx->mode == 0) {
1070 ctx->mode = 1;
1071 s = begin_html;
1072 len = strlen(s);
1073 endp = 0;
1074 } else {
1075 s = ctx->ptr;
1076 if (*s == '\0') {
1077 ctx->mode = 2;
1078 s = end_html;
1079 len = strlen(s);
1080 } else {
1081 endp = findNext(ctx, s,&ctx->lb);
1082 len = (size_t)(endp - s);
1083 }
1084 }
1085
1086 protect_rsqb(&ctx->lb);
1087
1088 ctx->prevtok = ctx->currtok;
1089 ctx->currtok = (strview_t){.data = s, .size = len};
1090 if ((llen = agxblen(&ctx->lb))) {
1091 assert(llen <= (size_t)INT_MAX && "XML token too long for expat API");
1092 rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
1093 } else {
1094 assert(len <= (size_t)INT_MAX && "XML token too long for expat API");
1095 rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
1096 }
1097 if (rv == XML_STATUS_ERROR) {
1098 if (!ctx->error) {
1099 agerrorf("%s in line %lu \n",
1100 XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
1101 error_context(ctx);
1102 ctx->error = 1;
1103 ctx->tok = T_error;
1104 }
1105 }
1106 if (endp)
1107 ctx->ptr = endp;
1108 } while (ctx->tok == 0);
1109#ifdef DEBUG
1110 printTok (ctx, ctx->tok);
1111#endif
1112 return ctx->tok;
1113#else
1114 return EOF;
1115#endif
1116}
1117
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:78
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:307
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:89
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:277
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:185
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1087
static int flags
Definition gc.c:61
static double len(glCompPoint p)
Definition glutils.c:150
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:163
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:210
void agwarningf(const char *fmt,...)
Definition agerror.c:173
void agerrorf(const char *fmt,...)
Definition agerror.c:165
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:155
@ AGPREV
Definition cgraph.h:849
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
agxbput(xb, staging)
int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
Definition htmllex.c:1052
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:792
unsigned long htmllineno(htmlscan_t *scanner)
Definition htmllex.c:907
#define XML_STATUS_ERROR
Definition htmllex.c:41
static void error_context(htmllexstate_t *ctx)
Definition htmllex.c:49
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:778
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:745
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:58
static unsigned long htmllineno_ctx(htmllexstate_t *ctx)
Definition htmllex.c:911
#define T_n_sup
Definition htmlparse.c:289
#define T_end_row
Definition htmlparse.c:277
#define T_end_table
Definition htmlparse.c:280
#define T_br
Definition htmlparse.c:299
#define T_vr
Definition htmlparse.c:296
#define T_error
Definition htmlparse.c:284
#define T_n_s
Definition htmlparse.c:291
#define T_end_cell
Definition htmlparse.c:281
#define T_n_sub
Definition htmlparse.c:290
#define T_n_bold
Definition htmlparse.c:286
#define T_html
Definition htmlparse.c:278
#define T_BR
Definition htmlparse.c:298
#define T_underline
Definition htmlparse.c:307
#define T_sup
Definition htmlparse.c:309
#define T_row
Definition htmlparse.c:276
#define T_table
Definition htmlparse.c:302
#define T_end_vr
Definition htmlparse.c:297
#define T_end_html
Definition htmlparse.c:279
#define T_IMG
Definition htmlparse.c:300
#define T_VR
Definition htmlparse.c:295
#define T_bold
Definition htmlparse.c:306
#define T_end_img
Definition htmlparse.c:275
#define T_sub
Definition htmlparse.c:310
#define T_s
Definition htmlparse.c:311
#define T_n_italic
Definition htmlparse.c:285
#define T_end_font
Definition htmlparse.c:282
#define T_overline
Definition htmlparse.c:308
#define T_hr
Definition htmlparse.c:293
#define T_font
Definition htmlparse.c:304
#define T_italic
Definition htmlparse.c:305
#define T_end_br
Definition htmlparse.c:274
#define T_n_underline
Definition htmlparse.c:287
#define T_cell
Definition htmlparse.c:303
#define T_end_hr
Definition htmlparse.c:294
#define T_string
Definition htmlparse.c:283
#define T_img
Definition htmlparse.c:301
#define T_HR
Definition htmlparse.c:292
#define T_n_overline
Definition htmlparse.c:288
cleanup & scanner
Definition htmlparse.y:295
#define PAD_SET
Definition htmltable.h:33
#define BORDER_RIGHT
Definition htmltable.h:40
#define BORDER_TOP
Definition htmltable.h:39
#define HALIGN_TEXT
Definition htmltable.h:28
#define UNSET_ALIGN
Definition htmltable.h:44
#define HALIGN_LEFT
Definition htmltable.h:26
#define VALIGN_BOTTOM
Definition htmltable.h:30
#define BALIGN_RIGHT
Definition htmltable.h:35
#define BALIGN_LEFT
Definition htmltable.h:36
#define BORDER_BOTTOM
Definition htmltable.h:41
#define SPACE_SET
Definition htmltable.h:34
#define BORDER_SET
Definition htmltable.h:32
#define BORDER_LEFT
Definition htmltable.h:38
#define BORDER_MASK
Definition htmltable.h:42
#define HALIGN_RIGHT
Definition htmltable.h:25
#define VALIGN_TOP
Definition htmltable.h:29
#define FIXED_FLAG
Definition htmltable.h:24
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:806
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Dt_t * textfont_dt
Definition gvcint.h:107
result of partitioning available space, part of maze
Definition grid.h:33
Definition legal.c:50
uint16_t rowspan
Definition htmltable.h:160
uint16_t colspan
Definition htmltable.h:159
char * bgcolor
Definition htmltable.h:87
unsigned char border
Definition htmltable.h:91
char * target
Definition htmltable.h:84
char * id
Definition htmltable.h:86
signed char space
Definition htmltable.h:90
unsigned short width
Definition htmltable.h:95
unsigned short height
Definition htmltable.h:96
int gradientangle
Definition htmltable.h:89
char * port
Definition htmltable.h:83
unsigned short flags
Definition htmltable.h:94
char * href
Definition htmltable.h:82
unsigned char pad
Definition htmltable.h:92
char * pencolor
Definition htmltable.h:88
htmlstyle_t style
Definition htmltable.h:97
char * title
Definition htmltable.h:85
graph_t * g
Definition htmltable.h:173
char * scale
Definition htmltable.h:70
char * src
Definition htmltable.h:69
agxbuf * xb
Definition htmlparse.c:375
HTMLSTYPE * htmllval
Definition htmlparse.c:384
strview_t prevtok
Definition htmlparse.c:382
strview_t currtok
Definition htmlparse.c:381
bool dashed
Definition htmltable.h:78
bool dotted
Definition htmltable.h:77
bool rounded
Definition htmltable.h:75
bool radial
Definition htmltable.h:74
bool invisible
Definition htmltable.h:76
bool vrule
vertical rule
Definition htmltable.h:145
size_t row_count
number of rows
Definition htmltable.h:141
bool hrule
horizontal rule
Definition htmltable.h:144
int8_t cellborder
Definition htmltable.h:138
Definition utils.c:747
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.c:322
htmlcell_t * cell
Definition htmlparse.c:321
textfont_t * font
Definition htmlparse.c:323
htmlimg_t * img
Definition htmlparse.c:324
Definition grammar.c:93
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25