Graphviz 13.1.3~dev.20250831.0023
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include <assert.h>
14#include <common/render.h>
15#include <common/htmltable.h>
16#include "htmlparse.h"
17#include <common/htmllex.h>
18#include <cdt/cdt.h>
19#include <limits.h>
20#include <stdbool.h>
21#include <stddef.h>
22#include <stdint.h>
23#include <util/alloc.h>
24#include <util/gv_ctype.h>
25#include <util/startswith.h>
26#include <util/strcasecmp.h>
27#include <util/strview.h>
28#include <util/tokenize.h>
29#include <util/unused.h>
30
31#ifdef HAVE_EXPAT
32#ifdef _WIN32
33// ensure that the expat functions get the correct storage class
34// declarations also on MinGW
35#define XML_USE_MSC_EXTENSIONS 1
36#endif
37#include <expat.h>
38#endif
39
40#ifndef XML_STATUS_ERROR
41#define XML_STATUS_ERROR 0
42#endif
43
44static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
45
46/* error_context:
47 * Print the last 2 "token"s seen.
48 */
50{
51 agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
52 ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
53}
54
55/* htmlerror:
56 * yyerror - called by yacc output
57 */
58void htmlerror(htmlscan_t *scanner, const char *msg)
59{
60 htmllexstate_t *ctx = &scanner->lexer;
61 if (ctx->error)
62 return;
63 ctx->error = 1;
64 agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
65 error_context(&scanner->lexer);
66}
67
68#ifdef HAVE_EXPAT
69/* lexerror:
70 * called by lexer when unknown <..> is found.
71 */
72static void lexerror(htmllexstate_t *ctx, const char *name)
73{
74 ctx->tok = T_error;
75 ctx->error = 1;
76 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
77}
78
79typedef int (*attrFn) (void *, char *);
80typedef int (*bcmpfn) (const void *, const void *);
81
82/* Mechanism for automatically processing attributes */
83typedef struct {
84 char *name; /* attribute name */
85 attrFn action; /* action to perform if name matches */
86} attr_item;
87
88#define ISIZE (sizeof(attr_item))
89
90/* icmp:
91 * Compare an attr_item. Used in bsearch
92 */
93static int icmp(const void *name, const void *item) {
94 const attr_item *j = item;
95 return strcasecmp(name, j->name);
96}
97
98static int bgcolorfn(htmldata_t * p, char *v)
99{
100 p->bgcolor = strdup(v);
101 return 0;
102}
103
104static int pencolorfn(htmldata_t * p, char *v)
105{
106 p->pencolor = strdup(v);
107 return 0;
108}
109
110static int hreffn(htmldata_t * p, char *v)
111{
112 p->href = strdup(v);
113 return 0;
114}
115
116static int sidesfn(htmldata_t * p, char *v)
117{
118 unsigned short flags = 0;
119 char c;
120
121 while ((c = *v++)) {
122 switch (gv_tolower(c)) {
123 case 'l' :
125 break;
126 case 't' :
127 flags |= BORDER_TOP;
128 break;
129 case 'r' :
131 break;
132 case 'b' :
134 break;
135 default :
136 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
137 break;
138 }
139 }
140 if (flags != BORDER_MASK)
141 p->flags |= flags;
142 return 0;
143}
144
145static int titlefn(htmldata_t * p, char *v)
146{
147 p->title = strdup(v);
148 return 0;
149}
150
151static int portfn(htmldata_t * p, char *v)
152{
153 p->port = strdup(v);
154 return 0;
155}
156
157#define DELIM " ,"
158
159static int stylefn(htmldata_t * p, char *v)
160{
161 int rv = 0;
162 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
163 strview_t tk = tok_get(&t);
164 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
165 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
166 else if (strview_case_str_eq(tk,"SOLID")) {
167 p->style.dotted = false;
168 p->style.dashed = false;
169 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
170 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
171 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
172 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
173 else {
174 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
175 tk.data);
176 rv = 1;
177 }
178 }
179 return rv;
180}
181
182static int targetfn(htmldata_t * p, char *v)
183{
184 p->target = strdup(v);
185 return 0;
186}
187
188static int idfn(htmldata_t * p, char *v)
189{
190 p->id = strdup(v);
191 return 0;
192}
193
194
195/* doInt:
196 * Scan v for integral value. Check that
197 * the value is >= min and <= max. Return value in ul.
198 * String s is name of value.
199 * Return 0 if okay; 1 otherwise.
200 */
201static int doInt(char *v, char *s, int min, int max, long *ul)
202{
203 int rv = 0;
204 char *ep;
205 long b = strtol(v, &ep, 10);
206
207 if (ep == v) {
208 agwarningf("Improper %s value %s - ignored", s, v);
209 rv = 1;
210 } else if (b > max) {
211 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
212 rv = 1;
213 } else if (b < min) {
214 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
215 rv = 1;
216 } else
217 *ul = b;
218 return rv;
219}
220
221
222static int gradientanglefn(htmldata_t * p, char *v)
223{
224 long u;
225
226 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
227 return 1;
228 p->gradientangle = (unsigned short) u;
229 return 0;
230}
231
232
233static int borderfn(htmldata_t * p, char *v)
234{
235 long u;
236
237 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
238 return 1;
239 p->border = (unsigned char) u;
240 p->flags |= BORDER_SET;
241 return 0;
242}
243
244static int cellpaddingfn(htmldata_t * p, char *v)
245{
246 long u;
247
248 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
249 return 1;
250 p->pad = (unsigned char) u;
251 p->flags |= PAD_SET;
252 return 0;
253}
254
255static int cellspacingfn(htmldata_t * p, char *v)
256{
257 long u;
258
259 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
260 return 1;
261 p->space = (signed char) u;
262 p->flags |= SPACE_SET;
263 return 0;
264}
265
266static int cellborderfn(htmltbl_t * p, char *v)
267{
268 long u;
269
270 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
271 return 1;
272 p->cellborder = (int8_t)u;
273 return 0;
274}
275
276static int columnsfn(htmltbl_t * p, char *v)
277{
278 if (*v != '*') {
279 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
280 return 1;
281 }
282 p->vrule = true;
283 return 0;
284}
285
286static int rowsfn(htmltbl_t * p, char *v)
287{
288 if (*v != '*') {
289 agwarningf("Unknown value %s for ROWS - ignored\n", v);
290 return 1;
291 }
292 p->hrule = true;
293 return 0;
294}
295
296static int fixedsizefn(htmldata_t * p, char *v)
297{
298 int rv = 0;
299 if (!strcasecmp(v, "TRUE"))
300 p->flags |= FIXED_FLAG;
301 else if (strcasecmp(v, "FALSE")) {
302 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
303 rv = 1;
304 }
305 return rv;
306}
307
308static int valignfn(htmldata_t * p, char *v)
309{
310 int rv = 0;
311 if (!strcasecmp(v, "BOTTOM"))
312 p->flags |= VALIGN_BOTTOM;
313 else if (!strcasecmp(v, "TOP"))
314 p->flags |= VALIGN_TOP;
315 else if (strcasecmp(v, "MIDDLE")) {
316 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
317 rv = 1;
318 }
319 return rv;
320}
321
322static int halignfn(htmldata_t * p, char *v)
323{
324 int rv = 0;
325 if (!strcasecmp(v, "LEFT"))
326 p->flags |= HALIGN_LEFT;
327 else if (!strcasecmp(v, "RIGHT"))
328 p->flags |= HALIGN_RIGHT;
329 else if (strcasecmp(v, "CENTER")) {
330 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
331 rv = 1;
332 }
333 return rv;
334}
335
336static int cell_halignfn(htmldata_t * p, char *v)
337{
338 int rv = 0;
339 if (!strcasecmp(v, "LEFT"))
340 p->flags |= HALIGN_LEFT;
341 else if (!strcasecmp(v, "RIGHT"))
342 p->flags |= HALIGN_RIGHT;
343 else if (!strcasecmp(v, "TEXT"))
344 p->flags |= HALIGN_TEXT;
345 else if (strcasecmp(v, "CENTER"))
346 rv = 1;
347 if (rv)
348 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
349 return rv;
350}
351
352static int balignfn(htmldata_t * p, char *v)
353{
354 int rv = 0;
355 if (!strcasecmp(v, "LEFT"))
356 p->flags |= BALIGN_LEFT;
357 else if (!strcasecmp(v, "RIGHT"))
358 p->flags |= BALIGN_RIGHT;
359 else if (strcasecmp(v, "CENTER"))
360 rv = 1;
361 if (rv)
362 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
363 return rv;
364}
365
366static int heightfn(htmldata_t * p, char *v)
367{
368 long u;
369
370 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
371 return 1;
372 p->height = (unsigned short) u;
373 return 0;
374}
375
376static int widthfn(htmldata_t * p, char *v)
377{
378 long u;
379
380 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
381 return 1;
382 p->width = (unsigned short) u;
383 return 0;
384}
385
386static int rowspanfn(htmlcell_t * p, char *v)
387{
388 long u;
389
390 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
391 return 1;
392 if (u == 0) {
393 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
394 return 1;
395 }
396 p->rowspan = (uint16_t)u;
397 return 0;
398}
399
400static int colspanfn(htmlcell_t * p, char *v)
401{
402 long u;
403
404 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
405 return 1;
406 if (u == 0) {
407 agwarningf("COLSPAN value cannot be 0 - ignored\n");
408 return 1;
409 }
410 p->colspan = (uint16_t)u;
411 return 0;
412}
413
414static int fontcolorfn(textfont_t * p, char *v)
415{
416 p->color = v;
417 return 0;
418}
419
420static int facefn(textfont_t * p, char *v)
421{
422 p->name = v;
423 return 0;
424}
425
426static int ptsizefn(textfont_t * p, char *v)
427{
428 long u;
429
430 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
431 return 1;
432 p->size = (double) u;
433 return 0;
434}
435
436static int srcfn(htmlimg_t * p, char *v)
437{
438 p->src = strdup(v);
439 return 0;
440}
441
442static int scalefn(htmlimg_t * p, char *v)
443{
444 p->scale = strdup(v);
445 return 0;
446}
447
448static int alignfn(int *p, char *v)
449{
450 int rv = 0;
451 if (!strcasecmp(v, "RIGHT"))
452 *p = 'r';
453 else if (!strcasecmp(v, "LEFT"))
454 *p = 'l';
455 else if (!strcasecmp(v, "CENTER"))
456 *p = 'n';
457 else {
458 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
459 rv = 1;
460 }
461 return rv;
462}
463
464/* Tables used in binary search; MUST be alphabetized */
465static attr_item tbl_items[] = {
466 {"align", (attrFn) halignfn},
467 {"bgcolor", (attrFn) bgcolorfn},
468 {"border", (attrFn) borderfn},
469 {"cellborder", (attrFn) cellborderfn},
470 {"cellpadding", (attrFn) cellpaddingfn},
471 {"cellspacing", (attrFn) cellspacingfn},
472 {"color", (attrFn) pencolorfn},
473 {"columns", (attrFn) columnsfn},
474 {"fixedsize", (attrFn) fixedsizefn},
475 {"gradientangle", (attrFn) gradientanglefn},
476 {"height", (attrFn) heightfn},
477 {"href", (attrFn) hreffn},
478 {"id", (attrFn) idfn},
479 {"port", (attrFn) portfn},
480 {"rows", (attrFn) rowsfn},
481 {"sides", (attrFn) sidesfn},
482 {"style", (attrFn) stylefn},
483 {"target", (attrFn) targetfn},
484 {"title", (attrFn) titlefn},
485 {"tooltip", (attrFn) titlefn},
486 {"valign", (attrFn) valignfn},
487 {"width", (attrFn) widthfn},
488};
489
490static attr_item cell_items[] = {
491 {"align", (attrFn) cell_halignfn},
492 {"balign", (attrFn) balignfn},
493 {"bgcolor", (attrFn) bgcolorfn},
494 {"border", (attrFn) borderfn},
495 {"cellpadding", (attrFn) cellpaddingfn},
496 {"cellspacing", (attrFn) cellspacingfn},
497 {"color", (attrFn) pencolorfn},
498 {"colspan", (attrFn) colspanfn},
499 {"fixedsize", (attrFn) fixedsizefn},
500 {"gradientangle", (attrFn) gradientanglefn},
501 {"height", (attrFn) heightfn},
502 {"href", (attrFn) hreffn},
503 {"id", (attrFn) idfn},
504 {"port", (attrFn) portfn},
505 {"rowspan", (attrFn) rowspanfn},
506 {"sides", (attrFn) sidesfn},
507 {"style", (attrFn) stylefn},
508 {"target", (attrFn) targetfn},
509 {"title", (attrFn) titlefn},
510 {"tooltip", (attrFn) titlefn},
511 {"valign", (attrFn) valignfn},
512 {"width", (attrFn) widthfn},
513};
514
515static attr_item font_items[] = {
516 {"color", (attrFn) fontcolorfn},
517 {"face", (attrFn) facefn},
518 {"point-size", (attrFn) ptsizefn},
519};
520
521static attr_item img_items[] = {
522 {"scale", (attrFn) scalefn},
523 {"src", (attrFn) srcfn},
524};
525
526static attr_item br_items[] = {
527 {"align", (attrFn) alignfn},
528};
529
530/* doAttrs:
531 * General function for processing list of name/value attributes.
532 * Do binary search on items table. If match found, invoke action
533 * passing it tp and attribute value.
534 * Table size is given by nel
535 * Name/value pairs are in array atts, which is null terminated.
536 * s is the name of the HTML element being processed.
537 */
538static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
539 char *s) {
540 char *name;
541 char *val;
542 attr_item *ip;
543
544 while ((name = *atts++) != NULL) {
545 val = *atts++;
546 ip = bsearch(name, items, nel, ISIZE, icmp);
547 if (ip)
548 ctx->warn |= ip->action(tp, val);
549 else {
550 agwarningf("Illegal attribute %s in %s - ignored\n", name,
551 s);
552 ctx->warn = 1;
553 }
554 }
555}
556
557static void mkBR(htmllexstate_t *ctx, char **atts)
558{
559 ctx->htmllval->i = UNSET_ALIGN;
560 doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
561}
562
563static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
564{
565 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
566
567 doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
568
569 return img;
570}
571
572static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
573 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
574
575 tf.size = -1.0; /* unassigned */
576 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
577 assert(flags <= FLAGS_MAX);
578 tf.flags = (unsigned char)(flags & FLAGS_MAX);
579 if (atts)
580 doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
581
582 return dtinsert(ctx->gvc->textfont_dt, &tf);
583}
584
585static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
586{
588
589 cell->colspan = 1;
590 cell->rowspan = 1;
591 doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
592
593 return cell;
594}
595
596static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
597{
598 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
599
600 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
601 tbl->u.p.rows = (rows_t){.dtor = free_ritem};
602 tbl->cellborder = -1; // unset cell border attribute
603 doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
604
605 return tbl;
606}
607
608static void startElement(void *user, const char *name, char **atts)
609{
610 htmllexstate_t *ctx = user;
611
612 if (strcasecmp(name, "TABLE") == 0) {
613 ctx->htmllval->tbl = mkTbl(ctx, atts);
614 ctx->inCell = 0;
615 ctx->tok = T_table;
616 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
617 ctx->inCell = 0;
618 ctx->tok = T_row;
619 } else if (strcasecmp(name, "TD") == 0) {
620 ctx->inCell = 1;
621 ctx->htmllval->cell = mkCell(ctx, atts);
622 ctx->tok = T_cell;
623 } else if (strcasecmp(name, "FONT") == 0) {
624 ctx->htmllval->font = mkFont(ctx, atts, 0);
625 ctx->tok = T_font;
626 } else if (strcasecmp(name, "B") == 0) {
627 ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
628 ctx->tok = T_bold;
629 } else if (strcasecmp(name, "S") == 0) {
630 ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
631 ctx->tok = T_s;
632 } else if (strcasecmp(name, "U") == 0) {
633 ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
634 ctx->tok = T_underline;
635 } else if (strcasecmp(name, "O") == 0) {
636 ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
637 ctx->tok = T_overline;
638 } else if (strcasecmp(name, "I") == 0) {
639 ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
640 ctx->tok = T_italic;
641 } else if (strcasecmp(name, "SUP") == 0) {
642 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
643 ctx->tok = T_sup;
644 } else if (strcasecmp(name, "SUB") == 0) {
645 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
646 ctx->tok = T_sub;
647 } else if (strcasecmp(name, "BR") == 0) {
648 mkBR(ctx, atts);
649 ctx->tok = T_br;
650 } else if (strcasecmp(name, "HR") == 0) {
651 ctx->tok = T_hr;
652 } else if (strcasecmp(name, "VR") == 0) {
653 ctx->tok = T_vr;
654 } else if (strcasecmp(name, "IMG") == 0) {
655 ctx->htmllval->img = mkImg(ctx, atts);
656 ctx->tok = T_img;
657 } else if (strcasecmp(name, "HTML") == 0) {
658 ctx->tok = T_html;
659 } else {
660 lexerror(ctx, name);
661 }
662}
663
664static void endElement(void *user, const char *name)
665{
666 htmllexstate_t *ctx = user;
667
668 if (strcasecmp(name, "TABLE") == 0) {
669 ctx->tok = T_end_table;
670 ctx->inCell = 1;
671 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
672 ctx->tok = T_end_row;
673 } else if (strcasecmp(name, "TD") == 0) {
674 ctx->tok = T_end_cell;
675 ctx->inCell = 0;
676 } else if (strcasecmp(name, "HTML") == 0) {
677 ctx->tok = T_end_html;
678 } else if (strcasecmp(name, "FONT") == 0) {
679 ctx->tok = T_end_font;
680 } else if (strcasecmp(name, "B") == 0) {
681 ctx->tok = T_n_bold;
682 } else if (strcasecmp(name, "U") == 0) {
683 ctx->tok = T_n_underline;
684 } else if (strcasecmp(name, "O") == 0) {
685 ctx->tok = T_n_overline;
686 } else if (strcasecmp(name, "I") == 0) {
687 ctx->tok = T_n_italic;
688 } else if (strcasecmp(name, "SUP") == 0) {
689 ctx->tok = T_n_sup;
690 } else if (strcasecmp(name, "SUB") == 0) {
691 ctx->tok = T_n_sub;
692 } else if (strcasecmp(name, "S") == 0) {
693 ctx->tok = T_n_s;
694 } else if (strcasecmp(name, "BR") == 0) {
695 if (ctx->tok == T_br)
696 ctx->tok = T_BR;
697 else
698 ctx->tok = T_end_br;
699 } else if (strcasecmp(name, "HR") == 0) {
700 if (ctx->tok == T_hr)
701 ctx->tok = T_HR;
702 else
703 ctx->tok = T_end_hr;
704 } else if (strcasecmp(name, "VR") == 0) {
705 if (ctx->tok == T_vr)
706 ctx->tok = T_VR;
707 else
708 ctx->tok = T_end_vr;
709 } else if (strcasecmp(name, "IMG") == 0) {
710 if (ctx->tok == T_img)
711 ctx->tok = T_IMG;
712 else
713 ctx->tok = T_end_img;
714 } else {
715 lexerror(ctx, name);
716 }
717}
718
719/* characterData:
720 * Generate T_string token. Do this only when immediately in
721 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
722 * Strip out formatting characters but keep spaces.
723 * Distinguish between all whitespace vs. strings with non-whitespace
724 * characters.
725 */
726static void characterData(void *user, const char *s, int length)
727{
728 htmllexstate_t *ctx = user;
729
730 int i, cnt = 0;
731 unsigned char c;
732
733 if (ctx->inCell) {
734 for (i = length; i; i--) {
735 c = *s++;
736 if (c >= ' ') {
737 cnt++;
738 agxbputc(ctx->xb, (char)c);
739 }
740 }
741 if (cnt) ctx->tok = T_string;
742 }
743}
744#endif
745
746int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
747{
748#ifdef HAVE_EXPAT
749 htmllexstate_t *ctx = &scanner->lexer;
750
751 ctx->xb = xb;
752 ctx->lb = (agxbuf){0};
753 ctx->ptr = src;
754 ctx->mode = 0;
755 ctx->warn = 0;
756 ctx->error = 0;
757 ctx->currtok = (strview_t){0};
758 ctx->prevtok = (strview_t){0};
759 ctx->inCell = 1;
760 ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
761 ctx->gvc = GD_gvc(env->g);
762 XML_SetUserData(ctx->parser, ctx);
763 XML_SetElementHandler(ctx->parser,
764 (XML_StartElementHandler) startElement,
765 endElement);
766 XML_SetCharacterDataHandler(ctx->parser, characterData);
767 return 0;
768#else
769 (void)scanner;
770 (void)src;
771 (void)xb;
772 (void)env;
773
774 static int first;
775 if (!first) {
777 "Not built with libexpat. Table formatting is not available.\n");
778 first++;
779 }
780 return 1;
781#endif
782}
783
785{
786#ifdef HAVE_EXPAT
787 htmllexstate_t *ctx = &scanner->lexer;
788 int rv = ctx->error ? 3 : ctx->warn;
789 XML_ParserFree(ctx->parser);
790 agxbfree (&ctx->lb);
791 return rv;
792#else
793 (void)scanner;
794
795 return 1;
796#endif
797}
798
800static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
801 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
802 // implicitly assuming that source and destination do not overlap
803 char *src_copy = gv_strdup(src);
804 agxbput(dst, src_copy);
805 free(src_copy);
806}
807
808#ifdef HAVE_EXPAT
809/* eatComment:
810 * Given first character after open comment, eat characters
811 * up to comment close, returning pointer to closing > if it exists,
812 * or null character otherwise.
813 * We rely on HTML strings having matched nested <>.
814 */
815static char *eatComment(htmllexstate_t *ctx, char *p)
816{
817 int depth = 1;
818 char *s = p;
819 char c;
820
821 while (depth && (c = *s++)) {
822 if (c == '<')
823 depth++;
824 else if (c == '>')
825 depth--;
826 }
827 s--; /* move back to '\0' or '>' */
828 if (*s) {
829 char *t = s - 2;
830 if (t < p || !startswith(t, "--")) {
831 agwarningf("Unclosed comment\n");
832 ctx->warn = 1;
833 }
834 }
835 return s;
836}
837
838/* findNext:
839 * Return next XML unit. This is either <..>, an HTML
840 * comment <!-- ... -->, or characters up to next <.
841 */
842static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
843{
844 char* t = s + 1;
845 char c;
846
847 if (*s == '<') {
848 if (startswith(t, "!--"))
849 t = eatComment(ctx, t + 3);
850 else
851 while (*t && *t != '>')
852 t++;
853 if (*t != '>') {
854 agwarningf("Label closed before end of HTML element\n");
855 ctx->warn = 1;
856 } else
857 t++;
858 } else {
859 t = s;
860 while ((c = *t) && c != '<') {
861 if (c == '&' && *(t+1) != '#') {
862 t = scanEntity(t + 1, xb);
863 }
864 else {
865 agxbputc(xb, c);
866 t++;
867 }
868 }
869 }
870 return t;
871}
872
889static void protect_rsqb(agxbuf *xb) {
890
891 // if the buffer is empty, we have nothing to do
892 if (agxblen(xb) == 0) {
893 return;
894 }
895
896 // check the last character and if it is not ], we have nothing to do
897 char *data = agxbuse(xb);
898 size_t size = strlen(data);
899 assert(size > 0);
900 if (data[size - 1] != ']') {
901 agxbput_move(xb, data);
902 return;
903 }
904
905 // truncate ] and write back the remaining prefix
906 data[size - 1] = '\0';
907 agxbput_move(xb, data);
908
909 // write an XML-escaped version of ] as a replacement
910 agxbput(xb, "&#93;");
911}
912#endif
913
914
916 return htmllineno_ctx(&scanner->lexer);
917}
918
919static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
920#ifdef HAVE_EXPAT
921 return XML_GetCurrentLineNumber(ctx->parser);
922#else
923 (void)ctx;
924
925 return 0;
926#endif
927}
928
929#ifdef DEBUG
930static void printTok(htmllexstate_t *ctx, int tok)
931{
932 char *s;
933
934 switch (tok) {
935 case T_end_br:
936 s = "T_end_br";
937 break;
938 case T_end_img:
939 s = "T_end_img";
940 break;
941 case T_row:
942 s = "T_row";
943 break;
944 case T_end_row:
945 s = "T_end_row";
946 break;
947 case T_html:
948 s = "T_html";
949 break;
950 case T_end_html:
951 s = "T_end_html";
952 break;
953 case T_end_table:
954 s = "T_end_table";
955 break;
956 case T_end_cell:
957 s = "T_end_cell";
958 break;
959 case T_end_font:
960 s = "T_end_font";
961 break;
962 case T_string:
963 s = "T_string";
964 break;
965 case T_error:
966 s = "T_error";
967 break;
968 case T_n_italic:
969 s = "T_n_italic";
970 break;
971 case T_n_bold:
972 s = "T_n_bold";
973 break;
974 case T_n_underline:
975 s = "T_n_underline";
976 break;
977 case T_n_overline:
978 s = "T_n_overline";
979 break;
980 case T_n_sup:
981 s = "T_n_sup";
982 break;
983 case T_n_sub:
984 s = "T_n_sub";
985 break;
986 case T_n_s:
987 s = "T_n_s";
988 break;
989 case T_HR:
990 s = "T_HR";
991 break;
992 case T_hr:
993 s = "T_hr";
994 break;
995 case T_end_hr:
996 s = "T_end_hr";
997 break;
998 case T_VR:
999 s = "T_VR";
1000 break;
1001 case T_vr:
1002 s = "T_vr";
1003 break;
1004 case T_end_vr:
1005 s = "T_end_vr";
1006 break;
1007 case T_BR:
1008 s = "T_BR";
1009 break;
1010 case T_br:
1011 s = "T_br";
1012 break;
1013 case T_IMG:
1014 s = "T_IMG";
1015 break;
1016 case T_img:
1017 s = "T_img";
1018 break;
1019 case T_table:
1020 s = "T_table";
1021 break;
1022 case T_cell:
1023 s = "T_cell";
1024 break;
1025 case T_font:
1026 s = "T_font";
1027 break;
1028 case T_italic:
1029 s = "T_italic";
1030 break;
1031 case T_bold:
1032 s = "T_bold";
1033 break;
1034 case T_underline:
1035 s = "T_underline";
1036 break;
1037 case T_overline:
1038 s = "T_overline";
1039 break;
1040 case T_sup:
1041 s = "T_sup";
1042 break;
1043 case T_sub:
1044 s = "T_sub";
1045 break;
1046 case T_s:
1047 s = "T_s";
1048 break;
1049 default:
1050 s = "<unknown>";
1051 }
1052 if (tok == T_string) {
1053 const char *token_text = agxbuse(ctx->xb);
1054 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1055 agxbput_move(ctx->xb, token_text);
1056 } else
1057 fprintf(stderr, "%s\n", s);
1058}
1059
1060#endif
1061
1062int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
1063{
1064#ifdef HAVE_EXPAT
1065 static char *begin_html = "<HTML>";
1066 static char *end_html = "</HTML>";
1067
1068 char *s;
1069 char *endp = 0;
1070 size_t len, llen;
1071 int rv;
1072 htmllexstate_t *ctx = &scanner->lexer;
1073
1074 ctx->htmllval = htmllval;
1075 ctx->tok = 0;
1076 do {
1077 if (ctx->mode == 2)
1078 return EOF;
1079 if (ctx->mode == 0) {
1080 ctx->mode = 1;
1081 s = begin_html;
1082 len = strlen(s);
1083 endp = 0;
1084 } else {
1085 s = ctx->ptr;
1086 if (*s == '\0') {
1087 ctx->mode = 2;
1088 s = end_html;
1089 len = strlen(s);
1090 } else {
1091 endp = findNext(ctx, s,&ctx->lb);
1092 len = (size_t)(endp - s);
1093 }
1094 }
1095
1096 protect_rsqb(&ctx->lb);
1097
1098 ctx->prevtok = ctx->currtok;
1099 ctx->currtok = (strview_t){.data = s, .size = len};
1100 if ((llen = agxblen(&ctx->lb))) {
1101 assert(llen <= INT_MAX && "XML token too long for expat API");
1102 rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
1103 } else {
1104 assert(len <= INT_MAX && "XML token too long for expat API");
1105 rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
1106 }
1107 if (rv == XML_STATUS_ERROR) {
1108 if (!ctx->error) {
1109 agerrorf("%s in line %lu \n",
1110 XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
1111 error_context(ctx);
1112 ctx->error = 1;
1113 ctx->tok = T_error;
1114 }
1115 }
1116 if (endp)
1117 ctx->ptr = endp;
1118 } while (ctx->tok == 0);
1119#ifdef DEBUG
1120 printTok (ctx, ctx->tok);
1121#endif
1122 return ctx->tok;
1123#else
1124 (void)htmllval;
1125 (void)scanner;
1126
1127 return EOF;
1128#endif
1129}
1130
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:77
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:306
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:88
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:276
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:185
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1080
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1606
static int flags
Definition gc.c:61
static double len(glCompPoint p)
Definition glutils.c:136
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:181
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:196
void agwarningf(const char *fmt,...)
Definition agerror.c:173
void agerrorf(const char *fmt,...)
Definition agerror.c:165
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:155
@ AGPREV
Definition cgraph.h:957
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
agxbput(xb, staging)
int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
Definition htmllex.c:1062
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:800
unsigned long htmllineno(htmlscan_t *scanner)
Definition htmllex.c:915
#define XML_STATUS_ERROR
Definition htmllex.c:41
static void error_context(htmllexstate_t *ctx)
Definition htmllex.c:49
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:784
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:746
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:58
static unsigned long htmllineno_ctx(htmllexstate_t *ctx)
Definition htmllex.c:919
#define T_n_sup
Definition htmlparse.h:138
#define T_end_row
Definition htmlparse.h:126
#define T_end_table
Definition htmlparse.h:129
#define T_br
Definition htmlparse.h:148
#define T_vr
Definition htmlparse.h:145
#define T_error
Definition htmlparse.h:133
#define T_n_s
Definition htmlparse.h:140
#define T_end_cell
Definition htmlparse.h:130
#define T_n_sub
Definition htmlparse.h:139
#define T_n_bold
Definition htmlparse.h:135
#define T_html
Definition htmlparse.h:127
#define T_BR
Definition htmlparse.h:147
#define T_underline
Definition htmlparse.h:156
#define T_sup
Definition htmlparse.h:158
#define T_row
Definition htmlparse.h:125
#define T_table
Definition htmlparse.h:151
#define T_end_vr
Definition htmlparse.h:146
#define T_end_html
Definition htmlparse.h:128
#define T_IMG
Definition htmlparse.h:149
#define T_VR
Definition htmlparse.h:144
#define T_bold
Definition htmlparse.h:155
#define T_end_img
Definition htmlparse.h:124
#define T_sub
Definition htmlparse.h:159
#define T_s
Definition htmlparse.h:160
#define T_n_italic
Definition htmlparse.h:134
#define T_end_font
Definition htmlparse.h:131
#define T_overline
Definition htmlparse.h:157
#define T_hr
Definition htmlparse.h:142
#define T_font
Definition htmlparse.h:153
#define T_italic
Definition htmlparse.h:154
#define T_end_br
Definition htmlparse.h:123
#define T_n_underline
Definition htmlparse.h:136
#define T_cell
Definition htmlparse.h:152
#define T_end_hr
Definition htmlparse.h:143
#define T_string
Definition htmlparse.h:132
#define T_img
Definition htmlparse.h:150
#define T_HR
Definition htmlparse.h:141
#define T_n_overline
Definition htmlparse.h:137
cleanup & scanner
Definition htmlparse.y:289
#define PAD_SET
Definition htmltable.h:33
#define BORDER_RIGHT
Definition htmltable.h:40
#define BORDER_TOP
Definition htmltable.h:39
#define HALIGN_TEXT
Definition htmltable.h:28
#define UNSET_ALIGN
Definition htmltable.h:44
#define HALIGN_LEFT
Definition htmltable.h:26
#define VALIGN_BOTTOM
Definition htmltable.h:30
#define BALIGN_RIGHT
Definition htmltable.h:35
#define BALIGN_LEFT
Definition htmltable.h:36
#define BORDER_BOTTOM
Definition htmltable.h:41
static void free_ritem(row_t *p)
Free row. This closes and frees row’s list, then the item itself is freed.
Definition htmltable.h:117
#define SPACE_SET
Definition htmltable.h:34
#define BORDER_SET
Definition htmltable.h:32
#define BORDER_LEFT
Definition htmltable.h:38
#define BORDER_MASK
Definition htmltable.h:42
#define HALIGN_RIGHT
Definition htmltable.h:25
#define VALIGN_TOP
Definition htmltable.h:29
#define FIXED_FLAG
Definition htmltable.h:24
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:811
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Dt_t * textfont_dt
Definition gvcint.h:108
result of partitioning available space, part of maze
Definition grid.h:33
Definition legal.c:50
uint16_t rowspan
Definition htmltable.h:158
uint16_t colspan
Definition htmltable.h:157
char * bgcolor
Definition htmltable.h:87
unsigned char border
Definition htmltable.h:91
char * target
Definition htmltable.h:84
char * id
Definition htmltable.h:86
signed char space
Definition htmltable.h:90
unsigned short width
Definition htmltable.h:95
unsigned short height
Definition htmltable.h:96
int gradientangle
Definition htmltable.h:89
char * port
Definition htmltable.h:83
unsigned short flags
Definition htmltable.h:94
char * href
Definition htmltable.h:82
unsigned char pad
Definition htmltable.h:92
char * pencolor
Definition htmltable.h:88
htmlstyle_t style
Definition htmltable.h:97
char * title
Definition htmltable.h:85
graph_t * g
Definition htmltable.h:171
char * scale
Definition htmltable.h:70
char * src
Definition htmltable.h:69
HTMLSTYPE * htmllval
Definition htmlparse.h:229
agxbuf * xb
Definition htmlparse.h:220
strview_t prevtok
Definition htmlparse.h:227
strview_t currtok
Definition htmlparse.h:226
bool dashed
Definition htmltable.h:78
bool dotted
Definition htmltable.h:77
bool rounded
Definition htmltable.h:75
bool radial
Definition htmltable.h:74
bool invisible
Definition htmltable.h:76
bool vrule
vertical rule
Definition htmltable.h:143
size_t row_count
number of rows
Definition htmltable.h:139
bool hrule
horizontal rule
Definition htmltable.h:142
struct htmltbl_t::@78::@80 p
union htmltbl_t::@78 u
rows_t rows
cells
Definition htmltable.h:133
int8_t cellborder
Definition htmltable.h:136
Definition utils.c:751
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.h:171
htmlcell_t * cell
Definition htmlparse.h:170
textfont_t * font
Definition htmlparse.h:172
htmlimg_t * img
Definition htmlparse.h:173
Definition grammar.c:90
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25