Graphviz 14.1.6~dev.20260416.2202
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v2.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include "config.h"
14
15#include <assert.h>
16#include <common/render.h>
17#include <common/htmltable.h>
18#include "htmlparse.h"
19#include <common/htmllex.h>
20#include <cdt/cdt.h>
21#include <limits.h>
22#include <stdatomic.h>
23#include <stdbool.h>
24#include <stddef.h>
25#include <stdint.h>
26#include <util/alloc.h>
27#include <util/gv_ctype.h>
28#include <util/startswith.h>
29#include <util/strcasecmp.h>
30#include <util/strview.h>
31#include <util/tokenize.h>
32#include <util/unused.h>
33
34#ifdef HAVE_EXPAT
35#include <expat.h>
36#endif
37
38#ifndef XML_STATUS_ERROR
39#define XML_STATUS_ERROR 0
40#endif
41
42static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
43
44/* error_context:
45 * Print the last 2 "token"s seen.
46 */
48{
49 agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
50 ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
51}
52
53/* htmlerror:
54 * yyerror - called by yacc output
55 */
56void htmlerror(htmlscan_t *scanner, const char *msg)
57{
58 htmllexstate_t *ctx = &scanner->lexer;
59 if (ctx->error)
60 return;
61 ctx->error = 1;
62 agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
63 error_context(&scanner->lexer);
64}
65
66#ifdef HAVE_EXPAT
67/* lexerror:
68 * called by lexer when unknown <..> is found.
69 */
70static void lexerror(htmllexstate_t *ctx, const char *name)
71{
72 ctx->tok = T_error;
73 ctx->error = 1;
74 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
75}
76
77typedef int (*attrFn) (void *, char *);
78typedef int (*bcmpfn) (const void *, const void *);
79
80/* Mechanism for automatically processing attributes */
81typedef struct {
82 char *name; /* attribute name */
83 attrFn action; /* action to perform if name matches */
84} attr_item;
85
86#define ISIZE (sizeof(attr_item))
87
88/* icmp:
89 * Compare an attr_item. Used in bsearch
90 */
91static int icmp(const void *name, const void *item) {
92 const attr_item *j = item;
93 return strcasecmp(name, j->name);
94}
95
96static int bgcolorfn(htmldata_t * p, char *v)
97{
98 p->bgcolor = strdup(v);
99 return 0;
100}
101
102static int pencolorfn(htmldata_t * p, char *v)
103{
104 p->pencolor = strdup(v);
105 return 0;
106}
107
108static int hreffn(htmldata_t * p, char *v)
109{
110 p->href = strdup(v);
111 return 0;
112}
113
114static int sidesfn(htmldata_t * p, char *v)
115{
116 unsigned short flags = 0;
117 char c;
118
119 while ((c = *v++)) {
120 switch (gv_tolower(c)) {
121 case 'l' :
123 break;
124 case 't' :
125 flags |= BORDER_TOP;
126 break;
127 case 'r' :
129 break;
130 case 'b' :
132 break;
133 default :
134 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
135 break;
136 }
137 }
138 if (flags != BORDER_MASK)
139 p->flags |= flags;
140 return 0;
141}
142
143static int titlefn(htmldata_t * p, char *v)
144{
145 p->title = strdup(v);
146 return 0;
147}
148
149static int portfn(htmldata_t * p, char *v)
150{
151 p->port = strdup(v);
152 return 0;
153}
154
155#define DELIM " ,"
156
157static int stylefn(htmldata_t * p, char *v)
158{
159 int rv = 0;
160 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
161 strview_t tk = tok_get(&t);
162 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
163 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
164 else if (strview_case_str_eq(tk,"SOLID")) {
165 p->style.dotted = false;
166 p->style.dashed = false;
167 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
168 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
169 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
170 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
171 else {
172 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
173 tk.data);
174 rv = 1;
175 }
176 }
177 return rv;
178}
179
180static int targetfn(htmldata_t * p, char *v)
181{
182 p->target = strdup(v);
183 return 0;
184}
185
186static int idfn(htmldata_t * p, char *v)
187{
188 p->id = strdup(v);
189 return 0;
190}
191
192
193/* doInt:
194 * Scan v for integral value. Check that
195 * the value is >= min and <= max. Return value in ul.
196 * String s is name of value.
197 * Return 0 if okay; 1 otherwise.
198 */
199static int doInt(char *v, char *s, int min, int max, long *ul)
200{
201 int rv = 0;
202 char *ep;
203 long b = strtol(v, &ep, 10);
204
205 if (ep == v) {
206 agwarningf("Improper %s value %s - ignored", s, v);
207 rv = 1;
208 } else if (b > max) {
209 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
210 rv = 1;
211 } else if (b < min) {
212 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
213 rv = 1;
214 } else
215 *ul = b;
216 return rv;
217}
218
219
220static int gradientanglefn(htmldata_t * p, char *v)
221{
222 long u;
223
224 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
225 return 1;
226 p->gradientangle = (unsigned short) u;
227 return 0;
228}
229
230
231static int borderfn(htmldata_t * p, char *v)
232{
233 long u;
234
235 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
236 return 1;
237 p->border = (unsigned char) u;
238 p->flags |= BORDER_SET;
239 return 0;
240}
241
242static int cellpaddingfn(htmldata_t * p, char *v)
243{
244 long u;
245
246 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
247 return 1;
248 p->pad = (unsigned char) u;
249 p->flags |= PAD_SET;
250 return 0;
251}
252
253static int cellspacingfn(htmldata_t * p, char *v)
254{
255 long u;
256
257 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
258 return 1;
259 p->space = (signed char) u;
260 p->flags |= SPACE_SET;
261 return 0;
262}
263
264static int cellborderfn(htmltbl_t * p, char *v)
265{
266 long u;
267
268 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
269 return 1;
270 p->cellborder = (int8_t)u;
271 return 0;
272}
273
274static int columnsfn(htmltbl_t * p, char *v)
275{
276 if (*v != '*') {
277 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
278 return 1;
279 }
280 p->vrule = true;
281 return 0;
282}
283
284static int rowsfn(htmltbl_t * p, char *v)
285{
286 if (*v != '*') {
287 agwarningf("Unknown value %s for ROWS - ignored\n", v);
288 return 1;
289 }
290 p->hrule = true;
291 return 0;
292}
293
294static int fixedsizefn(htmldata_t * p, char *v)
295{
296 int rv = 0;
297 if (!strcasecmp(v, "TRUE"))
298 p->flags |= FIXED_FLAG;
299 else if (strcasecmp(v, "FALSE")) {
300 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
301 rv = 1;
302 }
303 return rv;
304}
305
306static int valignfn(htmldata_t * p, char *v)
307{
308 int rv = 0;
309 if (!strcasecmp(v, "BOTTOM"))
310 p->flags |= VALIGN_BOTTOM;
311 else if (!strcasecmp(v, "TOP"))
312 p->flags |= VALIGN_TOP;
313 else if (strcasecmp(v, "MIDDLE")) {
314 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
315 rv = 1;
316 }
317 return rv;
318}
319
320static int halignfn(htmldata_t * p, char *v)
321{
322 int rv = 0;
323 if (!strcasecmp(v, "LEFT"))
324 p->flags |= HALIGN_LEFT;
325 else if (!strcasecmp(v, "RIGHT"))
326 p->flags |= HALIGN_RIGHT;
327 else if (strcasecmp(v, "CENTER")) {
328 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
329 rv = 1;
330 }
331 return rv;
332}
333
334static int cell_halignfn(htmldata_t * p, char *v)
335{
336 int rv = 0;
337 if (!strcasecmp(v, "LEFT"))
338 p->flags |= HALIGN_LEFT;
339 else if (!strcasecmp(v, "RIGHT"))
340 p->flags |= HALIGN_RIGHT;
341 else if (!strcasecmp(v, "TEXT"))
342 p->flags |= HALIGN_TEXT;
343 else if (strcasecmp(v, "CENTER"))
344 rv = 1;
345 if (rv)
346 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
347 return rv;
348}
349
350static int balignfn(htmldata_t * p, char *v)
351{
352 int rv = 0;
353 if (!strcasecmp(v, "LEFT"))
354 p->flags |= BALIGN_LEFT;
355 else if (!strcasecmp(v, "RIGHT"))
356 p->flags |= BALIGN_RIGHT;
357 else if (strcasecmp(v, "CENTER"))
358 rv = 1;
359 if (rv)
360 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
361 return rv;
362}
363
364static int heightfn(htmldata_t * p, char *v)
365{
366 long u;
367
368 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
369 return 1;
370 p->height = (unsigned short) u;
371 return 0;
372}
373
374static int widthfn(htmldata_t * p, char *v)
375{
376 long u;
377
378 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
379 return 1;
380 p->width = (unsigned short) u;
381 return 0;
382}
383
384static int rowspanfn(htmlcell_t * p, char *v)
385{
386 long u;
387
388 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
389 return 1;
390 if (u == 0) {
391 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
392 return 1;
393 }
394 p->rowspan = (uint16_t)u;
395 return 0;
396}
397
398static int colspanfn(htmlcell_t * p, char *v)
399{
400 long u;
401
402 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
403 return 1;
404 if (u == 0) {
405 agwarningf("COLSPAN value cannot be 0 - ignored\n");
406 return 1;
407 }
408 p->colspan = (uint16_t)u;
409 return 0;
410}
411
412static int fontcolorfn(textfont_t * p, char *v)
413{
414 p->color = v;
415 return 0;
416}
417
418static int facefn(textfont_t * p, char *v)
419{
420 p->name = v;
421 return 0;
422}
423
424static int ptsizefn(textfont_t * p, char *v)
425{
426 long u;
427
428 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
429 return 1;
430 p->size = (double) u;
431 return 0;
432}
433
434static int srcfn(htmlimg_t * p, char *v)
435{
436 p->src = strdup(v);
437 return 0;
438}
439
440static int scalefn(htmlimg_t * p, char *v)
441{
442 p->scale = strdup(v);
443 return 0;
444}
445
446static int alignfn(int *p, char *v)
447{
448 int rv = 0;
449 if (!strcasecmp(v, "RIGHT"))
450 *p = 'r';
451 else if (!strcasecmp(v, "LEFT"))
452 *p = 'l';
453 else if (!strcasecmp(v, "CENTER"))
454 *p = 'n';
455 else {
456 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
457 rv = 1;
458 }
459 return rv;
460}
461
462/* Tables used in binary search; MUST be alphabetized */
463static attr_item tbl_items[] = {
464 {"align", (attrFn) halignfn},
465 {"bgcolor", (attrFn) bgcolorfn},
466 {"border", (attrFn) borderfn},
467 {"cellborder", (attrFn) cellborderfn},
468 {"cellpadding", (attrFn) cellpaddingfn},
469 {"cellspacing", (attrFn) cellspacingfn},
470 {"color", (attrFn) pencolorfn},
471 {"columns", (attrFn) columnsfn},
472 {"fixedsize", (attrFn) fixedsizefn},
473 {"gradientangle", (attrFn) gradientanglefn},
474 {"height", (attrFn) heightfn},
475 {"href", (attrFn) hreffn},
476 {"id", (attrFn) idfn},
477 {"port", (attrFn) portfn},
478 {"rows", (attrFn) rowsfn},
479 {"sides", (attrFn) sidesfn},
480 {"style", (attrFn) stylefn},
481 {"target", (attrFn) targetfn},
482 {"title", (attrFn) titlefn},
483 {"tooltip", (attrFn) titlefn},
484 {"valign", (attrFn) valignfn},
485 {"width", (attrFn) widthfn},
486};
487
488static attr_item cell_items[] = {
489 {"align", (attrFn) cell_halignfn},
490 {"balign", (attrFn) balignfn},
491 {"bgcolor", (attrFn) bgcolorfn},
492 {"border", (attrFn) borderfn},
493 {"cellpadding", (attrFn) cellpaddingfn},
494 {"cellspacing", (attrFn) cellspacingfn},
495 {"color", (attrFn) pencolorfn},
496 {"colspan", (attrFn) colspanfn},
497 {"fixedsize", (attrFn) fixedsizefn},
498 {"gradientangle", (attrFn) gradientanglefn},
499 {"height", (attrFn) heightfn},
500 {"href", (attrFn) hreffn},
501 {"id", (attrFn) idfn},
502 {"port", (attrFn) portfn},
503 {"rowspan", (attrFn) rowspanfn},
504 {"sides", (attrFn) sidesfn},
505 {"style", (attrFn) stylefn},
506 {"target", (attrFn) targetfn},
507 {"title", (attrFn) titlefn},
508 {"tooltip", (attrFn) titlefn},
509 {"valign", (attrFn) valignfn},
510 {"width", (attrFn) widthfn},
511};
512
513static attr_item font_items[] = {
514 {"color", (attrFn) fontcolorfn},
515 {"face", (attrFn) facefn},
516 {"point-size", (attrFn) ptsizefn},
517};
518
519static attr_item img_items[] = {
520 {"scale", (attrFn) scalefn},
521 {"src", (attrFn) srcfn},
522};
523
524static attr_item br_items[] = {
525 {"align", (attrFn) alignfn},
526};
527
528/* doAttrs:
529 * General function for processing list of name/value attributes.
530 * Do binary search on items table. If match found, invoke action
531 * passing it tp and attribute value.
532 * Table size is given by nel
533 * Name/value pairs are in array atts, which is null terminated.
534 * s is the name of the HTML element being processed.
535 */
536static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
537 char *s) {
538 char *name;
539 char *val;
540 attr_item *ip;
541
542 while ((name = *atts++) != NULL) {
543 val = *atts++;
544 ip = bsearch(name, items, nel, ISIZE, icmp);
545 if (ip)
546 ctx->warn |= ip->action(tp, val);
547 else {
548 agwarningf("Illegal attribute %s in %s - ignored\n", name,
549 s);
550 ctx->warn = 1;
551 }
552 }
553}
554
555static void mkBR(htmllexstate_t *ctx, char **atts)
556{
557 ctx->htmllval->i = UNSET_ALIGN;
558 doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
559}
560
561static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
562{
563 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
564
565 doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
566
567 return img;
568}
569
570static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
571 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
572
573 tf.size = -1.0; /* unassigned */
574 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
575 assert(flags <= FLAGS_MAX);
576 tf.flags = (unsigned char)(flags & FLAGS_MAX);
577 if (atts)
578 doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
579
580 return dtinsert(ctx->gvc->textfont_dt, &tf);
581}
582
583static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
584{
586
587 cell->colspan = 1;
588 cell->rowspan = 1;
589 doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
590
591 return cell;
592}
593
594static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
595{
596 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
597
598 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
599 tbl->rows = (rows_t){.dtor = free_ritem};
600 tbl->cellborder = -1; // unset cell border attribute
601 doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
602
603 return tbl;
604}
605
606static void startElement(void *user, const char *name, char **atts)
607{
608 htmllexstate_t *ctx = user;
609
610 if (strcasecmp(name, "TABLE") == 0) {
611 ctx->htmllval->tbl = mkTbl(ctx, atts);
612 ctx->inCell = 0;
613 ctx->tok = T_table;
614 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
615 ctx->inCell = 0;
616 ctx->tok = T_row;
617 } else if (strcasecmp(name, "TD") == 0) {
618 ctx->inCell = 1;
619 ctx->htmllval->cell = mkCell(ctx, atts);
620 ctx->tok = T_cell;
621 } else if (strcasecmp(name, "FONT") == 0) {
622 ctx->htmllval->font = mkFont(ctx, atts, 0);
623 ctx->tok = T_font;
624 } else if (strcasecmp(name, "B") == 0) {
625 ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
626 ctx->tok = T_bold;
627 } else if (strcasecmp(name, "S") == 0) {
628 ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
629 ctx->tok = T_s;
630 } else if (strcasecmp(name, "U") == 0) {
631 ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
632 ctx->tok = T_underline;
633 } else if (strcasecmp(name, "O") == 0) {
634 ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
635 ctx->tok = T_overline;
636 } else if (strcasecmp(name, "I") == 0) {
637 ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
638 ctx->tok = T_italic;
639 } else if (strcasecmp(name, "SUP") == 0) {
640 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
641 ctx->tok = T_sup;
642 } else if (strcasecmp(name, "SUB") == 0) {
643 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
644 ctx->tok = T_sub;
645 } else if (strcasecmp(name, "BR") == 0) {
646 mkBR(ctx, atts);
647 ctx->tok = T_br;
648 } else if (strcasecmp(name, "HR") == 0) {
649 ctx->tok = T_hr;
650 } else if (strcasecmp(name, "VR") == 0) {
651 ctx->tok = T_vr;
652 } else if (strcasecmp(name, "IMG") == 0) {
653 ctx->htmllval->img = mkImg(ctx, atts);
654 ctx->tok = T_img;
655 } else if (strcasecmp(name, "HTML") == 0) {
656 ctx->tok = T_html;
657 } else {
658 lexerror(ctx, name);
659 }
660}
661
662static void endElement(void *user, const char *name)
663{
664 htmllexstate_t *ctx = user;
665
666 if (strcasecmp(name, "TABLE") == 0) {
667 ctx->tok = T_end_table;
668 ctx->inCell = 1;
669 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
670 ctx->tok = T_end_row;
671 } else if (strcasecmp(name, "TD") == 0) {
672 ctx->tok = T_end_cell;
673 ctx->inCell = 0;
674 } else if (strcasecmp(name, "HTML") == 0) {
675 ctx->tok = T_end_html;
676 } else if (strcasecmp(name, "FONT") == 0) {
677 ctx->tok = T_end_font;
678 } else if (strcasecmp(name, "B") == 0) {
679 ctx->tok = T_n_bold;
680 } else if (strcasecmp(name, "U") == 0) {
681 ctx->tok = T_n_underline;
682 } else if (strcasecmp(name, "O") == 0) {
683 ctx->tok = T_n_overline;
684 } else if (strcasecmp(name, "I") == 0) {
685 ctx->tok = T_n_italic;
686 } else if (strcasecmp(name, "SUP") == 0) {
687 ctx->tok = T_n_sup;
688 } else if (strcasecmp(name, "SUB") == 0) {
689 ctx->tok = T_n_sub;
690 } else if (strcasecmp(name, "S") == 0) {
691 ctx->tok = T_n_s;
692 } else if (strcasecmp(name, "BR") == 0) {
693 if (ctx->tok == T_br)
694 ctx->tok = T_BR;
695 else
696 ctx->tok = T_end_br;
697 } else if (strcasecmp(name, "HR") == 0) {
698 if (ctx->tok == T_hr)
699 ctx->tok = T_HR;
700 else
701 ctx->tok = T_end_hr;
702 } else if (strcasecmp(name, "VR") == 0) {
703 if (ctx->tok == T_vr)
704 ctx->tok = T_VR;
705 else
706 ctx->tok = T_end_vr;
707 } else if (strcasecmp(name, "IMG") == 0) {
708 if (ctx->tok == T_img)
709 ctx->tok = T_IMG;
710 else
711 ctx->tok = T_end_img;
712 } else {
713 lexerror(ctx, name);
714 }
715}
716
717/* characterData:
718 * Generate T_string token. Do this only when immediately in
719 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
720 * Strip out formatting characters but keep spaces.
721 * Distinguish between all whitespace vs. strings with non-whitespace
722 * characters.
723 */
724static void characterData(void *user, const char *s, int length)
725{
726 htmllexstate_t *ctx = user;
727
728 int i, cnt = 0;
729 unsigned char c;
730
731 if (ctx->inCell) {
732 for (i = length; i; i--) {
733 c = *s++;
734 if (c >= ' ') {
735 cnt++;
736 agxbputc(ctx->xb, (char)c);
737 }
738 }
739 if (cnt) ctx->tok = T_string;
740 }
741}
742#endif
743
744int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
745{
746#ifdef HAVE_EXPAT
747 htmllexstate_t *ctx = &scanner->lexer;
748
749 ctx->xb = xb;
750 ctx->lb = (agxbuf){0};
751 ctx->ptr = src;
752 ctx->mode = 0;
753 ctx->warn = 0;
754 ctx->error = 0;
755 ctx->currtok = (strview_t){0};
756 ctx->prevtok = (strview_t){0};
757 ctx->inCell = 1;
758 ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
759 ctx->gvc = GD_gvc(env->g);
760 XML_SetUserData(ctx->parser, ctx);
761 XML_SetElementHandler(ctx->parser,
762 (XML_StartElementHandler) startElement,
763 endElement);
764 XML_SetCharacterDataHandler(ctx->parser, characterData);
765 return 0;
766#else
767 (void)scanner;
768 (void)src;
769 (void)xb;
770 (void)env;
771
772 static atomic_flag first;
773 if (!atomic_flag_test_and_set(&first)) {
775 "Not built with libexpat. Table formatting is not available.\n");
776 }
777 return 1;
778#endif
779}
780
782{
783#ifdef HAVE_EXPAT
784 htmllexstate_t *ctx = &scanner->lexer;
785 int rv = ctx->error ? 3 : ctx->warn;
786 XML_ParserFree(ctx->parser);
787 agxbfree (&ctx->lb);
788 return rv;
789#else
790 (void)scanner;
791
792 return 1;
793#endif
794}
795
797static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
798 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
799 // implicitly assuming that source and destination do not overlap
800 char *src_copy = gv_strdup(src);
801 agxbput(dst, src_copy);
802 free(src_copy);
803}
804
805#ifdef HAVE_EXPAT
806/* eatComment:
807 * Given first character after open comment, eat characters
808 * up to comment close, returning pointer to closing > if it exists,
809 * or null character otherwise.
810 * We rely on HTML strings having matched nested <>.
811 */
812static char *eatComment(htmllexstate_t *ctx, char *p)
813{
814 int depth = 1;
815 char *s = p;
816 char c;
817
818 while (depth && (c = *s++)) {
819 if (c == '<')
820 depth++;
821 else if (c == '>')
822 depth--;
823 }
824 s--; /* move back to '\0' or '>' */
825 if (*s) {
826 char *t = s - 2;
827 if (t < p || !startswith(t, "--")) {
828 agwarningf("Unclosed comment\n");
829 ctx->warn = 1;
830 }
831 }
832 return s;
833}
834
835/* findNext:
836 * Return next XML unit. This is either <..>, an HTML
837 * comment <!-- ... -->, or characters up to next <.
838 */
839static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
840{
841 char* t = s + 1;
842 char c;
843
844 if (*s == '<') {
845 if (startswith(t, "!--"))
846 t = eatComment(ctx, t + 3);
847 else
848 while (*t && *t != '>')
849 t++;
850 if (*t != '>') {
851 agwarningf("Label closed before end of HTML element\n");
852 ctx->warn = 1;
853 } else
854 t++;
855 } else {
856 t = s;
857 while ((c = *t) && c != '<') {
858 if (c == '&' && *(t+1) != '#') {
859 t = scanEntity(t + 1, xb);
860 }
861 else {
862 agxbputc(xb, c);
863 t++;
864 }
865 }
866 }
867 return t;
868}
869
886static void protect_rsqb(agxbuf *xb) {
887
888 // if the buffer is empty, we have nothing to do
889 if (agxblen(xb) == 0) {
890 return;
891 }
892
893 // check the last character and if it is not ], we have nothing to do
894 char *data = agxbuse(xb);
895 size_t size = strlen(data);
896 assert(size > 0);
897 if (data[size - 1] != ']') {
898 agxbput_move(xb, data);
899 return;
900 }
901
902 // truncate ] and write back the remaining prefix
903 data[size - 1] = '\0';
904 agxbput_move(xb, data);
905
906 // write an XML-escaped version of ] as a replacement
907 agxbput(xb, "&#93;");
908}
909#endif
910
911
913 return htmllineno_ctx(&scanner->lexer);
914}
915
916static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
917#ifdef HAVE_EXPAT
918 return XML_GetCurrentLineNumber(ctx->parser);
919#else
920 (void)ctx;
921
922 return 0;
923#endif
924}
925
926#ifdef DEBUG
927static void printTok(htmllexstate_t *ctx, int tok)
928{
929 char *s;
930
931 switch (tok) {
932 case T_end_br:
933 s = "T_end_br";
934 break;
935 case T_end_img:
936 s = "T_end_img";
937 break;
938 case T_row:
939 s = "T_row";
940 break;
941 case T_end_row:
942 s = "T_end_row";
943 break;
944 case T_html:
945 s = "T_html";
946 break;
947 case T_end_html:
948 s = "T_end_html";
949 break;
950 case T_end_table:
951 s = "T_end_table";
952 break;
953 case T_end_cell:
954 s = "T_end_cell";
955 break;
956 case T_end_font:
957 s = "T_end_font";
958 break;
959 case T_string:
960 s = "T_string";
961 break;
962 case T_error:
963 s = "T_error";
964 break;
965 case T_n_italic:
966 s = "T_n_italic";
967 break;
968 case T_n_bold:
969 s = "T_n_bold";
970 break;
971 case T_n_underline:
972 s = "T_n_underline";
973 break;
974 case T_n_overline:
975 s = "T_n_overline";
976 break;
977 case T_n_sup:
978 s = "T_n_sup";
979 break;
980 case T_n_sub:
981 s = "T_n_sub";
982 break;
983 case T_n_s:
984 s = "T_n_s";
985 break;
986 case T_HR:
987 s = "T_HR";
988 break;
989 case T_hr:
990 s = "T_hr";
991 break;
992 case T_end_hr:
993 s = "T_end_hr";
994 break;
995 case T_VR:
996 s = "T_VR";
997 break;
998 case T_vr:
999 s = "T_vr";
1000 break;
1001 case T_end_vr:
1002 s = "T_end_vr";
1003 break;
1004 case T_BR:
1005 s = "T_BR";
1006 break;
1007 case T_br:
1008 s = "T_br";
1009 break;
1010 case T_IMG:
1011 s = "T_IMG";
1012 break;
1013 case T_img:
1014 s = "T_img";
1015 break;
1016 case T_table:
1017 s = "T_table";
1018 break;
1019 case T_cell:
1020 s = "T_cell";
1021 break;
1022 case T_font:
1023 s = "T_font";
1024 break;
1025 case T_italic:
1026 s = "T_italic";
1027 break;
1028 case T_bold:
1029 s = "T_bold";
1030 break;
1031 case T_underline:
1032 s = "T_underline";
1033 break;
1034 case T_overline:
1035 s = "T_overline";
1036 break;
1037 case T_sup:
1038 s = "T_sup";
1039 break;
1040 case T_sub:
1041 s = "T_sub";
1042 break;
1043 case T_s:
1044 s = "T_s";
1045 break;
1046 default:
1047 s = "<unknown>";
1048 }
1049 if (tok == T_string) {
1050 const char *token_text = agxbuse(ctx->xb);
1051 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1052 agxbput_move(ctx->xb, token_text);
1053 } else
1054 fprintf(stderr, "%s\n", s);
1055}
1056
1057#endif
1058
1059int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
1060{
1061#ifdef HAVE_EXPAT
1062 static char *begin_html = "<HTML>";
1063 static char *end_html = "</HTML>";
1064
1065 char *s;
1066 char *endp = 0;
1067 size_t len, llen;
1068 int rv;
1069 htmllexstate_t *ctx = &scanner->lexer;
1070
1071 ctx->htmllval = htmllval;
1072 ctx->tok = 0;
1073 do {
1074 if (ctx->mode == 2)
1075 return EOF;
1076 if (ctx->mode == 0) {
1077 ctx->mode = 1;
1078 s = begin_html;
1079 len = strlen(s);
1080 endp = 0;
1081 } else {
1082 s = ctx->ptr;
1083 if (*s == '\0') {
1084 ctx->mode = 2;
1085 s = end_html;
1086 len = strlen(s);
1087 } else {
1088 endp = findNext(ctx, s,&ctx->lb);
1089 len = (size_t)(endp - s);
1090 }
1091 }
1092
1093 protect_rsqb(&ctx->lb);
1094
1095 ctx->prevtok = ctx->currtok;
1096 ctx->currtok = (strview_t){.data = s, .size = len};
1097 if ((llen = agxblen(&ctx->lb))) {
1098 assert(llen <= INT_MAX && "XML token too long for expat API");
1099 rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
1100 } else {
1101 assert(len <= INT_MAX && "XML token too long for expat API");
1102 rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
1103 }
1104 if (rv == XML_STATUS_ERROR) {
1105 if (!ctx->error) {
1106 agerrorf("%s in line %lu \n",
1107 XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
1108 error_context(ctx);
1109 ctx->error = 1;
1110 ctx->tok = T_error;
1111 }
1112 }
1113 if (endp)
1114 ctx->ptr = endp;
1115 } while (ctx->tok == 0);
1116#ifdef DEBUG
1117 printTok (ctx, ctx->tok);
1118#endif
1119 return ctx->tok;
1120#else
1121 (void)htmllval;
1122 (void)scanner;
1123
1124 return EOF;
1125#endif
1126}
1127
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:97
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:325
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:108
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:295
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:186
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1082
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1615
static int flags
Definition gc.c:63
static double len(glCompPoint p)
Definition glutils.c:138
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:181
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:200
void agwarningf(const char *fmt,...)
Definition agerror.c:175
void agerrorf(const char *fmt,...)
Definition agerror.c:167
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:157
@ AGPREV
Definition cgraph.h:946
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
agxbput(xb, staging)
int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
Definition htmllex.c:1059
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:797
unsigned long htmllineno(htmlscan_t *scanner)
Definition htmllex.c:912
#define XML_STATUS_ERROR
Definition htmllex.c:39
static void error_context(htmllexstate_t *ctx)
Definition htmllex.c:47
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:781
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:744
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:56
static unsigned long htmllineno_ctx(htmllexstate_t *ctx)
Definition htmllex.c:916
#define T_n_sup
Definition htmlparse.h:138
#define T_end_row
Definition htmlparse.h:126
#define T_end_table
Definition htmlparse.h:129
#define T_br
Definition htmlparse.h:148
#define T_vr
Definition htmlparse.h:145
#define T_error
Definition htmlparse.h:133
#define T_n_s
Definition htmlparse.h:140
#define T_end_cell
Definition htmlparse.h:130
#define T_n_sub
Definition htmlparse.h:139
#define T_n_bold
Definition htmlparse.h:135
#define T_html
Definition htmlparse.h:127
#define T_BR
Definition htmlparse.h:147
#define T_underline
Definition htmlparse.h:156
#define T_sup
Definition htmlparse.h:158
#define T_row
Definition htmlparse.h:125
#define T_table
Definition htmlparse.h:151
#define T_end_vr
Definition htmlparse.h:146
#define T_end_html
Definition htmlparse.h:128
#define T_IMG
Definition htmlparse.h:149
#define T_VR
Definition htmlparse.h:144
#define T_bold
Definition htmlparse.h:155
#define T_end_img
Definition htmlparse.h:124
#define T_sub
Definition htmlparse.h:159
#define T_s
Definition htmlparse.h:160
#define T_n_italic
Definition htmlparse.h:134
#define T_end_font
Definition htmlparse.h:131
#define T_overline
Definition htmlparse.h:157
#define T_hr
Definition htmlparse.h:142
#define T_font
Definition htmlparse.h:153
#define T_italic
Definition htmlparse.h:154
#define T_end_br
Definition htmlparse.h:123
#define T_n_underline
Definition htmlparse.h:136
#define T_cell
Definition htmlparse.h:152
#define T_end_hr
Definition htmlparse.h:143
#define T_string
Definition htmlparse.h:132
#define T_img
Definition htmlparse.h:150
#define T_HR
Definition htmlparse.h:141
#define T_n_overline
Definition htmlparse.h:137
cleanup & scanner
Definition htmlparse.y:289
#define PAD_SET
Definition htmltable.h:33
#define BORDER_RIGHT
Definition htmltable.h:40
#define BORDER_TOP
Definition htmltable.h:39
#define HALIGN_TEXT
Definition htmltable.h:28
#define UNSET_ALIGN
Definition htmltable.h:44
#define HALIGN_LEFT
Definition htmltable.h:26
#define VALIGN_BOTTOM
Definition htmltable.h:30
#define BALIGN_RIGHT
Definition htmltable.h:35
#define BALIGN_LEFT
Definition htmltable.h:36
#define BORDER_BOTTOM
Definition htmltable.h:41
static void free_ritem(row_t *p)
Free row. This closes and frees row’s list, then the item itself is freed.
Definition htmltable.h:117
#define SPACE_SET
Definition htmltable.h:34
#define BORDER_SET
Definition htmltable.h:32
#define BORDER_LEFT
Definition htmltable.h:38
#define BORDER_MASK
Definition htmltable.h:42
#define HALIGN_RIGHT
Definition htmltable.h:25
#define VALIGN_TOP
Definition htmltable.h:29
#define FIXED_FLAG
Definition htmltable.h:24
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:814
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Dt_t * textfont_dt
Definition gvcint.h:108
result of partitioning available space, part of maze
Definition grid.h:33
uint16_t rowspan
Definition htmltable.h:158
uint16_t colspan
Definition htmltable.h:157
char * bgcolor
Definition htmltable.h:87
unsigned char border
Definition htmltable.h:91
char * target
Definition htmltable.h:84
char * id
Definition htmltable.h:86
signed char space
Definition htmltable.h:90
unsigned short width
Definition htmltable.h:95
unsigned short height
Definition htmltable.h:96
int gradientangle
Definition htmltable.h:89
char * port
Definition htmltable.h:83
unsigned short flags
Definition htmltable.h:94
char * href
Definition htmltable.h:82
unsigned char pad
Definition htmltable.h:92
char * pencolor
Definition htmltable.h:88
htmlstyle_t style
Definition htmltable.h:97
char * title
Definition htmltable.h:85
graph_t * g
Definition htmltable.h:171
char * scale
Definition htmltable.h:70
char * src
Definition htmltable.h:69
HTMLSTYPE * htmllval
Definition htmlparse.h:229
agxbuf * xb
Definition htmlparse.h:220
strview_t prevtok
Definition htmlparse.h:227
strview_t currtok
Definition htmlparse.h:226
bool dashed
Definition htmltable.h:78
bool dotted
Definition htmltable.h:77
bool rounded
Definition htmltable.h:75
bool radial
Definition htmltable.h:74
bool invisible
Definition htmltable.h:76
bool vrule
vertical rule
Definition htmltable.h:143
size_t row_count
number of rows
Definition htmltable.h:139
bool hrule
horizontal rule
Definition htmltable.h:142
rows_t rows
cells
Definition htmltable.h:133
int8_t cellborder
Definition htmltable.h:136
Definition utils.c:752
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.h:171
htmlcell_t * cell
Definition htmlparse.h:170
textfont_t * font
Definition htmlparse.h:172
htmlimg_t * img
Definition htmlparse.h:173
Definition grammar.c:90
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25