Graphviz 12.0.1~dev.20240715.2254
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include <assert.h>
14#include <common/render.h>
15#include <common/htmltable.h>
16#include "htmlparse.h"
17#include <common/htmllex.h>
18#include <cdt/cdt.h>
19#include <cgraph/alloc.h>
20#include <cgraph/gv_ctype.h>
21#include <cgraph/startswith.h>
22#include <cgraph/strcasecmp.h>
23#include <cgraph/strview.h>
24#include <cgraph/tokenize.h>
25#include <cgraph/unused.h>
26#include <limits.h>
27#include <stdbool.h>
28#include <stddef.h>
29#include <stdint.h>
30
31#ifdef HAVE_EXPAT
32#ifdef _WIN32
33// ensure that the expat functions get the correct storage class
34// declarations also on MinGW
35#define XML_USE_MSC_EXTENSIONS 1
36#endif
37#include <expat.h>
38#endif
39
40#ifndef XML_STATUS_ERROR
41#define XML_STATUS_ERROR 0
42#endif
43
44typedef struct {
45#ifdef HAVE_EXPAT
46 XML_Parser parser;
47#endif
48 char* ptr; // input source
49 int tok; // token type
50 agxbuf* xb; // buffer to gather T_string data
51 agxbuf lb; // buffer for translating lexical data
52 int warn; // set if warning given
53 int error; // set if error given
54 char inCell; // set if in TD to allow T_string
55 char mode; // for handling artificial <HTML>..</HTML>
56 char *currtok; // for error reporting
57 char *prevtok; // for error reporting
58 size_t currtoklen;
59 size_t prevtoklen;
62
63/* error_context:
64 * Print the last 2 "token"s seen.
65 */
74
75/* htmlerror:
76 * yyerror - called by yacc output
77 */
78void htmlerror(const char *msg)
79{
80 if (state.error)
81 return;
82 state.error = 1;
83 agerrorf("%s in line %lu \n", msg, htmllineno());
85}
86
87#ifdef HAVE_EXPAT
88/* lexerror:
89 * called by lexer when unknown <..> is found.
90 */
91static void lexerror(const char *name)
92{
94 state.error = 1;
95 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno());
96}
97
98typedef int (*attrFn) (void *, char *);
99typedef int (*bcmpfn) (const void *, const void *);
100
101/* Mechanism for automatically processing attributes */
102typedef struct {
103 char *name; /* attribute name */
104 attrFn action; /* action to perform if name matches */
105} attr_item;
106
107#define ISIZE (sizeof(attr_item))
108
109/* icmp:
110 * Compare an attr_item. Used in bsearch
111 */
112static int icmp(const void *name, const void *item) {
113 const attr_item *j = item;
114 return strcasecmp(name, j->name);
115}
116
117static int bgcolorfn(htmldata_t * p, char *v)
118{
119 p->bgcolor = strdup(v);
120 return 0;
121}
122
123static int pencolorfn(htmldata_t * p, char *v)
124{
125 p->pencolor = strdup(v);
126 return 0;
127}
128
129static int hreffn(htmldata_t * p, char *v)
130{
131 p->href = strdup(v);
132 return 0;
133}
134
135static int sidesfn(htmldata_t * p, char *v)
136{
137 unsigned short flags = 0;
138 char c;
139
140 while ((c = *v++)) {
141 switch (gv_tolower(c)) {
142 case 'l' :
144 break;
145 case 't' :
146 flags |= BORDER_TOP;
147 break;
148 case 'r' :
150 break;
151 case 'b' :
153 break;
154 default :
155 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
156 break;
157 }
158 }
159 if (flags != BORDER_MASK)
160 p->flags |= flags;
161 return 0;
162}
163
164static int titlefn(htmldata_t * p, char *v)
165{
166 p->title = strdup(v);
167 return 0;
168}
169
170static int portfn(htmldata_t * p, char *v)
171{
172 p->port = strdup(v);
173 return 0;
174}
175
176#define DELIM " ,"
177
178static int stylefn(htmldata_t * p, char *v)
179{
180 int rv = 0;
181 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
182 strview_t tk = tok_get(&t);
183 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
184 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
185 else if (strview_case_str_eq(tk,"SOLID")) {
186 p->style.dotted = false;
187 p->style.dashed = false;
188 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
189 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
190 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
191 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
192 else {
193 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
194 tk.data);
195 rv = 1;
196 }
197 }
198 return rv;
199}
200
201static int targetfn(htmldata_t * p, char *v)
202{
203 p->target = strdup(v);
204 return 0;
205}
206
207static int idfn(htmldata_t * p, char *v)
208{
209 p->id = strdup(v);
210 return 0;
211}
212
213
214/* doInt:
215 * Scan v for integral value. Check that
216 * the value is >= min and <= max. Return value in ul.
217 * String s is name of value.
218 * Return 0 if okay; 1 otherwise.
219 */
220static int doInt(char *v, char *s, int min, int max, long *ul)
221{
222 int rv = 0;
223 char *ep;
224 long b = strtol(v, &ep, 10);
225
226 if (ep == v) {
227 agwarningf("Improper %s value %s - ignored", s, v);
228 rv = 1;
229 } else if (b > max) {
230 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
231 rv = 1;
232 } else if (b < min) {
233 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
234 rv = 1;
235 } else
236 *ul = b;
237 return rv;
238}
239
240
241static int gradientanglefn(htmldata_t * p, char *v)
242{
243 long u;
244
245 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
246 return 1;
247 p->gradientangle = (unsigned short) u;
248 return 0;
249}
250
251
252static int borderfn(htmldata_t * p, char *v)
253{
254 long u;
255
256 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
257 return 1;
258 p->border = (unsigned char) u;
259 p->flags |= BORDER_SET;
260 return 0;
261}
262
263static int cellpaddingfn(htmldata_t * p, char *v)
264{
265 long u;
266
267 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
268 return 1;
269 p->pad = (unsigned char) u;
270 p->flags |= PAD_SET;
271 return 0;
272}
273
274static int cellspacingfn(htmldata_t * p, char *v)
275{
276 long u;
277
278 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
279 return 1;
280 p->space = (signed char) u;
281 p->flags |= SPACE_SET;
282 return 0;
283}
284
285static int cellborderfn(htmltbl_t * p, char *v)
286{
287 long u;
288
289 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
290 return 1;
291 p->cellborder = (int8_t)u;
292 return 0;
293}
294
295static int columnsfn(htmltbl_t * p, char *v)
296{
297 if (*v != '*') {
298 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
299 return 1;
300 }
301 p->vrule = true;
302 return 0;
303}
304
305static int rowsfn(htmltbl_t * p, char *v)
306{
307 if (*v != '*') {
308 agwarningf("Unknown value %s for ROWS - ignored\n", v);
309 return 1;
310 }
311 p->hrule = true;
312 return 0;
313}
314
315static int fixedsizefn(htmldata_t * p, char *v)
316{
317 int rv = 0;
318 if (!strcasecmp(v, "TRUE"))
319 p->flags |= FIXED_FLAG;
320 else if (strcasecmp(v, "FALSE")) {
321 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
322 rv = 1;
323 }
324 return rv;
325}
326
327static int valignfn(htmldata_t * p, char *v)
328{
329 int rv = 0;
330 if (!strcasecmp(v, "BOTTOM"))
331 p->flags |= VALIGN_BOTTOM;
332 else if (!strcasecmp(v, "TOP"))
333 p->flags |= VALIGN_TOP;
334 else if (strcasecmp(v, "MIDDLE")) {
335 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
336 rv = 1;
337 }
338 return rv;
339}
340
341static int halignfn(htmldata_t * p, char *v)
342{
343 int rv = 0;
344 if (!strcasecmp(v, "LEFT"))
345 p->flags |= HALIGN_LEFT;
346 else if (!strcasecmp(v, "RIGHT"))
347 p->flags |= HALIGN_RIGHT;
348 else if (strcasecmp(v, "CENTER")) {
349 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
350 rv = 1;
351 }
352 return rv;
353}
354
355static int cell_halignfn(htmldata_t * p, char *v)
356{
357 int rv = 0;
358 if (!strcasecmp(v, "LEFT"))
359 p->flags |= HALIGN_LEFT;
360 else if (!strcasecmp(v, "RIGHT"))
361 p->flags |= HALIGN_RIGHT;
362 else if (!strcasecmp(v, "TEXT"))
363 p->flags |= HALIGN_TEXT;
364 else if (strcasecmp(v, "CENTER"))
365 rv = 1;
366 if (rv)
367 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
368 return rv;
369}
370
371static int balignfn(htmldata_t * p, char *v)
372{
373 int rv = 0;
374 if (!strcasecmp(v, "LEFT"))
375 p->flags |= BALIGN_LEFT;
376 else if (!strcasecmp(v, "RIGHT"))
377 p->flags |= BALIGN_RIGHT;
378 else if (strcasecmp(v, "CENTER"))
379 rv = 1;
380 if (rv)
381 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
382 return rv;
383}
384
385static int heightfn(htmldata_t * p, char *v)
386{
387 long u;
388
389 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
390 return 1;
391 p->height = (unsigned short) u;
392 return 0;
393}
394
395static int widthfn(htmldata_t * p, char *v)
396{
397 long u;
398
399 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
400 return 1;
401 p->width = (unsigned short) u;
402 return 0;
403}
404
405static int rowspanfn(htmlcell_t * p, char *v)
406{
407 long u;
408
409 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
410 return 1;
411 if (u == 0) {
412 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
413 return 1;
414 }
415 p->rowspan = (uint16_t)u;
416 return 0;
417}
418
419static int colspanfn(htmlcell_t * p, char *v)
420{
421 long u;
422
423 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
424 return 1;
425 if (u == 0) {
426 agwarningf("COLSPAN value cannot be 0 - ignored\n");
427 return 1;
428 }
429 p->colspan = (uint16_t)u;
430 return 0;
431}
432
433static int fontcolorfn(textfont_t * p, char *v)
434{
435 p->color = v;
436 return 0;
437}
438
439static int facefn(textfont_t * p, char *v)
440{
441 p->name = v;
442 return 0;
443}
444
445static int ptsizefn(textfont_t * p, char *v)
446{
447 long u;
448
449 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
450 return 1;
451 p->size = (double) u;
452 return 0;
453}
454
455static int srcfn(htmlimg_t * p, char *v)
456{
457 p->src = strdup(v);
458 return 0;
459}
460
461static int scalefn(htmlimg_t * p, char *v)
462{
463 p->scale = strdup(v);
464 return 0;
465}
466
467static int alignfn(int *p, char *v)
468{
469 int rv = 0;
470 if (!strcasecmp(v, "RIGHT"))
471 *p = 'r';
472 else if (!strcasecmp(v, "LEFT"))
473 *p = 'l';
474 else if (!strcasecmp(v, "CENTER"))
475 *p = 'n';
476 else {
477 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
478 rv = 1;
479 }
480 return rv;
481}
482
483/* Tables used in binary search; MUST be alphabetized */
484static attr_item tbl_items[] = {
485 {"align", (attrFn) halignfn},
486 {"bgcolor", (attrFn) bgcolorfn},
487 {"border", (attrFn) borderfn},
488 {"cellborder", (attrFn) cellborderfn},
489 {"cellpadding", (attrFn) cellpaddingfn},
490 {"cellspacing", (attrFn) cellspacingfn},
491 {"color", (attrFn) pencolorfn},
492 {"columns", (attrFn) columnsfn},
493 {"fixedsize", (attrFn) fixedsizefn},
494 {"gradientangle", (attrFn) gradientanglefn},
495 {"height", (attrFn) heightfn},
496 {"href", (attrFn) hreffn},
497 {"id", (attrFn) idfn},
498 {"port", (attrFn) portfn},
499 {"rows", (attrFn) rowsfn},
500 {"sides", (attrFn) sidesfn},
501 {"style", (attrFn) stylefn},
502 {"target", (attrFn) targetfn},
503 {"title", (attrFn) titlefn},
504 {"tooltip", (attrFn) titlefn},
505 {"valign", (attrFn) valignfn},
506 {"width", (attrFn) widthfn},
507};
508
509static attr_item cell_items[] = {
510 {"align", (attrFn) cell_halignfn},
511 {"balign", (attrFn) balignfn},
512 {"bgcolor", (attrFn) bgcolorfn},
513 {"border", (attrFn) borderfn},
514 {"cellpadding", (attrFn) cellpaddingfn},
515 {"cellspacing", (attrFn) cellspacingfn},
516 {"color", (attrFn) pencolorfn},
517 {"colspan", (attrFn) colspanfn},
518 {"fixedsize", (attrFn) fixedsizefn},
519 {"gradientangle", (attrFn) gradientanglefn},
520 {"height", (attrFn) heightfn},
521 {"href", (attrFn) hreffn},
522 {"id", (attrFn) idfn},
523 {"port", (attrFn) portfn},
524 {"rowspan", (attrFn) rowspanfn},
525 {"sides", (attrFn) sidesfn},
526 {"style", (attrFn) stylefn},
527 {"target", (attrFn) targetfn},
528 {"title", (attrFn) titlefn},
529 {"tooltip", (attrFn) titlefn},
530 {"valign", (attrFn) valignfn},
531 {"width", (attrFn) widthfn},
532};
533
534static attr_item font_items[] = {
535 {"color", (attrFn) fontcolorfn},
536 {"face", (attrFn) facefn},
537 {"point-size", (attrFn) ptsizefn},
538};
539
540static attr_item img_items[] = {
541 {"scale", (attrFn) scalefn},
542 {"src", (attrFn) srcfn},
543};
544
545static attr_item br_items[] = {
546 {"align", (attrFn) alignfn},
547};
548
549/* doAttrs:
550 * General function for processing list of name/value attributes.
551 * Do binary search on items table. If match found, invoke action
552 * passing it tp and attribute value.
553 * Table size is given by nel
554 * Name/value pairs are in array atts, which is null terminated.
555 * s is the name of the HTML element being processed.
556 */
557static void doAttrs(void *tp, attr_item *items, size_t nel, char **atts,
558 char *s) {
559 char *name;
560 char *val;
561 attr_item *ip;
562
563 while ((name = *atts++) != NULL) {
564 val = *atts++;
565 ip = bsearch(name, items, nel, ISIZE, icmp);
566 if (ip)
567 state.warn |= ip->action(tp, val);
568 else {
569 agwarningf("Illegal attribute %s in %s - ignored\n", name,
570 s);
571 state.warn = 1;
572 }
573 }
574}
575
576static void mkBR(char **atts)
577{
579 doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
580}
581
582static htmlimg_t *mkImg(char **atts)
583{
584 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
585
586 doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
587
588 return img;
589}
590
591static textfont_t *mkFont(GVC_t *gvc, char **atts, unsigned char flags) {
592 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
593
594 tf.size = -1.0; /* unassigned */
595 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
596 assert(flags <= FLAGS_MAX);
597 tf.flags = (unsigned char)(flags & FLAGS_MAX);
598 if (atts)
599 doAttrs(&tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
600
601 return dtinsert(gvc->textfont_dt, &tf);
602}
603
604static htmlcell_t *mkCell(char **atts)
605{
607
608 cell->colspan = 1;
609 cell->rowspan = 1;
610 doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
611
612 return cell;
613}
614
615static htmltbl_t *mkTbl(char **atts)
616{
617 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
618
619 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
620 tbl->cellborder = -1; // unset cell border attribute
621 doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
622
623 return tbl;
624}
625
626static void startElement(void *user, const char *name, char **atts)
627{
628 GVC_t *gvc = user;
629
630 if (strcasecmp(name, "TABLE") == 0) {
631 htmllval.tbl = mkTbl(atts);
632 state.inCell = 0;
633 state.tok = T_table;
634 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
635 state.inCell = 0;
636 state.tok = T_row;
637 } else if (strcasecmp(name, "TD") == 0) {
638 state.inCell = 1;
639 htmllval.cell = mkCell(atts);
640 state.tok = T_cell;
641 } else if (strcasecmp(name, "FONT") == 0) {
642 htmllval.font = mkFont(gvc, atts, 0);
643 state.tok = T_font;
644 } else if (strcasecmp(name, "B") == 0) {
645 htmllval.font = mkFont(gvc, 0, HTML_BF);
646 state.tok = T_bold;
647 } else if (strcasecmp(name, "S") == 0) {
648 htmllval.font = mkFont(gvc, 0, HTML_S);
649 state.tok = T_s;
650 } else if (strcasecmp(name, "U") == 0) {
651 htmllval.font = mkFont(gvc, 0, HTML_UL);
653 } else if (strcasecmp(name, "O") == 0) {
654 htmllval.font = mkFont(gvc, 0, HTML_OL);
656 } else if (strcasecmp(name, "I") == 0) {
657 htmllval.font = mkFont(gvc, 0, HTML_IF);
659 } else if (strcasecmp(name, "SUP") == 0) {
660 htmllval.font = mkFont(gvc, 0, HTML_SUP);
661 state.tok = T_sup;
662 } else if (strcasecmp(name, "SUB") == 0) {
663 htmllval.font = mkFont(gvc, 0, HTML_SUB);
664 state.tok = T_sub;
665 } else if (strcasecmp(name, "BR") == 0) {
666 mkBR(atts);
667 state.tok = T_br;
668 } else if (strcasecmp(name, "HR") == 0) {
669 state.tok = T_hr;
670 } else if (strcasecmp(name, "VR") == 0) {
671 state.tok = T_vr;
672 } else if (strcasecmp(name, "IMG") == 0) {
673 htmllval.img = mkImg(atts);
674 state.tok = T_img;
675 } else if (strcasecmp(name, "HTML") == 0) {
676 state.tok = T_html;
677 } else {
678 lexerror(name);
679 }
680}
681
682static void endElement(void *user, const char *name)
683{
684 (void)user;
685
686 if (strcasecmp(name, "TABLE") == 0) {
688 state.inCell = 1;
689 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
691 } else if (strcasecmp(name, "TD") == 0) {
693 state.inCell = 0;
694 } else if (strcasecmp(name, "HTML") == 0) {
696 } else if (strcasecmp(name, "FONT") == 0) {
698 } else if (strcasecmp(name, "B") == 0) {
700 } else if (strcasecmp(name, "U") == 0) {
702 } else if (strcasecmp(name, "O") == 0) {
704 } else if (strcasecmp(name, "I") == 0) {
706 } else if (strcasecmp(name, "SUP") == 0) {
707 state.tok = T_n_sup;
708 } else if (strcasecmp(name, "SUB") == 0) {
709 state.tok = T_n_sub;
710 } else if (strcasecmp(name, "S") == 0) {
711 state.tok = T_n_s;
712 } else if (strcasecmp(name, "BR") == 0) {
713 if (state.tok == T_br)
714 state.tok = T_BR;
715 else
717 } else if (strcasecmp(name, "HR") == 0) {
718 if (state.tok == T_hr)
719 state.tok = T_HR;
720 else
722 } else if (strcasecmp(name, "VR") == 0) {
723 if (state.tok == T_vr)
724 state.tok = T_VR;
725 else
727 } else if (strcasecmp(name, "IMG") == 0) {
728 if (state.tok == T_img)
729 state.tok = T_IMG;
730 else
732 } else {
733 lexerror(name);
734 }
735}
736
737/* characterData:
738 * Generate T_string token. Do this only when immediately in
739 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
740 * Strip out formatting characters but keep spaces.
741 * Distinguish between all whitespace vs. strings with non-whitespace
742 * characters.
743 */
744static void characterData(void *user, const char *s, int length)
745{
746 (void)user;
747
748 int i, cnt = 0;
749 unsigned char c;
750
751 if (state.inCell) {
752 for (i = length; i; i--) {
753 c = *s++;
754 if (c >= ' ') {
755 cnt++;
756 agxbputc(state.xb, (char)c);
757 }
758 }
759 if (cnt) state.tok = T_string;
760 }
761}
762#endif
763
764int initHTMLlexer(char *src, agxbuf * xb, htmlenv_t *env)
765{
766#ifdef HAVE_EXPAT
767 state.xb = xb;
768 state.lb = (agxbuf){0};
769 state.ptr = src;
770 state.mode = 0;
771 state.warn = 0;
772 state.error = 0;
773 state.currtoklen = 0;
774 state.prevtoklen = 0;
775 state.inCell = 1;
776 state.parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
777 XML_SetUserData(state.parser, GD_gvc(env->g));
778 XML_SetElementHandler(state.parser,
779 (XML_StartElementHandler) startElement,
780 endElement);
781 XML_SetCharacterDataHandler(state.parser, characterData);
782 return 0;
783#else
784 static int first;
785 if (!first) {
787 "Not built with libexpat. Table formatting is not available.\n");
788 first++;
789 }
790 return 1;
791#endif
792}
793
795{
796#ifdef HAVE_EXPAT
797 int rv = state.error ? 3 : state.warn;
798 XML_ParserFree(state.parser);
799 agxbfree (&state.lb);
800 return rv;
801#else
802 return 1;
803#endif
804}
805
807static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
808 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
809 // implicitly assuming that source and destination do not overlap
810 char *src_copy = gv_strdup(src);
811 agxbput(dst, src_copy);
812 free(src_copy);
813}
814
815#ifdef HAVE_EXPAT
816/* eatComment:
817 * Given first character after open comment, eat characters
818 * up to comment close, returning pointer to closing > if it exists,
819 * or null character otherwise.
820 * We rely on HTML strings having matched nested <>.
821 */
822static char *eatComment(char *p)
823{
824 int depth = 1;
825 char *s = p;
826 char c;
827
828 while (depth && (c = *s++)) {
829 if (c == '<')
830 depth++;
831 else if (c == '>')
832 depth--;
833 }
834 s--; /* move back to '\0' or '>' */
835 if (*s) {
836 char *t = s - 2;
837 if (t < p || !startswith(t, "--")) {
838 agwarningf("Unclosed comment\n");
839 state.warn = 1;
840 }
841 }
842 return s;
843}
844
845/* findNext:
846 * Return next XML unit. This is either <..>, an HTML
847 * comment <!-- ... -->, or characters up to next <.
848 */
849static char *findNext(char *s, agxbuf* xb)
850{
851 char* t = s + 1;
852 char c;
853
854 if (*s == '<') {
855 if (startswith(t, "!--"))
856 t = eatComment(t + 3);
857 else
858 while (*t && *t != '>')
859 t++;
860 if (*t != '>') {
861 agwarningf("Label closed before end of HTML element\n");
862 state.warn = 1;
863 } else
864 t++;
865 } else {
866 t = s;
867 while ((c = *t) && c != '<') {
868 if (c == '&' && *(t+1) != '#') {
869 t = scanEntity(t + 1, xb);
870 }
871 else {
872 agxbputc(xb, c);
873 t++;
874 }
875 }
876 }
877 return t;
878}
879
896static void protect_rsqb(agxbuf *xb) {
897
898 // if the buffer is empty, we have nothing to do
899 if (agxblen(xb) == 0) {
900 return;
901 }
902
903 // check the last character and if it is not ], we have nothing to do
904 char *data = agxbuse(xb);
905 size_t size = strlen(data);
906 assert(size > 0);
907 if (data[size - 1] != ']') {
908 agxbput_move(xb, data);
909 return;
910 }
911
912 // truncate ] and write back the remaining prefix
913 data[size - 1] = '\0';
914 agxbput_move(xb, data);
915
916 // write an XML-escaped version of ] as a replacement
917 agxbput(xb, "&#93;");
918}
919#endif
920
921unsigned long htmllineno(void) {
922#ifdef HAVE_EXPAT
923 return XML_GetCurrentLineNumber(state.parser);
924#else
925 return 0;
926#endif
927}
928
929#ifdef DEBUG
930static void printTok(int tok)
931{
932 char *s;
933
934 switch (tok) {
935 case T_end_br:
936 s = "T_end_br";
937 break;
938 case T_end_img:
939 s = "T_end_img";
940 break;
941 case T_row:
942 s = "T_row";
943 break;
944 case T_end_row:
945 s = "T_end_row";
946 break;
947 case T_html:
948 s = "T_html";
949 break;
950 case T_end_html:
951 s = "T_end_html";
952 break;
953 case T_end_table:
954 s = "T_end_table";
955 break;
956 case T_end_cell:
957 s = "T_end_cell";
958 break;
959 case T_end_font:
960 s = "T_end_font";
961 break;
962 case T_string:
963 s = "T_string";
964 break;
965 case T_error:
966 s = "T_error";
967 break;
968 case T_n_italic:
969 s = "T_n_italic";
970 break;
971 case T_n_bold:
972 s = "T_n_bold";
973 break;
974 case T_n_underline:
975 s = "T_n_underline";
976 break;
977 case T_n_overline:
978 s = "T_n_overline";
979 break;
980 case T_n_sup:
981 s = "T_n_sup";
982 break;
983 case T_n_sub:
984 s = "T_n_sub";
985 break;
986 case T_n_s:
987 s = "T_n_s";
988 break;
989 case T_HR:
990 s = "T_HR";
991 break;
992 case T_hr:
993 s = "T_hr";
994 break;
995 case T_end_hr:
996 s = "T_end_hr";
997 break;
998 case T_VR:
999 s = "T_VR";
1000 break;
1001 case T_vr:
1002 s = "T_vr";
1003 break;
1004 case T_end_vr:
1005 s = "T_end_vr";
1006 break;
1007 case T_BR:
1008 s = "T_BR";
1009 break;
1010 case T_br:
1011 s = "T_br";
1012 break;
1013 case T_IMG:
1014 s = "T_IMG";
1015 break;
1016 case T_img:
1017 s = "T_img";
1018 break;
1019 case T_table:
1020 s = "T_table";
1021 break;
1022 case T_cell:
1023 s = "T_cell";
1024 break;
1025 case T_font:
1026 s = "T_font";
1027 break;
1028 case T_italic:
1029 s = "T_italic";
1030 break;
1031 case T_bold:
1032 s = "T_bold";
1033 break;
1034 case T_underline:
1035 s = "T_underline";
1036 break;
1037 case T_overline:
1038 s = "T_overline";
1039 break;
1040 case T_sup:
1041 s = "T_sup";
1042 break;
1043 case T_sub:
1044 s = "T_sub";
1045 break;
1046 case T_s:
1047 s = "T_s";
1048 break;
1049 default:
1050 s = "<unknown>";
1051 }
1052 if (tok == T_string) {
1053 const char *token_text = agxbuse(state.xb);
1054 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1055 agxbput_move(state.xb, token_text);
1056 } else
1057 fprintf(stderr, "%s\n", s);
1058}
1059
1060#endif
1061
1062int htmllex(void)
1063{
1064#ifdef HAVE_EXPAT
1065 static char *begin_html = "<HTML>";
1066 static char *end_html = "</HTML>";
1067
1068 char *s;
1069 char *endp = 0;
1070 size_t len, llen;
1071 int rv;
1072
1073 state.tok = 0;
1074 do {
1075 if (state.mode == 2)
1076 return EOF;
1077 if (state.mode == 0) {
1078 state.mode = 1;
1079 s = begin_html;
1080 len = strlen(s);
1081 endp = 0;
1082 } else {
1083 s = state.ptr;
1084 if (*s == '\0') {
1085 state.mode = 2;
1086 s = end_html;
1087 len = strlen(s);
1088 } else {
1089 endp = findNext(s,&state.lb);
1090 len = (size_t)(endp - s);
1091 }
1092 }
1093
1094 protect_rsqb(&state.lb);
1095
1098 state.currtok = s;
1100 if ((llen = (size_t)agxblen(&state.lb))) {
1101 assert(llen <= (size_t)INT_MAX && "XML token too long for expat API");
1102 rv = XML_Parse(state.parser, agxbuse(&state.lb), (int)llen, 0);
1103 } else {
1104 assert(len <= (size_t)INT_MAX && "XML token too long for expat API");
1105 rv = XML_Parse(state.parser, s, (int)len, len ? 0 : 1);
1106 }
1107 if (rv == XML_STATUS_ERROR) {
1108 if (!state.error) {
1109 agerrorf("%s in line %lu \n",
1110 XML_ErrorString(XML_GetErrorCode(state.parser)), htmllineno());
1111 error_context();
1112 state.error = 1;
1113 state.tok = T_error;
1114 }
1115 }
1116 if (endp)
1117 state.ptr = endp;
1118 } while (state.tok == 0);
1119#ifdef DEBUG
1120 printTok (state.tok);
1121#endif
1122 return state.tok;
1123#else
1124 return EOF;
1125#endif
1126}
1127
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:77
static size_t agxbput(agxbuf *xb, const char *s)
append string s into xb
Definition agxbuf.h:249
static size_t agxbput_n(agxbuf *xb, const char *s, size_t ssz)
append string s of length ssz into xb
Definition agxbuf.h:229
static void agxbclear(agxbuf *xb)
resets pointer to data
Definition agxbuf.h:273
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:88
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:256
static char * agxbuse(agxbuf *xb)
Definition agxbuf.h:286
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:193
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1087
static int flags
Definition gc.c:61
static double len(glCompPoint p)
Definition glutils.c:150
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:149
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:199
void agwarningf(const char *fmt,...)
Definition agerror.c:173
void agerrorf(const char *fmt,...)
Definition agerror.c:165
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:155
@ AGPREV
Definition cgraph.h:858
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:807
#define XML_STATUS_ERROR
Definition htmllex.c:41
void htmlerror(const char *msg)
Definition htmllex.c:78
int htmllex(void)
Definition htmllex.c:1062
int initHTMLlexer(char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:764
unsigned long htmllineno(void)
Definition htmllex.c:921
static lexstate_t state
Definition htmllex.c:61
int clearHTMLlexer(void)
Definition htmllex.c:794
static void error_context(void)
Definition htmllex.c:66
#define T_n_sup
Definition htmlparse.c:546
#define T_end_row
Definition htmlparse.c:534
#define T_end_table
Definition htmlparse.c:537
#define T_br
Definition htmlparse.c:556
#define T_vr
Definition htmlparse.c:553
#define T_error
Definition htmlparse.c:541
#define T_n_s
Definition htmlparse.c:548
#define T_end_cell
Definition htmlparse.c:538
#define T_n_sub
Definition htmlparse.c:547
#define T_n_bold
Definition htmlparse.c:543
#define T_html
Definition htmlparse.c:535
#define T_BR
Definition htmlparse.c:555
#define T_underline
Definition htmlparse.c:564
#define T_sup
Definition htmlparse.c:566
#define T_row
Definition htmlparse.c:533
GVC_t * gvc
Definition htmlparse.c:99
#define T_table
Definition htmlparse.c:559
#define T_end_vr
Definition htmlparse.c:554
#define T_end_html
Definition htmlparse.c:536
#define T_IMG
Definition htmlparse.c:557
#define T_VR
Definition htmlparse.c:552
#define T_bold
Definition htmlparse.c:563
#define T_end_img
Definition htmlparse.c:532
#define T_sub
Definition htmlparse.c:567
HTMLSTYPE htmllval
#define T_s
Definition htmlparse.c:568
#define T_n_italic
Definition htmlparse.c:542
#define T_end_font
Definition htmlparse.c:539
#define T_overline
Definition htmlparse.c:565
#define T_hr
Definition htmlparse.c:550
#define T_font
Definition htmlparse.c:561
#define T_italic
Definition htmlparse.c:562
#define T_end_br
Definition htmlparse.c:531
#define T_n_underline
Definition htmlparse.c:544
#define T_cell
Definition htmlparse.c:560
#define T_end_hr
Definition htmlparse.c:551
#define T_string
Definition htmlparse.c:540
#define T_img
Definition htmlparse.c:558
#define T_HR
Definition htmlparse.c:549
#define T_n_overline
Definition htmlparse.c:545
#define PAD_SET
Definition htmltable.h:32
#define BORDER_RIGHT
Definition htmltable.h:39
#define BORDER_TOP
Definition htmltable.h:38
#define HALIGN_TEXT
Definition htmltable.h:27
#define UNSET_ALIGN
Definition htmltable.h:43
#define HALIGN_LEFT
Definition htmltable.h:25
#define VALIGN_BOTTOM
Definition htmltable.h:29
#define BALIGN_RIGHT
Definition htmltable.h:34
#define BALIGN_LEFT
Definition htmltable.h:35
#define BORDER_BOTTOM
Definition htmltable.h:40
#define SPACE_SET
Definition htmltable.h:33
#define BORDER_SET
Definition htmltable.h:31
#define BORDER_LEFT
Definition htmltable.h:37
#define BORDER_MASK
Definition htmltable.h:41
#define HALIGN_RIGHT
Definition htmltable.h:24
#define VALIGN_TOP
Definition htmltable.h:28
#define FIXED_FLAG
Definition htmltable.h:23
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:813
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Definition gvcint.h:80
Dt_t * textfont_dt
Definition gvcint.h:107
result of partitioning available space, part of maze
Definition grid.h:33
Definition legal.c:50
uint16_t rowspan
Definition htmltable.h:145
uint16_t colspan
Definition htmltable.h:144
char * bgcolor
Definition htmltable.h:86
unsigned char border
Definition htmltable.h:90
char * target
Definition htmltable.h:83
char * id
Definition htmltable.h:85
signed char space
Definition htmltable.h:89
unsigned short width
Definition htmltable.h:94
unsigned short height
Definition htmltable.h:95
int gradientangle
Definition htmltable.h:88
char * port
Definition htmltable.h:82
unsigned short flags
Definition htmltable.h:93
char * href
Definition htmltable.h:81
unsigned char pad
Definition htmltable.h:91
char * pencolor
Definition htmltable.h:87
htmlstyle_t style
Definition htmltable.h:96
char * title
Definition htmltable.h:84
graph_t * g
Definition htmltable.h:171
char * scale
Definition htmltable.h:69
char * src
Definition htmltable.h:68
bool dashed
Definition htmltable.h:77
bool dotted
Definition htmltable.h:76
bool rounded
Definition htmltable.h:74
bool radial
Definition htmltable.h:73
bool invisible
Definition htmltable.h:75
bool vrule
vertical rule
Definition htmltable.h:130
size_t row_count
number of rows
Definition htmltable.h:126
bool hrule
horizontal rule
Definition htmltable.h:129
int8_t cellborder
Definition htmltable.h:123
Definition utils.c:748
int warn
Definition htmllex.c:52
char mode
Definition htmllex.c:55
int tok
Definition htmllex.c:49
int error
Definition htmllex.c:53
char * prevtok
Definition htmllex.c:57
size_t currtoklen
Definition htmllex.c:58
agxbuf * xb
Definition htmllex.c:50
char * currtok
Definition htmllex.c:56
char * ptr
Definition htmllex.c:48
size_t prevtoklen
Definition htmllex.c:59
agxbuf lb
Definition htmllex.c:51
char inCell
Definition htmllex.c:54
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.c:579
htmlcell_t * cell
Definition htmlparse.c:578
textfont_t * font
Definition htmlparse.c:580
htmlimg_t * img
Definition htmlparse.c:581
Definition grammar.c:93
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25