Graphviz 14.0.3~dev.20251029.0425
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include <assert.h>
14#include <common/render.h>
15#include <common/htmltable.h>
16#include "htmlparse.h"
17#include <common/htmllex.h>
18#include <cdt/cdt.h>
19#include <limits.h>
20#include <stdbool.h>
21#include <stddef.h>
22#include <stdint.h>
23#include <util/alloc.h>
24#include <util/gv_ctype.h>
25#include <util/startswith.h>
26#include <util/strcasecmp.h>
27#include <util/strview.h>
28#include <util/tokenize.h>
29#include <util/unused.h>
30
31#ifdef HAVE_EXPAT
32#include <expat.h>
33#endif
34
35#ifndef XML_STATUS_ERROR
36#define XML_STATUS_ERROR 0
37#endif
38
39static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
40
41/* error_context:
42 * Print the last 2 "token"s seen.
43 */
45{
46 agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
47 ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
48}
49
50/* htmlerror:
51 * yyerror - called by yacc output
52 */
53void htmlerror(htmlscan_t *scanner, const char *msg)
54{
55 htmllexstate_t *ctx = &scanner->lexer;
56 if (ctx->error)
57 return;
58 ctx->error = 1;
59 agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
60 error_context(&scanner->lexer);
61}
62
63#ifdef HAVE_EXPAT
64/* lexerror:
65 * called by lexer when unknown <..> is found.
66 */
67static void lexerror(htmllexstate_t *ctx, const char *name)
68{
69 ctx->tok = T_error;
70 ctx->error = 1;
71 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
72}
73
74typedef int (*attrFn) (void *, char *);
75typedef int (*bcmpfn) (const void *, const void *);
76
77/* Mechanism for automatically processing attributes */
78typedef struct {
79 char *name; /* attribute name */
80 attrFn action; /* action to perform if name matches */
81} attr_item;
82
83#define ISIZE (sizeof(attr_item))
84
85/* icmp:
86 * Compare an attr_item. Used in bsearch
87 */
88static int icmp(const void *name, const void *item) {
89 const attr_item *j = item;
90 return strcasecmp(name, j->name);
91}
92
93static int bgcolorfn(htmldata_t * p, char *v)
94{
95 p->bgcolor = strdup(v);
96 return 0;
97}
98
99static int pencolorfn(htmldata_t * p, char *v)
100{
101 p->pencolor = strdup(v);
102 return 0;
103}
104
105static int hreffn(htmldata_t * p, char *v)
106{
107 p->href = strdup(v);
108 return 0;
109}
110
111static int sidesfn(htmldata_t * p, char *v)
112{
113 unsigned short flags = 0;
114 char c;
115
116 while ((c = *v++)) {
117 switch (gv_tolower(c)) {
118 case 'l' :
120 break;
121 case 't' :
122 flags |= BORDER_TOP;
123 break;
124 case 'r' :
126 break;
127 case 'b' :
129 break;
130 default :
131 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
132 break;
133 }
134 }
135 if (flags != BORDER_MASK)
136 p->flags |= flags;
137 return 0;
138}
139
140static int titlefn(htmldata_t * p, char *v)
141{
142 p->title = strdup(v);
143 return 0;
144}
145
146static int portfn(htmldata_t * p, char *v)
147{
148 p->port = strdup(v);
149 return 0;
150}
151
152#define DELIM " ,"
153
154static int stylefn(htmldata_t * p, char *v)
155{
156 int rv = 0;
157 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
158 strview_t tk = tok_get(&t);
159 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
160 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
161 else if (strview_case_str_eq(tk,"SOLID")) {
162 p->style.dotted = false;
163 p->style.dashed = false;
164 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
165 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
166 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
167 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
168 else {
169 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
170 tk.data);
171 rv = 1;
172 }
173 }
174 return rv;
175}
176
177static int targetfn(htmldata_t * p, char *v)
178{
179 p->target = strdup(v);
180 return 0;
181}
182
183static int idfn(htmldata_t * p, char *v)
184{
185 p->id = strdup(v);
186 return 0;
187}
188
189
190/* doInt:
191 * Scan v for integral value. Check that
192 * the value is >= min and <= max. Return value in ul.
193 * String s is name of value.
194 * Return 0 if okay; 1 otherwise.
195 */
196static int doInt(char *v, char *s, int min, int max, long *ul)
197{
198 int rv = 0;
199 char *ep;
200 long b = strtol(v, &ep, 10);
201
202 if (ep == v) {
203 agwarningf("Improper %s value %s - ignored", s, v);
204 rv = 1;
205 } else if (b > max) {
206 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
207 rv = 1;
208 } else if (b < min) {
209 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
210 rv = 1;
211 } else
212 *ul = b;
213 return rv;
214}
215
216
217static int gradientanglefn(htmldata_t * p, char *v)
218{
219 long u;
220
221 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
222 return 1;
223 p->gradientangle = (unsigned short) u;
224 return 0;
225}
226
227
228static int borderfn(htmldata_t * p, char *v)
229{
230 long u;
231
232 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
233 return 1;
234 p->border = (unsigned char) u;
235 p->flags |= BORDER_SET;
236 return 0;
237}
238
239static int cellpaddingfn(htmldata_t * p, char *v)
240{
241 long u;
242
243 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
244 return 1;
245 p->pad = (unsigned char) u;
246 p->flags |= PAD_SET;
247 return 0;
248}
249
250static int cellspacingfn(htmldata_t * p, char *v)
251{
252 long u;
253
254 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
255 return 1;
256 p->space = (signed char) u;
257 p->flags |= SPACE_SET;
258 return 0;
259}
260
261static int cellborderfn(htmltbl_t * p, char *v)
262{
263 long u;
264
265 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
266 return 1;
267 p->cellborder = (int8_t)u;
268 return 0;
269}
270
271static int columnsfn(htmltbl_t * p, char *v)
272{
273 if (*v != '*') {
274 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
275 return 1;
276 }
277 p->vrule = true;
278 return 0;
279}
280
281static int rowsfn(htmltbl_t * p, char *v)
282{
283 if (*v != '*') {
284 agwarningf("Unknown value %s for ROWS - ignored\n", v);
285 return 1;
286 }
287 p->hrule = true;
288 return 0;
289}
290
291static int fixedsizefn(htmldata_t * p, char *v)
292{
293 int rv = 0;
294 if (!strcasecmp(v, "TRUE"))
295 p->flags |= FIXED_FLAG;
296 else if (strcasecmp(v, "FALSE")) {
297 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
298 rv = 1;
299 }
300 return rv;
301}
302
303static int valignfn(htmldata_t * p, char *v)
304{
305 int rv = 0;
306 if (!strcasecmp(v, "BOTTOM"))
307 p->flags |= VALIGN_BOTTOM;
308 else if (!strcasecmp(v, "TOP"))
309 p->flags |= VALIGN_TOP;
310 else if (strcasecmp(v, "MIDDLE")) {
311 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
312 rv = 1;
313 }
314 return rv;
315}
316
317static int halignfn(htmldata_t * p, char *v)
318{
319 int rv = 0;
320 if (!strcasecmp(v, "LEFT"))
321 p->flags |= HALIGN_LEFT;
322 else if (!strcasecmp(v, "RIGHT"))
323 p->flags |= HALIGN_RIGHT;
324 else if (strcasecmp(v, "CENTER")) {
325 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
326 rv = 1;
327 }
328 return rv;
329}
330
331static int cell_halignfn(htmldata_t * p, char *v)
332{
333 int rv = 0;
334 if (!strcasecmp(v, "LEFT"))
335 p->flags |= HALIGN_LEFT;
336 else if (!strcasecmp(v, "RIGHT"))
337 p->flags |= HALIGN_RIGHT;
338 else if (!strcasecmp(v, "TEXT"))
339 p->flags |= HALIGN_TEXT;
340 else if (strcasecmp(v, "CENTER"))
341 rv = 1;
342 if (rv)
343 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
344 return rv;
345}
346
347static int balignfn(htmldata_t * p, char *v)
348{
349 int rv = 0;
350 if (!strcasecmp(v, "LEFT"))
351 p->flags |= BALIGN_LEFT;
352 else if (!strcasecmp(v, "RIGHT"))
353 p->flags |= BALIGN_RIGHT;
354 else if (strcasecmp(v, "CENTER"))
355 rv = 1;
356 if (rv)
357 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
358 return rv;
359}
360
361static int heightfn(htmldata_t * p, char *v)
362{
363 long u;
364
365 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
366 return 1;
367 p->height = (unsigned short) u;
368 return 0;
369}
370
371static int widthfn(htmldata_t * p, char *v)
372{
373 long u;
374
375 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
376 return 1;
377 p->width = (unsigned short) u;
378 return 0;
379}
380
381static int rowspanfn(htmlcell_t * p, char *v)
382{
383 long u;
384
385 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
386 return 1;
387 if (u == 0) {
388 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
389 return 1;
390 }
391 p->rowspan = (uint16_t)u;
392 return 0;
393}
394
395static int colspanfn(htmlcell_t * p, char *v)
396{
397 long u;
398
399 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
400 return 1;
401 if (u == 0) {
402 agwarningf("COLSPAN value cannot be 0 - ignored\n");
403 return 1;
404 }
405 p->colspan = (uint16_t)u;
406 return 0;
407}
408
409static int fontcolorfn(textfont_t * p, char *v)
410{
411 p->color = v;
412 return 0;
413}
414
415static int facefn(textfont_t * p, char *v)
416{
417 p->name = v;
418 return 0;
419}
420
421static int ptsizefn(textfont_t * p, char *v)
422{
423 long u;
424
425 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
426 return 1;
427 p->size = (double) u;
428 return 0;
429}
430
431static int srcfn(htmlimg_t * p, char *v)
432{
433 p->src = strdup(v);
434 return 0;
435}
436
437static int scalefn(htmlimg_t * p, char *v)
438{
439 p->scale = strdup(v);
440 return 0;
441}
442
443static int alignfn(int *p, char *v)
444{
445 int rv = 0;
446 if (!strcasecmp(v, "RIGHT"))
447 *p = 'r';
448 else if (!strcasecmp(v, "LEFT"))
449 *p = 'l';
450 else if (!strcasecmp(v, "CENTER"))
451 *p = 'n';
452 else {
453 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
454 rv = 1;
455 }
456 return rv;
457}
458
459/* Tables used in binary search; MUST be alphabetized */
460static attr_item tbl_items[] = {
461 {"align", (attrFn) halignfn},
462 {"bgcolor", (attrFn) bgcolorfn},
463 {"border", (attrFn) borderfn},
464 {"cellborder", (attrFn) cellborderfn},
465 {"cellpadding", (attrFn) cellpaddingfn},
466 {"cellspacing", (attrFn) cellspacingfn},
467 {"color", (attrFn) pencolorfn},
468 {"columns", (attrFn) columnsfn},
469 {"fixedsize", (attrFn) fixedsizefn},
470 {"gradientangle", (attrFn) gradientanglefn},
471 {"height", (attrFn) heightfn},
472 {"href", (attrFn) hreffn},
473 {"id", (attrFn) idfn},
474 {"port", (attrFn) portfn},
475 {"rows", (attrFn) rowsfn},
476 {"sides", (attrFn) sidesfn},
477 {"style", (attrFn) stylefn},
478 {"target", (attrFn) targetfn},
479 {"title", (attrFn) titlefn},
480 {"tooltip", (attrFn) titlefn},
481 {"valign", (attrFn) valignfn},
482 {"width", (attrFn) widthfn},
483};
484
485static attr_item cell_items[] = {
486 {"align", (attrFn) cell_halignfn},
487 {"balign", (attrFn) balignfn},
488 {"bgcolor", (attrFn) bgcolorfn},
489 {"border", (attrFn) borderfn},
490 {"cellpadding", (attrFn) cellpaddingfn},
491 {"cellspacing", (attrFn) cellspacingfn},
492 {"color", (attrFn) pencolorfn},
493 {"colspan", (attrFn) colspanfn},
494 {"fixedsize", (attrFn) fixedsizefn},
495 {"gradientangle", (attrFn) gradientanglefn},
496 {"height", (attrFn) heightfn},
497 {"href", (attrFn) hreffn},
498 {"id", (attrFn) idfn},
499 {"port", (attrFn) portfn},
500 {"rowspan", (attrFn) rowspanfn},
501 {"sides", (attrFn) sidesfn},
502 {"style", (attrFn) stylefn},
503 {"target", (attrFn) targetfn},
504 {"title", (attrFn) titlefn},
505 {"tooltip", (attrFn) titlefn},
506 {"valign", (attrFn) valignfn},
507 {"width", (attrFn) widthfn},
508};
509
510static attr_item font_items[] = {
511 {"color", (attrFn) fontcolorfn},
512 {"face", (attrFn) facefn},
513 {"point-size", (attrFn) ptsizefn},
514};
515
516static attr_item img_items[] = {
517 {"scale", (attrFn) scalefn},
518 {"src", (attrFn) srcfn},
519};
520
521static attr_item br_items[] = {
522 {"align", (attrFn) alignfn},
523};
524
525/* doAttrs:
526 * General function for processing list of name/value attributes.
527 * Do binary search on items table. If match found, invoke action
528 * passing it tp and attribute value.
529 * Table size is given by nel
530 * Name/value pairs are in array atts, which is null terminated.
531 * s is the name of the HTML element being processed.
532 */
533static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
534 char *s) {
535 char *name;
536 char *val;
537 attr_item *ip;
538
539 while ((name = *atts++) != NULL) {
540 val = *atts++;
541 ip = bsearch(name, items, nel, ISIZE, icmp);
542 if (ip)
543 ctx->warn |= ip->action(tp, val);
544 else {
545 agwarningf("Illegal attribute %s in %s - ignored\n", name,
546 s);
547 ctx->warn = 1;
548 }
549 }
550}
551
552static void mkBR(htmllexstate_t *ctx, char **atts)
553{
554 ctx->htmllval->i = UNSET_ALIGN;
555 doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
556}
557
558static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
559{
560 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
561
562 doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
563
564 return img;
565}
566
567static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
568 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
569
570 tf.size = -1.0; /* unassigned */
571 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
572 assert(flags <= FLAGS_MAX);
573 tf.flags = (unsigned char)(flags & FLAGS_MAX);
574 if (atts)
575 doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
576
577 return dtinsert(ctx->gvc->textfont_dt, &tf);
578}
579
580static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
581{
583
584 cell->colspan = 1;
585 cell->rowspan = 1;
586 doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
587
588 return cell;
589}
590
591static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
592{
593 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
594
595 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
596 tbl->rows = (rows_t){.dtor = free_ritem};
597 tbl->cellborder = -1; // unset cell border attribute
598 doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
599
600 return tbl;
601}
602
603static void startElement(void *user, const char *name, char **atts)
604{
605 htmllexstate_t *ctx = user;
606
607 if (strcasecmp(name, "TABLE") == 0) {
608 ctx->htmllval->tbl = mkTbl(ctx, atts);
609 ctx->inCell = 0;
610 ctx->tok = T_table;
611 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
612 ctx->inCell = 0;
613 ctx->tok = T_row;
614 } else if (strcasecmp(name, "TD") == 0) {
615 ctx->inCell = 1;
616 ctx->htmllval->cell = mkCell(ctx, atts);
617 ctx->tok = T_cell;
618 } else if (strcasecmp(name, "FONT") == 0) {
619 ctx->htmllval->font = mkFont(ctx, atts, 0);
620 ctx->tok = T_font;
621 } else if (strcasecmp(name, "B") == 0) {
622 ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
623 ctx->tok = T_bold;
624 } else if (strcasecmp(name, "S") == 0) {
625 ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
626 ctx->tok = T_s;
627 } else if (strcasecmp(name, "U") == 0) {
628 ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
629 ctx->tok = T_underline;
630 } else if (strcasecmp(name, "O") == 0) {
631 ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
632 ctx->tok = T_overline;
633 } else if (strcasecmp(name, "I") == 0) {
634 ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
635 ctx->tok = T_italic;
636 } else if (strcasecmp(name, "SUP") == 0) {
637 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
638 ctx->tok = T_sup;
639 } else if (strcasecmp(name, "SUB") == 0) {
640 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
641 ctx->tok = T_sub;
642 } else if (strcasecmp(name, "BR") == 0) {
643 mkBR(ctx, atts);
644 ctx->tok = T_br;
645 } else if (strcasecmp(name, "HR") == 0) {
646 ctx->tok = T_hr;
647 } else if (strcasecmp(name, "VR") == 0) {
648 ctx->tok = T_vr;
649 } else if (strcasecmp(name, "IMG") == 0) {
650 ctx->htmllval->img = mkImg(ctx, atts);
651 ctx->tok = T_img;
652 } else if (strcasecmp(name, "HTML") == 0) {
653 ctx->tok = T_html;
654 } else {
655 lexerror(ctx, name);
656 }
657}
658
659static void endElement(void *user, const char *name)
660{
661 htmllexstate_t *ctx = user;
662
663 if (strcasecmp(name, "TABLE") == 0) {
664 ctx->tok = T_end_table;
665 ctx->inCell = 1;
666 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
667 ctx->tok = T_end_row;
668 } else if (strcasecmp(name, "TD") == 0) {
669 ctx->tok = T_end_cell;
670 ctx->inCell = 0;
671 } else if (strcasecmp(name, "HTML") == 0) {
672 ctx->tok = T_end_html;
673 } else if (strcasecmp(name, "FONT") == 0) {
674 ctx->tok = T_end_font;
675 } else if (strcasecmp(name, "B") == 0) {
676 ctx->tok = T_n_bold;
677 } else if (strcasecmp(name, "U") == 0) {
678 ctx->tok = T_n_underline;
679 } else if (strcasecmp(name, "O") == 0) {
680 ctx->tok = T_n_overline;
681 } else if (strcasecmp(name, "I") == 0) {
682 ctx->tok = T_n_italic;
683 } else if (strcasecmp(name, "SUP") == 0) {
684 ctx->tok = T_n_sup;
685 } else if (strcasecmp(name, "SUB") == 0) {
686 ctx->tok = T_n_sub;
687 } else if (strcasecmp(name, "S") == 0) {
688 ctx->tok = T_n_s;
689 } else if (strcasecmp(name, "BR") == 0) {
690 if (ctx->tok == T_br)
691 ctx->tok = T_BR;
692 else
693 ctx->tok = T_end_br;
694 } else if (strcasecmp(name, "HR") == 0) {
695 if (ctx->tok == T_hr)
696 ctx->tok = T_HR;
697 else
698 ctx->tok = T_end_hr;
699 } else if (strcasecmp(name, "VR") == 0) {
700 if (ctx->tok == T_vr)
701 ctx->tok = T_VR;
702 else
703 ctx->tok = T_end_vr;
704 } else if (strcasecmp(name, "IMG") == 0) {
705 if (ctx->tok == T_img)
706 ctx->tok = T_IMG;
707 else
708 ctx->tok = T_end_img;
709 } else {
710 lexerror(ctx, name);
711 }
712}
713
714/* characterData:
715 * Generate T_string token. Do this only when immediately in
716 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
717 * Strip out formatting characters but keep spaces.
718 * Distinguish between all whitespace vs. strings with non-whitespace
719 * characters.
720 */
721static void characterData(void *user, const char *s, int length)
722{
723 htmllexstate_t *ctx = user;
724
725 int i, cnt = 0;
726 unsigned char c;
727
728 if (ctx->inCell) {
729 for (i = length; i; i--) {
730 c = *s++;
731 if (c >= ' ') {
732 cnt++;
733 agxbputc(ctx->xb, (char)c);
734 }
735 }
736 if (cnt) ctx->tok = T_string;
737 }
738}
739#endif
740
741int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
742{
743#ifdef HAVE_EXPAT
744 htmllexstate_t *ctx = &scanner->lexer;
745
746 ctx->xb = xb;
747 ctx->lb = (agxbuf){0};
748 ctx->ptr = src;
749 ctx->mode = 0;
750 ctx->warn = 0;
751 ctx->error = 0;
752 ctx->currtok = (strview_t){0};
753 ctx->prevtok = (strview_t){0};
754 ctx->inCell = 1;
755 ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
756 ctx->gvc = GD_gvc(env->g);
757 XML_SetUserData(ctx->parser, ctx);
758 XML_SetElementHandler(ctx->parser,
759 (XML_StartElementHandler) startElement,
760 endElement);
761 XML_SetCharacterDataHandler(ctx->parser, characterData);
762 return 0;
763#else
764 (void)scanner;
765 (void)src;
766 (void)xb;
767 (void)env;
768
769 static int first;
770 if (!first) {
772 "Not built with libexpat. Table formatting is not available.\n");
773 first++;
774 }
775 return 1;
776#endif
777}
778
780{
781#ifdef HAVE_EXPAT
782 htmllexstate_t *ctx = &scanner->lexer;
783 int rv = ctx->error ? 3 : ctx->warn;
784 XML_ParserFree(ctx->parser);
785 agxbfree (&ctx->lb);
786 return rv;
787#else
788 (void)scanner;
789
790 return 1;
791#endif
792}
793
795static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
796 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
797 // implicitly assuming that source and destination do not overlap
798 char *src_copy = gv_strdup(src);
799 agxbput(dst, src_copy);
800 free(src_copy);
801}
802
803#ifdef HAVE_EXPAT
804/* eatComment:
805 * Given first character after open comment, eat characters
806 * up to comment close, returning pointer to closing > if it exists,
807 * or null character otherwise.
808 * We rely on HTML strings having matched nested <>.
809 */
810static char *eatComment(htmllexstate_t *ctx, char *p)
811{
812 int depth = 1;
813 char *s = p;
814 char c;
815
816 while (depth && (c = *s++)) {
817 if (c == '<')
818 depth++;
819 else if (c == '>')
820 depth--;
821 }
822 s--; /* move back to '\0' or '>' */
823 if (*s) {
824 char *t = s - 2;
825 if (t < p || !startswith(t, "--")) {
826 agwarningf("Unclosed comment\n");
827 ctx->warn = 1;
828 }
829 }
830 return s;
831}
832
833/* findNext:
834 * Return next XML unit. This is either <..>, an HTML
835 * comment <!-- ... -->, or characters up to next <.
836 */
837static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
838{
839 char* t = s + 1;
840 char c;
841
842 if (*s == '<') {
843 if (startswith(t, "!--"))
844 t = eatComment(ctx, t + 3);
845 else
846 while (*t && *t != '>')
847 t++;
848 if (*t != '>') {
849 agwarningf("Label closed before end of HTML element\n");
850 ctx->warn = 1;
851 } else
852 t++;
853 } else {
854 t = s;
855 while ((c = *t) && c != '<') {
856 if (c == '&' && *(t+1) != '#') {
857 t = scanEntity(t + 1, xb);
858 }
859 else {
860 agxbputc(xb, c);
861 t++;
862 }
863 }
864 }
865 return t;
866}
867
884static void protect_rsqb(agxbuf *xb) {
885
886 // if the buffer is empty, we have nothing to do
887 if (agxblen(xb) == 0) {
888 return;
889 }
890
891 // check the last character and if it is not ], we have nothing to do
892 char *data = agxbuse(xb);
893 size_t size = strlen(data);
894 assert(size > 0);
895 if (data[size - 1] != ']') {
896 agxbput_move(xb, data);
897 return;
898 }
899
900 // truncate ] and write back the remaining prefix
901 data[size - 1] = '\0';
902 agxbput_move(xb, data);
903
904 // write an XML-escaped version of ] as a replacement
905 agxbput(xb, "&#93;");
906}
907#endif
908
909
911 return htmllineno_ctx(&scanner->lexer);
912}
913
914static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
915#ifdef HAVE_EXPAT
916 return XML_GetCurrentLineNumber(ctx->parser);
917#else
918 (void)ctx;
919
920 return 0;
921#endif
922}
923
924#ifdef DEBUG
925static void printTok(htmllexstate_t *ctx, int tok)
926{
927 char *s;
928
929 switch (tok) {
930 case T_end_br:
931 s = "T_end_br";
932 break;
933 case T_end_img:
934 s = "T_end_img";
935 break;
936 case T_row:
937 s = "T_row";
938 break;
939 case T_end_row:
940 s = "T_end_row";
941 break;
942 case T_html:
943 s = "T_html";
944 break;
945 case T_end_html:
946 s = "T_end_html";
947 break;
948 case T_end_table:
949 s = "T_end_table";
950 break;
951 case T_end_cell:
952 s = "T_end_cell";
953 break;
954 case T_end_font:
955 s = "T_end_font";
956 break;
957 case T_string:
958 s = "T_string";
959 break;
960 case T_error:
961 s = "T_error";
962 break;
963 case T_n_italic:
964 s = "T_n_italic";
965 break;
966 case T_n_bold:
967 s = "T_n_bold";
968 break;
969 case T_n_underline:
970 s = "T_n_underline";
971 break;
972 case T_n_overline:
973 s = "T_n_overline";
974 break;
975 case T_n_sup:
976 s = "T_n_sup";
977 break;
978 case T_n_sub:
979 s = "T_n_sub";
980 break;
981 case T_n_s:
982 s = "T_n_s";
983 break;
984 case T_HR:
985 s = "T_HR";
986 break;
987 case T_hr:
988 s = "T_hr";
989 break;
990 case T_end_hr:
991 s = "T_end_hr";
992 break;
993 case T_VR:
994 s = "T_VR";
995 break;
996 case T_vr:
997 s = "T_vr";
998 break;
999 case T_end_vr:
1000 s = "T_end_vr";
1001 break;
1002 case T_BR:
1003 s = "T_BR";
1004 break;
1005 case T_br:
1006 s = "T_br";
1007 break;
1008 case T_IMG:
1009 s = "T_IMG";
1010 break;
1011 case T_img:
1012 s = "T_img";
1013 break;
1014 case T_table:
1015 s = "T_table";
1016 break;
1017 case T_cell:
1018 s = "T_cell";
1019 break;
1020 case T_font:
1021 s = "T_font";
1022 break;
1023 case T_italic:
1024 s = "T_italic";
1025 break;
1026 case T_bold:
1027 s = "T_bold";
1028 break;
1029 case T_underline:
1030 s = "T_underline";
1031 break;
1032 case T_overline:
1033 s = "T_overline";
1034 break;
1035 case T_sup:
1036 s = "T_sup";
1037 break;
1038 case T_sub:
1039 s = "T_sub";
1040 break;
1041 case T_s:
1042 s = "T_s";
1043 break;
1044 default:
1045 s = "<unknown>";
1046 }
1047 if (tok == T_string) {
1048 const char *token_text = agxbuse(ctx->xb);
1049 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1050 agxbput_move(ctx->xb, token_text);
1051 } else
1052 fprintf(stderr, "%s\n", s);
1053}
1054
1055#endif
1056
1057int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
1058{
1059#ifdef HAVE_EXPAT
1060 static char *begin_html = "<HTML>";
1061 static char *end_html = "</HTML>";
1062
1063 char *s;
1064 char *endp = 0;
1065 size_t len, llen;
1066 int rv;
1067 htmllexstate_t *ctx = &scanner->lexer;
1068
1069 ctx->htmllval = htmllval;
1070 ctx->tok = 0;
1071 do {
1072 if (ctx->mode == 2)
1073 return EOF;
1074 if (ctx->mode == 0) {
1075 ctx->mode = 1;
1076 s = begin_html;
1077 len = strlen(s);
1078 endp = 0;
1079 } else {
1080 s = ctx->ptr;
1081 if (*s == '\0') {
1082 ctx->mode = 2;
1083 s = end_html;
1084 len = strlen(s);
1085 } else {
1086 endp = findNext(ctx, s,&ctx->lb);
1087 len = (size_t)(endp - s);
1088 }
1089 }
1090
1091 protect_rsqb(&ctx->lb);
1092
1093 ctx->prevtok = ctx->currtok;
1094 ctx->currtok = (strview_t){.data = s, .size = len};
1095 if ((llen = agxblen(&ctx->lb))) {
1096 assert(llen <= INT_MAX && "XML token too long for expat API");
1097 rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
1098 } else {
1099 assert(len <= INT_MAX && "XML token too long for expat API");
1100 rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
1101 }
1102 if (rv == XML_STATUS_ERROR) {
1103 if (!ctx->error) {
1104 agerrorf("%s in line %lu \n",
1105 XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
1106 error_context(ctx);
1107 ctx->error = 1;
1108 ctx->tok = T_error;
1109 }
1110 }
1111 if (endp)
1112 ctx->ptr = endp;
1113 } while (ctx->tok == 0);
1114#ifdef DEBUG
1115 printTok (ctx, ctx->tok);
1116#endif
1117 return ctx->tok;
1118#else
1119 (void)htmllval;
1120 (void)scanner;
1121
1122 return EOF;
1123#endif
1124}
1125
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:77
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:305
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:88
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:275
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:186
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1079
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1606
static int flags
Definition gc.c:61
static double len(glCompPoint p)
Definition glutils.c:136
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:181
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:196
void agwarningf(const char *fmt,...)
Definition agerror.c:173
void agerrorf(const char *fmt,...)
Definition agerror.c:165
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:155
@ AGPREV
Definition cgraph.h:946
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
agxbput(xb, staging)
int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
Definition htmllex.c:1057
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:795
unsigned long htmllineno(htmlscan_t *scanner)
Definition htmllex.c:910
#define XML_STATUS_ERROR
Definition htmllex.c:36
static void error_context(htmllexstate_t *ctx)
Definition htmllex.c:44
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:779
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:741
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:53
static unsigned long htmllineno_ctx(htmllexstate_t *ctx)
Definition htmllex.c:914
#define T_n_sup
Definition htmlparse.h:138
#define T_end_row
Definition htmlparse.h:126
#define T_end_table
Definition htmlparse.h:129
#define T_br
Definition htmlparse.h:148
#define T_vr
Definition htmlparse.h:145
#define T_error
Definition htmlparse.h:133
#define T_n_s
Definition htmlparse.h:140
#define T_end_cell
Definition htmlparse.h:130
#define T_n_sub
Definition htmlparse.h:139
#define T_n_bold
Definition htmlparse.h:135
#define T_html
Definition htmlparse.h:127
#define T_BR
Definition htmlparse.h:147
#define T_underline
Definition htmlparse.h:156
#define T_sup
Definition htmlparse.h:158
#define T_row
Definition htmlparse.h:125
#define T_table
Definition htmlparse.h:151
#define T_end_vr
Definition htmlparse.h:146
#define T_end_html
Definition htmlparse.h:128
#define T_IMG
Definition htmlparse.h:149
#define T_VR
Definition htmlparse.h:144
#define T_bold
Definition htmlparse.h:155
#define T_end_img
Definition htmlparse.h:124
#define T_sub
Definition htmlparse.h:159
#define T_s
Definition htmlparse.h:160
#define T_n_italic
Definition htmlparse.h:134
#define T_end_font
Definition htmlparse.h:131
#define T_overline
Definition htmlparse.h:157
#define T_hr
Definition htmlparse.h:142
#define T_font
Definition htmlparse.h:153
#define T_italic
Definition htmlparse.h:154
#define T_end_br
Definition htmlparse.h:123
#define T_n_underline
Definition htmlparse.h:136
#define T_cell
Definition htmlparse.h:152
#define T_end_hr
Definition htmlparse.h:143
#define T_string
Definition htmlparse.h:132
#define T_img
Definition htmlparse.h:150
#define T_HR
Definition htmlparse.h:141
#define T_n_overline
Definition htmlparse.h:137
cleanup & scanner
Definition htmlparse.y:289
#define PAD_SET
Definition htmltable.h:33
#define BORDER_RIGHT
Definition htmltable.h:40
#define BORDER_TOP
Definition htmltable.h:39
#define HALIGN_TEXT
Definition htmltable.h:28
#define UNSET_ALIGN
Definition htmltable.h:44
#define HALIGN_LEFT
Definition htmltable.h:26
#define VALIGN_BOTTOM
Definition htmltable.h:30
#define BALIGN_RIGHT
Definition htmltable.h:35
#define BALIGN_LEFT
Definition htmltable.h:36
#define BORDER_BOTTOM
Definition htmltable.h:41
static void free_ritem(row_t *p)
Free row. This closes and frees row’s list, then the item itself is freed.
Definition htmltable.h:117
#define SPACE_SET
Definition htmltable.h:34
#define BORDER_SET
Definition htmltable.h:32
#define BORDER_LEFT
Definition htmltable.h:38
#define BORDER_MASK
Definition htmltable.h:42
#define HALIGN_RIGHT
Definition htmltable.h:25
#define VALIGN_TOP
Definition htmltable.h:29
#define FIXED_FLAG
Definition htmltable.h:24
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:811
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Dt_t * textfont_dt
Definition gvcint.h:108
result of partitioning available space, part of maze
Definition grid.h:33
uint16_t rowspan
Definition htmltable.h:158
uint16_t colspan
Definition htmltable.h:157
char * bgcolor
Definition htmltable.h:87
unsigned char border
Definition htmltable.h:91
char * target
Definition htmltable.h:84
char * id
Definition htmltable.h:86
signed char space
Definition htmltable.h:90
unsigned short width
Definition htmltable.h:95
unsigned short height
Definition htmltable.h:96
int gradientangle
Definition htmltable.h:89
char * port
Definition htmltable.h:83
unsigned short flags
Definition htmltable.h:94
char * href
Definition htmltable.h:82
unsigned char pad
Definition htmltable.h:92
char * pencolor
Definition htmltable.h:88
htmlstyle_t style
Definition htmltable.h:97
char * title
Definition htmltable.h:85
graph_t * g
Definition htmltable.h:171
char * scale
Definition htmltable.h:70
char * src
Definition htmltable.h:69
HTMLSTYPE * htmllval
Definition htmlparse.h:229
agxbuf * xb
Definition htmlparse.h:220
strview_t prevtok
Definition htmlparse.h:227
strview_t currtok
Definition htmlparse.h:226
bool dashed
Definition htmltable.h:78
bool dotted
Definition htmltable.h:77
bool rounded
Definition htmltable.h:75
bool radial
Definition htmltable.h:74
bool invisible
Definition htmltable.h:76
bool vrule
vertical rule
Definition htmltable.h:143
size_t row_count
number of rows
Definition htmltable.h:139
bool hrule
horizontal rule
Definition htmltable.h:142
rows_t rows
cells
Definition htmltable.h:133
int8_t cellborder
Definition htmltable.h:136
Definition utils.c:750
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.h:171
htmlcell_t * cell
Definition htmlparse.h:170
textfont_t * font
Definition htmlparse.h:172
htmlimg_t * img
Definition htmlparse.h:173
Definition grammar.c:90
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25