Graphviz 14.1.2~dev.20260123.1158
Loading...
Searching...
No Matches
htmllex.c
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13#include "config.h"
14
15#include <assert.h>
16#include <common/render.h>
17#include <common/htmltable.h>
18#include "htmlparse.h"
19#include <common/htmllex.h>
20#include <cdt/cdt.h>
21#include <limits.h>
22#include <stdbool.h>
23#include <stddef.h>
24#include <stdint.h>
25#include <util/alloc.h>
26#include <util/gv_ctype.h>
27#include <util/startswith.h>
28#include <util/strcasecmp.h>
29#include <util/strview.h>
30#include <util/tokenize.h>
31#include <util/unused.h>
32
33#ifdef HAVE_EXPAT
34#include <expat.h>
35#endif
36
37#ifndef XML_STATUS_ERROR
38#define XML_STATUS_ERROR 0
39#endif
40
41static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
42
43/* error_context:
44 * Print the last 2 "token"s seen.
45 */
47{
48 agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
49 ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
50}
51
52/* htmlerror:
53 * yyerror - called by yacc output
54 */
55void htmlerror(htmlscan_t *scanner, const char *msg)
56{
57 htmllexstate_t *ctx = &scanner->lexer;
58 if (ctx->error)
59 return;
60 ctx->error = 1;
61 agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
62 error_context(&scanner->lexer);
63}
64
65#ifdef HAVE_EXPAT
66/* lexerror:
67 * called by lexer when unknown <..> is found.
68 */
69static void lexerror(htmllexstate_t *ctx, const char *name)
70{
71 ctx->tok = T_error;
72 ctx->error = 1;
73 agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
74}
75
76typedef int (*attrFn) (void *, char *);
77typedef int (*bcmpfn) (const void *, const void *);
78
79/* Mechanism for automatically processing attributes */
80typedef struct {
81 char *name; /* attribute name */
82 attrFn action; /* action to perform if name matches */
83} attr_item;
84
85#define ISIZE (sizeof(attr_item))
86
87/* icmp:
88 * Compare an attr_item. Used in bsearch
89 */
90static int icmp(const void *name, const void *item) {
91 const attr_item *j = item;
92 return strcasecmp(name, j->name);
93}
94
95static int bgcolorfn(htmldata_t * p, char *v)
96{
97 p->bgcolor = strdup(v);
98 return 0;
99}
100
101static int pencolorfn(htmldata_t * p, char *v)
102{
103 p->pencolor = strdup(v);
104 return 0;
105}
106
107static int hreffn(htmldata_t * p, char *v)
108{
109 p->href = strdup(v);
110 return 0;
111}
112
113static int sidesfn(htmldata_t * p, char *v)
114{
115 unsigned short flags = 0;
116 char c;
117
118 while ((c = *v++)) {
119 switch (gv_tolower(c)) {
120 case 'l' :
122 break;
123 case 't' :
124 flags |= BORDER_TOP;
125 break;
126 case 'r' :
128 break;
129 case 'b' :
131 break;
132 default :
133 agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
134 break;
135 }
136 }
137 if (flags != BORDER_MASK)
138 p->flags |= flags;
139 return 0;
140}
141
142static int titlefn(htmldata_t * p, char *v)
143{
144 p->title = strdup(v);
145 return 0;
146}
147
148static int portfn(htmldata_t * p, char *v)
149{
150 p->port = strdup(v);
151 return 0;
152}
153
154#define DELIM " ,"
155
156static int stylefn(htmldata_t * p, char *v)
157{
158 int rv = 0;
159 for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
160 strview_t tk = tok_get(&t);
161 if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
162 else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
163 else if (strview_case_str_eq(tk,"SOLID")) {
164 p->style.dotted = false;
165 p->style.dashed = false;
166 } else if (strview_case_str_eq(tk,"INVISIBLE") ||
167 strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
168 else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
169 else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
170 else {
171 agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
172 tk.data);
173 rv = 1;
174 }
175 }
176 return rv;
177}
178
179static int targetfn(htmldata_t * p, char *v)
180{
181 p->target = strdup(v);
182 return 0;
183}
184
185static int idfn(htmldata_t * p, char *v)
186{
187 p->id = strdup(v);
188 return 0;
189}
190
191
192/* doInt:
193 * Scan v for integral value. Check that
194 * the value is >= min and <= max. Return value in ul.
195 * String s is name of value.
196 * Return 0 if okay; 1 otherwise.
197 */
198static int doInt(char *v, char *s, int min, int max, long *ul)
199{
200 int rv = 0;
201 char *ep;
202 long b = strtol(v, &ep, 10);
203
204 if (ep == v) {
205 agwarningf("Improper %s value %s - ignored", s, v);
206 rv = 1;
207 } else if (b > max) {
208 agwarningf("%s value %s > %d - too large - ignored", s, v, max);
209 rv = 1;
210 } else if (b < min) {
211 agwarningf("%s value %s < %d - too small - ignored", s, v, min);
212 rv = 1;
213 } else
214 *ul = b;
215 return rv;
216}
217
218
219static int gradientanglefn(htmldata_t * p, char *v)
220{
221 long u;
222
223 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
224 return 1;
225 p->gradientangle = (unsigned short) u;
226 return 0;
227}
228
229
230static int borderfn(htmldata_t * p, char *v)
231{
232 long u;
233
234 if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
235 return 1;
236 p->border = (unsigned char) u;
237 p->flags |= BORDER_SET;
238 return 0;
239}
240
241static int cellpaddingfn(htmldata_t * p, char *v)
242{
243 long u;
244
245 if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
246 return 1;
247 p->pad = (unsigned char) u;
248 p->flags |= PAD_SET;
249 return 0;
250}
251
252static int cellspacingfn(htmldata_t * p, char *v)
253{
254 long u;
255
256 if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
257 return 1;
258 p->space = (signed char) u;
259 p->flags |= SPACE_SET;
260 return 0;
261}
262
263static int cellborderfn(htmltbl_t * p, char *v)
264{
265 long u;
266
267 if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
268 return 1;
269 p->cellborder = (int8_t)u;
270 return 0;
271}
272
273static int columnsfn(htmltbl_t * p, char *v)
274{
275 if (*v != '*') {
276 agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
277 return 1;
278 }
279 p->vrule = true;
280 return 0;
281}
282
283static int rowsfn(htmltbl_t * p, char *v)
284{
285 if (*v != '*') {
286 agwarningf("Unknown value %s for ROWS - ignored\n", v);
287 return 1;
288 }
289 p->hrule = true;
290 return 0;
291}
292
293static int fixedsizefn(htmldata_t * p, char *v)
294{
295 int rv = 0;
296 if (!strcasecmp(v, "TRUE"))
297 p->flags |= FIXED_FLAG;
298 else if (strcasecmp(v, "FALSE")) {
299 agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
300 rv = 1;
301 }
302 return rv;
303}
304
305static int valignfn(htmldata_t * p, char *v)
306{
307 int rv = 0;
308 if (!strcasecmp(v, "BOTTOM"))
309 p->flags |= VALIGN_BOTTOM;
310 else if (!strcasecmp(v, "TOP"))
311 p->flags |= VALIGN_TOP;
312 else if (strcasecmp(v, "MIDDLE")) {
313 agwarningf("Illegal value %s for VALIGN - ignored\n", v);
314 rv = 1;
315 }
316 return rv;
317}
318
319static int halignfn(htmldata_t * p, char *v)
320{
321 int rv = 0;
322 if (!strcasecmp(v, "LEFT"))
323 p->flags |= HALIGN_LEFT;
324 else if (!strcasecmp(v, "RIGHT"))
325 p->flags |= HALIGN_RIGHT;
326 else if (strcasecmp(v, "CENTER")) {
327 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
328 rv = 1;
329 }
330 return rv;
331}
332
333static int cell_halignfn(htmldata_t * p, char *v)
334{
335 int rv = 0;
336 if (!strcasecmp(v, "LEFT"))
337 p->flags |= HALIGN_LEFT;
338 else if (!strcasecmp(v, "RIGHT"))
339 p->flags |= HALIGN_RIGHT;
340 else if (!strcasecmp(v, "TEXT"))
341 p->flags |= HALIGN_TEXT;
342 else if (strcasecmp(v, "CENTER"))
343 rv = 1;
344 if (rv)
345 agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
346 return rv;
347}
348
349static int balignfn(htmldata_t * p, char *v)
350{
351 int rv = 0;
352 if (!strcasecmp(v, "LEFT"))
353 p->flags |= BALIGN_LEFT;
354 else if (!strcasecmp(v, "RIGHT"))
355 p->flags |= BALIGN_RIGHT;
356 else if (strcasecmp(v, "CENTER"))
357 rv = 1;
358 if (rv)
359 agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
360 return rv;
361}
362
363static int heightfn(htmldata_t * p, char *v)
364{
365 long u;
366
367 if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
368 return 1;
369 p->height = (unsigned short) u;
370 return 0;
371}
372
373static int widthfn(htmldata_t * p, char *v)
374{
375 long u;
376
377 if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
378 return 1;
379 p->width = (unsigned short) u;
380 return 0;
381}
382
383static int rowspanfn(htmlcell_t * p, char *v)
384{
385 long u;
386
387 if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
388 return 1;
389 if (u == 0) {
390 agwarningf("ROWSPAN value cannot be 0 - ignored\n");
391 return 1;
392 }
393 p->rowspan = (uint16_t)u;
394 return 0;
395}
396
397static int colspanfn(htmlcell_t * p, char *v)
398{
399 long u;
400
401 if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
402 return 1;
403 if (u == 0) {
404 agwarningf("COLSPAN value cannot be 0 - ignored\n");
405 return 1;
406 }
407 p->colspan = (uint16_t)u;
408 return 0;
409}
410
411static int fontcolorfn(textfont_t * p, char *v)
412{
413 p->color = v;
414 return 0;
415}
416
417static int facefn(textfont_t * p, char *v)
418{
419 p->name = v;
420 return 0;
421}
422
423static int ptsizefn(textfont_t * p, char *v)
424{
425 long u;
426
427 if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
428 return 1;
429 p->size = (double) u;
430 return 0;
431}
432
433static int srcfn(htmlimg_t * p, char *v)
434{
435 p->src = strdup(v);
436 return 0;
437}
438
439static int scalefn(htmlimg_t * p, char *v)
440{
441 p->scale = strdup(v);
442 return 0;
443}
444
445static int alignfn(int *p, char *v)
446{
447 int rv = 0;
448 if (!strcasecmp(v, "RIGHT"))
449 *p = 'r';
450 else if (!strcasecmp(v, "LEFT"))
451 *p = 'l';
452 else if (!strcasecmp(v, "CENTER"))
453 *p = 'n';
454 else {
455 agwarningf("Illegal value %s for ALIGN - ignored\n", v);
456 rv = 1;
457 }
458 return rv;
459}
460
461/* Tables used in binary search; MUST be alphabetized */
462static attr_item tbl_items[] = {
463 {"align", (attrFn) halignfn},
464 {"bgcolor", (attrFn) bgcolorfn},
465 {"border", (attrFn) borderfn},
466 {"cellborder", (attrFn) cellborderfn},
467 {"cellpadding", (attrFn) cellpaddingfn},
468 {"cellspacing", (attrFn) cellspacingfn},
469 {"color", (attrFn) pencolorfn},
470 {"columns", (attrFn) columnsfn},
471 {"fixedsize", (attrFn) fixedsizefn},
472 {"gradientangle", (attrFn) gradientanglefn},
473 {"height", (attrFn) heightfn},
474 {"href", (attrFn) hreffn},
475 {"id", (attrFn) idfn},
476 {"port", (attrFn) portfn},
477 {"rows", (attrFn) rowsfn},
478 {"sides", (attrFn) sidesfn},
479 {"style", (attrFn) stylefn},
480 {"target", (attrFn) targetfn},
481 {"title", (attrFn) titlefn},
482 {"tooltip", (attrFn) titlefn},
483 {"valign", (attrFn) valignfn},
484 {"width", (attrFn) widthfn},
485};
486
487static attr_item cell_items[] = {
488 {"align", (attrFn) cell_halignfn},
489 {"balign", (attrFn) balignfn},
490 {"bgcolor", (attrFn) bgcolorfn},
491 {"border", (attrFn) borderfn},
492 {"cellpadding", (attrFn) cellpaddingfn},
493 {"cellspacing", (attrFn) cellspacingfn},
494 {"color", (attrFn) pencolorfn},
495 {"colspan", (attrFn) colspanfn},
496 {"fixedsize", (attrFn) fixedsizefn},
497 {"gradientangle", (attrFn) gradientanglefn},
498 {"height", (attrFn) heightfn},
499 {"href", (attrFn) hreffn},
500 {"id", (attrFn) idfn},
501 {"port", (attrFn) portfn},
502 {"rowspan", (attrFn) rowspanfn},
503 {"sides", (attrFn) sidesfn},
504 {"style", (attrFn) stylefn},
505 {"target", (attrFn) targetfn},
506 {"title", (attrFn) titlefn},
507 {"tooltip", (attrFn) titlefn},
508 {"valign", (attrFn) valignfn},
509 {"width", (attrFn) widthfn},
510};
511
512static attr_item font_items[] = {
513 {"color", (attrFn) fontcolorfn},
514 {"face", (attrFn) facefn},
515 {"point-size", (attrFn) ptsizefn},
516};
517
518static attr_item img_items[] = {
519 {"scale", (attrFn) scalefn},
520 {"src", (attrFn) srcfn},
521};
522
523static attr_item br_items[] = {
524 {"align", (attrFn) alignfn},
525};
526
527/* doAttrs:
528 * General function for processing list of name/value attributes.
529 * Do binary search on items table. If match found, invoke action
530 * passing it tp and attribute value.
531 * Table size is given by nel
532 * Name/value pairs are in array atts, which is null terminated.
533 * s is the name of the HTML element being processed.
534 */
535static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
536 char *s) {
537 char *name;
538 char *val;
539 attr_item *ip;
540
541 while ((name = *atts++) != NULL) {
542 val = *atts++;
543 ip = bsearch(name, items, nel, ISIZE, icmp);
544 if (ip)
545 ctx->warn |= ip->action(tp, val);
546 else {
547 agwarningf("Illegal attribute %s in %s - ignored\n", name,
548 s);
549 ctx->warn = 1;
550 }
551 }
552}
553
554static void mkBR(htmllexstate_t *ctx, char **atts)
555{
556 ctx->htmllval->i = UNSET_ALIGN;
557 doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
558}
559
560static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
561{
562 htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
563
564 doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
565
566 return img;
567}
568
569static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
570 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
571
572 tf.size = -1.0; /* unassigned */
573 enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
574 assert(flags <= FLAGS_MAX);
575 tf.flags = (unsigned char)(flags & FLAGS_MAX);
576 if (atts)
577 doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
578
579 return dtinsert(ctx->gvc->textfont_dt, &tf);
580}
581
582static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
583{
585
586 cell->colspan = 1;
587 cell->rowspan = 1;
588 doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
589
590 return cell;
591}
592
593static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
594{
595 htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
596
597 tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
598 tbl->rows = (rows_t){.dtor = free_ritem};
599 tbl->cellborder = -1; // unset cell border attribute
600 doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
601
602 return tbl;
603}
604
605static void startElement(void *user, const char *name, char **atts)
606{
607 htmllexstate_t *ctx = user;
608
609 if (strcasecmp(name, "TABLE") == 0) {
610 ctx->htmllval->tbl = mkTbl(ctx, atts);
611 ctx->inCell = 0;
612 ctx->tok = T_table;
613 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
614 ctx->inCell = 0;
615 ctx->tok = T_row;
616 } else if (strcasecmp(name, "TD") == 0) {
617 ctx->inCell = 1;
618 ctx->htmllval->cell = mkCell(ctx, atts);
619 ctx->tok = T_cell;
620 } else if (strcasecmp(name, "FONT") == 0) {
621 ctx->htmllval->font = mkFont(ctx, atts, 0);
622 ctx->tok = T_font;
623 } else if (strcasecmp(name, "B") == 0) {
624 ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
625 ctx->tok = T_bold;
626 } else if (strcasecmp(name, "S") == 0) {
627 ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
628 ctx->tok = T_s;
629 } else if (strcasecmp(name, "U") == 0) {
630 ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
631 ctx->tok = T_underline;
632 } else if (strcasecmp(name, "O") == 0) {
633 ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
634 ctx->tok = T_overline;
635 } else if (strcasecmp(name, "I") == 0) {
636 ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
637 ctx->tok = T_italic;
638 } else if (strcasecmp(name, "SUP") == 0) {
639 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
640 ctx->tok = T_sup;
641 } else if (strcasecmp(name, "SUB") == 0) {
642 ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
643 ctx->tok = T_sub;
644 } else if (strcasecmp(name, "BR") == 0) {
645 mkBR(ctx, atts);
646 ctx->tok = T_br;
647 } else if (strcasecmp(name, "HR") == 0) {
648 ctx->tok = T_hr;
649 } else if (strcasecmp(name, "VR") == 0) {
650 ctx->tok = T_vr;
651 } else if (strcasecmp(name, "IMG") == 0) {
652 ctx->htmllval->img = mkImg(ctx, atts);
653 ctx->tok = T_img;
654 } else if (strcasecmp(name, "HTML") == 0) {
655 ctx->tok = T_html;
656 } else {
657 lexerror(ctx, name);
658 }
659}
660
661static void endElement(void *user, const char *name)
662{
663 htmllexstate_t *ctx = user;
664
665 if (strcasecmp(name, "TABLE") == 0) {
666 ctx->tok = T_end_table;
667 ctx->inCell = 1;
668 } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
669 ctx->tok = T_end_row;
670 } else if (strcasecmp(name, "TD") == 0) {
671 ctx->tok = T_end_cell;
672 ctx->inCell = 0;
673 } else if (strcasecmp(name, "HTML") == 0) {
674 ctx->tok = T_end_html;
675 } else if (strcasecmp(name, "FONT") == 0) {
676 ctx->tok = T_end_font;
677 } else if (strcasecmp(name, "B") == 0) {
678 ctx->tok = T_n_bold;
679 } else if (strcasecmp(name, "U") == 0) {
680 ctx->tok = T_n_underline;
681 } else if (strcasecmp(name, "O") == 0) {
682 ctx->tok = T_n_overline;
683 } else if (strcasecmp(name, "I") == 0) {
684 ctx->tok = T_n_italic;
685 } else if (strcasecmp(name, "SUP") == 0) {
686 ctx->tok = T_n_sup;
687 } else if (strcasecmp(name, "SUB") == 0) {
688 ctx->tok = T_n_sub;
689 } else if (strcasecmp(name, "S") == 0) {
690 ctx->tok = T_n_s;
691 } else if (strcasecmp(name, "BR") == 0) {
692 if (ctx->tok == T_br)
693 ctx->tok = T_BR;
694 else
695 ctx->tok = T_end_br;
696 } else if (strcasecmp(name, "HR") == 0) {
697 if (ctx->tok == T_hr)
698 ctx->tok = T_HR;
699 else
700 ctx->tok = T_end_hr;
701 } else if (strcasecmp(name, "VR") == 0) {
702 if (ctx->tok == T_vr)
703 ctx->tok = T_VR;
704 else
705 ctx->tok = T_end_vr;
706 } else if (strcasecmp(name, "IMG") == 0) {
707 if (ctx->tok == T_img)
708 ctx->tok = T_IMG;
709 else
710 ctx->tok = T_end_img;
711 } else {
712 lexerror(ctx, name);
713 }
714}
715
716/* characterData:
717 * Generate T_string token. Do this only when immediately in
718 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
719 * Strip out formatting characters but keep spaces.
720 * Distinguish between all whitespace vs. strings with non-whitespace
721 * characters.
722 */
723static void characterData(void *user, const char *s, int length)
724{
725 htmllexstate_t *ctx = user;
726
727 int i, cnt = 0;
728 unsigned char c;
729
730 if (ctx->inCell) {
731 for (i = length; i; i--) {
732 c = *s++;
733 if (c >= ' ') {
734 cnt++;
735 agxbputc(ctx->xb, (char)c);
736 }
737 }
738 if (cnt) ctx->tok = T_string;
739 }
740}
741#endif
742
743int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
744{
745#ifdef HAVE_EXPAT
746 htmllexstate_t *ctx = &scanner->lexer;
747
748 ctx->xb = xb;
749 ctx->lb = (agxbuf){0};
750 ctx->ptr = src;
751 ctx->mode = 0;
752 ctx->warn = 0;
753 ctx->error = 0;
754 ctx->currtok = (strview_t){0};
755 ctx->prevtok = (strview_t){0};
756 ctx->inCell = 1;
757 ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
758 ctx->gvc = GD_gvc(env->g);
759 XML_SetUserData(ctx->parser, ctx);
760 XML_SetElementHandler(ctx->parser,
761 (XML_StartElementHandler) startElement,
762 endElement);
763 XML_SetCharacterDataHandler(ctx->parser, characterData);
764 return 0;
765#else
766 (void)scanner;
767 (void)src;
768 (void)xb;
769 (void)env;
770
771 static int first;
772 if (!first) {
774 "Not built with libexpat. Table formatting is not available.\n");
775 first++;
776 }
777 return 1;
778#endif
779}
780
782{
783#ifdef HAVE_EXPAT
784 htmllexstate_t *ctx = &scanner->lexer;
785 int rv = ctx->error ? 3 : ctx->warn;
786 XML_ParserFree(ctx->parser);
787 agxbfree (&ctx->lb);
788 return rv;
789#else
790 (void)scanner;
791
792 return 1;
793#endif
794}
795
797static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
798 // we cannot call `agxbput` itself because it calls `memcpy`, thereby
799 // implicitly assuming that source and destination do not overlap
800 char *src_copy = gv_strdup(src);
801 agxbput(dst, src_copy);
802 free(src_copy);
803}
804
805#ifdef HAVE_EXPAT
806/* eatComment:
807 * Given first character after open comment, eat characters
808 * up to comment close, returning pointer to closing > if it exists,
809 * or null character otherwise.
810 * We rely on HTML strings having matched nested <>.
811 */
812static char *eatComment(htmllexstate_t *ctx, char *p)
813{
814 int depth = 1;
815 char *s = p;
816 char c;
817
818 while (depth && (c = *s++)) {
819 if (c == '<')
820 depth++;
821 else if (c == '>')
822 depth--;
823 }
824 s--; /* move back to '\0' or '>' */
825 if (*s) {
826 char *t = s - 2;
827 if (t < p || !startswith(t, "--")) {
828 agwarningf("Unclosed comment\n");
829 ctx->warn = 1;
830 }
831 }
832 return s;
833}
834
835/* findNext:
836 * Return next XML unit. This is either <..>, an HTML
837 * comment <!-- ... -->, or characters up to next <.
838 */
839static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
840{
841 char* t = s + 1;
842 char c;
843
844 if (*s == '<') {
845 if (startswith(t, "!--"))
846 t = eatComment(ctx, t + 3);
847 else
848 while (*t && *t != '>')
849 t++;
850 if (*t != '>') {
851 agwarningf("Label closed before end of HTML element\n");
852 ctx->warn = 1;
853 } else
854 t++;
855 } else {
856 t = s;
857 while ((c = *t) && c != '<') {
858 if (c == '&' && *(t+1) != '#') {
859 t = scanEntity(t + 1, xb);
860 }
861 else {
862 agxbputc(xb, c);
863 t++;
864 }
865 }
866 }
867 return t;
868}
869
886static void protect_rsqb(agxbuf *xb) {
887
888 // if the buffer is empty, we have nothing to do
889 if (agxblen(xb) == 0) {
890 return;
891 }
892
893 // check the last character and if it is not ], we have nothing to do
894 char *data = agxbuse(xb);
895 size_t size = strlen(data);
896 assert(size > 0);
897 if (data[size - 1] != ']') {
898 agxbput_move(xb, data);
899 return;
900 }
901
902 // truncate ] and write back the remaining prefix
903 data[size - 1] = '\0';
904 agxbput_move(xb, data);
905
906 // write an XML-escaped version of ] as a replacement
907 agxbput(xb, "&#93;");
908}
909#endif
910
911
913 return htmllineno_ctx(&scanner->lexer);
914}
915
916static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
917#ifdef HAVE_EXPAT
918 return XML_GetCurrentLineNumber(ctx->parser);
919#else
920 (void)ctx;
921
922 return 0;
923#endif
924}
925
926#ifdef DEBUG
927static void printTok(htmllexstate_t *ctx, int tok)
928{
929 char *s;
930
931 switch (tok) {
932 case T_end_br:
933 s = "T_end_br";
934 break;
935 case T_end_img:
936 s = "T_end_img";
937 break;
938 case T_row:
939 s = "T_row";
940 break;
941 case T_end_row:
942 s = "T_end_row";
943 break;
944 case T_html:
945 s = "T_html";
946 break;
947 case T_end_html:
948 s = "T_end_html";
949 break;
950 case T_end_table:
951 s = "T_end_table";
952 break;
953 case T_end_cell:
954 s = "T_end_cell";
955 break;
956 case T_end_font:
957 s = "T_end_font";
958 break;
959 case T_string:
960 s = "T_string";
961 break;
962 case T_error:
963 s = "T_error";
964 break;
965 case T_n_italic:
966 s = "T_n_italic";
967 break;
968 case T_n_bold:
969 s = "T_n_bold";
970 break;
971 case T_n_underline:
972 s = "T_n_underline";
973 break;
974 case T_n_overline:
975 s = "T_n_overline";
976 break;
977 case T_n_sup:
978 s = "T_n_sup";
979 break;
980 case T_n_sub:
981 s = "T_n_sub";
982 break;
983 case T_n_s:
984 s = "T_n_s";
985 break;
986 case T_HR:
987 s = "T_HR";
988 break;
989 case T_hr:
990 s = "T_hr";
991 break;
992 case T_end_hr:
993 s = "T_end_hr";
994 break;
995 case T_VR:
996 s = "T_VR";
997 break;
998 case T_vr:
999 s = "T_vr";
1000 break;
1001 case T_end_vr:
1002 s = "T_end_vr";
1003 break;
1004 case T_BR:
1005 s = "T_BR";
1006 break;
1007 case T_br:
1008 s = "T_br";
1009 break;
1010 case T_IMG:
1011 s = "T_IMG";
1012 break;
1013 case T_img:
1014 s = "T_img";
1015 break;
1016 case T_table:
1017 s = "T_table";
1018 break;
1019 case T_cell:
1020 s = "T_cell";
1021 break;
1022 case T_font:
1023 s = "T_font";
1024 break;
1025 case T_italic:
1026 s = "T_italic";
1027 break;
1028 case T_bold:
1029 s = "T_bold";
1030 break;
1031 case T_underline:
1032 s = "T_underline";
1033 break;
1034 case T_overline:
1035 s = "T_overline";
1036 break;
1037 case T_sup:
1038 s = "T_sup";
1039 break;
1040 case T_sub:
1041 s = "T_sub";
1042 break;
1043 case T_s:
1044 s = "T_s";
1045 break;
1046 default:
1047 s = "<unknown>";
1048 }
1049 if (tok == T_string) {
1050 const char *token_text = agxbuse(ctx->xb);
1051 fprintf(stderr, "%s \"%s\"\n", s, token_text);
1052 agxbput_move(ctx->xb, token_text);
1053 } else
1054 fprintf(stderr, "%s\n", s);
1055}
1056
1057#endif
1058
1059int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
1060{
1061#ifdef HAVE_EXPAT
1062 static char *begin_html = "<HTML>";
1063 static char *end_html = "</HTML>";
1064
1065 char *s;
1066 char *endp = 0;
1067 size_t len, llen;
1068 int rv;
1069 htmllexstate_t *ctx = &scanner->lexer;
1070
1071 ctx->htmllval = htmllval;
1072 ctx->tok = 0;
1073 do {
1074 if (ctx->mode == 2)
1075 return EOF;
1076 if (ctx->mode == 0) {
1077 ctx->mode = 1;
1078 s = begin_html;
1079 len = strlen(s);
1080 endp = 0;
1081 } else {
1082 s = ctx->ptr;
1083 if (*s == '\0') {
1084 ctx->mode = 2;
1085 s = end_html;
1086 len = strlen(s);
1087 } else {
1088 endp = findNext(ctx, s,&ctx->lb);
1089 len = (size_t)(endp - s);
1090 }
1091 }
1092
1093 protect_rsqb(&ctx->lb);
1094
1095 ctx->prevtok = ctx->currtok;
1096 ctx->currtok = (strview_t){.data = s, .size = len};
1097 if ((llen = agxblen(&ctx->lb))) {
1098 assert(llen <= INT_MAX && "XML token too long for expat API");
1099 rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
1100 } else {
1101 assert(len <= INT_MAX && "XML token too long for expat API");
1102 rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
1103 }
1104 if (rv == XML_STATUS_ERROR) {
1105 if (!ctx->error) {
1106 agerrorf("%s in line %lu \n",
1107 XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
1108 error_context(ctx);
1109 ctx->error = 1;
1110 ctx->tok = T_error;
1111 }
1112 }
1113 if (endp)
1114 ctx->ptr = endp;
1115 } while (ctx->tok == 0);
1116#ifdef DEBUG
1117 printTok (ctx, ctx->tok);
1118#endif
1119 return ctx->tok;
1120#else
1121 (void)htmllval;
1122 (void)scanner;
1123
1124 return EOF;
1125#endif
1126}
1127
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:97
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:325
static size_t agxblen(const agxbuf *xb)
return number of characters currently stored
Definition agxbuf.h:108
static int agxbputc(agxbuf *xb, char c)
add character to buffer
Definition agxbuf.h:295
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_alloc(size_t size)
Definition alloc.h:47
container data types API
#define dtinsert(d, o)
Definition cdt.h:186
char * scanEntity(char *t, agxbuf *xb)
Definition utils.c:1081
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1606
static int flags
Definition gc.c:63
static double len(glCompPoint p)
Definition glutils.c:138
void free(void *)
#define SIZE_MAX
Definition gmlscan.c:347
#define UINT16_MAX
Definition gmlscan.c:340
#define INT8_MAX
Definition gmlscan.c:328
node NULL
Definition grammar.y:181
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:198
void agwarningf(const char *fmt,...)
Definition agerror.c:175
void agerrorf(const char *fmt,...)
Definition agerror.c:167
int agerr(agerrlevel_t level, const char *fmt,...)
Definition agerror.c:157
@ AGPREV
Definition cgraph.h:946
#define GD_charset(g)
Definition types.h:367
#define GD_gvc(g)
Definition types.h:355
replacements for ctype.h functions
static char gv_tolower(int c)
Definition gv_ctype.h:81
agxbput(xb, staging)
int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
Definition htmllex.c:1059
static UNUSED void agxbput_move(agxbuf *dst, const char *src)
agxbput, but assume that source and destination may overlap
Definition htmllex.c:797
unsigned long htmllineno(htmlscan_t *scanner)
Definition htmllex.c:912
#define XML_STATUS_ERROR
Definition htmllex.c:38
static void error_context(htmllexstate_t *ctx)
Definition htmllex.c:46
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:781
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:743
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:55
static unsigned long htmllineno_ctx(htmllexstate_t *ctx)
Definition htmllex.c:916
#define T_n_sup
Definition htmlparse.h:138
#define T_end_row
Definition htmlparse.h:126
#define T_end_table
Definition htmlparse.h:129
#define T_br
Definition htmlparse.h:148
#define T_vr
Definition htmlparse.h:145
#define T_error
Definition htmlparse.h:133
#define T_n_s
Definition htmlparse.h:140
#define T_end_cell
Definition htmlparse.h:130
#define T_n_sub
Definition htmlparse.h:139
#define T_n_bold
Definition htmlparse.h:135
#define T_html
Definition htmlparse.h:127
#define T_BR
Definition htmlparse.h:147
#define T_underline
Definition htmlparse.h:156
#define T_sup
Definition htmlparse.h:158
#define T_row
Definition htmlparse.h:125
#define T_table
Definition htmlparse.h:151
#define T_end_vr
Definition htmlparse.h:146
#define T_end_html
Definition htmlparse.h:128
#define T_IMG
Definition htmlparse.h:149
#define T_VR
Definition htmlparse.h:144
#define T_bold
Definition htmlparse.h:155
#define T_end_img
Definition htmlparse.h:124
#define T_sub
Definition htmlparse.h:159
#define T_s
Definition htmlparse.h:160
#define T_n_italic
Definition htmlparse.h:134
#define T_end_font
Definition htmlparse.h:131
#define T_overline
Definition htmlparse.h:157
#define T_hr
Definition htmlparse.h:142
#define T_font
Definition htmlparse.h:153
#define T_italic
Definition htmlparse.h:154
#define T_end_br
Definition htmlparse.h:123
#define T_n_underline
Definition htmlparse.h:136
#define T_cell
Definition htmlparse.h:152
#define T_end_hr
Definition htmlparse.h:143
#define T_string
Definition htmlparse.h:132
#define T_img
Definition htmlparse.h:150
#define T_HR
Definition htmlparse.h:141
#define T_n_overline
Definition htmlparse.h:137
cleanup & scanner
Definition htmlparse.y:289
#define PAD_SET
Definition htmltable.h:33
#define BORDER_RIGHT
Definition htmltable.h:40
#define BORDER_TOP
Definition htmltable.h:39
#define HALIGN_TEXT
Definition htmltable.h:28
#define UNSET_ALIGN
Definition htmltable.h:44
#define HALIGN_LEFT
Definition htmltable.h:26
#define VALIGN_BOTTOM
Definition htmltable.h:30
#define BALIGN_RIGHT
Definition htmltable.h:35
#define BALIGN_LEFT
Definition htmltable.h:36
#define BORDER_BOTTOM
Definition htmltable.h:41
static void free_ritem(row_t *p)
Free row. This closes and frees row’s list, then the item itself is freed.
Definition htmltable.h:117
#define SPACE_SET
Definition htmltable.h:34
#define BORDER_SET
Definition htmltable.h:32
#define BORDER_LEFT
Definition htmltable.h:38
#define BORDER_MASK
Definition htmltable.h:42
#define HALIGN_RIGHT
Definition htmltable.h:25
#define VALIGN_TOP
Definition htmltable.h:29
#define FIXED_FLAG
Definition htmltable.h:24
char * charsetToStr(int c)
Given an internal charset value, return a canonical string representation.
Definition input.c:814
static bool startswith(const char *s, const char *prefix)
does the string s begin with the string prefix?
Definition startswith.h:11
platform abstraction for case-insensitive string functions
Dt_t * textfont_dt
Definition gvcint.h:108
result of partitioning available space, part of maze
Definition grid.h:33
uint16_t rowspan
Definition htmltable.h:158
uint16_t colspan
Definition htmltable.h:157
char * bgcolor
Definition htmltable.h:87
unsigned char border
Definition htmltable.h:91
char * target
Definition htmltable.h:84
char * id
Definition htmltable.h:86
signed char space
Definition htmltable.h:90
unsigned short width
Definition htmltable.h:95
unsigned short height
Definition htmltable.h:96
int gradientangle
Definition htmltable.h:89
char * port
Definition htmltable.h:83
unsigned short flags
Definition htmltable.h:94
char * href
Definition htmltable.h:82
unsigned char pad
Definition htmltable.h:92
char * pencolor
Definition htmltable.h:88
htmlstyle_t style
Definition htmltable.h:97
char * title
Definition htmltable.h:85
graph_t * g
Definition htmltable.h:171
char * scale
Definition htmltable.h:70
char * src
Definition htmltable.h:69
HTMLSTYPE * htmllval
Definition htmlparse.h:229
agxbuf * xb
Definition htmlparse.h:220
strview_t prevtok
Definition htmlparse.h:227
strview_t currtok
Definition htmlparse.h:226
bool dashed
Definition htmltable.h:78
bool dotted
Definition htmltable.h:77
bool rounded
Definition htmltable.h:75
bool radial
Definition htmltable.h:74
bool invisible
Definition htmltable.h:76
bool vrule
vertical rule
Definition htmltable.h:143
size_t row_count
number of rows
Definition htmltable.h:139
bool hrule
horizontal rule
Definition htmltable.h:142
rows_t rows
cells
Definition htmltable.h:133
int8_t cellborder
Definition htmltable.h:136
Definition utils.c:752
a non-owning string reference
Definition strview.h:20
const char * data
start of the pointed to string
Definition strview.h:21
size_t size
extent of the string in bytes
Definition strview.h:22
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
state for an in-progress string tokenization
Definition tokenize.h:36
Non-owning string references.
static bool strview_case_str_eq(strview_t a, const char *b)
compare a string reference to a string for case insensitive equality
Definition strview.h:62
#define HTML_OL
Definition textspan.h:35
#define HTML_IF
Definition textspan.h:30
#define HTML_UL
Definition textspan.h:31
#define HTML_BF
Definition textspan.h:29
#define HTML_SUP
Definition textspan.h:32
#define HTML_S
Definition textspan.h:34
#define GV_TEXTFONT_FLAGS_WIDTH
Definition textspan.h:24
#define HTML_SUB
Definition textspan.h:33
String tokenization.
static strview_t tok_get(const tok_t *t)
get the current token
Definition tokenize.h:76
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
static bool tok_end(const tok_t *t)
is this tokenizer exhausted?
Definition tokenize.h:68
static void tok_next(tok_t *t)
advance to the next token in the string being scanned
Definition tokenize.h:85
htmltbl_t * tbl
Definition htmlparse.h:171
htmlcell_t * cell
Definition htmlparse.h:170
textfont_t * font
Definition htmlparse.h:172
htmlimg_t * img
Definition htmlparse.h:173
Definition grammar.c:90
abstraction for squashing compiler warnings for unused symbols
#define UNUSED
Definition unused.h:25