Graphviz 13.0.0~dev.20250121.0651
Loading...
Searching...
No Matches
htmlparse.y
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13%require "3.0"
14
15 /* By default, Bison emits a parser using symbols prefixed with "yy". Graphviz
16 * contains multiple Bison-generated parsers, so we alter this prefix to avoid
17 * symbol clashes.
18 */
19%define api.prefix {html}
20
21 /* Generate a reentrant parser with no global state */
22%define api.pure full
23%param { htmlscan_t *scanner }
24
25
26%code requires {
27#include <common/htmllex.h>
28#include <common/htmltable.h>
29#include <common/textspan.h>
30#include <gvc/gvcext.h>
31#include <util/agxbuf.h>
32#include <util/list.h>
33#include <util/strview.h>
34}
35
36%code provides {
37
38DEFINE_LIST(sfont, textfont_t *)
39
40static inline void free_ti(textspan_t item) {
41 free(item.str);
42}
43
45
46static inline void free_hi(htextspan_t item) {
47 for (size_t i = 0; i < item.nitems; i++) {
48 free(item.items[i].str);
49 }
50 free(item.items);
51}
52
54
55struct htmlparserstate_s {
56 htmllabel_t* lbl; /* Generated label */
57 htmltbl_t* tblstack; /* Stack of tables maintained during parsing */
58 textspans_t fitemList;
59 htextspans_t fspanList;
60 agxbuf* str; /* Buffer for text */
61 sfont_t fontstack;
62 GVC_t* gvc;
63};
64
65typedef struct {
66#ifdef HAVE_EXPAT
67 struct XML_ParserStruct *parser;
68#endif
69 char* ptr; // input source
70 int tok; // token type
71 agxbuf* xb; // buffer to gather T_string data
72 agxbuf lb; // buffer for translating lexical data
73 int warn; // set if warning given
74 int error; // set if error given
75 char inCell; // set if in TD to allow T_string
76 char mode; // for handling artificial <HTML>..</HTML>
77 strview_t currtok; // for error reporting
78 strview_t prevtok; // for error reporting
79 GVC_t *gvc; // current GraphViz context
80 HTMLSTYPE *htmllval; // generated by htmlparse.y
82
83
84struct htmlscan_s {
87};
88}
89
90%{
91
92#include <common/render.h>
93#include <common/htmltable.h>
94#include <common/htmllex.h>
95#include <stdbool.h>
96#include <util/alloc.h>
97
99static void cleanCell(htmlcell_t *cp);
100
102static void cleanTbl(htmltbl_t *tp) {
103 rows_t *rows = &tp->u.p.rows;
104 for (size_t r = 0; r < rows_size(rows); ++r) {
105 row_t *rp = rows_get(rows, r);
106 for (size_t c = 0; c < cells_size(&rp->rp); ++c) {
107 cleanCell(cells_get(&rp->rp, c));
108 }
109 }
110 rows_free(rows);
111 free_html_data(&tp->data);
112 free(tp);
113}
114
116static void
118{
119 if (cp->child.kind == HTML_TBL) cleanTbl (cp->child.u.tbl);
120 else if (cp->child.kind == HTML_TEXT) free_html_text (cp->child.u.txt);
121 free_html_data (&cp->data);
122 free (cp);
123}
124
126static void
127appendFItemList (htmlparserstate_t *html_state, agxbuf *ag);
128
129static void
130appendFLineList (htmlparserstate_t *html_state, int v);
131
132static htmltxt_t*
133mkText(htmlparserstate_t *html_state);
134
135static row_t *lastRow(htmlparserstate_t *html_state);
136
138static void addRow(htmlparserstate_t *html_state);
139
141static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind);
142
144static htmllabel_t *mkLabel(void *obj, label_type_t kind) {
145 htmllabel_t* lp = gv_alloc(sizeof(htmllabel_t));
146
147 lp->kind = kind;
148 if (kind == HTML_TEXT)
149 lp->u.txt = obj;
150 else
151 lp->u.tbl = obj;
152 return lp;
153}
154
155/* Called on error. Frees resources allocated during parsing.
156 * This includes a label, plus a walk down the stack of
157 * tables. Note that `cleanTbl` frees the contained cells.
158 */
159static void cleanup (htmlparserstate_t *html_state);
160
162static bool nonSpace(const char *s) {
163 char c;
164
165 while ((c = *s++)) {
166 if (c != ' ') return true;
167 }
168 return false;
169}
170
172static void
173pushFont (htmlparserstate_t *html_state, textfont_t *fp);
174
175static void
176popFont (htmlparserstate_t *html_state);
177
178%}
179
180%union {
181 int i;
182 htmltxt_t* txt;
184 htmltbl_t* tbl;
186 htmlimg_t* img;
187 row_t *p;
188}
189
193%token T_HR T_hr T_end_hr
194%token T_VR T_vr T_end_vr
195%token <i> T_BR T_br
196%token <img> T_IMG T_img
197%token <tbl> T_table
198%token <cell> T_cell
200
201%type <txt> fonttext
202%type <cell> cell cells
203%type <i> br
204%type <tbl> table fonttable
205%type <img> image
206%type <p> row rows
207
208%start html
209
210%%
211
212html : T_html fonttext T_end_html { scanner->parser.lbl = mkLabel($2,HTML_TEXT); }
213 | T_html fonttable T_end_html { scanner->parser.lbl = mkLabel($2,HTML_TBL); }
214 | error { cleanup(&scanner->parser); YYABORT; }
215 ;
216
217fonttext : text { $$ = mkText(&scanner->parser); }
218 ;
219
220text : text textitem
221 | textitem
222 ;
223
224textitem : string { appendFItemList(&scanner->parser,scanner->parser.str);}
226 | font text n_font
227 | italic text n_italic
228 | underline text n_underline
229 | overline text n_overline
230 | bold text n_bold
231 | sup text n_sup
232 | sub text n_sub
233 | strike text n_strike
234 ;
235
236font : T_font { pushFont (&scanner->parser,$1); }
237 ;
238
240 ;
241
242italic : T_italic {pushFont(&scanner->parser,$1);}
243 ;
244
246 ;
247
248bold : T_bold {pushFont(&scanner->parser,$1);}
249 ;
250
252 ;
253
254strike : T_s {pushFont(&scanner->parser,$1);}
255 ;
256
258 ;
259
260underline : T_underline {pushFont(&scanner->parser,$1);}
261 ;
262
264 ;
265
266overline : T_overline {pushFont(&scanner->parser,$1);}
267 ;
268
270 ;
271
272sup : T_sup {pushFont(&scanner->parser,$1);}
273 ;
274
275n_sup : T_n_sup {popFont(&scanner->parser);}
276 ;
277
279 ;
280
281n_sub : T_n_sub {popFont(&scanner->parser);}
282 ;
283
285 | T_BR { $$ = $1; }
286 ;
287
288string : T_string
289 | string T_string
290 ;
291
293 if (nonSpace(agxbuse(scanner->parser.str))) {
294 htmlerror (scanner,"Syntax error: non-space string used before <TABLE>");
296 }
297 $2->u.p.prev = scanner->parser.tblstack;
298 $2->u.p.rows = (rows_t){0};
299 scanner->parser.tblstack = $2;
300 $2->font = *sfont_back(&scanner->parser.fontstack);
301 $<tbl>$ = $2;
302 }
304 if (nonSpace(agxbuse(scanner->parser.str))) {
305 htmlerror (scanner,"Syntax error: non-space string used after </TABLE>");
306 cleanup(&scanner->parser); YYABORT;
307 }
308 $$ = scanner->parser.tblstack;
309 scanner->parser.tblstack = scanner->parser.tblstack->u.p.prev;
310 }
311 ;
312
313fonttable : table { $$ = $1; }
315 | italic table n_italic { $$=$2; }
316 | underline table n_underline { $$=$2; }
317 | overline table n_overline { $$=$2; }
318 | bold table n_bold { $$=$2; }
319 ;
320
321opt_space : string
322 | /* empty*/
323 ;
324
325rows : row { $$ = $1; }
326 | rows row { $$ = $2; }
327 | rows HR row { $1->ruled = true; $$ = $3; }
328 ;
329
330row : T_row { addRow (&scanner->parser); } cells T_end_row { $$ = lastRow(&scanner->parser); }
331 ;
332
333cells : cell { $$ = $1; }
334 | cells cell { $$ = $2; }
335 | cells VR cell { $1->vruled = true; $$ = $3; }
336 ;
337
338cell : T_cell fonttable { setCell(&scanner->parser,$1,$2,HTML_TBL); } T_end_cell { $$ = $1; }
341 | T_cell { setCell(&scanner->parser,$1,mkText(&scanner->parser),HTML_TEXT); } T_end_cell { $$ = $1; }
342 ;
343
345 | T_IMG { $$ = $1; }
346 ;
347
349 | T_HR
350 ;
351
353 | T_VR
354 ;
355
356
357%%
358
359static void
361{
362 const textspan_t ti = {.str = agxbdisown(ag),
363 .font = *sfont_back(&html_state->fontstack)};
364 textspans_append(&html_state->fitemList, ti);
365}
366
367static void
369{
370 htextspan_t lp = {0};
371 textspans_t *ilist = &html_state->fitemList;
372
373 size_t cnt = textspans_size(ilist);
374 lp.just = v;
375 if (cnt) {
376 lp.nitems = cnt;
377 lp.items = gv_calloc(cnt, sizeof(textspan_t));
378
379 for (size_t i = 0; i < textspans_size(ilist); ++i) {
380 // move this text span into the new list
381 textspan_t *ti = textspans_at(ilist, i);
382 lp.items[i] = *ti;
383 *ti = (textspan_t){0};
384 }
385 }
386 else {
387 lp.items = gv_alloc(sizeof(textspan_t));
388 lp.nitems = 1;
389 lp.items[0].str = gv_strdup("");
390 lp.items[0].font = *sfont_back(&html_state->fontstack);
391 }
392
393 textspans_clear(ilist);
394
395 htextspans_append(&html_state->fspanList, lp);
396}
397
398static htmltxt_t*
400{
401 htextspans_t *ispan = &html_state->fspanList;
402 htmltxt_t *hft = gv_alloc(sizeof(htmltxt_t));
403
404 if (!textspans_is_empty(&html_state->fitemList))
405 appendFLineList (html_state, UNSET_ALIGN);
406
407 size_t cnt = htextspans_size(ispan);
408 hft->nspans = cnt;
409
410 hft->spans = gv_calloc(cnt, sizeof(htextspan_t));
411 for (size_t i = 0; i < htextspans_size(ispan); ++i) {
412 // move this HTML text span into the new list
413 htextspan_t *hi = htextspans_at(ispan, i);
414 hft->spans[i] = *hi;
415 *hi = (htextspan_t){0};
416 }
417
418 htextspans_clear(ispan);
419
420 return hft;
421}
422
423static row_t *lastRow(htmlparserstate_t *html_state) {
424 htmltbl_t* tbl = html_state->tblstack;
425 row_t *sp = *rows_back(&tbl->u.p.rows);
426 return sp;
427}
428
429static void addRow(htmlparserstate_t *html_state) {
430 htmltbl_t* tbl = html_state->tblstack;
431 row_t *sp = gv_alloc(sizeof(row_t));
432 if (tbl->hrule)
433 sp->ruled = true;
434 rows_append(&tbl->u.p.rows, sp);
435}
436
437static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind) {
438 htmltbl_t* tbl = html_state->tblstack;
439 row_t *rp = *rows_back(&tbl->u.p.rows);
440 cells_t *row = &rp->rp;
441 cells_append(row, cp);
442 cp->child.kind = kind;
443 if (tbl->vrule) {
444 cp->vruled = true;
445 cp->hruled = false;
446 }
447
448 if(kind == HTML_TEXT)
449 cp->child.u.txt = obj;
450 else if (kind == HTML_IMAGE)
451 cp->child.u.img = obj;
452 else
453 cp->child.u.tbl = obj;
454}
455
456static void cleanup (htmlparserstate_t *html_state)
457{
458 htmltbl_t* tp = html_state->tblstack;
459 htmltbl_t* next;
460
461 if (html_state->lbl) {
462 free_html_label (html_state->lbl,1);
463 html_state->lbl = NULL;
464 }
465 while (tp) {
466 next = tp->u.p.prev;
467 cleanTbl (tp);
468 tp = next;
469 }
470
471 textspans_clear(&html_state->fitemList);
472 htextspans_clear(&html_state->fspanList);
473
474 sfont_free(&html_state->fontstack);
475}
476
477static void
479{
480 textfont_t* curfont = *sfont_back(&html_state->fontstack);
481 textfont_t f = *fp;
482
483 if (curfont) {
484 if (!f.color && curfont->color)
485 f.color = curfont->color;
486 if ((f.size < 0.0) && (curfont->size >= 0.0))
487 f.size = curfont->size;
488 if (!f.name && curfont->name)
489 f.name = curfont->name;
490 if (curfont->flags)
491 f.flags |= curfont->flags;
492 }
493
494 textfont_t *const ft = dtinsert(html_state->gvc->textfont_dt, &f);
495 sfont_push_back(&html_state->fontstack, ft);
496}
497
498static void
500{
501 (void)sfont_pop_back(&html_state->fontstack);
502}
503
504/* Return parsed label or NULL if failure.
505 * Set warn to 0 on success; 1 for warning message; 2 if no expat; 3 for error
506 * message.
507 */
509parseHTML (char* txt, int* warn, htmlenv_t *env)
510{
511 agxbuf str = {0};
512 htmllabel_t* l = NULL;
513 htmlscan_t scanner = {0};
514
515 sfont_push_back(&scanner.parser.fontstack, NULL);
516 scanner.parser.gvc = GD_gvc(env->g);
517 scanner.parser.str = &str;
518
519 if (initHTMLlexer (&scanner, txt, &str, env)) {/* failed: no libexpat - give up */
520 *warn = 2;
521 }
522 else {
524 *warn = clearHTMLlexer (&scanner);
525 l = scanner.parser.lbl;
526 }
527
528 textspans_free(&scanner.parser.fitemList);
529 htextspans_free(&scanner.parser.fspanList);
530
531 sfont_free(&scanner.parser.fontstack);
532
533 agxbfree (&str);
534
535 return l;
536}
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:78
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:307
static char * agxbdisown(agxbuf *xb)
Definition agxbuf.h:327
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_calloc(size_t nmemb, size_t size)
Definition alloc.h:26
static void * gv_alloc(size_t size)
Definition alloc.h:47
#define dtinsert(d, o)
Definition cdt.h:185
#define sub(h, i)
Definition closest.c:67
mode
Definition cvtgxl.c:33
static void cleanup(void)
Definition gmlparse.c:128
void free(void *)
node NULL
Definition grammar.y:163
atom $3
Definition grammar.y:164
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:206
#define GD_gvc(g)
Definition types.h:355
static GVC_t * gvc
Definition gv.cpp:23
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:783
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:745
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:58
#define T_n_sup
Definition htmlparse.c:289
int htmlparse(htmlscan_t *scanner)
#define T_end_table
Definition htmlparse.c:280
#define T_br
Definition htmlparse.c:299
#define T_vr
Definition htmlparse.c:296
#define T_error
Definition htmlparse.c:284
#define T_n_s
Definition htmlparse.c:291
#define T_n_sub
Definition htmlparse.c:290
#define T_n_bold
Definition htmlparse.c:286
#define T_html
Definition htmlparse.c:278
static void cleanTbl(htmltbl_t *tp)
Clean up table if error in parsing.
Definition htmlparse.c:88
static void free_ti(textspan_t item)
Definition htmlparse.c:344
#define T_underline
Definition htmlparse.c:307
#define T_sup
Definition htmlparse.c:309
#define T_row
Definition htmlparse.c:276
#define T_table
Definition htmlparse.c:302
#define T_end_vr
Definition htmlparse.c:297
#define T_end_html
Definition htmlparse.c:279
static void cleanCell(htmlcell_t *cp)
Clean up cell if error in parsing.
Definition htmlparse.c:103
#define T_VR
Definition htmlparse.c:295
#define T_bold
Definition htmlparse.c:306
#define T_end_img
Definition htmlparse.c:275
static void free_hi(htextspan_t item)
Definition htmlparse.c:350
#define T_sub
Definition htmlparse.c:310
#define T_s
Definition htmlparse.c:311
#define T_n_italic
Definition htmlparse.c:285
static htmllabel_t * mkLabel(void *obj, label_type_t kind)
Create label, given body and type.
Definition htmlparse.c:130
#define T_end_font
Definition htmlparse.c:282
#define T_overline
Definition htmlparse.c:308
#define T_hr
Definition htmlparse.c:293
#define T_font
Definition htmlparse.c:304
#define T_italic
Definition htmlparse.c:305
static bool nonSpace(const char *s)
Return 1 if s contains a non-space character.
Definition htmlparse.c:148
#define T_end_br
Definition htmlparse.c:274
#define T_n_underline
Definition htmlparse.c:287
#define T_end_hr
Definition htmlparse.c:294
#define T_string
Definition htmlparse.c:283
#define T_img
Definition htmlparse.c:301
#define T_HR
Definition htmlparse.c:292
#define T_n_overline
Definition htmlparse.c:288
YYABORT
Definition htmlparse.y:295
static void appendFItemList(htmlparserstate_t *html_state, agxbuf *ag)
Definition htmlparse.y:360
italic table n_italic
Definition htmlparse.y:315
font text n_font italic text n_italic underline text n_underline overline text n_overline bold text n_bold sup text n_sup sub text n_sub strike text n_strike
Definition htmlparse.y:234
static void popFont(htmlparserstate_t *html_state)
Definition htmlparse.y:499
static void addRow(htmlparserstate_t *html_state)
Definition htmlparse.y:429
font $1
Definition htmlparse.y:236
underline table n_underline
Definition htmlparse.y:316
static void appendFLineList(htmlparserstate_t *html_state, int v)
Definition htmlparse.y:368
cell $2
Definition htmlparse.y:338
htmllabel_t * parseHTML(char *txt, int *warn, htmlenv_t *env)
Definition htmlparse.y:509
T_BR
Definition htmlparse.y:285
T_cell fonttext
Definition htmlparse.y:339
static row_t * lastRow(htmlparserstate_t *html_state)
Definition htmlparse.y:423
overline table n_overline
Definition htmlparse.y:317
T_cell
Definition htmlparse.y:341
rows T_end_table opt_space
Definition htmlparse.y:303
bold table n_bold
Definition htmlparse.y:318
br
Definition htmlparse.y:225
static htmltxt_t * mkText(htmlparserstate_t *html_state)
Definition htmlparse.y:399
font table n_font
Definition htmlparse.y:314
cells T_end_row
Definition htmlparse.y:330
cells cell
Definition htmlparse.y:334
static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind)
Definition htmlparse.y:437
textitem scanner parser str
Definition htmlparse.y:224
$2 font
Definition htmlparse.y:300
$< tbl > $
Definition htmlparse.y:301
$2 u p rows
Definition htmlparse.y:298
rows row
Definition htmlparse.y:326
cell HTML_TBL
Definition htmlparse.y:338
table Syntax error
Definition htmlparse.y:294
T_IMG
Definition htmlparse.y:345
T_end_cell
Definition htmlparse.y:338
static void pushFont(htmlparserstate_t *html_state, textfont_t *fp)
Definition htmlparse.y:478
cleanup & scanner
Definition htmlparse.y:295
$$
Definition htmlparse.y:327
T_cell image
Definition htmlparse.y:340
void free_html_text(htmltxt_t *t)
Definition htmltable.c:803
void free_html_label(htmllabel_t *lp, int root)
Definition htmltable.c:864
void free_html_data(htmldata_t *dp)
Definition htmltable.c:792
#define UNSET_ALIGN
Definition htmltable.h:44
label_type_t
Definition htmltable.h:101
@ HTML_TEXT
Definition htmltable.h:101
@ HTML_IMAGE
Definition htmltable.h:101
#define DEFINE_LIST_WITH_DTOR(name, type, dtor)
Definition list.h:29
#define DEFINE_LIST(name, type)
Definition list.h:21
static int table[NTYPES][NTYPES]
Definition mincross.c:1755
Definition gvcint.h:80
Dt_t * textfont_dt
Definition gvcint.h:107
result of partitioning available space, part of maze
Definition grid.h:33
gridpt p
Definition grid.h:34
size_t nitems
Definition htmltable.h:54
textspan_t * items
Definition htmltable.h:53
htmllabel_t child
Definition htmltable.h:163
bool vruled
vertically ruled?
Definition htmltable.h:165
bool hruled
horizontally ruled?
Definition htmltable.h:166
htmldata_t data
Definition htmltable.h:158
graph_t * g
Definition htmltable.h:173
htmltxt_t * txt
Definition htmltable.h:151
union htmllabel_t::@76 u
htmltbl_t * tbl
Definition htmltable.h:150
htmlimg_t * img
Definition htmltable.h:152
label_type_t kind
Definition htmltable.h:154
htmltbl_t * tblstack
Definition htmlparse.c:361
textspans_t fitemList
Definition htmlparse.c:362
htextspans_t fspanList
Definition htmlparse.c:363
htmllabel_t * lbl
Definition htmlparse.c:360
htmlparserstate_t parser
Definition htmlparse.c:390
htmllexstate_t lexer
Definition htmlparse.c:389
union htmltbl_t::@73 u
bool vrule
vertical rule
Definition htmltable.h:145
struct htmltbl_t::@73::@75 p
bool hrule
horizontal rule
Definition htmltable.h:144
htmltbl_t * prev
Definition htmltable.h:134
htmldata_t data
Definition htmltable.h:127
rows_t rows
cells
Definition htmltable.h:135
size_t nspans
Definition htmltable.h:62
htextspan_t * spans
Definition htmltable.h:61
Definition utils.c:749
cells_t rp
Definition htmltable.h:114
bool ruled
Definition htmltable.h:115
a non-owning string reference
Definition strview.h:20
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
char * str
Definition textspan.h:65
textfont_t * font
Definition textspan.h:66
Non-owning string references.
textspan_t, textfont_t, PostscriptAlias
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
Definition grammar.c:93