Graphviz 13.1.3~dev.20250831.0023
Loading...
Searching...
No Matches
htmlparse.y
Go to the documentation of this file.
1
3/*************************************************************************
4 * Copyright (c) 2011 AT&T Intellectual Property
5 * All rights reserved. This program and the accompanying materials
6 * are made available under the terms of the Eclipse Public License v1.0
7 * which accompanies this distribution, and is available at
8 * https://www.eclipse.org/legal/epl-v10.html
9 *
10 * Contributors: Details at https://graphviz.org
11 *************************************************************************/
12
13%require "3.0"
14
15 /* By default, Bison emits a parser using symbols prefixed with "yy". Graphviz
16 * contains multiple Bison-generated parsers, so we alter this prefix to avoid
17 * symbol clashes.
18 */
19%define api.prefix {html}
20
21 /* Generate a reentrant parser with no global state */
22%define api.pure full
23%param { htmlscan_t *scanner }
24
25
26%code requires {
27#include <common/htmllex.h>
28#include <common/htmltable.h>
29#include <common/textspan.h>
30#include <gvc/gvcext.h>
31#include <util/agxbuf.h>
32#include <util/list.h>
33#include <util/strview.h>
34}
35
36%code provides {
37
38static inline void free_ti(textspan_t item) {
39 free(item.str);
40}
41
42static inline void free_hi(htextspan_t item) {
43 for (size_t i = 0; i < item.nitems; i++) {
44 free(item.items[i].str);
45 }
46 free(item.items);
47}
48
49struct htmlparserstate_s {
50 htmllabel_t* lbl; /* Generated label */
51 htmltbl_t* tblstack; /* Stack of tables maintained during parsing */
52 LIST(textspan_t) fitemList;
53 LIST(htextspan_t) fspanList;
54 agxbuf* str; /* Buffer for text */
55 LIST(textfont_t *) fontstack;
56 GVC_t* gvc;
57};
58
59typedef struct {
60#ifdef HAVE_EXPAT
61 struct XML_ParserStruct *parser;
62#endif
63 char* ptr; // input source
64 int tok; // token type
65 agxbuf* xb; // buffer to gather T_string data
66 agxbuf lb; // buffer for translating lexical data
67 int warn; // set if warning given
68 int error; // set if error given
69 char inCell; // set if in TD to allow T_string
70 char mode; // for handling artificial <HTML>..</HTML>
71 strview_t currtok; // for error reporting
72 strview_t prevtok; // for error reporting
73 GVC_t *gvc; // current GraphViz context
74 HTMLSTYPE *htmllval; // generated by htmlparse.y
76
77
78struct htmlscan_s {
81};
82}
83
84%{
85
86#include <common/render.h>
87#include <common/htmltable.h>
88#include <common/htmllex.h>
89#include <stdbool.h>
90#include <util/alloc.h>
91
93static void cleanCell(htmlcell_t *cp);
94
96static void cleanTbl(htmltbl_t *tp) {
97 rows_t *rows = &tp->u.p.rows;
98 for (size_t r = 0; r < LIST_SIZE(rows); ++r) {
99 row_t *rp = LIST_GET(rows, r);
100 for (size_t c = 0; c < LIST_SIZE(&rp->rp); ++c) {
101 cleanCell(LIST_GET(&rp->rp, c));
102 }
103 }
105 free_html_data(&tp->data);
106 free(tp);
107}
108
110static void
112{
113 if (cp->child.kind == HTML_TBL) cleanTbl (cp->child.u.tbl);
114 else if (cp->child.kind == HTML_TEXT) free_html_text (cp->child.u.txt);
115 free_html_data (&cp->data);
116 free (cp);
117}
118
120static void
121appendFItemList (htmlparserstate_t *html_state, agxbuf *ag);
122
123static void
124appendFLineList (htmlparserstate_t *html_state, int v);
125
126static htmltxt_t*
127mkText(htmlparserstate_t *html_state);
128
129static row_t *lastRow(htmlparserstate_t *html_state);
130
132static void addRow(htmlparserstate_t *html_state);
133
135static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind);
136
138static htmllabel_t *mkLabel(void *obj, label_type_t kind) {
139 htmllabel_t* lp = gv_alloc(sizeof(htmllabel_t));
140
141 lp->kind = kind;
142 if (kind == HTML_TEXT)
143 lp->u.txt = obj;
144 else
145 lp->u.tbl = obj;
146 return lp;
147}
148
149/* Called on error. Frees resources allocated during parsing.
150 * This includes a label, plus a walk down the stack of
151 * tables. Note that `cleanTbl` frees the contained cells.
152 */
153static void cleanup (htmlparserstate_t *html_state);
154
156static bool nonSpace(const char *s) {
157 char c;
158
159 while ((c = *s++)) {
160 if (c != ' ') return true;
161 }
162 return false;
163}
164
166static void
167pushFont (htmlparserstate_t *html_state, textfont_t *fp);
168
169static void
170popFont (htmlparserstate_t *html_state);
171
172%}
173
174%union {
175 int i;
176 htmltxt_t* txt;
178 htmltbl_t* tbl;
180 htmlimg_t* img;
181 row_t *p;
182}
183
187%token T_HR T_hr T_end_hr
188%token T_VR T_vr T_end_vr
189%token <i> T_BR T_br
190%token <img> T_IMG T_img
191%token <tbl> T_table
192%token <cell> T_cell
194
195%type <txt> fonttext
196%type <cell> cell cells
197%type <i> br
198%type <tbl> table fonttable
199%type <img> image
200%type <p> row rows
201
202%start html
203
204%%
205
206html : T_html fonttext T_end_html { scanner->parser.lbl = mkLabel($2,HTML_TEXT); }
207 | T_html fonttable T_end_html { scanner->parser.lbl = mkLabel($2,HTML_TBL); }
208 | error { cleanup(&scanner->parser); YYABORT; }
209 ;
210
211fonttext : text { $$ = mkText(&scanner->parser); }
212 ;
213
214text : text textitem
215 | textitem
216 ;
217
218textitem : string { appendFItemList(&scanner->parser,scanner->parser.str);}
220 | font text n_font
221 | italic text n_italic
222 | underline text n_underline
223 | overline text n_overline
224 | bold text n_bold
225 | sup text n_sup
226 | sub text n_sub
227 | strike text n_strike
228 ;
229
230font : T_font { pushFont (&scanner->parser,$1); }
231 ;
232
234 ;
235
236italic : T_italic {pushFont(&scanner->parser,$1);}
237 ;
238
240 ;
241
242bold : T_bold {pushFont(&scanner->parser,$1);}
243 ;
244
246 ;
247
248strike : T_s {pushFont(&scanner->parser,$1);}
249 ;
250
252 ;
253
254underline : T_underline {pushFont(&scanner->parser,$1);}
255 ;
256
258 ;
259
260overline : T_overline {pushFont(&scanner->parser,$1);}
261 ;
262
264 ;
265
266sup : T_sup {pushFont(&scanner->parser,$1);}
267 ;
268
269n_sup : T_n_sup {popFont(&scanner->parser);}
270 ;
271
273 ;
274
275n_sub : T_n_sub {popFont(&scanner->parser);}
276 ;
277
279 | T_BR { $$ = $1; }
280 ;
281
282string : T_string
283 | string T_string
284 ;
285
287 if (nonSpace(agxbuse(scanner->parser.str))) {
288 htmlerror (scanner,"Syntax error: non-space string used before <TABLE>");
290 }
291 $2->u.p.prev = scanner->parser.tblstack;
292 $2->u.p.rows = (rows_t){.dtor = free_ritem};
293 scanner->parser.tblstack = $2;
294 $2->font = *LIST_BACK(&scanner->parser.fontstack);
295 $<tbl>$ = $2;
296 }
298 if (nonSpace(agxbuse(scanner->parser.str))) {
299 htmlerror (scanner,"Syntax error: non-space string used after </TABLE>");
300 cleanup(&scanner->parser); YYABORT;
301 }
302 $$ = scanner->parser.tblstack;
303 scanner->parser.tblstack = scanner->parser.tblstack->u.p.prev;
304 }
305 ;
306
307fonttable : table { $$ = $1; }
309 | italic table n_italic { $$=$2; }
310 | underline table n_underline { $$=$2; }
311 | overline table n_overline { $$=$2; }
312 | bold table n_bold { $$=$2; }
313 ;
314
315opt_space : string
316 | /* empty*/
317 ;
318
319rows : row { $$ = $1; }
320 | rows row { $$ = $2; }
321 | rows HR row { $1->ruled = true; $$ = $3; }
322 ;
323
324row : T_row { addRow (&scanner->parser); } cells T_end_row { $$ = lastRow(&scanner->parser); }
325 ;
326
327cells : cell { $$ = $1; }
328 | cells cell { $$ = $2; }
329 | cells VR cell { $1->vruled = true; $$ = $3; }
330 ;
331
332cell : T_cell fonttable { setCell(&scanner->parser,$1,$2,HTML_TBL); } T_end_cell { $$ = $1; }
335 | T_cell { setCell(&scanner->parser,$1,mkText(&scanner->parser),HTML_TEXT); } T_end_cell { $$ = $1; }
336 ;
337
339 | T_IMG { $$ = $1; }
340 ;
341
343 | T_HR
344 ;
345
347 | T_VR
348 ;
349
350
351%%
352
353static void
355{
356 const textspan_t ti = {.str = agxbdisown(ag),
357 .font = *LIST_BACK(&html_state->fontstack)};
358 LIST_APPEND(&html_state->fitemList, ti);
359}
360
361static void
363{
364 htextspan_t lp = {0};
365
366 size_t cnt = LIST_SIZE(&html_state->fitemList);
367 lp.just = v;
368 if (cnt) {
369 lp.nitems = cnt;
370 lp.items = gv_calloc(cnt, sizeof(textspan_t));
371
372 for (size_t i = 0; i < LIST_SIZE(&html_state->fitemList); ++i) {
373 // move this text span into the new list
374 textspan_t *ti = LIST_AT(&html_state->fitemList, i);
375 lp.items[i] = *ti;
376 *ti = (textspan_t){0};
377 }
378 }
379 else {
380 lp.items = gv_alloc(sizeof(textspan_t));
381 lp.nitems = 1;
382 lp.items[0].str = gv_strdup("");
383 lp.items[0].font = *LIST_BACK(&html_state->fontstack);
384 }
385
386 LIST_CLEAR(&html_state->fitemList);
387
388 LIST_APPEND(&html_state->fspanList, lp);
389}
390
391static htmltxt_t*
393{
394 htmltxt_t *hft = gv_alloc(sizeof(htmltxt_t));
395
396 if (!LIST_IS_EMPTY(&html_state->fitemList))
397 appendFLineList (html_state, UNSET_ALIGN);
398
399 size_t cnt = LIST_SIZE(&html_state->fspanList);
400 hft->nspans = cnt;
401
402 hft->spans = gv_calloc(cnt, sizeof(htextspan_t));
403 for (size_t i = 0; i < LIST_SIZE(&html_state->fspanList); ++i) {
404 // move this HTML text span into the new list
405 htextspan_t *hi = LIST_AT(&html_state->fspanList, i);
406 hft->spans[i] = *hi;
407 *hi = (htextspan_t){0};
408 }
409
410 LIST_CLEAR(&html_state->fspanList);
411
412 return hft;
413}
414
415static row_t *lastRow(htmlparserstate_t *html_state) {
416 htmltbl_t* tbl = html_state->tblstack;
417 row_t *sp = *LIST_BACK(&tbl->u.p.rows);
418 return sp;
419}
420
421static void addRow(htmlparserstate_t *html_state) {
422 htmltbl_t* tbl = html_state->tblstack;
423 row_t *sp = gv_alloc(sizeof(row_t));
424 if (tbl->hrule)
425 sp->ruled = true;
426 LIST_APPEND(&tbl->u.p.rows, sp);
427}
428
429static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind) {
430 htmltbl_t* tbl = html_state->tblstack;
431 row_t *rp = *LIST_BACK(&tbl->u.p.rows);
432 LIST_APPEND(&rp->rp, cp);
433 cp->child.kind = kind;
434 if (tbl->vrule) {
435 cp->vruled = true;
436 cp->hruled = false;
437 }
438
439 if(kind == HTML_TEXT)
440 cp->child.u.txt = obj;
441 else if (kind == HTML_IMAGE)
442 cp->child.u.img = obj;
443 else
444 cp->child.u.tbl = obj;
445}
446
447static void cleanup (htmlparserstate_t *html_state)
448{
449 htmltbl_t* tp = html_state->tblstack;
450 htmltbl_t* next;
451
452 if (html_state->lbl) {
453 free_html_label (html_state->lbl,1);
454 html_state->lbl = NULL;
455 }
456 while (tp) {
457 next = tp->u.p.prev;
458 cleanTbl (tp);
459 tp = next;
460 }
461
462 LIST_CLEAR(&html_state->fitemList);
463 LIST_CLEAR(&html_state->fspanList);
464
465 LIST_FREE(&html_state->fontstack);
466}
467
468static void
470{
471 textfont_t* curfont = *LIST_BACK(&html_state->fontstack);
472 textfont_t f = *fp;
473
474 if (curfont) {
475 if (!f.color && curfont->color)
476 f.color = curfont->color;
477 if ((f.size < 0.0) && (curfont->size >= 0.0))
478 f.size = curfont->size;
479 if (!f.name && curfont->name)
480 f.name = curfont->name;
481 if (curfont->flags)
482 f.flags |= curfont->flags;
483 }
484
485 textfont_t *const ft = dtinsert(html_state->gvc->textfont_dt, &f);
486 LIST_PUSH_BACK(&html_state->fontstack, ft);
487}
488
489static void
491{
492 (void)LIST_POP_BACK(&html_state->fontstack);
493}
494
495/* Return parsed label or NULL if failure.
496 * Set warn to 0 on success; 1 for warning message; 2 if no expat; 3 for error
497 * message.
498 */
500parseHTML (char* txt, int* warn, htmlenv_t *env)
501{
502 agxbuf str = {0};
503 htmllabel_t* l = NULL;
504 htmlscan_t scanner = {0};
505
506 LIST_PUSH_BACK(&scanner.parser.fontstack, NULL);
507 scanner.parser.fitemList.dtor = free_ti;
508 scanner.parser.fspanList.dtor = free_hi;
509 scanner.parser.gvc = GD_gvc(env->g);
510 scanner.parser.str = &str;
511
512 if (initHTMLlexer (&scanner, txt, &str, env)) {/* failed: no libexpat - give up */
513 *warn = 2;
514 }
515 else {
517 *warn = clearHTMLlexer (&scanner);
518 l = scanner.parser.lbl;
519 }
520
521 LIST_FREE(&scanner.parser.fitemList);
522 LIST_FREE(&scanner.parser.fspanList);
523
524 LIST_FREE(&scanner.parser.fontstack);
525
526 agxbfree (&str);
527
528 return l;
529}
static void agxbfree(agxbuf *xb)
free any malloced resources
Definition agxbuf.h:77
static WUR char * agxbuse(agxbuf *xb)
Definition agxbuf.h:306
static char * agxbdisown(agxbuf *xb)
Definition agxbuf.h:326
Memory allocation wrappers that exit on failure.
static char * gv_strdup(const char *original)
Definition alloc.h:101
static void * gv_calloc(size_t nmemb, size_t size)
Definition alloc.h:26
static void * gv_alloc(size_t size)
Definition alloc.h:47
#define dtinsert(d, o)
Definition cdt.h:185
#define sub(h, i)
Definition closest.c:68
mode
Definition cvtgxl.c:33
static void cleanup(void)
Definition gmlparse.c:128
void free(void *)
node NULL
Definition grammar.y:181
hdr $3
Definition grammar.y:149
static int cnt(Dict_t *d, Dtlink_t **set)
Definition graph.c:196
#define GD_gvc(g)
Definition types.h:355
static GVC_t * gvc
Definition gv.cpp:23
int clearHTMLlexer(htmlscan_t *scanner)
Definition htmllex.c:784
int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf *xb, htmlenv_t *env)
Definition htmllex.c:746
void htmlerror(htmlscan_t *scanner, const char *msg)
Definition htmllex.c:58
static void cleanTbl(htmltbl_t *tp)
Clean up table if error in parsing.
Definition htmlparse.c:89
static void cleanCell(htmlcell_t *cp)
Clean up cell if error in parsing.
Definition htmlparse.c:104
static htmllabel_t * mkLabel(void *obj, label_type_t kind)
Create label, given body and type.
Definition htmlparse.c:131
static bool nonSpace(const char *s)
Return 1 if s contains a non-space character.
Definition htmlparse.c:149
#define T_n_sup
Definition htmlparse.h:138
int htmlparse(htmlscan_t *scanner)
#define T_end_table
Definition htmlparse.h:129
#define T_br
Definition htmlparse.h:148
#define T_vr
Definition htmlparse.h:145
#define T_error
Definition htmlparse.h:133
#define T_n_s
Definition htmlparse.h:140
#define T_n_sub
Definition htmlparse.h:139
#define T_n_bold
Definition htmlparse.h:135
#define T_html
Definition htmlparse.h:127
static void free_ti(textspan_t item)
Definition htmlparse.h:193
#define T_underline
Definition htmlparse.h:156
#define T_sup
Definition htmlparse.h:158
#define T_row
Definition htmlparse.h:125
#define T_table
Definition htmlparse.h:151
#define T_end_vr
Definition htmlparse.h:146
#define T_end_html
Definition htmlparse.h:128
#define T_VR
Definition htmlparse.h:144
#define T_bold
Definition htmlparse.h:155
#define T_end_img
Definition htmlparse.h:124
static void free_hi(htextspan_t item)
Definition htmlparse.h:197
#define T_sub
Definition htmlparse.h:159
#define T_s
Definition htmlparse.h:160
#define T_n_italic
Definition htmlparse.h:134
#define T_end_font
Definition htmlparse.h:131
#define T_overline
Definition htmlparse.h:157
#define T_hr
Definition htmlparse.h:142
#define T_font
Definition htmlparse.h:153
#define T_italic
Definition htmlparse.h:154
#define T_end_br
Definition htmlparse.h:123
#define T_n_underline
Definition htmlparse.h:136
#define T_end_hr
Definition htmlparse.h:143
#define T_string
Definition htmlparse.h:132
#define T_img
Definition htmlparse.h:150
#define T_HR
Definition htmlparse.h:141
#define T_n_overline
Definition htmlparse.h:137
YYABORT
Definition htmlparse.y:289
static void appendFItemList(htmlparserstate_t *html_state, agxbuf *ag)
Definition htmlparse.y:354
italic table n_italic
Definition htmlparse.y:309
font text n_font italic text n_italic underline text n_underline overline text n_overline bold text n_bold sup text n_sup sub text n_sub strike text n_strike
Definition htmlparse.y:228
static void popFont(htmlparserstate_t *html_state)
Definition htmlparse.y:490
static void addRow(htmlparserstate_t *html_state)
Definition htmlparse.y:421
font $1
Definition htmlparse.y:230
underline table n_underline
Definition htmlparse.y:310
static void appendFLineList(htmlparserstate_t *html_state, int v)
Definition htmlparse.y:362
cell $2
Definition htmlparse.y:332
htmllabel_t * parseHTML(char *txt, int *warn, htmlenv_t *env)
Definition htmlparse.y:500
T_BR
Definition htmlparse.y:279
T_cell fonttext
Definition htmlparse.y:333
static row_t * lastRow(htmlparserstate_t *html_state)
Definition htmlparse.y:415
overline table n_overline
Definition htmlparse.y:311
T_cell
Definition htmlparse.y:335
rows T_end_table opt_space
Definition htmlparse.y:297
bold table n_bold
Definition htmlparse.y:312
br
Definition htmlparse.y:219
static htmltxt_t * mkText(htmlparserstate_t *html_state)
Definition htmlparse.y:392
font table n_font
Definition htmlparse.y:308
cells T_end_row
Definition htmlparse.y:324
cells cell
Definition htmlparse.y:328
static void setCell(htmlparserstate_t *html_state, htmlcell_t *cp, void *obj, label_type_t kind)
Definition htmlparse.y:429
textitem scanner parser str
Definition htmlparse.y:218
$2 font
Definition htmlparse.y:294
$< tbl > $
Definition htmlparse.y:295
$2 u p rows
Definition htmlparse.y:292
rows row
Definition htmlparse.y:320
cell HTML_TBL
Definition htmlparse.y:332
table Syntax error
Definition htmlparse.y:288
T_IMG
Definition htmlparse.y:339
T_end_cell
Definition htmlparse.y:332
static void pushFont(htmlparserstate_t *html_state, textfont_t *fp)
Definition htmlparse.y:469
cleanup & scanner
Definition htmlparse.y:289
$$
Definition htmlparse.y:321
T_cell image
Definition htmlparse.y:334
void free_html_text(htmltxt_t *t)
Definition htmltable.c:802
void free_html_label(htmllabel_t *lp, int root)
Definition htmltable.c:859
void free_html_data(htmldata_t *dp)
Definition htmltable.c:791
#define UNSET_ALIGN
Definition htmltable.h:44
static void free_ritem(row_t *p)
Free row. This closes and frees row’s list, then the item itself is freed.
Definition htmltable.h:117
label_type_t
Definition htmltable.h:101
@ HTML_TEXT
Definition htmltable.h:101
@ HTML_IMAGE
Definition htmltable.h:101
type-generic dynamically expanding list
#define LIST_AT(list, index)
Definition list.h:178
#define LIST(type)
Definition list.h:55
#define LIST_BACK(list)
Definition list.h:201
#define LIST_SIZE(list)
Definition list.h:80
#define LIST_CLEAR(list)
Definition list.h:249
#define LIST_APPEND(list, item)
Definition list.h:132
#define LIST_FREE(list)
Definition list.h:379
#define LIST_POP_BACK(list)
Definition list.h:416
#define LIST_IS_EMPTY(list)
Definition list.h:90
#define LIST_PUSH_BACK(list, item)
Definition list.h:393
#define LIST_GET(list, index)
Definition list.h:165
static int table[NTYPES][NTYPES]
Definition mincross.c:1790
Definition gvcint.h:81
Dt_t * textfont_dt
Definition gvcint.h:108
result of partitioning available space, part of maze
Definition grid.h:33
gridpt p
Definition grid.h:34
size_t nitems
Definition htmltable.h:54
textspan_t * items
Definition htmltable.h:53
htmllabel_t child
Definition htmltable.h:161
bool vruled
vertically ruled?
Definition htmltable.h:163
bool hruled
horizontally ruled?
Definition htmltable.h:164
htmldata_t data
Definition htmltable.h:156
graph_t * g
Definition htmltable.h:171
htmltxt_t * txt
Definition htmltable.h:149
htmltbl_t * tbl
Definition htmltable.h:148
htmlimg_t * img
Definition htmltable.h:150
union htmllabel_t::@81 u
label_type_t kind
Definition htmltable.h:152
htmllabel_t * lbl
Definition htmlparse.h:205
htmltbl_t * tblstack
Definition htmlparse.h:206
htmlparserstate_t parser
Definition htmlparse.h:235
htmllexstate_t lexer
Definition htmlparse.h:234
bool vrule
vertical rule
Definition htmltable.h:143
bool hrule
horizontal rule
Definition htmltable.h:142
htmltbl_t * prev
Definition htmltable.h:132
struct htmltbl_t::@78::@80 p
union htmltbl_t::@78 u
htmldata_t data
Definition htmltable.h:125
rows_t rows
cells
Definition htmltable.h:133
size_t nspans
Definition htmltable.h:62
htextspan_t * spans
Definition htmltable.h:61
Definition utils.c:751
bool ruled
Definition htmltable.h:113
a non-owning string reference
Definition strview.h:20
char * color
Definition textspan.h:55
char * name
Definition textspan.h:54
unsigned int flags
Definition textspan.h:58
double size
Definition textspan.h:57
char * str
Definition textspan.h:65
textfont_t * font
Definition textspan.h:66
Non-owning string references.
textspan_t, textfont_t, PostscriptAlias
static tok_t tok(const char *input, const char *separators)
begin tokenization of a new string
Definition tokenize.h:43
Definition grammar.c:90