Graphviz 14.1.3~dev.20260124.0732
Loading...
Searching...
No Matches
xml.c
Go to the documentation of this file.
1
7#ifndef TEST_XML
8#include "config.h"
9#endif
10
11#include <errno.h>
12#include <inttypes.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <util/exit.h>
19#include <util/gv_ctype.h>
20#include <util/prisize_t.h>
21#include <util/unreachable.h>
22#include <util/xml.h>
23
24/* return true if *s points to &[A-Za-z]+; (e.g. &Ccedil; )
25 * or &#[0-9]*; (e.g. &#38; )
26 * or &#x[0-9a-fA-F]*; (e.g. &#x6C34; )
27 */
28static bool xml_isentity(const char *s) {
29 s++; /* already known to be '&' */
30 if (*s == ';') { // '&;' is not a valid entity
31 return false;
32 }
33 if (*s == '#') {
34 s++;
35 if (*s == 'x' || *s == 'X') {
36 s++;
37 while (gv_isxdigit(*s))
38 s++;
39 } else {
40 while (gv_isdigit(*s))
41 s++;
42 }
43 } else {
44 while (gv_isalpha(*s))
45 s++;
46 }
47 if (*s == ';')
48 return true;
49 return false;
50}
51
67static int xml_core(char previous, const char **current, xml_flags_t flags,
68 int (*cb)(void *state, const char *s), void *state) {
69
70 const char *s = *current;
71 char c = *s;
72
73 // we will consume at least one character, so note that now
74 ++*current;
75
76 // escape '&' only if not part of a legal entity sequence
77 if (c == '&' && (flags.raw || !xml_isentity(s)))
78 return cb(state, "&amp;");
79
80 // '<' '>' are safe to substitute even if string is already XML encoded since
81 // XML strings won’t contain '<' or '>'
82 if (c == '<')
83 return cb(state, "&lt;");
84
85 if (c == '>')
86 return cb(state, "&gt;");
87
88 // '-' cannot be used in XML comment strings
89 if (c == '-' && flags.dash)
90 return cb(state, "&#45;");
91
92 if (c == ' ' && previous == ' ' && flags.nbsp)
93 // substitute 2nd and subsequent spaces with required_spaces
94 return cb(state, "&#160;"); // Inkscape does not recognize &nbsp;
95
96 if (c == '"')
97 return cb(state, "&quot;");
98
99 if (c == '\'')
100 return cb(state, "&#39;");
101
102 if (c == '\n' && flags.raw)
103 return cb(state, "&#10;");
104
105 if (c == '\r' && flags.raw)
106 return cb(state, "&#13;");
107
108 unsigned char uc = (unsigned char)c;
109 if (uc > 0x7f && flags.utf8) {
110
111 // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
112 //
113 // ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
114 // │First code point│Last code point│Byte 1 │Byte 2 │Byte 3 │Byte 4 │
115 // ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
116 // │ U+0000│ U+007F│0xxxxxxx│ │ │ │
117 // │ U+0080│ U+07FF│110xxxxx│10xxxxxx│ │ │
118 // │ U+0800│ U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│ │
119 // │ U+10000│ U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
120 // └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
121 //
122 // from which we can calculate the byte length of the current character
123 size_t length = (uc >> 5) == 6 ? 2
124 : (uc >> 4) == 14 ? 3
125 : (uc >> 3) == 30 ? 4
126 : 0;
127
128 // was the length malformed or is the follow on sequence truncated?
129 bool is_invalid = length == 0;
130 for (size_t l = 1; !is_invalid && length > l; ++l)
131 is_invalid |= s[l] == '\0';
132
133 // TODO: a better strategy than aborting on malformed data
134 if (is_invalid) {
135 fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
136 graphviz_exit(EXIT_FAILURE);
137 }
138
139 // Decode the character. Refer again to the above table to understand this
140 // algorithm.
141 uint32_t utf8_char = 0;
142 switch (length) {
143 case 2: {
144 uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
145 uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
146 utf8_char = low | (high << 6);
147 break;
148 }
149 case 3: {
150 uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
151 uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
152 uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
153 utf8_char = low | (mid << 6) | (high << 12);
154 break;
155 }
156 case 4: {
157 uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
158 uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
159 uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
160 uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
161 utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
162 break;
163 }
164 default:
165 UNREACHABLE();
166 }
167
168 // setup a buffer that will fit the largest escape we need to print
169 char buffer[sizeof("&#xFFFFFFFF;")];
170
171 // emit the escape sequence itself
172 snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
173
174 // note how many extra characters we consumed
175 *current += length - 1;
176
177 return cb(state, buffer);
178 }
179
180 // otherwise, output the character as-is
181 char buffer[2] = {c, '\0'};
182 return cb(state, buffer);
183}
184
186 int (*cb)(void *state, const char *s), void *state) {
187 char previous = '\0';
188 int rc = 0;
189 while (*s != '\0') {
190 char p = *s;
191 rc = xml_core(previous, &s, flags, cb, state);
192 if (rc < 0)
193 return rc;
194 previous = p;
195 }
196 return rc;
197}
198
199#ifdef TEST_XML
200// compile the below test stub with:
201//
202// ${CC} -std=c11 -DTEST_XML -Ilib lib/util/xml.c
203
204static int put(void *stream, const char *s) { return fputs(s, stream); }
205
206// stub for testing above functionality
207int main(int argc, char **argv) {
208
209 xml_flags_t flags = {0};
210
211 int i;
212 for (i = 1; i < argc; ++i) {
213 if (strcmp(argv[i], "--dash") == 0) {
214 flags.dash = 1;
215 } else if (strcmp(argv[i], "--nbsp") == 0) {
216 flags.nbsp = 1;
217 } else if (strcmp(argv[i], "--raw") == 0) {
218 flags.raw = 1;
219 } else if (strcmp(argv[i], "--utf8") == 0) {
220 flags.utf8 = 1;
221 } else if (argv[i][0] == '-') {
222 fprintf(stderr, "unrecognized argument %s\n", argv[i]);
223 graphviz_exit(EXIT_FAILURE);
224 } else {
225 // assume we have reached filenames
226 break;
227 }
228 }
229
230 if (i + 2 != argc) {
231 fprintf(stderr,
232 "usage: %s [--dash] [--nbsp] [--raw] [--utf8] <input> <output>\n",
233 argv[0]);
234 graphviz_exit(EXIT_FAILURE);
235 }
236
237 const char *const in = argv[i];
238 const char *const out = argv[i + 1];
239
240 // read in the input
241 char buffer[128] = {0};
242 {
243 FILE *const f = fopen(in, "rb");
244 if (f == NULL) {
245 fprintf(stderr, "failed to open %s: %s\n", in, strerror(errno));
246 graphviz_exit(EXIT_FAILURE);
247 }
248 const size_t read = fread(buffer, 1, sizeof(buffer), f);
249 (void)fclose(f);
250 if (read == 0 || read == sizeof(buffer)) {
251 fprintf(stderr,
252 "only escaping 1 - %" PRISIZE_T
253 " bytes is supported, not %" PRISIZE_T " bytes\n",
254 sizeof(buffer) - 1, read);
255 graphviz_exit(EXIT_FAILURE);
256 }
257 }
258
259 // open the output
260 FILE *const f = fopen(out, "wb");
261 if (f == NULL) {
262 fprintf(stderr, "failed to open %s: %s\n", out, strerror(errno));
263 graphviz_exit(EXIT_FAILURE);
264 }
265
266 // escape the buffer into the output
267 const int r = gv_xml_escape(buffer, flags, put, f);
268 (void)fclose(f);
269 if (r < 0) {
270 graphviz_exit(EXIT_FAILURE);
271 }
272
273 graphviz_exit(EXIT_SUCCESS);
274}
275#endif
static void out(agerrlevel_t level, const char *fmt, va_list args)
Report messages using a user-supplied or default write function.
Definition agerror.c:86
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1606
static int in(Extype_t lhs, Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1632
static int put(void *buffer, const char *s)
Definition draw.c:293
static NORETURN void graphviz_exit(int status)
Definition exit.h:23
static int flags
Definition gc.c:63
node NULL
Definition grammar.y:181
Agraph_t * read(FILE *f)
Definition gv.cpp:64
replacements for ctype.h functions
static bool gv_isxdigit(int c)
Definition gv_ctype.h:71
static bool gv_isdigit(int c)
Definition gv_ctype.h:41
static bool gv_isalpha(int c)
Definition gv_ctype.h:29
#define PRISIZE_T
Definition prisize_t.h:25
options to tweak the behavior of XML escaping
Definition xml.h:13
unsigned dash
escape '-'
Definition xml.h:17
int main()
Definition grammar.c:90
#define UNREACHABLE()
Definition unreachable.h:30
int gv_xml_escape(const char *s, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:185
static bool xml_isentity(const char *s)
Definition xml.c:28
static int xml_core(char previous, const char **current, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:67
XML escaping functionality.