Graphviz 13.0.0~dev.20250607.1528
Loading...
Searching...
No Matches
xml.c
Go to the documentation of this file.
1
7#include <errno.h>
8#include <inttypes.h>
9#include <stdbool.h>
10#include <stdint.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14#include <util/exit.h>
15#include <util/gv_ctype.h>
16#include <util/prisize_t.h>
17#include <util/unreachable.h>
18#include <util/xml.h>
19
20/* return true if *s points to &[A-Za-z]+; (e.g. &Ccedil; )
21 * or &#[0-9]*; (e.g. &#38; )
22 * or &#x[0-9a-fA-F]*; (e.g. &#x6C34; )
23 */
24static bool xml_isentity(const char *s) {
25 s++; /* already known to be '&' */
26 if (*s == ';') { // '&;' is not a valid entity
27 return false;
28 }
29 if (*s == '#') {
30 s++;
31 if (*s == 'x' || *s == 'X') {
32 s++;
33 while (gv_isxdigit(*s))
34 s++;
35 } else {
36 while (gv_isdigit(*s))
37 s++;
38 }
39 } else {
40 while (gv_isalpha(*s))
41 s++;
42 }
43 if (*s == ';')
44 return true;
45 return false;
46}
47
63static int xml_core(char previous, const char **current, xml_flags_t flags,
64 int (*cb)(void *state, const char *s), void *state) {
65
66 const char *s = *current;
67 char c = *s;
68
69 // we will consume at least one character, so note that now
70 ++*current;
71
72 // escape '&' only if not part of a legal entity sequence
73 if (c == '&' && (flags.raw || !xml_isentity(s)))
74 return cb(state, "&amp;");
75
76 // '<' '>' are safe to substitute even if string is already XML encoded since
77 // XML strings won’t contain '<' or '>'
78 if (c == '<')
79 return cb(state, "&lt;");
80
81 if (c == '>')
82 return cb(state, "&gt;");
83
84 // '-' cannot be used in XML comment strings
85 if (c == '-' && flags.dash)
86 return cb(state, "&#45;");
87
88 if (c == ' ' && previous == ' ' && flags.nbsp)
89 // substitute 2nd and subsequent spaces with required_spaces
90 return cb(state, "&#160;"); // Inkscape does not recognize &nbsp;
91
92 if (c == '"')
93 return cb(state, "&quot;");
94
95 if (c == '\'')
96 return cb(state, "&#39;");
97
98 if (c == '\n' && flags.raw)
99 return cb(state, "&#10;");
100
101 if (c == '\r' && flags.raw)
102 return cb(state, "&#13;");
103
104 unsigned char uc = (unsigned char)c;
105 if (uc > 0x7f && flags.utf8) {
106
107 // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
108 //
109 // ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
110 // │First code point│Last code point│Byte 1 │Byte 2 │Byte 3 │Byte 4 │
111 // ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
112 // │ U+0000│ U+007F│0xxxxxxx│ │ │ │
113 // │ U+0080│ U+07FF│110xxxxx│10xxxxxx│ │ │
114 // │ U+0800│ U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│ │
115 // │ U+10000│ U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
116 // └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
117 //
118 // from which we can calculate the byte length of the current character
119 size_t length = (uc >> 5) == 6 ? 2
120 : (uc >> 4) == 14 ? 3
121 : (uc >> 3) == 30 ? 4
122 : 0;
123
124 // was the length malformed or is the follow on sequence truncated?
125 bool is_invalid = length == 0;
126 for (size_t l = 1; !is_invalid && length > l; ++l)
127 is_invalid |= s[l] == '\0';
128
129 // TODO: a better strategy than aborting on malformed data
130 if (is_invalid) {
131 fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
132 graphviz_exit(EXIT_FAILURE);
133 }
134
135 // Decode the character. Refer again to the above table to understand this
136 // algorithm.
137 uint32_t utf8_char = 0;
138 switch (length) {
139 case 2: {
140 uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
141 uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
142 utf8_char = low | (high << 6);
143 break;
144 }
145 case 3: {
146 uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
147 uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
148 uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
149 utf8_char = low | (mid << 6) | (high << 12);
150 break;
151 }
152 case 4: {
153 uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
154 uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
155 uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
156 uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
157 utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
158 break;
159 }
160 default:
161 UNREACHABLE();
162 }
163
164 // setup a buffer that will fit the largest escape we need to print
165 char buffer[sizeof("&#xFFFFFFFF;")];
166
167 // emit the escape sequence itself
168 snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
169
170 // note how many extra characters we consumed
171 *current += length - 1;
172
173 return cb(state, buffer);
174 }
175
176 // otherwise, output the character as-is
177 char buffer[2] = {c, '\0'};
178 return cb(state, buffer);
179}
180
182 int (*cb)(void *state, const char *s), void *state) {
183 char previous = '\0';
184 int rc = 0;
185 while (*s != '\0') {
186 char p = *s;
187 rc = xml_core(previous, &s, flags, cb, state);
188 if (rc < 0)
189 return rc;
190 previous = p;
191 }
192 return rc;
193}
194
195#ifdef TEST_XML
196// compile the below test stub with:
197//
198// ${CC} -std=c11 -DTEST_XML -Ilib lib/util/xml.c
199
200static int put(void *stream, const char *s) { return fputs(s, stream); }
201
202// stub for testing above functionality
203int main(int argc, char **argv) {
204
205 xml_flags_t flags = {0};
206
207 int i;
208 for (i = 1; i < argc; ++i) {
209 if (strcmp(argv[i], "--dash") == 0) {
210 flags.dash = 1;
211 } else if (strcmp(argv[i], "--nbsp") == 0) {
212 flags.nbsp = 1;
213 } else if (strcmp(argv[i], "--raw") == 0) {
214 flags.raw = 1;
215 } else if (strcmp(argv[i], "--utf8") == 0) {
216 flags.utf8 = 1;
217 } else if (argv[i][0] == '-') {
218 fprintf(stderr, "unrecognized argument %s\n", argv[i]);
219 graphviz_exit(EXIT_FAILURE);
220 } else {
221 // assume we have reached filenames
222 break;
223 }
224 }
225
226 if (i + 2 != argc) {
227 fprintf(stderr,
228 "usage: %s [--dash] [--nbsp] [--raw] [--utf8] <input> <output>\n",
229 argv[0]);
230 graphviz_exit(EXIT_FAILURE);
231 }
232
233 const char *const in = argv[i];
234 const char *const out = argv[i + 1];
235
236 // read in the input
237 char buffer[128] = {0};
238 {
239 FILE *const f = fopen(in, "rb");
240 if (f == NULL) {
241 fprintf(stderr, "failed to open %s: %s\n", in, strerror(errno));
242 graphviz_exit(EXIT_FAILURE);
243 }
244 const size_t read = fread(buffer, 1, sizeof(buffer), f);
245 (void)fclose(f);
246 if (read == 0 || read == sizeof(buffer)) {
247 fprintf(stderr,
248 "only escaping 1 - %" PRISIZE_T
249 " bytes is supported, not %" PRISIZE_T " bytes\n",
250 sizeof(buffer) - 1, read);
251 graphviz_exit(EXIT_FAILURE);
252 }
253 }
254
255 // open the output
256 FILE *const f = fopen(out, "wb");
257 if (f == NULL) {
258 fprintf(stderr, "failed to open %s: %s\n", out, strerror(errno));
259 graphviz_exit(EXIT_FAILURE);
260 }
261
262 // escape the buffer into the output
263 const int r = gv_xml_escape(buffer, flags, put, f);
264 (void)fclose(f);
265 if (r < 0) {
266 graphviz_exit(EXIT_FAILURE);
267 }
268
269 graphviz_exit(EXIT_SUCCESS);
270}
271#endif
static void out(agerrlevel_t level, const char *fmt, va_list args)
Report messages using a user-supplied or default write function.
Definition agerror.c:84
static Extype_t length(Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1614
static int in(Extype_t lhs, Exid_t *rhs, Exdisc_t *disc)
Definition compile.c:1638
static int put(void *buffer, const char *s)
Definition draw.c:288
static NORETURN void graphviz_exit(int status)
Definition exit.h:23
static int flags
Definition gc.c:61
node NULL
Definition grammar.y:180
Agraph_t * read(FILE *f)
Definition gv.cpp:60
replacements for ctype.h functions
static bool gv_isxdigit(int c)
Definition gv_ctype.h:71
static bool gv_isdigit(int c)
Definition gv_ctype.h:41
static bool gv_isalpha(int c)
Definition gv_ctype.h:29
#define PRISIZE_T
Definition prisize_t.h:25
options to tweak the behavior of XML escaping
Definition xml.h:30
unsigned dash
escape '-'
Definition xml.h:34
int main()
Definition grammar.c:89
#define UNREACHABLE()
Definition unreachable.h:30
int gv_xml_escape(const char *s, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:181
static bool xml_isentity(const char *s)
Definition xml.c:24
static int xml_core(char previous, const char **current, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:63
XML escaping functionality.