Graphviz 13.0.0~dev.20250424.1043
Loading...
Searching...
No Matches
xml.c
Go to the documentation of this file.
1
7#include <inttypes.h>
8#include <stdbool.h>
9#include <stdint.h>
10#include <stdio.h>
11#include <stdlib.h>
12#include <util/exit.h>
13#include <util/gv_ctype.h>
14#include <util/unreachable.h>
15#include <util/xml.h>
16
17/* return true if *s points to &[A-Za-z]+; (e.g. &Ccedil; )
18 * or &#[0-9]*; (e.g. &#38; )
19 * or &#x[0-9a-fA-F]*; (e.g. &#x6C34; )
20 */
21static bool xml_isentity(const char *s) {
22 s++; /* already known to be '&' */
23 if (*s == ';') { // '&;' is not a valid entity
24 return false;
25 }
26 if (*s == '#') {
27 s++;
28 if (*s == 'x' || *s == 'X') {
29 s++;
30 while (gv_isxdigit(*s))
31 s++;
32 } else {
33 while (gv_isdigit(*s))
34 s++;
35 }
36 } else {
37 while (gv_isalpha(*s))
38 s++;
39 }
40 if (*s == ';')
41 return true;
42 return false;
43}
44
60static int xml_core(char previous, const char **current, xml_flags_t flags,
61 int (*cb)(void *state, const char *s), void *state) {
62
63 const char *s = *current;
64 char c = *s;
65
66 // we will consume at least one character, so note that now
67 ++*current;
68
69 // escape '&' only if not part of a legal entity sequence
70 if (c == '&' && (flags.raw || !xml_isentity(s)))
71 return cb(state, "&amp;");
72
73 // '<' '>' are safe to substitute even if string is already XML encoded since
74 // XML strings won’t contain '<' or '>'
75 if (c == '<')
76 return cb(state, "&lt;");
77
78 if (c == '>')
79 return cb(state, "&gt;");
80
81 // '-' cannot be used in XML comment strings
82 if (c == '-' && flags.dash)
83 return cb(state, "&#45;");
84
85 if (c == ' ' && previous == ' ' && flags.nbsp)
86 // substitute 2nd and subsequent spaces with required_spaces
87 return cb(state, "&#160;"); // Inkscape does not recognize &nbsp;
88
89 if (c == '"')
90 return cb(state, "&quot;");
91
92 if (c == '\'')
93 return cb(state, "&#39;");
94
95 if (c == '\n' && flags.raw)
96 return cb(state, "&#10;");
97
98 if (c == '\r' && flags.raw)
99 return cb(state, "&#13;");
100
101 unsigned char uc = (unsigned char)c;
102 if (uc > 0x7f && flags.utf8) {
103
104 // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
105 //
106 // ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
107 // │First code point│Last code point│Byte 1 │Byte 2 │Byte 3 │Byte 4 │
108 // ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
109 // │ U+0000│ U+007F│0xxxxxxx│ │ │ │
110 // │ U+0080│ U+07FF│110xxxxx│10xxxxxx│ │ │
111 // │ U+0800│ U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│ │
112 // │ U+10000│ U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
113 // └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
114 //
115 // from which we can calculate the byte length of the current character
116 size_t length = (uc >> 5) == 6 ? 2
117 : (uc >> 4) == 14 ? 3
118 : (uc >> 3) == 30 ? 4
119 : 0;
120
121 // was the length malformed or is the follow on sequence truncated?
122 bool is_invalid = length == 0;
123 for (size_t l = 1; !is_invalid && length > l; ++l)
124 is_invalid |= s[l] == '\0';
125
126 // TODO: a better strategy than aborting on malformed data
127 if (is_invalid) {
128 fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
129 graphviz_exit(EXIT_FAILURE);
130 }
131
132 // Decode the character. Refer again to the above table to understand this
133 // algorithm.
134 uint32_t utf8_char = 0;
135 switch (length) {
136 case 2: {
137 uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
138 uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
139 utf8_char = low | (high << 6);
140 break;
141 }
142 case 3: {
143 uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
144 uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
145 uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
146 utf8_char = low | (mid << 6) | (high << 12);
147 break;
148 }
149 case 4: {
150 uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
151 uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
152 uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
153 uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
154 utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
155 break;
156 }
157 default:
158 UNREACHABLE();
159 }
160
161 // setup a buffer that will fit the largest escape we need to print
162 char buffer[sizeof("&#xFFFFFFFF;")];
163
164 // emit the escape sequence itself
165 snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
166
167 // note how many extra characters we consumed
168 *current += length - 1;
169
170 return cb(state, buffer);
171 }
172
173 // otherwise, output the character as-is
174 char buffer[2] = {c, '\0'};
175 return cb(state, buffer);
176}
177
179 int (*cb)(void *state, const char *s), void *state) {
180 char previous = '\0';
181 int rc = 0;
182 while (*s != '\0') {
183 char p = *s;
184 rc = xml_core(previous, &s, flags, cb, state);
185 if (rc < 0)
186 return rc;
187 previous = p;
188 }
189 return rc;
190}
191
192#ifdef TEST_XML
193// compile the below test stub with:
194//
195// ${CC} -std=c99 -DTEST_XML -Ilib -Ilib/gvc -Ilib/pathplan -Ilib/cgraph
196// -Ilib/cdt lib/common/xml.c
197
198#include <getopt.h>
199
200static int put(void *stream, const char *s) { return fputs(s, stream); }
201
202// stub for testing above functionality
203int main(int argc, char **argv) {
204
205 xml_flags_t flags = {0};
206
207 while (true) {
208 static const struct option opts[] = {
209 {"dash", no_argument, 0, 'd'},
210 {"nbsp", no_argument, 0, 'n'},
211 {"raw", no_argument, 0, 'r'},
212 {"utf8", no_argument, 0, 'u'},
213 {0, 0, 0, 0},
214 };
215
216 int index;
217 int c = getopt_long(argc, argv, "dnru", opts, &index);
218
219 if (c == -1)
220 break;
221
222 switch (c) {
223
224 case 'd':
225 flags.dash = 1;
226 break;
227
228 case 'n':
229 flags.nbsp = 1;
230 break;
231
232 case 'r':
233 flags.raw = 1;
234 break;
235
236 case 'u':
237 flags.utf8 = 1;
238 break;
239
240 default:
241 fprintf(stderr, "unexpected error\n");
242 graphviz_exit(EXIT_FAILURE);
243 }
244 }
245
246 // escape all input we received
247 for (int i = optind; i < argc; ++i) {
248 int r = gv_xml_escape(argv[i], flags, put, stdout);
249 if (r < 0)
250 graphviz_exit(EXIT_FAILURE);
251 }
252
253 graphviz_exit(EXIT_SUCCESS);
254}
255#endif
static int put(void *buffer, const char *s)
Definition draw.c:310
static NORETURN void graphviz_exit(int status)
Definition exit.h:23
static int flags
Definition gc.c:61
replacements for ctype.h functions
static bool gv_isxdigit(int c)
Definition gv_ctype.h:71
static bool gv_isdigit(int c)
Definition gv_ctype.h:41
static bool gv_isalpha(int c)
Definition gv_ctype.h:29
static opts_t opts
Definition gvgen.c:401
options to tweak the behavior of XML escaping
Definition xml.h:30
int main()
Definition grammar.c:93
#define UNREACHABLE()
Definition unreachable.h:30
int gv_xml_escape(const char *s, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:178
static bool xml_isentity(const char *s)
Definition xml.c:21
static int xml_core(char previous, const char **current, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:60
XML escaping functionality.