Graphviz 12.0.1~dev.20240716.0800
Loading...
Searching...
No Matches
xml.c
Go to the documentation of this file.
1
7#include <cgraph/exit.h>
8#include <cgraph/gv_ctype.h>
10#include <common/types.h>
11#include <common/utils.h>
12#include <inttypes.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17
18/* return true if *s points to &[A-Za-z]+; (e.g. &Ccedil; )
19 * or &#[0-9]*; (e.g. &#38; )
20 * or &#x[0-9a-fA-F]*; (e.g. &#x6C34; )
21 */
22static bool xml_isentity(const char *s) {
23 s++; /* already known to be '&' */
24 if (*s == ';') { // '&;' is not a valid entity
25 return false;
26 }
27 if (*s == '#') {
28 s++;
29 if (*s == 'x' || *s == 'X') {
30 s++;
31 while (gv_isxdigit(*s))
32 s++;
33 } else {
34 while (gv_isdigit(*s))
35 s++;
36 }
37 } else {
38 while (gv_isalpha(*s))
39 s++;
40 }
41 if (*s == ';')
42 return true;
43 return false;
44}
45
61static int xml_core(char previous, const char **current, xml_flags_t flags,
62 int (*cb)(void *state, const char *s), void *state) {
63
64 const char *s = *current;
65 char c = *s;
66
67 // we will consume at least one character, so note that now
68 ++*current;
69
70 // escape '&' only if not part of a legal entity sequence
71 if (c == '&' && (flags.raw || !xml_isentity(s)))
72 return cb(state, "&amp;");
73
74 // '<' '>' are safe to substitute even if string is already XML encoded since
75 // XML strings won’t contain '<' or '>'
76 if (c == '<')
77 return cb(state, "&lt;");
78
79 if (c == '>')
80 return cb(state, "&gt;");
81
82 // '-' cannot be used in XML comment strings
83 if (c == '-' && flags.dash)
84 return cb(state, "&#45;");
85
86 if (c == ' ' && previous == ' ' && flags.nbsp)
87 // substitute 2nd and subsequent spaces with required_spaces
88 return cb(state, "&#160;"); // Inkscape does not recognize &nbsp;
89
90 if (c == '"')
91 return cb(state, "&quot;");
92
93 if (c == '\'')
94 return cb(state, "&#39;");
95
96 if (c == '\n' && flags.raw)
97 return cb(state, "&#10;");
98
99 if (c == '\r' && flags.raw)
100 return cb(state, "&#13;");
101
102 unsigned char uc = (unsigned char)c;
103 if (uc > 0x7f && flags.utf8) {
104
105 // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
106 //
107 // ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
108 // │First code point│Last code point│Byte 1 │Byte 2 │Byte 3 │Byte 4 │
109 // ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
110 // │ U+0000│ U+007F│0xxxxxxx│ │ │ │
111 // │ U+0080│ U+07FF│110xxxxx│10xxxxxx│ │ │
112 // │ U+0800│ U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│ │
113 // │ U+10000│ U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
114 // └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
115 //
116 // from which we can calculate the byte length of the current character
117 size_t length = (uc >> 5) == 6 ? 2
118 : (uc >> 4) == 14 ? 3
119 : (uc >> 3) == 30 ? 4
120 : 0;
121
122 // was the length malformed or is the follow on sequence truncated?
123 bool is_invalid = length == 0;
124 for (size_t l = 1; !is_invalid && length > l; ++l)
125 is_invalid |= s[l] == '\0';
126
127 // TODO: a better strategy than aborting on malformed data
128 if (is_invalid) {
129 fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
130 graphviz_exit(EXIT_FAILURE);
131 }
132
133 // Decode the character. Refer again to the above table to understand this
134 // algorithm.
135 uint32_t utf8_char = 0;
136 switch (length) {
137 case 2: {
138 uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
139 uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
140 utf8_char = low | (high << 6);
141 break;
142 }
143 case 3: {
144 uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
145 uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
146 uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
147 utf8_char = low | (mid << 6) | (high << 12);
148 break;
149 }
150 case 4: {
151 uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
152 uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
153 uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
154 uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
155 utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
156 break;
157 }
158 default:
159 UNREACHABLE();
160 }
161
162 // setup a buffer that will fit the largest escape we need to print
163 char buffer[sizeof("&#xFFFFFFFF;")];
164
165 // emit the escape sequence itself
166 snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
167
168 // note how many extra characters we consumed
169 *current += length - 1;
170
171 return cb(state, buffer);
172 }
173
174 // otherwise, output the character as-is
175 char buffer[2] = {c, '\0'};
176 return cb(state, buffer);
177}
178
179int xml_escape(const char *s, xml_flags_t flags,
180 int (*cb)(void *state, const char *s), void *state) {
181 char previous = '\0';
182 int rc = 0;
183 while (*s != '\0') {
184 char p = *s;
185 rc = xml_core(previous, &s, flags, cb, state);
186 if (rc < 0)
187 return rc;
188 previous = p;
189 }
190 return rc;
191}
192
193#ifdef TEST_XML
194// compile the below test stub with:
195//
196// ${CC} -std=c99 -DTEST_XML -Ilib -Ilib/gvc -Ilib/pathplan -Ilib/cgraph
197// -Ilib/cdt lib/common/xml.c
198
199#include <getopt.h>
200
201static int put(void *stream, const char *s) { return fputs(s, stream); }
202
203// stub for testing above functionality
204int main(int argc, char **argv) {
205
206 xml_flags_t flags = {0};
207
208 while (true) {
209 static const struct option opts[] = {
210 {"dash", no_argument, 0, 'd'},
211 {"nbsp", no_argument, 0, 'n'},
212 {"raw", no_argument, 0, 'r'},
213 {"utf8", no_argument, 0, 'u'},
214 {0, 0, 0, 0},
215 };
216
217 int index;
218 int c = getopt_long(argc, argv, "dnru", opts, &index);
219
220 if (c == -1)
221 break;
222
223 switch (c) {
224
225 case 'd':
226 flags.dash = 1;
227 break;
228
229 case 'n':
230 flags.nbsp = 1;
231 break;
232
233 case 'r':
234 flags.raw = 1;
235 break;
236
237 case 'u':
238 flags.utf8 = 1;
239 break;
240
241 default:
242 fprintf(stderr, "unexpected error\n");
243 graphviz_exit(EXIT_FAILURE);
244 }
245 }
246
247 // escape all input we received
248 for (int i = optind; i < argc; ++i) {
249 int r = xml_escape(argv[i], flags, put, stdout);
250 if (r < 0)
251 graphviz_exit(EXIT_FAILURE);
252 }
253
254 graphviz_exit(EXIT_SUCCESS);
255}
256#endif
static int put(void *buffer, const char *s)
Definition draw.c:313
static NORETURN void graphviz_exit(int status)
Definition exit.h:23
static int flags
Definition gc.c:61
replacements for ctype.h functions
static bool gv_isxdigit(int c)
Definition gv_ctype.h:71
static bool gv_isdigit(int c)
Definition gv_ctype.h:41
static bool gv_isalpha(int c)
Definition gv_ctype.h:29
static opts_t opts
Definition gvgen.c:394
static lexstate_t state
Definition htmllex.c:61
int main()
graphs, nodes and edges info: Agraphinfo_t, Agnodeinfo_t and Agedgeinfo_t
Definition grammar.c:93
#define UNREACHABLE()
Definition unreachable.h:30
int xml_escape(const char *s, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:179
static bool xml_isentity(const char *s)
Definition xml.c:22
static int xml_core(char previous, const char **current, xml_flags_t flags, int(*cb)(void *state, const char *s), void *state)
Definition xml.c:61