xml.c - xmlparser - XML parser
HTML git clone git://git.codemadness.org/xmlparser
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
xml.c (11462B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
12
13 static void
14 xml_parseattrs(XMLParser *x)
15 {
16 size_t namelen = 0, valuelen;
17 int c, endsep, endname = 0, valuestart = 0;
18
19 while ((c = GETNEXT()) != EOF) {
20 if (ISSPACE(c)) {
21 if (namelen)
22 endname = 1;
23 continue;
24 } else if (c == '?')
25 ; /* ignore */
26 else if (c == '=') {
27 x->name[namelen] = '\0';
28 valuestart = 1;
29 endname = 1;
30 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
31 /* attribute without value */
32 x->name[namelen] = '\0';
33 if (x->xmlattrstart)
34 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
35 if (x->xmlattr)
36 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
37 if (x->xmlattrend)
38 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
39 endname = 0;
40 x->name[0] = c;
41 namelen = 1;
42 } else if (namelen && valuestart) {
43 /* attribute with value */
44 if (x->xmlattrstart)
45 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
46
47 valuelen = 0;
48 if (c == '\'' || c == '"') {
49 endsep = c;
50 } else {
51 endsep = ' '; /* ISSPACE() */
52 goto startvalue;
53 }
54
55 while ((c = GETNEXT()) != EOF) {
56 startvalue:
57 if (c == '&') { /* entities */
58 x->data[valuelen] = '\0';
59 /* call data function with data before entity if there is data */
60 if (valuelen && x->xmlattr)
61 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
62 x->data[0] = c;
63 valuelen = 1;
64 while ((c = GETNEXT()) != EOF) {
65 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
66 break;
67 if (valuelen < sizeof(x->data) - 1)
68 x->data[valuelen++] = c;
69 else {
70 /* entity too long for buffer, handle as normal data */
71 x->data[valuelen] = '\0';
72 if (x->xmlattr)
73 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
74 x->data[0] = c;
75 valuelen = 1;
76 break;
77 }
78 if (c == ';') {
79 x->data[valuelen] = '\0';
80 if (x->xmlattrentity)
81 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
82 valuelen = 0;
83 break;
84 }
85 }
86 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
87 if (valuelen < sizeof(x->data) - 1) {
88 x->data[valuelen++] = c;
89 } else {
90 x->data[valuelen] = '\0';
91 if (x->xmlattr)
92 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
93 x->data[0] = c;
94 valuelen = 1;
95 }
96 }
97 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
98 x->data[valuelen] = '\0';
99 if (x->xmlattr)
100 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
101 if (x->xmlattrend)
102 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
103 break;
104 }
105 }
106 namelen = endname = valuestart = 0;
107 } else if (namelen < sizeof(x->name) - 1) {
108 x->name[namelen++] = c;
109 }
110 if (c == '>') {
111 break;
112 } else if (c == '/') {
113 x->isshorttag = 1;
114 x->name[0] = '\0';
115 namelen = 0;
116 }
117 }
118 }
119
120 static void
121 xml_parsecomment(XMLParser *x)
122 {
123 size_t datalen = 0, i = 0;
124 int c;
125
126 if (x->xmlcommentstart)
127 x->xmlcommentstart(x);
128 while ((c = GETNEXT()) != EOF) {
129 if (c == '-' || c == '>') {
130 if (x->xmlcomment && datalen) {
131 x->data[datalen] = '\0';
132 x->xmlcomment(x, x->data, datalen);
133 datalen = 0;
134 }
135 }
136
137 if (c == '-') {
138 if (++i > 2) {
139 if (x->xmlcomment)
140 for (; i > 2; i--)
141 x->xmlcomment(x, "-", 1);
142 i = 2;
143 }
144 continue;
145 } else if (c == '>' && i == 2) {
146 if (x->xmlcommentend)
147 x->xmlcommentend(x);
148 return;
149 } else if (i) {
150 if (x->xmlcomment) {
151 for (; i > 0; i--)
152 x->xmlcomment(x, "-", 1);
153 }
154 i = 0;
155 }
156
157 if (datalen < sizeof(x->data) - 1) {
158 x->data[datalen++] = c;
159 } else {
160 x->data[datalen] = '\0';
161 if (x->xmlcomment)
162 x->xmlcomment(x, x->data, datalen);
163 x->data[0] = c;
164 datalen = 1;
165 }
166 }
167 }
168
169 static void
170 xml_parsecdata(XMLParser *x)
171 {
172 size_t datalen = 0, i = 0;
173 int c;
174
175 if (x->xmlcdatastart)
176 x->xmlcdatastart(x);
177 while ((c = GETNEXT()) != EOF) {
178 if (c == ']' || c == '>') {
179 if (x->xmlcdata && datalen) {
180 x->data[datalen] = '\0';
181 x->xmlcdata(x, x->data, datalen);
182 datalen = 0;
183 }
184 }
185
186 if (c == ']') {
187 if (++i > 2) {
188 if (x->xmlcdata)
189 for (; i > 2; i--)
190 x->xmlcdata(x, "]", 1);
191 i = 2;
192 }
193 continue;
194 } else if (c == '>' && i == 2) {
195 if (x->xmlcdataend)
196 x->xmlcdataend(x);
197 return;
198 } else if (i) {
199 if (x->xmlcdata)
200 for (; i > 0; i--)
201 x->xmlcdata(x, "]", 1);
202 i = 0;
203 }
204
205 if (datalen < sizeof(x->data) - 1) {
206 x->data[datalen++] = c;
207 } else {
208 x->data[datalen] = '\0';
209 if (x->xmlcdata)
210 x->xmlcdata(x, x->data, datalen);
211 x->data[0] = c;
212 datalen = 1;
213 }
214 }
215 }
216
217 static int
218 codepointtoutf8(long r, char *s)
219 {
220 if (r == 0) {
221 return 0; /* NUL byte */
222 } else if (r <= 0x7F) {
223 /* 1 byte: 0aaaaaaa */
224 s[0] = r;
225 return 1;
226 } else if (r <= 0x07FF) {
227 /* 2 bytes: 00000aaa aabbbbbb */
228 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
229 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
230 return 2;
231 } else if (r <= 0xFFFF) {
232 /* 3 bytes: aaaabbbb bbcccccc */
233 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
234 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
235 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
236 return 3;
237 } else {
238 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
239 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
240 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
241 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
242 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
243 return 4;
244 }
245 }
246
247 static int
248 namedentitytostr(const char *e, char *buf, size_t bufsiz)
249 {
250 static const struct {
251 const char *entity;
252 int c;
253 } entities[] = {
254 { "amp;", '&' },
255 { "lt;", '<' },
256 { "gt;", '>' },
257 { "apos;", '\'' },
258 { "quot;", '"' },
259 };
260 size_t i;
261
262 /* buffer is too small */
263 if (bufsiz < 2)
264 return -1;
265
266 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
267 if (!strcmp(e, entities[i].entity)) {
268 buf[0] = entities[i].c;
269 buf[1] = '\0';
270 return 1;
271 }
272 }
273 return -1;
274 }
275
276 static int
277 numericentitytostr(const char *e, char *buf, size_t bufsiz)
278 {
279 long l;
280 int base, len;
281 const char *s;
282 char *end;
283
284 /* buffer is too small */
285 if (bufsiz < 5)
286 return -1;
287
288 /* hex (base 16) or decimal (base 10) */
289 if (*e == 'x') {
290 e++;
291 for (s = e; *s && *s != ';'; s++) {
292 if (!ISXDIGIT((unsigned char)*s))
293 return -1; /* invalid: no hex */
294 }
295 base = 16;
296
297 } else {
298 for (s = e; *s && *s != ';'; s++) {
299 if (!ISDIGIT((unsigned char)*s))
300 return -1; /* invalid: no digits */
301 }
302 base = 10;
303 }
304 if (*s != ';' || *(s + 1) != '\0')
305 return -1; /* must end with ';' NUL */
306
307 errno = 0;
308 l = strtol(e, &end, base);
309
310 /* invalid value or not a well-formed entity or invalid code point */
311 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
312 (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
313 return -1;
314 len = codepointtoutf8(l, buf);
315 buf[len] = '\0';
316
317 return len;
318 }
319
320 /* convert named- or numeric entity string to buffer string
321 * returns byte-length of string or -1 on failure. */
322 int
323 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
324 {
325 /* doesn't start with & */
326 if (e[0] != '&')
327 return -1;
328 /* numeric entity */
329 if (e[1] == '#')
330 return numericentitytostr(e + 2, buf, bufsiz);
331 else /* named entity */
332 return namedentitytostr(e + 1, buf, bufsiz);
333 }
334
335 void
336 xml_parse(XMLParser *x)
337 {
338 size_t datalen, tagdatalen;
339 int c, isend;
340
341 while ((c = GETNEXT()) != EOF && c != '<')
342 ; /* skip until < */
343
344 while (c != EOF) {
345 if (c == '<') { /* parse tag */
346 if ((c = GETNEXT()) == EOF)
347 return;
348
349 if (c == '!') { /* CDATA and comments */
350 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
351 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
352 if (tagdatalen <= sizeof("[CDATA[") - 1)
353 x->data[tagdatalen++] = c;
354 if (c == '>')
355 break;
356 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
357 (x->data[0] == '-')) {
358 xml_parsecomment(x);
359 break;
360 } else if (c == '[') {
361 if (tagdatalen == sizeof("[CDATA[") - 1 &&
362 !strncmp(x->data, "[CDATA[", tagdatalen)) {
363 xml_parsecdata(x);
364 break;
365 }
366 }
367 }
368 } else {
369 /* normal tag (open, short open, close), processing instruction. */
370 x->tag[0] = c;
371 x->taglen = 1;
372 x->isshorttag = isend = 0;
373
374 /* treat processing instruction as short tag, don't strip "?" prefix. */
375 if (c == '?') {
376 x->isshorttag = 1;
377 } else if (c == '/') {
378 if ((c = GETNEXT()) == EOF)
379 return;
380 x->tag[0] = c;
381 isend = 1;
382 }
383
384 while ((c = GETNEXT()) != EOF) {
385 if (c == '/')
386 x->isshorttag = 1; /* short tag */
387 else if (c == '>' || ISSPACE(c)) {
388 x->tag[x->taglen] = '\0';
389 if (isend) { /* end tag, starts with </ */
390 while (c != '>' && c != EOF) /* skip until > */
391 c = GETNEXT();
392 if (x->xmltagend)
393 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
394 x->tag[0] = '\0';
395 x->taglen = 0;
396 } else {
397 /* start tag */
398 if (x->xmltagstart)
399 x->xmltagstart(x, x->tag, x->taglen);
400 if (ISSPACE(c))
401 xml_parseattrs(x);
402 if (x->xmltagstartparsed)
403 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
404 }
405 /* call tagend for short tag or processing instruction */
406 if (x->isshorttag) {
407 if (x->xmltagend)
408 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
409 x->tag[0] = '\0';
410 x->taglen = 0;
411 }
412 break;
413 } else if (x->taglen < sizeof(x->tag) - 1)
414 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
415 }
416 }
417 } else {
418 /* parse tag data */
419 datalen = 0;
420 if (x->xmldatastart)
421 x->xmldatastart(x);
422 while ((c = GETNEXT()) != EOF) {
423 if (c == '&') { /* entities */
424 if (datalen) {
425 x->data[datalen] = '\0';
426 if (x->xmldata)
427 x->xmldata(x, x->data, datalen);
428 }
429 x->data[0] = c;
430 datalen = 1;
431 while ((c = GETNEXT()) != EOF) {
432 if (c == '<')
433 break;
434 if (datalen < sizeof(x->data) - 1)
435 x->data[datalen++] = c;
436 else {
437 /* entity too long for buffer, handle as normal data */
438 x->data[datalen] = '\0';
439 if (x->xmldata)
440 x->xmldata(x, x->data, datalen);
441 x->data[0] = c;
442 datalen = 1;
443 break;
444 }
445 if (c == ';') {
446 x->data[datalen] = '\0';
447 if (x->xmldataentity)
448 x->xmldataentity(x, x->data, datalen);
449 datalen = 0;
450 break;
451 }
452 }
453 } else if (c != '<') {
454 if (datalen < sizeof(x->data) - 1) {
455 x->data[datalen++] = c;
456 } else {
457 x->data[datalen] = '\0';
458 if (x->xmldata)
459 x->xmldata(x, x->data, datalen);
460 x->data[0] = c;
461 datalen = 1;
462 }
463 }
464 if (c == '<') {
465 x->data[datalen] = '\0';
466 if (x->xmldata && datalen)
467 x->xmldata(x, x->data, datalen);
468 if (x->xmldataend)
469 x->xmldataend(x);
470 break;
471 }
472 }
473 }
474 }
475 }