URI:
       xml.c - xmlparser - XML parser
  HTML git clone git://git.codemadness.org/xmlparser
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       xml.c (11462B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
           12 
           13 static void
           14 xml_parseattrs(XMLParser *x)
           15 {
           16         size_t namelen = 0, valuelen;
           17         int c, endsep, endname = 0, valuestart = 0;
           18 
           19         while ((c = GETNEXT()) != EOF) {
           20                 if (ISSPACE(c)) {
           21                         if (namelen)
           22                                 endname = 1;
           23                         continue;
           24                 } else if (c == '?')
           25                         ; /* ignore */
           26                 else if (c == '=') {
           27                         x->name[namelen] = '\0';
           28                         valuestart = 1;
           29                         endname = 1;
           30                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           31                         /* attribute without value */
           32                         x->name[namelen] = '\0';
           33                         if (x->xmlattrstart)
           34                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           35                         if (x->xmlattr)
           36                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           37                         if (x->xmlattrend)
           38                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           39                         endname = 0;
           40                         x->name[0] = c;
           41                         namelen = 1;
           42                 } else if (namelen && valuestart) {
           43                         /* attribute with value */
           44                         if (x->xmlattrstart)
           45                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           46 
           47                         valuelen = 0;
           48                         if (c == '\'' || c == '"') {
           49                                 endsep = c;
           50                         } else {
           51                                 endsep = ' '; /* ISSPACE() */
           52                                 goto startvalue;
           53                         }
           54 
           55                         while ((c = GETNEXT()) != EOF) {
           56 startvalue:
           57                                 if (c == '&') { /* entities */
           58                                         x->data[valuelen] = '\0';
           59                                         /* call data function with data before entity if there is data */
           60                                         if (valuelen && x->xmlattr)
           61                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           62                                         x->data[0] = c;
           63                                         valuelen = 1;
           64                                         while ((c = GETNEXT()) != EOF) {
           65                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           66                                                         break;
           67                                                 if (valuelen < sizeof(x->data) - 1)
           68                                                         x->data[valuelen++] = c;
           69                                                 else {
           70                                                         /* entity too long for buffer, handle as normal data */
           71                                                         x->data[valuelen] = '\0';
           72                                                         if (x->xmlattr)
           73                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           74                                                         x->data[0] = c;
           75                                                         valuelen = 1;
           76                                                         break;
           77                                                 }
           78                                                 if (c == ';') {
           79                                                         x->data[valuelen] = '\0';
           80                                                         if (x->xmlattrentity)
           81                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           82                                                         valuelen = 0;
           83                                                         break;
           84                                                 }
           85                                         }
           86                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           87                                         if (valuelen < sizeof(x->data) - 1) {
           88                                                 x->data[valuelen++] = c;
           89                                         } else {
           90                                                 x->data[valuelen] = '\0';
           91                                                 if (x->xmlattr)
           92                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           93                                                 x->data[0] = c;
           94                                                 valuelen = 1;
           95                                         }
           96                                 }
           97                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           98                                         x->data[valuelen] = '\0';
           99                                         if (x->xmlattr)
          100                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          101                                         if (x->xmlattrend)
          102                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          103                                         break;
          104                                 }
          105                         }
          106                         namelen = endname = valuestart = 0;
          107                 } else if (namelen < sizeof(x->name) - 1) {
          108                         x->name[namelen++] = c;
          109                 }
          110                 if (c == '>') {
          111                         break;
          112                 } else if (c == '/') {
          113                         x->isshorttag = 1;
          114                         x->name[0] = '\0';
          115                         namelen = 0;
          116                 }
          117         }
          118 }
          119 
          120 static void
          121 xml_parsecomment(XMLParser *x)
          122 {
          123         size_t datalen = 0, i = 0;
          124         int c;
          125 
          126         if (x->xmlcommentstart)
          127                 x->xmlcommentstart(x);
          128         while ((c = GETNEXT()) != EOF) {
          129                 if (c == '-' || c == '>') {
          130                         if (x->xmlcomment && datalen) {
          131                                 x->data[datalen] = '\0';
          132                                 x->xmlcomment(x, x->data, datalen);
          133                                 datalen = 0;
          134                         }
          135                 }
          136 
          137                 if (c == '-') {
          138                         if (++i > 2) {
          139                                 if (x->xmlcomment)
          140                                         for (; i > 2; i--)
          141                                                 x->xmlcomment(x, "-", 1);
          142                                 i = 2;
          143                         }
          144                         continue;
          145                 } else if (c == '>' && i == 2) {
          146                         if (x->xmlcommentend)
          147                                 x->xmlcommentend(x);
          148                         return;
          149                 } else if (i) {
          150                         if (x->xmlcomment) {
          151                                 for (; i > 0; i--)
          152                                         x->xmlcomment(x, "-", 1);
          153                         }
          154                         i = 0;
          155                 }
          156 
          157                 if (datalen < sizeof(x->data) - 1) {
          158                         x->data[datalen++] = c;
          159                 } else {
          160                         x->data[datalen] = '\0';
          161                         if (x->xmlcomment)
          162                                 x->xmlcomment(x, x->data, datalen);
          163                         x->data[0] = c;
          164                         datalen = 1;
          165                 }
          166         }
          167 }
          168 
          169 static void
          170 xml_parsecdata(XMLParser *x)
          171 {
          172         size_t datalen = 0, i = 0;
          173         int c;
          174 
          175         if (x->xmlcdatastart)
          176                 x->xmlcdatastart(x);
          177         while ((c = GETNEXT()) != EOF) {
          178                 if (c == ']' || c == '>') {
          179                         if (x->xmlcdata && datalen) {
          180                                 x->data[datalen] = '\0';
          181                                 x->xmlcdata(x, x->data, datalen);
          182                                 datalen = 0;
          183                         }
          184                 }
          185 
          186                 if (c == ']') {
          187                         if (++i > 2) {
          188                                 if (x->xmlcdata)
          189                                         for (; i > 2; i--)
          190                                                 x->xmlcdata(x, "]", 1);
          191                                 i = 2;
          192                         }
          193                         continue;
          194                 } else if (c == '>' && i == 2) {
          195                         if (x->xmlcdataend)
          196                                 x->xmlcdataend(x);
          197                         return;
          198                 } else if (i) {
          199                         if (x->xmlcdata)
          200                                 for (; i > 0; i--)
          201                                         x->xmlcdata(x, "]", 1);
          202                         i = 0;
          203                 }
          204 
          205                 if (datalen < sizeof(x->data) - 1) {
          206                         x->data[datalen++] = c;
          207                 } else {
          208                         x->data[datalen] = '\0';
          209                         if (x->xmlcdata)
          210                                 x->xmlcdata(x, x->data, datalen);
          211                         x->data[0] = c;
          212                         datalen = 1;
          213                 }
          214         }
          215 }
          216 
          217 static int
          218 codepointtoutf8(long r, char *s)
          219 {
          220         if (r == 0) {
          221                 return 0; /* NUL byte */
          222         } else if (r <= 0x7F) {
          223                 /* 1 byte: 0aaaaaaa */
          224                 s[0] = r;
          225                 return 1;
          226         } else if (r <= 0x07FF) {
          227                 /* 2 bytes: 00000aaa aabbbbbb */
          228                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          229                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          230                 return 2;
          231         } else if (r <= 0xFFFF) {
          232                 /* 3 bytes: aaaabbbb bbcccccc */
          233                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          234                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          235                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          236                 return 3;
          237         } else {
          238                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          239                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          240                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          241                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          242                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          243                 return 4;
          244         }
          245 }
          246 
          247 static int
          248 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          249 {
          250         static const struct {
          251                 const char *entity;
          252                 int c;
          253         } entities[] = {
          254                 { "amp;",  '&'  },
          255                 { "lt;",   '<'  },
          256                 { "gt;",   '>'  },
          257                 { "apos;", '\'' },
          258                 { "quot;", '"'  },
          259         };
          260         size_t i;
          261 
          262         /* buffer is too small */
          263         if (bufsiz < 2)
          264                 return -1;
          265 
          266         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          267                 if (!strcmp(e, entities[i].entity)) {
          268                         buf[0] = entities[i].c;
          269                         buf[1] = '\0';
          270                         return 1;
          271                 }
          272         }
          273         return -1;
          274 }
          275 
          276 static int
          277 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          278 {
          279         long l;
          280         int base, len;
          281         const char *s;
          282         char *end;
          283 
          284         /* buffer is too small */
          285         if (bufsiz < 5)
          286                 return -1;
          287 
          288         /* hex (base 16) or decimal (base 10) */
          289         if (*e == 'x') {
          290                 e++;
          291                 for (s = e; *s && *s != ';'; s++) {
          292                         if (!ISXDIGIT((unsigned char)*s))
          293                                 return -1; /* invalid: no hex */
          294                 }
          295                 base = 16;
          296 
          297         } else {
          298                 for (s = e; *s && *s != ';'; s++) {
          299                         if (!ISDIGIT((unsigned char)*s))
          300                                 return -1; /* invalid: no digits */
          301                 }
          302                 base = 10;
          303         }
          304         if (*s != ';' || *(s + 1) != '\0')
          305                 return -1; /* must end with ';' NUL */
          306 
          307         errno = 0;
          308         l = strtol(e, &end, base);
          309 
          310         /* invalid value or not a well-formed entity or invalid code point */
          311         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          312             (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
          313                 return -1;
          314         len = codepointtoutf8(l, buf);
          315         buf[len] = '\0';
          316 
          317         return len;
          318 }
          319 
          320 /* convert named- or numeric entity string to buffer string
          321  * returns byte-length of string or -1 on failure. */
          322 int
          323 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          324 {
          325         /* doesn't start with & */
          326         if (e[0] != '&')
          327                 return -1;
          328         /* numeric entity */
          329         if (e[1] == '#')
          330                 return numericentitytostr(e + 2, buf, bufsiz);
          331         else /* named entity */
          332                 return namedentitytostr(e + 1, buf, bufsiz);
          333 }
          334 
          335 void
          336 xml_parse(XMLParser *x)
          337 {
          338         size_t datalen, tagdatalen;
          339         int c, isend;
          340 
          341         while ((c = GETNEXT()) != EOF && c != '<')
          342                 ; /* skip until < */
          343 
          344         while (c != EOF) {
          345                 if (c == '<') { /* parse tag */
          346                         if ((c = GETNEXT()) == EOF)
          347                                 return;
          348 
          349                         if (c == '!') { /* CDATA and comments */
          350                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          351                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          352                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          353                                                 x->data[tagdatalen++] = c;
          354                                         if (c == '>')
          355                                                 break;
          356                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          357                                                         (x->data[0] == '-')) {
          358                                                 xml_parsecomment(x);
          359                                                 break;
          360                                         } else if (c == '[') {
          361                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          362                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          363                                                         xml_parsecdata(x);
          364                                                         break;
          365                                                 }
          366                                         }
          367                                 }
          368                         } else {
          369                                 /* normal tag (open, short open, close), processing instruction. */
          370                                 x->tag[0] = c;
          371                                 x->taglen = 1;
          372                                 x->isshorttag = isend = 0;
          373 
          374                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          375                                 if (c == '?') {
          376                                         x->isshorttag = 1;
          377                                 } else if (c == '/') {
          378                                         if ((c = GETNEXT()) == EOF)
          379                                                 return;
          380                                         x->tag[0] = c;
          381                                         isend = 1;
          382                                 }
          383 
          384                                 while ((c = GETNEXT()) != EOF) {
          385                                         if (c == '/')
          386                                                 x->isshorttag = 1; /* short tag */
          387                                         else if (c == '>' || ISSPACE(c)) {
          388                                                 x->tag[x->taglen] = '\0';
          389                                                 if (isend) { /* end tag, starts with </ */
          390                                                         while (c != '>' && c != EOF) /* skip until > */
          391                                                                 c = GETNEXT();
          392                                                         if (x->xmltagend)
          393                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          394                                                         x->tag[0] = '\0';
          395                                                         x->taglen = 0;
          396                                                 } else {
          397                                                         /* start tag */
          398                                                         if (x->xmltagstart)
          399                                                                 x->xmltagstart(x, x->tag, x->taglen);
          400                                                         if (ISSPACE(c))
          401                                                                 xml_parseattrs(x);
          402                                                         if (x->xmltagstartparsed)
          403                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          404                                                 }
          405                                                 /* call tagend for short tag or processing instruction */
          406                                                 if (x->isshorttag) {
          407                                                         if (x->xmltagend)
          408                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          409                                                         x->tag[0] = '\0';
          410                                                         x->taglen = 0;
          411                                                 }
          412                                                 break;
          413                                         } else if (x->taglen < sizeof(x->tag) - 1)
          414                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          415                                 }
          416                         }
          417                 } else {
          418                         /* parse tag data */
          419                         datalen = 0;
          420                         if (x->xmldatastart)
          421                                 x->xmldatastart(x);
          422                         while ((c = GETNEXT()) != EOF) {
          423                                 if (c == '&') { /* entities */
          424                                         if (datalen) {
          425                                                 x->data[datalen] = '\0';
          426                                                 if (x->xmldata)
          427                                                         x->xmldata(x, x->data, datalen);
          428                                         }
          429                                         x->data[0] = c;
          430                                         datalen = 1;
          431                                         while ((c = GETNEXT()) != EOF) {
          432                                                 if (c == '<')
          433                                                         break;
          434                                                 if (datalen < sizeof(x->data) - 1)
          435                                                         x->data[datalen++] = c;
          436                                                 else {
          437                                                         /* entity too long for buffer, handle as normal data */
          438                                                         x->data[datalen] = '\0';
          439                                                         if (x->xmldata)
          440                                                                 x->xmldata(x, x->data, datalen);
          441                                                         x->data[0] = c;
          442                                                         datalen = 1;
          443                                                         break;
          444                                                 }
          445                                                 if (c == ';') {
          446                                                         x->data[datalen] = '\0';
          447                                                         if (x->xmldataentity)
          448                                                                 x->xmldataentity(x, x->data, datalen);
          449                                                         datalen = 0;
          450                                                         break;
          451                                                 }
          452                                         }
          453                                 } else if (c != '<') {
          454                                         if (datalen < sizeof(x->data) - 1) {
          455                                                 x->data[datalen++] = c;
          456                                         } else {
          457                                                 x->data[datalen] = '\0';
          458                                                 if (x->xmldata)
          459                                                         x->xmldata(x, x->data, datalen);
          460                                                 x->data[0] = c;
          461                                                 datalen = 1;
          462                                         }
          463                                 }
          464                                 if (c == '<') {
          465                                         x->data[datalen] = '\0';
          466                                         if (x->xmldata && datalen)
          467                                                 x->xmldata(x, x->data, datalen);
          468                                         if (x->xmldataend)
          469                                                 x->xmldataend(x);
          470                                         break;
          471                                 }
          472                         }
          473                 }
          474         }
          475 }