URI:
       thtml.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       thtml.c (7691B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include "hdr.h"
            5 #include "conv.h"
            6 
            7 typedef struct Hchar Hchar;
            8 struct Hchar
            9 {
           10         char *s;
           11         Rune r;
           12 };
           13 
           14 /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
           15 
           16 /*
           17  * Names beginning with _ are names we recognize
           18  * (without the underscore) but will not generate,
           19  * because they are nonstandard.
           20  */
           21 static Hchar byname[] =
           22 {
           23         {"AElig", 198},
           24         {"Aacute", 193},
           25         {"Acirc", 194},
           26         {"Agrave", 192},
           27         {"Alpha", 913},
           28         {"Aring", 197},
           29         {"Atilde", 195},
           30         {"Auml", 196},
           31         {"Beta", 914},
           32         {"Ccedil", 199},
           33         {"Chi", 935},
           34         {"Dagger", 8225},
           35         {"Delta", 916},
           36         {"ETH", 208},
           37         {"Eacute", 201},
           38         {"Ecirc", 202},
           39         {"Egrave", 200},
           40         {"Epsilon", 917},
           41         {"Eta", 919},
           42         {"Euml", 203},
           43         {"Gamma", 915},
           44         {"Iacute", 205},
           45         {"Icirc", 206},
           46         {"Igrave", 204},
           47         {"Iota", 921},
           48         {"Iuml", 207},
           49         {"Kappa", 922},
           50         {"Lambda", 923},
           51         {"Mu", 924},
           52         {"Ntilde", 209},
           53         {"Nu", 925},
           54         {"OElig", 338},
           55         {"Oacute", 211},
           56         {"Ocirc", 212},
           57         {"Ograve", 210},
           58         {"Omega", 937},
           59         {"Omicron", 927},
           60         {"Oslash", 216},
           61         {"Otilde", 213},
           62         {"Ouml", 214},
           63         {"Phi", 934},
           64         {"Pi", 928},
           65         {"Prime", 8243},
           66         {"Psi", 936},
           67         {"Rho", 929},
           68         {"Scaron", 352},
           69         {"Sigma", 931},
           70         {"THORN", 222},
           71         {"Tau", 932},
           72         {"Theta", 920},
           73         {"Uacute", 218},
           74         {"Ucirc", 219},
           75         {"Ugrave", 217},
           76         {"Upsilon", 933},
           77         {"Uuml", 220},
           78         {"Xi", 926},
           79         {"Yacute", 221},
           80         {"Yuml", 376},
           81         {"Zeta", 918},
           82         {"aacute", 225},
           83         {"acirc", 226},
           84         {"acute", 180},
           85         {"aelig", 230},
           86         {"agrave", 224},
           87         {"alefsym", 8501},
           88         {"alpha", 945},
           89         {"amp", 38},
           90         {"and", 8743},
           91         {"ang", 8736},
           92         {"aring", 229},
           93         {"asymp", 8776},
           94         {"atilde", 227},
           95         {"auml", 228},
           96         {"bdquo", 8222},
           97         {"beta", 946},
           98         {"brvbar", 166},
           99         {"bull", 8226},
          100         {"cap", 8745},
          101         {"ccedil", 231},
          102         {"cdots", 8943},
          103         {"cedil", 184},
          104         {"cent", 162},
          105         {"chi", 967},
          106         {"circ", 710},
          107         {"clubs", 9827},
          108         {"cong", 8773},
          109         {"copy", 169},
          110         {"crarr", 8629},
          111         {"cup", 8746},
          112         {"curren", 164},
          113         {"dArr", 8659},
          114         {"dagger", 8224},
          115         {"darr", 8595},
          116         {"ddots", 8945},
          117         {"deg", 176},
          118         {"delta", 948},
          119         {"diams", 9830},
          120         {"divide", 247},
          121         {"eacute", 233},
          122         {"ecirc", 234},
          123         {"egrave", 232},
          124         {"_emdash", 8212},        /* non-standard but commonly used */
          125         {"empty", 8709},
          126         {"emsp", 8195},
          127         {"_endash", 8211},        /* non-standard but commonly used */
          128         {"ensp", 8194},
          129         {"epsilon", 949},
          130         {"equiv", 8801},
          131         {"eta", 951},
          132         {"eth", 240},
          133         {"euml", 235},
          134         {"euro", 8364},
          135         {"exist", 8707},
          136         {"fnof", 402},
          137         {"forall", 8704},
          138         {"frac12", 189},
          139         {"frac14", 188},
          140         {"frac34", 190},
          141         {"frasl", 8260},
          142         {"gamma", 947},
          143         {"ge", 8805},
          144         {"gt", 62},
          145         {"hArr", 8660},
          146         {"harr", 8596},
          147         {"hearts", 9829},
          148         {"hellip", 8230},
          149         {"iacute", 237},
          150         {"icirc", 238},
          151         {"iexcl", 161},
          152         {"igrave", 236},
          153         {"image", 8465},
          154         {"infin", 8734},
          155         {"int", 8747},
          156         {"iota", 953},
          157         {"iquest", 191},
          158         {"isin", 8712},
          159         {"iuml", 239},
          160         {"kappa", 954},
          161         {"lArr", 8656},
          162         {"lambda", 955},
          163         {"lang", 9001},
          164         {"laquo", 171},
          165         {"larr", 8592},
          166         {"lceil", 8968},
          167         {"_ldots", 8230},
          168         {"ldquo", 8220},
          169         {"le", 8804},
          170         {"lfloor", 8970},
          171         {"lowast", 8727},
          172         {"loz", 9674},
          173         {"lrm", 8206},
          174         {"lsaquo", 8249},
          175         {"lsquo", 8216},
          176         {"lt", 60},
          177         {"macr", 175},
          178         {"mdash", 8212},
          179         {"micro", 181},
          180         {"middot", 183},
          181         {"minus", 8722},
          182         {"mu", 956},
          183         {"nabla", 8711},
          184         {"nbsp", 160},
          185         {"ndash", 8211},
          186         {"ne", 8800},
          187         {"ni", 8715},
          188         {"not", 172},
          189         {"notin", 8713},
          190         {"nsub", 8836},
          191         {"ntilde", 241},
          192         {"nu", 957},
          193         {"oacute", 243},
          194         {"ocirc", 244},
          195         {"oelig", 339},
          196         {"ograve", 242},
          197         {"oline", 8254},
          198         {"omega", 969},
          199         {"omicron", 959},
          200         {"oplus", 8853},
          201         {"or", 8744},
          202         {"ordf", 170},
          203         {"ordm", 186},
          204         {"oslash", 248},
          205         {"otilde", 245},
          206         {"otimes", 8855},
          207         {"ouml", 246},
          208         {"para", 182},
          209         {"part", 8706},
          210         {"permil", 8240},
          211         {"perp", 8869},
          212         {"phi", 966},
          213         {"pi", 960},
          214         {"piv", 982},
          215         {"plusmn", 177},
          216         {"pound", 163},
          217         {"prime", 8242},
          218         {"prod", 8719},
          219         {"prop", 8733},
          220         {"psi", 968},
          221         {"quad", 8193},
          222         {"quot", 34},
          223         {"rArr", 8658},
          224         {"radic", 8730},
          225         {"rang", 9002},
          226         {"raquo", 187},
          227         {"rarr", 8594},
          228         {"rceil", 8969},
          229         {"rdquo", 8221},
          230         {"real", 8476},
          231         {"reg", 174},
          232         {"rfloor", 8971},
          233         {"rho", 961},
          234         {"rlm", 8207},
          235         {"rsaquo", 8250},
          236         {"rsquo", 8217},
          237         {"sbquo", 8218},
          238         {"scaron", 353},
          239         {"sdot", 8901},
          240         {"sect", 167},
          241         {"shy", 173},
          242         {"sigma", 963},
          243         {"sigmaf", 962},
          244         {"sim", 8764},
          245         {"_sp", 8194},
          246         {"spades", 9824},
          247         {"sub", 8834},
          248         {"sube", 8838},
          249         {"sum", 8721},
          250         {"sup", 8835},
          251         {"sup1", 185},
          252         {"sup2", 178},
          253         {"sup3", 179},
          254         {"supe", 8839},
          255         {"szlig", 223},
          256         {"tau", 964},
          257         {"there4", 8756},
          258         {"theta", 952},
          259         {"thetasym", 977},
          260         {"thinsp", 8201},
          261         {"thorn", 254},
          262         {"tilde", 732},
          263         {"times", 215},
          264         {"trade", 8482},
          265         {"uArr", 8657},
          266         {"uacute", 250},
          267         {"uarr", 8593},
          268         {"ucirc", 251},
          269         {"ugrave", 249},
          270         {"uml", 168},
          271         {"upsih", 978},
          272         {"upsilon", 965},
          273         {"uuml", 252},
          274         {"_varepsilon", 8712},
          275         {"varphi", 981},
          276         {"_varpi", 982},
          277         {"varrho", 1009},
          278         {"vdots", 8942},
          279         {"_vsigma", 962},
          280         {"_vtheta", 977},
          281         {"weierp", 8472},
          282         {"xi", 958},
          283         {"yacute", 253},
          284         {"yen", 165},
          285         {"yuml", 255},
          286         {"zeta", 950},
          287         {"zwj", 8205},
          288         {"zwnj", 8204}
          289 };
          290 
          291 static Hchar byrune[nelem(byname)];
          292 
          293 static int
          294 hnamecmp(const void *va, const void *vb)
          295 {
          296         Hchar *a, *b;
          297 
          298         a = (Hchar*)va;
          299         b = (Hchar*)vb;
          300         return strcmp(a->s, b->s);
          301 }
          302 
          303 static int
          304 hrunecmp(const void *va, const void *vb)
          305 {
          306         Hchar *a, *b;
          307 
          308         a = (Hchar*)va;
          309         b = (Hchar*)vb;
          310         return a->r - b->r;
          311 }
          312 
          313 static void
          314 html_init(void)
          315 {
          316         static int init;
          317         int i;
          318 
          319         if(init)
          320                 return;
          321         init = 1;
          322         memmove(byrune, byname, sizeof byrune);
          323 
          324         /* Eliminate names we aren't allowed to generate. */
          325         for(i=0; i<nelem(byrune); i++){
          326                 if(byrune[i].s[0] == '_'){
          327                         byrune[i].r = Runeerror;
          328                         byname[i].s++;
          329                 }
          330         }
          331 
          332         qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
          333         qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
          334 }
          335 
          336 static Rune
          337 findbyname(char *s)
          338 {
          339         Hchar *h;
          340         int n, m, x;
          341 
          342         h = byname;
          343         n = nelem(byname);
          344         while(n > 0){
          345                 m = n/2;
          346                 x = strcmp(h[m].s, s);
          347                 if(x == 0)
          348                         return h[m].r;
          349                 if(x < 0){
          350                         h += m+1;
          351                         n -= m+1;
          352                 }else
          353                         n = m;
          354         }
          355         return Runeerror;
          356 }
          357 
          358 static char*
          359 findbyrune(Rune r)
          360 {
          361         Hchar *h;
          362         int n, m;
          363 
          364         if(r == Runeerror)
          365                 return nil;
          366         h = byrune;
          367         n = nelem(byrune);
          368         while(n > 0){
          369                 m = n/2;
          370                 if(h[m].r == r)
          371                         return h[m].s;
          372                 if(h[m].r < r){
          373                         h += m+1;
          374                         n -= m+1;
          375                 }else
          376                         n = m;
          377         }
          378         return nil;
          379 }
          380 
          381 void
          382 html_in(int fd, long *x, struct convert *out)
          383 {
          384         char buf[100], *p;
          385         Biobuf b;
          386         Rune rbuf[N];
          387         Rune *r, *er;
          388         int c, i;
          389 
          390         USED(x);
          391 
          392         html_init();
          393         r = rbuf;
          394         er = rbuf+N;
          395         Binit(&b, fd, OREAD);
          396         while((c = Bgetrune(&b)) != Beof){
          397                 if(r >= er){
          398                         OUT(out, rbuf, r-rbuf);
          399                         r = rbuf;
          400                 }
          401                 if(c == '&'){
          402                         buf[0] = c;
          403                         for(i=1; i<nelem(buf)-1;){
          404                                 c = Bgetc(&b);
          405                                 if(c == Beof)
          406                                         break;
          407                                 buf[i++] = c;
          408                                 if(strchr("; \t\r\n", c))
          409                                         break;
          410                         }
          411                         buf[i] = 0;
          412                         if(buf[i-1] == ';'){
          413                                 buf[i-1] = 0;
          414                                 if((c = findbyname(buf+1)) != Runeerror){
          415                                         *r++ = c;
          416                                         continue;
          417                                 }
          418                                 buf[i-1] = ';';
          419                                 if(buf[1] == '#'){
          420                                         if(buf[2] == 'x')
          421                                                 c = strtol(buf+3, &p, 16);
          422                                         else
          423                                                 c = strtol(buf+2, &p, 10);
          424                                         if(*p != ';' || c >= NRUNE || c < 0)
          425                                                 goto bad;
          426                                         *r++ = c;
          427                                         continue;
          428                                 }
          429                         }
          430                 bad:
          431                         for(p=buf; p<buf+i; ){
          432                                 p += chartorune(r++, p);
          433                                 if(r >= er){
          434                                         OUT(out, rbuf, r-rbuf);
          435                                         r = rbuf;
          436                                 }
          437                         }
          438                         continue;
          439                 }
          440                 *r++ = c;
          441         }
          442         if(r > rbuf)
          443                 OUT(out, rbuf, r-rbuf);
          444         OUT(out, rbuf, 0);
          445 }
          446 
          447 /*
          448  * use biobuf because can use more than UTFmax bytes per rune
          449  */
          450 void
          451 html_out(Rune *r, int n, long *x)
          452 {
          453         char *s;
          454         Biobuf b;
          455         Rune *er;
          456 
          457         USED(x);
          458         html_init();
          459         Binit(&b, 1, OWRITE);
          460         er = r+n;
          461         for(; r<er; r++){
          462                 if(*r < Runeself)
          463                         Bputrune(&b, *r);
          464                 else if((s = findbyrune(*r)) != nil)
          465                         Bprint(&b, "&%s;", s);
          466                 else
          467                         Bprint(&b, "&#%d;", *r);
          468         }
          469         Bflush(&b);
          470 }