URI:
       tlex.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tlex.c (26465B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <draw.h>
            4 #include <ctype.h>
            5 #include <html.h>
            6 #include "impl.h"
            7 
            8 typedef struct TokenSource TokenSource;
            9 struct TokenSource
           10 {
           11         int                        i;                /* index of next byte to use */
           12         uchar*                data;                /* all the data */
           13         int                        edata;        /* data[0:edata] is valid */
           14         int                        chset;        /* one of US_Ascii, etc. */
           15         int                        mtype;        /* TextHtml or TextPlain */
           16 };
           17 
           18 enum {
           19         EOF = -2,
           20         EOB = -1
           21 };
           22 
           23 #define ISNAMCHAR(c)        ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
           24 
           25 #define SMALLBUFSIZE 240
           26 #define BIGBUFSIZE 2000
           27 
           28 /* HTML 4.0 tag names. */
           29 /* Keep sorted, and in correspondence with enum in iparse.h. */
           30 Rune **tagnames;
           31 char *_tagnames[] = {
           32         " ",
           33         "!",
           34         "a",
           35         "abbr",
           36         "acronym",
           37         "address",
           38         "applet",
           39         "area",
           40         "b",
           41         "base",
           42         "basefont",
           43         "bdo",
           44         "big",
           45         "blink",
           46         "blockquote",
           47         "body",
           48         "bq",
           49         "br",
           50         "button",
           51         "caption",
           52         "center",
           53         "cite",
           54         "code",
           55         "col",
           56         "colgroup",
           57         "dd",
           58         "del",
           59         "dfn",
           60         "dir",
           61         "div",
           62         "dl",
           63         "dt",
           64         "em",
           65         "fieldset",
           66         "font",
           67         "form",
           68         "frame",
           69         "frameset",
           70         "h1",
           71         "h2",
           72         "h3",
           73         "h4",
           74         "h5",
           75         "h6",
           76         "head",
           77         "hr",
           78         "html",
           79         "i",
           80         "iframe",
           81         "img",
           82         "input",
           83         "ins",
           84         "isindex",
           85         "kbd",
           86         "label",
           87         "legend",
           88         "li",
           89         "link",
           90         "map",
           91         "menu",
           92         "meta",
           93         "nobr",
           94         "noframes",
           95         "noscript",
           96         "object",
           97         "ol",
           98         "optgroup",
           99         "option",
          100         "p",
          101         "param",
          102         "pre",
          103         "q",
          104         "s",
          105         "samp",
          106         "script",
          107         "select",
          108         "small",
          109         "span",
          110         "strike",
          111         "strong",
          112         "style",
          113         "sub",
          114         "sup",
          115         "table",
          116         "tbody",
          117         "td",
          118         "textarea",
          119         "tfoot",
          120         "th",
          121         "thead",
          122         "title",
          123         "tr",
          124         "tt",
          125         "u",
          126         "ul",
          127         "var"
          128 };
          129 
          130 /* HTML 4.0 attribute names. */
          131 /* Keep sorted, and in correspondence with enum in i.h. */
          132 Rune **attrnames;
          133 char* _attrnames[] = {
          134         "abbr",
          135         "accept-charset",
          136         "access-key",
          137         "action",
          138         "align",
          139         "alink",
          140         "alt",
          141         "archive",
          142         "axis",
          143         "background",
          144         "bgcolor",
          145         "border",
          146         "cellpadding",
          147         "cellspacing",
          148         "char",
          149         "charoff",
          150         "charset",
          151         "checked",
          152         "cite",
          153         "class",
          154         "classid",
          155         "clear",
          156         "code",
          157         "codebase",
          158         "codetype",
          159         "color",
          160         "cols",
          161         "colspan",
          162         "compact",
          163         "content",
          164         "coords",
          165         "data",
          166         "datetime",
          167         "declare",
          168         "defer",
          169         "dir",
          170         "disabled",
          171         "enctype",
          172         "face",
          173         "for",
          174         "frame",
          175         "frameborder",
          176         "headers",
          177         "height",
          178         "href",
          179         "hreflang",
          180         "hspace",
          181         "http-equiv",
          182         "id",
          183         "ismap",
          184         "label",
          185         "lang",
          186         "link",
          187         "longdesc",
          188         "marginheight",
          189         "marginwidth",
          190         "maxlength",
          191         "media",
          192         "method",
          193         "multiple",
          194         "name",
          195         "nohref",
          196         "noresize",
          197         "noshade",
          198         "nowrap",
          199         "object",
          200         "onblur",
          201         "onchange",
          202         "onclick",
          203         "ondblclick",
          204         "onfocus",
          205         "onkeypress",
          206         "onkeyup",
          207         "onload",
          208         "onmousedown",
          209         "onmousemove",
          210         "onmouseout",
          211         "onmouseover",
          212         "onmouseup",
          213         "onreset",
          214         "onselect",
          215         "onsubmit",
          216         "onunload",
          217         "profile",
          218         "prompt",
          219         "readonly",
          220         "rel",
          221         "rev",
          222         "rows",
          223         "rowspan",
          224         "rules",
          225         "scheme",
          226         "scope",
          227         "scrolling",
          228         "selected",
          229         "shape",
          230         "size",
          231         "span",
          232         "src",
          233         "standby",
          234         "start",
          235         "style",
          236         "summary",
          237         "tabindex",
          238         "target",
          239         "text",
          240         "title",
          241         "type",
          242         "usemap",
          243         "valign",
          244         "value",
          245         "valuetype",
          246         "version",
          247         "vlink",
          248         "vspace",
          249         "width"
          250 };
          251 
          252 
          253 /* Character entity to unicode character number map. */
          254 /* Keep sorted by name. */
          255 StringInt *chartab;
          256 AsciiInt _chartab[] = {
          257         {"AElig", 198},
          258         {"Aacute", 193},
          259         {"Acirc", 194},
          260         {"Agrave", 192},
          261         {"Aring", 197},
          262         {"Atilde", 195},
          263         {"Auml", 196},
          264         {"Ccedil", 199},
          265         {"ETH", 208},
          266         {"Eacute", 201},
          267         {"Ecirc", 202},
          268         {"Egrave", 200},
          269         {"Euml", 203},
          270         {"Iacute", 205},
          271         {"Icirc", 206},
          272         {"Igrave", 204},
          273         {"Iuml", 207},
          274         {"Ntilde", 209},
          275         {"Oacute", 211},
          276         {"Ocirc", 212},
          277         {"Ograve", 210},
          278         {"Oslash", 216},
          279         {"Otilde", 213},
          280         {"Ouml", 214},
          281         {"THORN", 222},
          282         {"Uacute", 218},
          283         {"Ucirc", 219},
          284         {"Ugrave", 217},
          285         {"Uuml", 220},
          286         {"Yacute", 221},
          287         {"aacute", 225},
          288         {"acirc", 226},
          289         {"acute", 180},
          290         {"aelig", 230},
          291         {"agrave", 224},
          292         {"alpha", 945},
          293         {"amp", 38},
          294         {"aring", 229},
          295         {"atilde", 227},
          296         {"auml", 228},
          297         {"beta", 946},
          298         {"brvbar", 166},
          299         {"ccedil", 231},
          300         {"cdots", 8943},
          301         {"cedil", 184},
          302         {"cent", 162},
          303         {"chi", 967},
          304         {"copy", 169},
          305         {"curren", 164},
          306         {"ddots", 8945},
          307         {"deg", 176},
          308         {"delta", 948},
          309         {"divide", 247},
          310         {"eacute", 233},
          311         {"ecirc", 234},
          312         {"egrave", 232},
          313         {"emdash", 8212},        /* non-standard but commonly used */
          314         {"emsp", 8195},
          315         {"endash", 8211},        /* non-standard but commonly used */
          316         {"ensp", 8194},
          317         {"epsilon", 949},
          318         {"eta", 951},
          319         {"eth", 240},
          320         {"euml", 235},
          321         {"frac12", 189},
          322         {"frac14", 188},
          323         {"frac34", 190},
          324         {"gamma", 947},
          325         {"gt", 62},
          326         {"iacute", 237},
          327         {"icirc", 238},
          328         {"iexcl", 161},
          329         {"igrave", 236},
          330         {"iota", 953},
          331         {"iquest", 191},
          332         {"iuml", 239},
          333         {"kappa", 954},
          334         {"lambda", 955},
          335         {"laquo", 171},
          336         {"ldquo", 8220},
          337         {"ldots", 8230},
          338         {"lsquo", 8216},
          339         {"lt", 60},
          340         {"macr", 175},
          341         {"mdash", 8212},
          342         {"micro", 181},
          343         {"middot", 183},
          344         {"mu", 956},
          345         {"nbsp", 160},
          346         {"ndash", 8211},
          347         {"not", 172},
          348         {"ntilde", 241},
          349         {"nu", 957},
          350         {"oacute", 243},
          351         {"ocirc", 244},
          352         {"ograve", 242},
          353         {"omega", 969},
          354         {"omicron", 959},
          355         {"ordf", 170},
          356         {"ordm", 186},
          357         {"oslash", 248},
          358         {"otilde", 245},
          359         {"ouml", 246},
          360         {"para", 182},
          361         {"phi", 966},
          362         {"pi", 960},
          363         {"plusmn", 177},
          364         {"pound", 163},
          365         {"psi", 968},
          366         {"quad", 8193},
          367         {"quot", 34},
          368         {"raquo", 187},
          369         {"rdquo", 8221},
          370         {"reg", 174},
          371         {"rho", 961},
          372         {"rsquo", 8217},
          373         {"sect", 167},
          374         {"shy", 173},
          375         {"sigma", 963},
          376         {"sp", 8194},
          377         {"sup1", 185},
          378         {"sup2", 178},
          379         {"sup3", 179},
          380         {"szlig", 223},
          381         {"tau", 964},
          382         {"theta", 952},
          383         {"thinsp", 8201},
          384         {"thorn", 254},
          385         {"times", 215},
          386         {"trade", 8482},
          387         {"uacute", 250},
          388         {"ucirc", 251},
          389         {"ugrave", 249},
          390         {"uml", 168},
          391         {"upsilon", 965},
          392         {"uuml", 252},
          393         {"varepsilon", 8712},
          394         {"varphi", 981},
          395         {"varpi", 982},
          396         {"varrho", 1009},
          397         {"vdots", 8942},
          398         {"vsigma", 962},
          399         {"vtheta", 977},
          400         {"xi", 958},
          401         {"yacute", 253},
          402         {"yen", 165},
          403         {"yuml", 255},
          404         {"zeta", 950}
          405 };
          406 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
          407 
          408 /* Characters Winstart..Winend are those that Windows */
          409 /* uses interpolated into the Latin1 set. */
          410 /* They aren't supposed to appear in HTML, but they do.... */
          411 enum {
          412         Winstart = 127,
          413         Winend = 159
          414 };
          415 
          416 static int        winchars[]= { 8226,        /* 8226 is a bullet */
          417         8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
          418         710, 8240, 352, 8249, 338, 8226, 8226, 8226,
          419         8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
          420         732, 8482, 353, 8250, 339, 8226, 8226, 376};
          421 
          422 static StringInt*        tagtable;                /* initialized from tagnames */
          423 static StringInt*        attrtable;                /* initialized from attrnames */
          424 
          425 static void                lexinit(void);
          426 static int                getplaindata(TokenSource* ts, Token* a, int* pai);
          427 static int                getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
          428 static int                getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
          429 static int                gettag(TokenSource* ts, int starti, Token* a, int* pai);
          430 static Rune*                buftostr(Rune* s, Rune* buf, int j);
          431 static int                comment(TokenSource* ts);
          432 static int                findstr(TokenSource* ts, Rune* s);
          433 static int                ampersand(TokenSource* ts);
          434 /*static int                lowerc(int c); */
          435 static int                getchar(TokenSource* ts);
          436 static void                ungetchar(TokenSource* ts, int c);
          437 static void                backup(TokenSource* ts, int savei);
          438 /*static void                freeinsidetoken(Token* t); */
          439 static void                freeattrs(Attr* ahead);
          440 static Attr*                newattr(int attid, Rune* value, Attr* link);
          441 static int                Tconv(Fmt* f);
          442 
          443 int        dbglex = 0;
          444 static int lexinited = 0;
          445 
          446 static void
          447 lexinit(void)
          448 {
          449         chartab = _cvtstringinttab(_chartab, nelem(_chartab));
          450         tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
          451         tagtable = _makestrinttab(tagnames, Numtags);
          452         attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
          453         attrtable = _makestrinttab(attrnames, Numattrs);
          454         fmtinstall('T', Tconv);
          455         lexinited = 1;
          456 }
          457 
          458 static TokenSource*
          459 newtokensource(uchar* data, int edata, int chset, int mtype)
          460 {
          461         TokenSource*        ans;
          462 
          463         assert(chset == US_Ascii || chset == ISO_8859_1 ||
          464                         chset == UTF_8 || chset == Unicode);
          465         ans = (TokenSource*)emalloc(sizeof(TokenSource));
          466         ans->i = 0;
          467         ans->data = data;
          468         ans->edata = edata;
          469         ans->chset = chset;
          470         ans->mtype = mtype;
          471         return ans;
          472 }
          473 
          474 enum {
          475         ToksChunk = 500
          476 };
          477 
          478 /* Call this to get the tokens. */
          479 /*  The number of returned tokens is returned in *plen. */
          480 Token*
          481 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
          482 {
          483         TokenSource*        ts;
          484         Token*                a;
          485         int        alen;
          486         int        ai;
          487         int        starti;
          488         int        c;
          489         int        tag;
          490 
          491         if(!lexinited)
          492                 lexinit();
          493         ts = newtokensource(data, datalen, chset, mtype);
          494         alen = ToksChunk;
          495         a = (Token*)emalloc(alen * sizeof(Token));
          496         ai = 0;
          497         if(dbglex)
          498                 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
          499         if(ts->mtype == TextHtml){
          500                 for(;;){
          501                         if(ai == alen){
          502                                 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
          503                                 alen += ToksChunk;
          504                         }
          505                         starti = ts->i;
          506                         c = getchar(ts);
          507                         if(c < 0)
          508                                 break;
          509                         if(c == '<'){
          510                                 tag = gettag(ts, starti, a, &ai);
          511                                 if(tag == Tscript){
          512                                         /* special rules for getting Data after.... */
          513                                         starti = ts->i;
          514                                         c = getchar(ts);
          515                                         tag = getscriptdata(ts, c, starti, a, &ai);
          516                                 }
          517                         }
          518                         else
          519                                 tag = getdata(ts, c, starti, a, &ai);
          520                         if(tag == -1)
          521                                 break;
          522                         else if(dbglex > 1 && tag != Comment)
          523                                 fprint(2, "lex: got token %T\n", &a[ai-1]);
          524                 }
          525         }
          526         else {
          527                 /* plain text (non-html) tokens */
          528                 for(;;){
          529                         if(ai == alen){
          530                                 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
          531                                 alen += ToksChunk;
          532                         }
          533                         tag = getplaindata(ts, a, &ai);
          534                         if(tag == -1)
          535                                 break;
          536                         if(dbglex > 1)
          537                                 fprint(2, "lex: got token %T\n", &a[ai]);
          538                 }
          539         }
          540         if(dbglex)
          541                 fprint(2, "lex: returning %d tokens\n", ai);
          542         *plen = ai;
          543         free(ts);
          544         if(ai == 0) {
          545                 free(a);
          546                 return nil;
          547         }
          548         return a;
          549 }
          550 
          551 /* For case where source isn't HTML. */
          552 /* Just make data tokens, one per line (or partial line, */
          553 /* at end of buffer), ignoring non-whitespace control */
          554 /* characters and dumping \r's. */
          555 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
          556 /* Otherwise return -1; */
          557 static int
          558 getplaindata(TokenSource* ts, Token* a, int* pai)
          559 {
          560         Rune*        s;
          561         int        j;
          562         int        starti;
          563         int        c;
          564         Token*        tok;
          565         Rune        buf[BIGBUFSIZE];
          566 
          567         s = nil;
          568         j = 0;
          569         starti = ts->i;
          570         for(c = getchar(ts); c >= 0; c = getchar(ts)){
          571                 if(c < ' '){
          572                         if(isspace(c)){
          573                                 if(c == '\r'){
          574                                         /* ignore it unless no following '\n', */
          575                                         /* in which case treat it like '\n' */
          576                                         c = getchar(ts);
          577                                         if(c != '\n'){
          578                                                 if(c >= 0)
          579                                                         ungetchar(ts, c);
          580                                                 c = '\n';
          581                                         }
          582                                 }
          583                         }
          584                         else
          585                                 c = 0;
          586                 }
          587                 if(c != 0){
          588                         buf[j++] = c;
          589                         if(j == BIGBUFSIZE-1){
          590                                 s = buftostr(s, buf, j);
          591                                 j = 0;
          592                         }
          593                 }
          594                 if(c == '\n')
          595                         break;
          596         }
          597         s = buftostr(s, buf, j);
          598         if(s == nil)
          599                 return -1;
          600         tok = &a[(*pai)++];
          601         tok->tag = Data;
          602         tok->text = s;
          603         tok->attr = nil;
          604         tok->starti = starti;
          605         return Data;
          606 }
          607 
          608 /* Return concatenation of s and buf[0:j] */
          609 /* Frees s. */
          610 static Rune*
          611 buftostr(Rune* s, Rune* buf, int j)
          612 {
          613         Rune *tmp;
          614         buf[j] = 0;
          615         if(s == nil)
          616                 tmp = _Strndup(buf, j);
          617         else
          618                 tmp = _Strdup2(s, buf);
          619         free(s);
          620         return tmp;
          621 }
          622 
          623 /* Gather data up to next start-of-tag or end-of-buffer. */
          624 /* Translate entity references (&amp;). */
          625 /* Ignore non-whitespace control characters and get rid of \r's. */
          626 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
          627 /* Otherwise return -1; */
          628 static int
          629 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
          630 {
          631         Rune*        s;
          632         int        j;
          633         int        c;
          634         Token*        tok;
          635         Rune        buf[BIGBUFSIZE];
          636 
          637         s = nil;
          638         j = 0;
          639         c = firstc;
          640         while(c >= 0){
          641                 if(c == '&'){
          642                         c = ampersand(ts);
          643                         if(c < 0)
          644                                 break;
          645                 }
          646                 else if(c < ' '){
          647                         if(isspace(c)){
          648                                 if(c == '\r'){
          649                                         /* ignore it unless no following '\n', */
          650                                         /* in which case treat it like '\n' */
          651                                         c = getchar(ts);
          652                                         if(c != '\n'){
          653                                                 if(c >= 0)
          654                                                         ungetchar(ts, c);
          655                                                 c = '\n';
          656                                         }
          657                                 }
          658                         }
          659                         else {
          660                                 if(warn)
          661                                         fprint(2, "warning: non-whitespace control character %d ignored\n", c);
          662                                 c = 0;
          663                         }
          664                 }
          665                 else if(c == '<'){
          666                         ungetchar(ts, c);
          667                         break;
          668                 }
          669                 if(c != 0){
          670                         buf[j++] = c;
          671                         if(j == BIGBUFSIZE-1){
          672                                 s = buftostr(s, buf, j);
          673                                 j = 0;
          674                         }
          675                 }
          676                 c = getchar(ts);
          677         }
          678         s = buftostr(s, buf, j);
          679         if(s == nil)
          680                 return -1;
          681         tok = &a[(*pai)++];
          682         tok->tag = Data;
          683         tok->text = s;
          684         tok->attr = nil;
          685         tok->starti = starti;
          686         return Data;
          687 }
          688 
          689 /* The rules for lexing scripts are different (ugh). */
          690 /* Gather up everything until see a </SCRIPT>. */
          691 static int
          692 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
          693 {
          694         Rune*        s;
          695         int        j;
          696         int        tstarti;
          697         int        savei;
          698         int        c;
          699         int        tag;
          700         int        done;
          701         Token*        tok;
          702         Rune        buf[BIGBUFSIZE];
          703 
          704         s = nil;
          705         j = 0;
          706         tstarti = starti;
          707         c = firstc;
          708         done = 0;
          709         while(c >= 0){
          710                 if(c == '<'){
          711                         /* other browsers ignore stuff to end of line after <! */
          712                         savei = ts->i;
          713                         c = getchar(ts);
          714                         if(c == '!'){
          715                                 while(c >= 0 && c != '\n' && c != '\r')
          716                                         c = getchar(ts);
          717                                 if(c == '\r')
          718                                         c = getchar(ts);
          719                                 if(c == '\n')
          720                                         c = getchar(ts);
          721                         }
          722                         else if(c >= 0){
          723                                 backup(ts, savei);
          724                                 tag = gettag(ts, tstarti, a, pai);
          725                                 if(tag == -1)
          726                                         break;
          727                                 if(tag != Comment)
          728                                         (*pai)--;
          729                                 backup(ts, tstarti);
          730                                 if(tag == Tscript + RBRA){
          731                                         done = 1;
          732                                         break;
          733                                 }
          734                                 /* here tag was not </SCRIPT>, so take as regular data */
          735                                 c = getchar(ts);
          736                         }
          737                 }
          738                 if(c < 0)
          739                         break;
          740                 if(c != 0){
          741                         buf[j++] = c;
          742                         if(j == BIGBUFSIZE-1){
          743                                 s = buftostr(s, buf, j);
          744                                 j = 0;
          745                         }
          746                 }
          747                 tstarti = ts->i;
          748                 c = getchar(ts);
          749         }
          750         if(done || ts->i == ts->edata){
          751                 s = buftostr(s, buf, j);
          752                 tok = &a[(*pai)++];
          753                 tok->tag = Data;
          754                 tok->text = s;
          755                 tok->attr = nil;
          756                 tok->starti = starti;
          757                 return Data;
          758         }
          759         backup(ts, starti);
          760         return -1;
          761 }
          762 
          763 /* We've just seen a '<'.  Gather up stuff to closing '>' (if buffer */
          764 /* ends before then, return -1). */
          765 /* If it's a tag, look up the name, gather the attributes, and return */
          766 /* the appropriate token. */
          767 /* Else it's either just plain data or some kind of ignorable stuff: */
          768 /* return Data or Comment as appropriate. */
          769 /* If it's not a Comment, put it in a[*pai] and bump *pai. */
          770 static int
          771 gettag(TokenSource* ts, int starti, Token* a, int* pai)
          772 {
          773         int        rbra;
          774         int        ans;
          775         Attr*        al;
          776         int        nexti;
          777         int        c;
          778         int        ti;
          779         int        afnd;
          780         int        attid;
          781         int        quote;
          782         Rune*        val;
          783         int        nv;
          784         int        i;
          785         int        tag;
          786         Token*        tok;
          787         Rune        buf[BIGBUFSIZE];
          788 
          789         rbra = 0;
          790         nexti = ts->i;
          791         tok = &a[*pai];
          792         tok->tag = Notfound;
          793         tok->text = nil;
          794         tok->attr = nil;
          795         tok->starti = starti;
          796         c = getchar(ts);
          797         if(c == '/'){
          798                 rbra = RBRA;
          799                 c = getchar(ts);
          800         }
          801         if(c < 0)
          802                 goto eob_done;
          803         if(c >= 256 || !isalpha(c)){
          804                 /* not a tag */
          805                 if(c == '!'){
          806                         ans = comment(ts);
          807                         if(ans != -1)
          808                                 return ans;
          809                         goto eob_done;
          810                 }
          811                 else {
          812                         backup(ts, nexti);
          813                         tok->tag = Data;
          814                         tok->text = _Strdup(L(Llt));
          815                         (*pai)++;
          816                         return Data;
          817                 }
          818         }
          819         /* c starts a tagname */
          820         buf[0] = c;
          821         i = 1;
          822         for(;;){
          823                 c = getchar(ts);
          824                 if(c < 0)
          825                         goto eob_done;
          826                 if(!ISNAMCHAR(c))
          827                         break;
          828                 /* if name is bigger than buf it won't be found anyway... */
          829                 if(i < BIGBUFSIZE)
          830                         buf[i++] = c;
          831         }
          832         if(_lookup(tagtable, Numtags, buf, i, &tag))
          833                 tok->tag = tag + rbra;
          834         else
          835                 tok->text = _Strndup(buf, i);        /* for warning print, in build */
          836 
          837         /* attribute gathering loop */
          838         al = nil;
          839         for(;;){
          840                 /* look for "ws name" or "ws name ws = ws val"  (ws=whitespace) */
          841                 /* skip whitespace */
          842 attrloop_continue:
          843                 while(c < 256 && isspace(c)){
          844                         c = getchar(ts);
          845                         if(c < 0)
          846                                 goto eob_done;
          847                 }
          848                 if(c == '>')
          849                         goto attrloop_done;
          850                 if(c == '<'){
          851                         if(warn)
          852                                 fprint(2, "warning: unclosed tag\n");
          853                         ungetchar(ts, c);
          854                         goto attrloop_done;
          855                 }
          856                 if(c >= 256 || !isalpha(c)){
          857                         if(warn)
          858                                 fprint(2, "warning: expected attribute name\n");
          859                         /* skipt to next attribute name */
          860                         for(;;){
          861                                 c = getchar(ts);
          862                                 if(c < 0)
          863                                         goto eob_done;
          864                                 if(c < 256 && isalpha(c))
          865                                         goto attrloop_continue;
          866                                 if(c == '<'){
          867                                         if(warn)
          868                                                 fprint(2, "warning: unclosed tag\n");
          869                                         ungetchar(ts, 60);
          870                                         goto attrloop_done;
          871                                 }
          872                                 if(c == '>')
          873                                         goto attrloop_done;
          874                         }
          875                 }
          876                 /* gather attribute name */
          877                 buf[0] = c;
          878                 i = 1;
          879                 for(;;){
          880                         c = getchar(ts);
          881                         if(c < 0)
          882                                 goto eob_done;
          883                         if(!ISNAMCHAR(c))
          884                                 break;
          885                         if(i < BIGBUFSIZE-1)
          886                                 buf[i++] = c;
          887                 }
          888                 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
          889                 if(warn && !afnd){
          890                         buf[i] = 0;
          891                         fprint(2, "warning: unknown attribute name %S\n", buf);
          892                 }
          893                 /* skip whitespace */
          894                 while(c < 256 && isspace(c)){
          895                         c = getchar(ts);
          896                         if(c < 0)
          897                                 goto eob_done;
          898                 }
          899                 if(c != '='){
          900                         if(afnd)
          901                                 al = newattr(attid, nil, al);
          902                         goto attrloop_continue;
          903                 }
          904                 /*# c is '=' here;  skip whitespace */
          905                 for(;;){
          906                         c = getchar(ts);
          907                         if(c < 0)
          908                                 goto eob_done;
          909                         if(c >= 256 || !isspace(c))
          910                                 break;
          911                 }
          912                 quote = 0;
          913                 if(c == '\'' || c == '"'){
          914                         quote = c;
          915                         c = getchar(ts);
          916                         if(c < 0)
          917                                 goto eob_done;
          918                 }
          919                 val = nil;
          920                 nv = 0;
          921                 for(;;){
          922 valloop_continue:
          923                         if(c < 0)
          924                                 goto eob_done;
          925                         if(c == '>'){
          926                                 if(quote){
          927                                         /* c might be part of string (though not good style) */
          928                                         /* but if line ends before close quote, assume */
          929                                         /* there was an unmatched quote */
          930                                         ti = ts->i;
          931                                         for(;;){
          932                                                 c = getchar(ts);
          933                                                 if(c < 0)
          934                                                         goto eob_done;
          935                                                 if(c == quote){
          936                                                         backup(ts, ti);
          937                                                         buf[nv++] = '>';
          938                                                         if(nv == BIGBUFSIZE-1){
          939                                                                 val = buftostr(val, buf, nv);
          940                                                                 nv = 0;
          941                                                         }
          942                                                         c = getchar(ts);
          943                                                         goto valloop_continue;
          944                                                 }
          945                                                 if(c == '\n'){
          946                                                         if(warn)
          947                                                                 fprint(2, "warning: apparent unmatched quote\n");
          948                                                         backup(ts, ti);
          949                                                         c = '>';
          950                                                         goto valloop_done;
          951                                                 }
          952                                         }
          953                                 }
          954                                 else
          955                                         goto valloop_done;
          956                         }
          957                         if(quote){
          958                                 if(c == quote){
          959                                         c = getchar(ts);
          960                                         if(c < 0)
          961                                                 goto eob_done;
          962                                         goto valloop_done;
          963                                 }
          964                                 if(c == '\r'){
          965                                         c = getchar(ts);
          966                                         goto valloop_continue;
          967                                 }
          968                                 if(c == '\t' || c == '\n')
          969                                         c = ' ';
          970                         }
          971                         else {
          972                                 if(c < 256 && isspace(c))
          973                                         goto valloop_done;
          974                         }
          975                         if(c == '&'){
          976                                 c = ampersand(ts);
          977                                 if(c == -1)
          978                                         goto eob_done;
          979                         }
          980                         buf[nv++] = c;
          981                         if(nv == BIGBUFSIZE-1){
          982                                 val = buftostr(val, buf, nv);
          983                                 nv = 0;
          984                         }
          985                         c = getchar(ts);
          986                 }
          987 valloop_done:
          988                 if(afnd){
          989                         val = buftostr(val, buf, nv);
          990                         al = newattr(attid, val, al);
          991                 }
          992         }
          993 
          994 attrloop_done:
          995         tok->attr = al;
          996         (*pai)++;
          997         return tok->tag;
          998 
          999 eob_done:
         1000         if(warn)
         1001                 fprint(2, "warning: incomplete tag at end of page\n");
         1002         backup(ts, nexti);
         1003         tok->tag = Data;
         1004         tok->text = _Strdup(L(Llt));
         1005         return Data;
         1006 }
         1007 
         1008 /* We've just read a '<!' at position starti, */
         1009 /* so this may be a comment or other ignored section, or it may */
         1010 /* be just a literal string if there is no close before end of file */
         1011 /* (other browsers do that). */
         1012 /* The accepted practice seems to be (note: contrary to SGML spec!): */
         1013 /* If see <!--, look for --> to close, or if none, > to close. */
         1014 /* If see <!(not --), look for > to close. */
         1015 /* If no close before end of file, leave original characters in as literal data. */
         1016 /* */
         1017 /* If we see ignorable stuff, return Comment. */
         1018 /* Else return nil (caller should back up and try again when more data arrives, */
         1019 /* unless at end of file, in which case caller should just make '<' a data token). */
         1020 static int
         1021 comment(TokenSource* ts)
         1022 {
         1023         int        nexti;
         1024         int        havecomment;
         1025         int        c;
         1026 
         1027         nexti = ts->i;
         1028         havecomment = 0;
         1029         c = getchar(ts);
         1030         if(c == '-'){
         1031                 c = getchar(ts);
         1032                 if(c == '-'){
         1033                         if(findstr(ts, L(Larrow)))
         1034                                 havecomment = 1;
         1035                         else
         1036                                 backup(ts, nexti);
         1037                 }
         1038         }
         1039         if(!havecomment){
         1040                 if(c == '>')
         1041                         havecomment = 1;
         1042                 else if(c >= 0){
         1043                         if(findstr(ts, L(Lgt)))
         1044                                 havecomment = 1;
         1045                 }
         1046         }
         1047         if(havecomment)
         1048                 return Comment;
         1049         return -1;
         1050 }
         1051 
         1052 /* Look for string s in token source. */
         1053 /* If found, return 1, with buffer at next char after s, */
         1054 /* else return 0 (caller should back up). */
         1055 static int
         1056 findstr(TokenSource* ts, Rune* s)
         1057 {
         1058         int        c0;
         1059         int        n;
         1060         int        nexti;
         1061         int        i;
         1062         int        c;
         1063 
         1064         c0 = s[0];
         1065         n = runestrlen(s);
         1066         for(;;){
         1067                 c = getchar(ts);
         1068                 if(c < 0)
         1069                         break;
         1070                 if(c == c0){
         1071                         if(n == 1)
         1072                                 return 1;
         1073                         nexti = ts->i;
         1074                         for(i = 1; i < n; i++){
         1075                                 c = getchar(ts);
         1076                                 if(c < 0)
         1077                                         goto mainloop_done;
         1078                                 if(c != s[i])
         1079                                         break;
         1080                         }
         1081                         if(i == n)
         1082                                 return 1;
         1083                         backup(ts, nexti);
         1084                 }
         1085         }
         1086 mainloop_done:
         1087         return 0;
         1088 }
         1089 
         1090 static int
         1091 xdigit(int c)
         1092 {
         1093         if('0' <= c && c <= '9')
         1094                 return c-'0';
         1095         if('a' <= c && c <= 'f')
         1096                 return c-'a'+10;
         1097         if('A' <= c && c <= 'F')
         1098                 return c-'A'+10;
         1099         return -1;
         1100 }
         1101 
         1102 /* We've just read an '&'; look for an entity reference */
         1103 /* name, and if found, return translated char. */
         1104 /* if there is a complete entity name but it isn't known, */
         1105 /* try prefixes (gets around some buggy HTML out there), */
         1106 /* and if that fails, back up to just past the '&' and return '&'. */
         1107 /* If the entity can't be completed in the current buffer, back up */
         1108 /* to the '&' and return -1. */
         1109 static int
         1110 ampersand(TokenSource* ts)
         1111 {
         1112         int        savei;
         1113         int        c;
         1114         int        fnd;
         1115         int        ans;
         1116         int        v;
         1117         int        i;
         1118         int        k;
         1119         Rune        buf[SMALLBUFSIZE];
         1120 
         1121         savei = ts->i;
         1122         c = getchar(ts);
         1123         fnd = 0;
         1124         ans = -1;
         1125         if(c == '#'){
         1126                 c = getchar(ts);
         1127                 v = 0;
         1128                 if(c == 'x'){
         1129                         c = getchar(ts);
         1130                         while((i=xdigit(c)) != -1){
         1131                                 v = v*16 + i;
         1132                                 c = getchar(ts);
         1133                         }
         1134                 }else{
         1135                         while('0' <= c && c <= '9'){
         1136                                 v = v*10 + c - '0';
         1137                                 c = getchar(ts);
         1138                         }
         1139                 }
         1140                 if(c >= 0){
         1141                         if(!(c == ';' || c == '\n' || c == '\r'))
         1142                                 ungetchar(ts, c);
         1143                         c = v;
         1144                         if(c == 160)
         1145                                 c = 160;
         1146                         if(c >= Winstart && c <= Winend){
         1147                                 c = winchars[c - Winstart];
         1148                         }
         1149                         ans = c;
         1150                         fnd = 1;
         1151                 }
         1152         }
         1153         else if(c < 256 && isalpha(c)){
         1154                 buf[0] = c;
         1155                 k = 1;
         1156                 for(;;){
         1157                         c = getchar(ts);
         1158                         if(c < 0)
         1159                                 break;
         1160                         if(ISNAMCHAR(c)){
         1161                                 if(k < SMALLBUFSIZE-1)
         1162                                         buf[k++] = c;
         1163                         }
         1164                         else {
         1165                                 if(!(c == ';' || c == '\n' || c == '\r'))
         1166                                         ungetchar(ts, c);
         1167                                 break;
         1168                         }
         1169                 }
         1170                 if(c >= 0){
         1171                         fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
         1172                         if(!fnd){
         1173                                 /* Try prefixes of s */
         1174                                 if(c == ';' || c == '\n' || c == '\r')
         1175                                         ungetchar(ts, c);
         1176                                 i = k;
         1177                                 while(--k > 0){
         1178                                         fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
         1179                                         if(fnd){
         1180                                                 while(i > k){
         1181                                                         i--;
         1182                                                         ungetchar(ts, buf[i]);
         1183                                                 }
         1184                                                 break;
         1185                                         }
         1186                                 }
         1187                         }
         1188                 }
         1189         }
         1190         if(!fnd){
         1191                 backup(ts, savei);
         1192                 ans = '&';
         1193         }
         1194         return ans;
         1195 }
         1196 
         1197 /* Get next char, obeying ts.chset. */
         1198 /* Returns -1 if no complete character left before current end of data. */
         1199 static int
         1200 getchar(TokenSource* ts)
         1201 {
         1202         uchar*        buf;
         1203         int        c;
         1204         int        n;
         1205         int        ok;
         1206         Rune        r;
         1207 
         1208         if(ts->i >= ts->edata)
         1209                 return -1;
         1210         buf = ts->data;
         1211         c = buf[ts->i];
         1212         switch(ts->chset){
         1213         case ISO_8859_1:
         1214                 if(c >= Winstart && c <= Winend)
         1215                         c = winchars[c - Winstart];
         1216                 ts->i++;
         1217                 break;
         1218         case US_Ascii:
         1219                 if(c > 127){
         1220                         if(warn)
         1221                                 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
         1222                 }
         1223                 ts->i++;
         1224                 break;
         1225         case UTF_8:
         1226                 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
         1227                 n = chartorune(&r, (char*)(buf+ts->i));
         1228                 if(ok){
         1229                         if(warn && c == 0x80)
         1230                                 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
         1231                         ts->i += n;
         1232                         c = r;
         1233                 }
         1234                 else {
         1235                         /* not enough bytes in buf to complete utf-8 char */
         1236                         ts->i = ts->edata;        /* mark "all used" */
         1237                         c = -1;
         1238                 }
         1239                 break;
         1240         case Unicode:
         1241                 if(ts->i < ts->edata - 1){
         1242                         /*standards say most-significant byte first */
         1243                         c = (c << 8)|(buf[ts->i + 1]);
         1244                         ts->i += 2;
         1245                 }
         1246                 else {
         1247                         ts->i = ts->edata;        /* mark "all used" */
         1248                         c = -1;
         1249                 }
         1250                 break;
         1251         }
         1252         return c;
         1253 }
         1254 
         1255 /* Assuming c was the last character returned by getchar, set */
         1256 /* things up so that next getchar will get that same character */
         1257 /* followed by the current 'next character', etc. */
         1258 static void
         1259 ungetchar(TokenSource* ts, int c)
         1260 {
         1261         int        n;
         1262         Rune        r;
         1263         char        a[UTFmax];
         1264 
         1265         n = 1;
         1266         switch(ts->chset){
         1267         case UTF_8:
         1268                 if(c >= 128){
         1269                         r = c;
         1270                         n = runetochar(a, &r);
         1271                 }
         1272                 break;
         1273         case Unicode:
         1274                 n = 2;
         1275                 break;
         1276         }
         1277         ts->i -= n;
         1278 }
         1279 
         1280 /* Restore ts so that it is at the state where the index was savei. */
         1281 static void
         1282 backup(TokenSource* ts, int savei)
         1283 {
         1284         if(dbglex)
         1285                 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
         1286         ts->i = savei;
         1287 }
         1288 
         1289 
         1290 /* Look for value associated with attribute attid in token t. */
         1291 /* If there is one, return 1 and put the value in *pans, */
         1292 /* else return 0. */
         1293 /* If xfer is true, transfer ownership of the string to the caller */
         1294 /* (nil it out here); otherwise, caller must duplicate the answer */
         1295 /* if it needs to save it. */
         1296 /* OK to have pans==0, in which case this is just looking */
         1297 /* to see if token is present. */
         1298 int
         1299 _tokaval(Token* t, int attid, Rune** pans, int xfer)
         1300 {
         1301         Attr*        attr;
         1302 
         1303         attr = t->attr;
         1304         while(attr != nil){
         1305                 if(attr->attid == attid){
         1306                         if(pans != nil)
         1307                                 *pans = attr->value;
         1308                         if(xfer)
         1309                                 attr->value = nil;
         1310                         return 1;
         1311                 }
         1312                 attr = attr->next;
         1313         }
         1314         if(pans != nil)
         1315                 *pans = nil;
         1316         return 0;
         1317 }
         1318 
         1319 static int
         1320 Tconv(Fmt *f)
         1321 {
         1322         Token*        t;
         1323         int        i;
         1324         int        tag;
         1325         char*        srbra;
         1326         Rune*        aname;
         1327         Rune*        tname;
         1328         Attr*        a;
         1329         char        buf[BIGBUFSIZE];
         1330 
         1331         t = va_arg(f->args, Token*);
         1332         if(t == nil)
         1333                 sprint(buf, "<null>");
         1334         else {
         1335                 i = 0;
         1336                 if(dbglex > 1)
         1337                         i = snprint(buf, sizeof(buf), "[%d]", t->starti);
         1338                 tag = t->tag;
         1339                 if(tag == Data){
         1340                         i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
         1341                 }
         1342                 else {
         1343                         srbra = "";
         1344                         if(tag >= RBRA){
         1345                                 tag -= RBRA;
         1346                                 srbra = "/";
         1347                         }
         1348                         tname = tagnames[tag];
         1349                         if(tag == Notfound)
         1350                                 tname = L(Lquestion);
         1351                         i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
         1352                         for(a = t->attr; a != nil; a = a->next){
         1353                                 aname = attrnames[a->attid];
         1354                                 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
         1355                                 if(a->value != nil)
         1356                                         i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
         1357                         }
         1358                         i += snprint(buf+i, sizeof(buf)-i-1, ">");
         1359                 }
         1360                 buf[i] = 0;
         1361         }
         1362         return fmtstrcpy(f, buf);
         1363 }
         1364 
         1365 /* Attrs own their constituent strings, but build may eventually */
         1366 /* transfer some values to its items and nil them out in the Attr. */
         1367 static Attr*
         1368 newattr(int attid, Rune* value, Attr* link)
         1369 {
         1370         Attr* ans;
         1371 
         1372         ans = (Attr*)emalloc(sizeof(Attr));
         1373         ans->attid = attid;
         1374         ans->value = value;
         1375         ans->next = link;
         1376         return ans;
         1377 }
         1378 
         1379 /* Free list of Attrs linked through next field */
         1380 static void
         1381 freeattrs(Attr* ahead)
         1382 {
         1383         Attr* a;
         1384         Attr* nexta;
         1385 
         1386         a = ahead;
         1387         while(a != nil){
         1388                 nexta = a->next;
         1389                 free(a->value);
         1390                 free(a);
         1391                 a = nexta;
         1392         }
         1393 }
         1394 
         1395 /* Free array of Tokens. */
         1396 /* Allocated space might have room for more than n tokens, */
         1397 /* but only n of them are initialized. */
         1398 /* If caller has transferred ownership of constitutent strings */
         1399 /* or attributes, it must have nil'd out the pointers in the Tokens. */
         1400 void
         1401 _freetokens(Token* tarray, int n)
         1402 {
         1403         int i;
         1404         Token* t;
         1405 
         1406         if(tarray == nil)
         1407                 return;
         1408         for(i = 0; i < n; i++){
         1409                 t = &tarray[i];
         1410                 free(t->text);
         1411                 freeattrs(t->attr);
         1412         }
         1413         free(tarray);
         1414 }