URI:
       webdump.c - webdump - HTML to plain-text converter for webpages
  HTML git clone git://git.codemadness.org/webdump
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       webdump.c (66874B)
       ---
            1 #include <errno.h>
            2 #include <limits.h>
            3 #include <stdio.h>
            4 #include <stdarg.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 #include <strings.h>
            8 #include <unistd.h>
            9 
           10 #include "arg.h"
           11 char *argv0;
           12 
           13 #include "tree.h"
           14 #include "xml.h"
           15 
           16 static XMLParser parser;
           17 
           18 #ifndef __OpenBSD__
           19 #define pledge(p1,p2) 0
           20 #endif
           21 
           22 #undef strlcat
           23 size_t strlcat(char *, const char *, size_t);
           24 #undef strlcpy
           25 size_t strlcpy(char *, const char *, size_t);
           26 
           27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
           30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c))
           33 
           34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
           35 
           36 /* URI */
           37 struct uri {
           38         char proto[48];     /* scheme including ":" or "://" */
           39         char userinfo[256]; /* username [:password] */
           40         char host[256];
           41         char port[6];       /* numeric port */
           42         char path[1024];
           43         char query[1024];
           44         char fragment[1024];
           45 };
           46 
           47 /* options */
           48 static int allowansi     = 0;  /* (-a) allow ANSI escape codes */
           49 static int uniqrefs      = 0;  /* (-d) number unique references */
           50 static int showrefinline = 0;  /* (-i) show link reference number inline */
           51 static int showurlinline = 0;  /* (-I) show full link reference inline */
           52 static int showrefbottom = 0;  /* (-l) show link references at the bottom */
           53 static int allowlinewrap = 0;  /* (-r) line-wrapping */
           54 static int termwidth     = 77; /* (-w) terminal width */
           55 static int resources     = 0;  /* (-x) write resources line-by-line to fd 3? */
           56 
           57 enum DisplayType {
           58         DisplayUnknown     = 0,
           59         DisplayInline      = 1 << 0,
           60         DisplayInlineBlock = 1 << 1, /* unused for now */
           61         DisplayBlock       = 1 << 2,
           62         DisplayNone        = 1 << 3,
           63         DisplayPre         = 1 << 4,
           64         DisplayList        = 1 << 5,
           65         DisplayListOrdered = 1 << 6,
           66         DisplayListItem    = 1 << 7,
           67         DisplayTable       = 1 << 8,
           68         DisplayTableRow    = 1 << 9,
           69         DisplayTableCell   = 1 << 10,
           70         DisplayHeader      = 1 << 11,
           71         DisplayDl          = 1 << 12,
           72         DisplayInput       = 1 << 13,
           73         DisplayButton      = 1 << 14,
           74         DisplaySelect      = 1 << 15,
           75         DisplaySelectMulti = 1 << 16,
           76         DisplayOption      = 1 << 17
           77 };
           78 
           79 /* ANSI markup */
           80 enum MarkupType {
           81         MarkupNone        = 0,
           82         MarkupBold        = 1 << 0,
           83         MarkupItalic      = 1 << 1,
           84         MarkupUnderline   = 1 << 2,
           85         MarkupBlink       = 1 << 3, /* lol */
           86         MarkupReverse     = 1 << 4,
           87         MarkupStrike      = 1 << 5
           88 };
           89 
           90 /* String data / memory pool */
           91 typedef struct string {
           92         char   *data;   /* data */
           93         size_t  len;    /* string length */
           94         size_t  bufsiz; /* allocated size */
           95 } String;
           96 
           97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
           98         TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
           99         TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
          100         TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
          101         TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
          102         TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
          103         TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
          104         TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
          105         TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
          106         TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
          107         TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
          108         TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack,
          109         TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
          110 
          111 struct tag {
          112         const char *name;
          113         enum TagId id;
          114         enum DisplayType displaytype;
          115         enum MarkupType markuptype; /* ANSI markup */
          116         enum DisplayType parenttype; /* display type belonging to element */
          117         int isvoid; /* "void" element */
          118         int isoptional; /* optional to close tag */
          119         int margintop; /* newlines when the tag starts */
          120         int marginbottom; /* newlines after the tag ends */
          121         int indent; /* indent in cells */
          122 };
          123 
          124 struct node {
          125         char tagname[256];
          126         struct tag tag;
          127         size_t nchildren; /* child node count */
          128         size_t visnchildren; /* child node count which are visible */
          129         /* attributes */
          130         char id[256];
          131         char classnames[1024];
          132         int indent; /* indent per node, for formatting */
          133         int hasdata; /* tag contains some data, for formatting */
          134 };
          135 
          136 struct selectornode {
          137         char tagname[256];
          138         long index; /* index of node to match on: -1 if not matching on index */
          139         /* attributes */
          140         char id[256];
          141         char classnames[1024];
          142 };
          143 
          144 struct selector {
          145         const char *text;
          146         struct selectornode nodes[32];
          147         int depth;
          148 };
          149 
          150 /* list of selectors */
          151 struct selectors {
          152         struct selector **selectors;
          153         size_t count;
          154 };
          155 
          156 /* RB tree of link references */
          157 struct linkref {
          158         char *type;
          159         enum TagId tagid;
          160         char *url;
          161         int ishidden;
          162         size_t linknr;
          163         RB_ENTRY(linkref) entry;
          164 };
          165 
          166 /* link references and hidden link references */
          167 static struct linkref **visrefs;
          168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
          169 static struct linkref **hiddenrefs;
          170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
          171 
          172 /* compare link by URL for link references RB-tree */
          173 static int
          174 linkrefcmp(struct linkref *r1, struct linkref *r2)
          175 {
          176         return strcmp(r1->url, r2->url);
          177 }
          178 
          179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
          180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
          181 
          182 static const char *str_bullet_item = "* ";
          183 static const char *str_checkbox_checked = "x";
          184 static const char *str_ruler = "-";
          185 static const char *str_radio_checked = "*";
          186 
          187 /* base href, to make URLs absolute */
          188 static char basehrefdoc[4096]; /* buffer for base href in document, if any */
          189 static int basehrefset; /* base href set and can be used? */
          190 static struct uri base; /* parsed current base href */
          191 
          192 /* buffers for some attributes of the current tag */
          193 static String attr_alt; /* alt attribute */
          194 static String attr_checked; /* checked attribute */
          195 static String attr_class; /* class attribute */
          196 static int attr_class_set; /* class attribute is set already */
          197 static String attr_data; /* data attribute */
          198 static String attr_href; /* href attribute */
          199 static String attr_id; /* id attribute */
          200 static int attr_id_set; /* class attribute is set already */
          201 static String attr_src; /* src attribute */
          202 static String attr_type; /* type attribute */
          203 static String attr_value; /* value attribute */
          204 
          205 static String htmldata; /* buffered HTML data near the current tag */
          206 
          207 /* for white-space output handling:
          208    1 = whitespace emitted (suppress repeated), 2 = other characters on this line
          209    Behaviour:
          210    * White-space data before non-whitespace data in tags are ignored on a line.
          211    * Repeated white-space are ignored: a single space (' ') is emitted.
          212 */
          213 static int whitespace_mode;
          214 static int nbytesline; /* bytes on this line */
          215 static int ncells; /* current cell/column count */
          216 static int hadnewline; /* count for repeated newlines */
          217 /* flag for skipping initial white-space in tag: for HTML white-space handling */
          218 static int skipinitialws = 1;
          219 #define DEFAULT_INDENT 2
          220 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
          221 static int indent; /* indent for the current line, in columns */
          222 /* previous output sequential newlines, used for calculating margins between
          223    elements and reducing excessive newlines */
          224 static int currentnewlines;
          225 
          226 /* buffers for line-wrapping (buffer per word boundary) */
          227 static char rbuf[1024];
          228 static int rbuflen;
          229 static int rnbufcells; /* pending cell count to add */
          230 
          231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
          232 static struct node *nodes; /* node tree (one per level is remembered) */
          233 static String *nodes_links; /* keep track of links per node */
          234 static size_t ncapnodes; /* current allocated node capacity */
          235 static int curnode; /* current node depth */
          236 
          237 /* reader / selector mode (-s) */
          238 static int reader_mode;
          239 /* flag if the tags and their children should be ignored in the current context */
          240 static int reader_ignore;
          241 
          242 static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */
          243 static int linewrap; /* allow linewrap in this context */
          244 
          245 /* selector to match (for -s and -u) */
          246 static struct selectors *sel_hide, *sel_show;
          247 
          248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
          249 
          250 /* tag          id             displaytype                       markup           parent           v  o  b  a  i */
          251 static struct tag tags[] = {
          252 { "a",          TagA,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          253 { "address",    TagAddress,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          254 { "area",       TagArea,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          255 { "article",    TagArticle,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          256 { "aside",      TagAside,      DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          257 { "audio",      TagAudio,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          258 { "b",          TagB,          DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          259 { "base",       TagBase,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          260 { "blink",      TagBlink,      DisplayInline,                    MarkupBlink,     0,               0, 0, 0, 0, 0 },
          261 { "blockquote", TagBlockquote, DisplayBlock,                     0,               0,               0, 0, 0, 0, 2 },
          262 { "body",       TagBody,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          263 { "br",         TagBr,         0,                                0,               0,               1, 0, 0, 0, 0 },
          264 { "button",     TagButton,     DisplayInline | DisplayButton,    0,               0,               0, 0, 0, 0, 0 },
          265 { "cite",       TagCite,       DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          266 { "col",        TagCol,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          267 { "colgroup",   TagColgroup,   DisplayInline,                    0,               0,               0, 1, 0, 0, 0 },
          268 { "datalist",   TagDatalist,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          269 { "dd",         TagDd,         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
          270 { "del",        TagDel,        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          271 { "details",    TagDetails,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          272 { "dfn",        TagDfn,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          273 { "dir",        TagDir,        DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          274 { "div",        TagDiv,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          275 { "dl",         TagDl,         DisplayBlock | DisplayDl,         0,               0,               0, 0, 0, 0, 0 },
          276 { "dt",         TagDt,         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
          277 { "em",         TagEm,         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          278 { "embed",      TagEmbed,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          279 { "fieldset",   TagFieldset,   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          280 { "figcaption", TagFigcaption, DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          281 { "figure",     TagFigure,     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
          282 { "footer",     TagFooter,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          283 { "form",       TagForm,       DisplayBlock,                     0,               0,               0, 0, 0, 1, 0 },
          284 { "frame",      TagFrame,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          285 { "h1",         TagH1,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          286 { "h2",         TagH2,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          287 { "h3",         TagH3,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          288 { "h4",         TagH4,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          289 { "h5",         TagH5,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          290 { "h6",         TagH6,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          291 { "head",       TagHead,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          292 { "header",     TagHeader,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          293 { "hr",         TagHr,         DisplayBlock,                     0,               0,               1, 0, 0, 0, 0 },
          294 { "html",       TagHtml,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          295 { "i",          TagI,          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          296 { "iframe",     TagIframe,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          297 { "img",        TagImg,        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
          298 { "input",      TagInput,      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
          299 { "ins",        TagIns,        DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          300 { "label",      TagLabel,      DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          301 { "legend",     TagLegend,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          302 { "li",         TagLi,         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
          303 { "link",       TagLink,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          304 { "main",       TagMain,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          305 { "mark",       TagMark,       DisplayInline,                    MarkupReverse,   0,               0, 0, 0, 0, 0 },
          306 { "menu",       TagMenu,       DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          307 { "meta",       TagMeta,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          308 { "nav",        TagNav,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          309 { "object",     TagObject,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          310 { "ol",         TagOl,         DisplayList | DisplayListOrdered, 0,               0,               0, 0, 1, 1, 0 },
          311 { "option",     TagOption,     DisplayInline | DisplayOption,    0,               0,               0, 1, 0, 0, 0 },
          312 { "p",          TagP,          DisplayBlock,                     0,               0,               0, 1, 1, 1, 0 },
          313 { "param",      TagParam,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          314 { "pre",        TagPre,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 },
          315 { "s",          TagS,          DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          316 { "script",     TagScript,     DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          317 { "search",     TagSearch,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          318 { "section",    TagSection,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          319 { "select",     TagSelect,     DisplayInline | DisplaySelect,    0,               0,               0, 0, 0, 0, 0 },
          320 { "source",     TagSource,     DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          321 { "strike",     TagStrike,     DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          322 { "strong",     TagStrong,     DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          323 { "style",      TagStyle,      DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          324 { "summary",    TagSummary,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          325 { "svg",        TagSvg,        DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          326 { "table",      TagTable,      DisplayTable,                     0,               0,               0, 0, 0, 0, 0 },
          327 { "tbody",      TagTbody,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          328 { "td",         TagTd,         DisplayTableCell,                 0,               DisplayTableRow, 0, 1, 0, 0, 0 },
          329 { "template",   TagTemplate,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          330 { "textarea",   TagTextarea,   DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          331 { "tfoot",      TagTfoot,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          332 { "th",         TagTh,         DisplayTableCell,                 MarkupBold,      DisplayTableRow, 0, 1, 0, 0, 0 },
          333 { "thead",      TagThead,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          334 { "title",      TagTitle,      DisplayBlock,                     0,               0,               0, 0, 0, 1, -DEFAULT_INDENT },
          335 { "tr",         TagTr,         DisplayTableRow,                  0,               DisplayTable,    0, 1, 0, 0, 0 },
          336 { "track",      TagTrack,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          337 { "u",          TagU,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          338 { "ul",         TagUl,         DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          339 { "var",        TagVar,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          340 { "video",      TagVideo,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          341 { "wbr",        TagWbr,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          342 { "xmp",        TagXmp,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 }
          343 };
          344 
          345 /* hint for compilers and static analyzers that a function exits */
          346 #ifndef __dead
          347 #define __dead
          348 #endif
          349 
          350 /* print to stderr, print error message of errno and exit(). */
          351 __dead static void
          352 err(int exitstatus, const char *fmt, ...)
          353 {
          354         va_list ap;
          355         int saved_errno;
          356 
          357         saved_errno = errno;
          358 
          359         fputs("webdump: ", stderr);
          360         if (fmt) {
          361                 va_start(ap, fmt);
          362                 vfprintf(stderr, fmt, ap);
          363                 va_end(ap);
          364                 fputs(": ", stderr);
          365         }
          366         fprintf(stderr, "%s\n", strerror(saved_errno));
          367 
          368         exit(exitstatus);
          369 }
          370 
          371 /* print to stderr and exit(). */
          372 __dead static void
          373 errx(int exitstatus, const char *fmt, ...)
          374 {
          375         va_list ap;
          376 
          377         fputs("webdump: ", stderr);
          378         if (fmt) {
          379                 va_start(ap, fmt);
          380                 vfprintf(stderr, fmt, ap);
          381                 va_end(ap);
          382         }
          383         fputs("\n", stderr);
          384 
          385         exit(exitstatus);
          386 }
          387 
          388 static const char *ignorestate, *endtag;
          389 static int (*getnext)(void);
          390 
          391 /* return a space for all data until some case-insensitive string occurs. This
          392    is used to parse incorrect HTML/XML that contains unescaped HTML in script
          393    or style tags. If you see some </script> tag in a CDATA or comment
          394    section then e-mail W3C and tell them the web is too complex. */
          395 static inline int
          396 getnext_ignore(void)
          397 {
          398         int c;
          399 
          400         if ((c = getnext()) == EOF)
          401                 return EOF;
          402 
          403         if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) {
          404                 ignorestate++;
          405                 if (*ignorestate == '\0') {
          406                         parser.getnext = getnext; /* restore */
          407                         return ' ';
          408                 }
          409         } else {
          410                 ignorestate = endtag; /* no full match: reset to beginning */
          411         }
          412 
          413         return ' '; /* pretend there is just SPACEs */
          414 }
          415 
          416 /* Clear string only; don't free, prevents unnecessary reallocation. */
          417 static void
          418 string_clear(String *s)
          419 {
          420         if (s->data)
          421                 s->data[0] = '\0';
          422         s->len = 0;
          423 }
          424 
          425 static void
          426 string_buffer_realloc(String *s, size_t newlen)
          427 {
          428         size_t alloclen;
          429 
          430         for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          431                 ;
          432         if (!(s->data = realloc(s->data, alloclen)))
          433                 err(1, "realloc");
          434         s->bufsiz = alloclen;
          435 }
          436 
          437 static void
          438 string_append(String *s, const char *data, size_t len)
          439 {
          440         if (!len)
          441                 return;
          442         /* check if allocation is necesary, don't shrink buffer,
          443          * should be more than bufsiz ofcourse. */
          444         if (s->len + len >= s->bufsiz)
          445                 string_buffer_realloc(s, s->len + len + 1);
          446         memcpy(s->data + s->len, data, len);
          447         s->len += len;
          448         s->data[s->len] = '\0';
          449 }
          450 
          451 static char *
          452 estrdup(const char *s)
          453 {
          454         char *p;
          455 
          456         if (!(p = strdup(s)))
          457                 err(1, "strdup");
          458         return p;
          459 }
          460 
          461 static char *
          462 estrndup(const char *s, size_t n)
          463 {
          464         char *p;
          465 
          466         if (!(p = strndup(s, n)))
          467                 err(1, "strndup");
          468         return p;
          469 }
          470 
          471 static void *
          472 erealloc(void *p, size_t siz)
          473 {
          474         if (!(p = realloc(p, siz)))
          475                 err(1, "realloc");
          476 
          477         return p;
          478 }
          479 
          480 static void *
          481 ecalloc(size_t nmemb, size_t size)
          482 {
          483         void *p;
          484 
          485         if (!(p = calloc(nmemb, size)))
          486                 err(1, "calloc");
          487         return p;
          488 }
          489 
          490 /* check if string has a non-empty scheme / protocol part */
          491 static int
          492 uri_hasscheme(const char *s)
          493 {
          494         const char *p = s;
          495 
          496         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          497                        *p == '+' || *p == '-' || *p == '.'; p++)
          498                 ;
          499         /* scheme, except if empty and starts with ":" then it is a path */
          500         return (*p == ':' && p != s);
          501 }
          502 
          503 static int
          504 uri_parse(const char *s, struct uri *u)
          505 {
          506         const char *p = s;
          507         char *endptr;
          508         size_t i;
          509         long l;
          510 
          511         u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
          512         u->path[0] = u->query[0] = u->fragment[0] = '\0';
          513 
          514         /* protocol-relative */
          515         if (*p == '/' && *(p + 1) == '/') {
          516                 p += 2; /* skip "//" */
          517                 goto parseauth;
          518         }
          519 
          520         /* scheme / protocol part */
          521         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          522                        *p == '+' || *p == '-' || *p == '.'; p++)
          523                 ;
          524         /* scheme, except if empty and starts with ":" then it is a path */
          525         if (*p == ':' && p != s) {
          526                 if (*(p + 1) == '/' && *(p + 2) == '/')
          527                         p += 3; /* skip "://" */
          528                 else
          529                         p++; /* skip ":" */
          530 
          531                 if ((size_t)(p - s) >= sizeof(u->proto))
          532                         return -1; /* protocol too long */
          533                 memcpy(u->proto, s, p - s);
          534                 u->proto[p - s] = '\0';
          535 
          536                 if (*(p - 1) != '/')
          537                         goto parsepath;
          538         } else {
          539                 p = s; /* no scheme format, reset to start */
          540                 goto parsepath;
          541         }
          542 
          543 parseauth:
          544         /* userinfo (username:password) */
          545         i = strcspn(p, "@/?#");
          546         if (p[i] == '@') {
          547                 if (i >= sizeof(u->userinfo))
          548                         return -1; /* userinfo too long */
          549                 memcpy(u->userinfo, p, i);
          550                 u->userinfo[i] = '\0';
          551                 p += i + 1;
          552         }
          553 
          554         /* IPv6 address */
          555         if (*p == '[') {
          556                 /* bracket not found, host too short or too long */
          557                 i = strcspn(p, "]");
          558                 if (p[i] != ']' || i < 3)
          559                         return -1;
          560                 i++; /* including "]" */
          561         } else {
          562                 /* domain / host part, skip until port, path or end. */
          563                 i = strcspn(p, ":/?#");
          564         }
          565         if (i >= sizeof(u->host))
          566                 return -1; /* host too long */
          567         memcpy(u->host, p, i);
          568         u->host[i] = '\0';
          569         p += i;
          570 
          571         /* port */
          572         if (*p == ':') {
          573                 p++;
          574                 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
          575                         return -1; /* port too long */
          576                 memcpy(u->port, p, i);
          577                 u->port[i] = '\0';
          578                 /* check for valid port: range 1 - 65535, may be empty */
          579                 errno = 0;
          580                 l = strtol(u->port, &endptr, 10);
          581                 if (i && (errno || *endptr || l <= 0 || l > 65535))
          582                         return -1;
          583                 p += i;
          584         }
          585 
          586 parsepath:
          587         /* path */
          588         if ((i = strcspn(p, "?#")) >= sizeof(u->path))
          589                 return -1; /* path too long */
          590         memcpy(u->path, p, i);
          591         u->path[i] = '\0';
          592         p += i;
          593 
          594         /* query */
          595         if (*p == '?') {
          596                 p++;
          597                 if ((i = strcspn(p, "#")) >= sizeof(u->query))
          598                         return -1; /* query too long */
          599                 memcpy(u->query, p, i);
          600                 u->query[i] = '\0';
          601                 p += i;
          602         }
          603 
          604         /* fragment */
          605         if (*p == '#') {
          606                 p++;
          607                 if ((i = strlen(p)) >= sizeof(u->fragment))
          608                         return -1; /* fragment too long */
          609                 memcpy(u->fragment, p, i);
          610                 u->fragment[i] = '\0';
          611         }
          612 
          613         return 0;
          614 }
          615 
          616 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
          617    Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
          618    Returns 0 on success, -1 on error or truncation. */
          619 static int
          620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
          621 {
          622         char *p;
          623         int c;
          624 
          625         strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
          626 
          627         if (u->proto[0] || u->host[0]) {
          628                 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
          629                 strlcpy(a->host, u->host, sizeof(a->host));
          630                 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
          631                 strlcpy(a->host, u->host, sizeof(a->host));
          632                 strlcpy(a->port, u->port, sizeof(a->port));
          633                 strlcpy(a->path, u->path, sizeof(a->path));
          634                 strlcpy(a->query, u->query, sizeof(a->query));
          635                 return 0;
          636         }
          637 
          638         strlcpy(a->proto, b->proto, sizeof(a->proto));
          639         strlcpy(a->host, b->host, sizeof(a->host));
          640         strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
          641         strlcpy(a->host, b->host, sizeof(a->host));
          642         strlcpy(a->port, b->port, sizeof(a->port));
          643 
          644         if (!u->path[0]) {
          645                 strlcpy(a->path, b->path, sizeof(a->path));
          646         } else if (u->path[0] == '/') {
          647                 strlcpy(a->path, u->path, sizeof(a->path));
          648         } else {
          649                 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
          650                 a->path[1] = '\0';
          651 
          652                 if ((p = strrchr(b->path, '/'))) {
          653                         c = *(++p);
          654                         *p = '\0'; /* temporary NUL-terminate */
          655                         if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
          656                                 return -1;
          657                         *p = c; /* restore */
          658                 }
          659                 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
          660                         return -1;
          661         }
          662 
          663         if (u->path[0] || u->query[0])
          664                 strlcpy(a->query, u->query, sizeof(a->query));
          665         else
          666                 strlcpy(a->query, b->query, sizeof(a->query));
          667 
          668         return 0;
          669 }
          670 
          671 static int
          672 uri_format(char *buf, size_t bufsiz, struct uri *u)
          673 {
          674         return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
          675                 u->proto,
          676                 u->userinfo[0] ? u->userinfo : "",
          677                 u->userinfo[0] ? "@" : "",
          678                 u->host,
          679                 u->port[0] ? ":" : "",
          680                 u->port,
          681                 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
          682                 u->path,
          683                 u->query[0] ? "?" : "",
          684                 u->query,
          685                 u->fragment[0] ? "#" : "",
          686                 u->fragment);
          687 }
          688 
          689 /* compare tag name (case-insensitive) */
          690 static int
          691 tagcmp(const char *s1, const char *s2)
          692 {
          693         return strcasecmp(s1, s2);
          694 }
          695 
          696 /* compare attribute name (case-insensitive) */
          697 static int
          698 attrcmp(const char *s1, const char *s2)
          699 {
          700         return strcasecmp(s1, s2);
          701 }
          702 
          703 static void
          704 rindent(void)
          705 {
          706         int i, total;
          707 
          708         total = indent + defaultindent;
          709         if (total < 0)
          710                 total = 0;
          711         for (i = 0; i < total; i++)
          712                 putchar(' ');
          713 
          714         nbytesline += total;
          715         ncells += total;
          716 }
          717 
          718 static void
          719 emitmarkup(int markuptype)
          720 {
          721         if (!allowansi)
          722                 return;
          723 
          724         if (!markuptype)
          725                 fputs("\033[0m", stdout); /* reset all attributes */
          726 
          727         /* set */
          728         if (markuptype & MarkupBold)
          729                 fputs("\033[1m", stdout);
          730         if (markuptype & MarkupItalic)
          731                 fputs("\033[3m", stdout);
          732         if (markuptype & MarkupUnderline)
          733                 fputs("\033[4m", stdout);
          734         if (markuptype & MarkupBlink)
          735                 fputs("\033[5m", stdout);
          736         if (markuptype & MarkupReverse)
          737                 fputs("\033[7m", stdout);
          738         if (markuptype & MarkupStrike)
          739                 fputs("\033[9m", stdout);
          740 }
          741 
          742 /* flush remaining buffer (containing a word): used for word-wrap handling */
          743 static void
          744 hflush(void)
          745 {
          746         int i;
          747 
          748         if (!rbuflen)
          749                 return;
          750 
          751         if (!nbytesline) {
          752                 if (curmarkup)
          753                         emitmarkup(0);
          754                 rindent();
          755                 /* emit code again per line, needed for GNU/less -R */
          756                 if (curmarkup)
          757                         emitmarkup(curmarkup);
          758         }
          759 
          760         for (i = 0; i < rbuflen; i++)
          761                 putchar(rbuf[i]);
          762 
          763         nbytesline += rbuflen;
          764         ncells += rnbufcells;
          765         rbuflen = 0;
          766         rnbufcells = 0;
          767 }
          768 
          769 static void
          770 printansi(const char *s)
          771 {
          772         size_t len;
          773 
          774         if (!allowansi)
          775                 return;
          776 
          777         if (linewrap) {
          778                 len = strlen(s);
          779                 if (rbuflen + len + 1 >= sizeof(rbuf))
          780                         hflush();
          781                 if (rbuflen + len + 1 < sizeof(rbuf)) {
          782                         memcpy(rbuf + rbuflen, s, len);
          783                         rbuflen += len;
          784                         /* NOTE: nbytesline and ncells are not counted for markup */
          785                 }
          786         } else {
          787                 fputs(s, stdout);
          788         }
          789 }
          790 
          791 static void
          792 setmarkup(int markuptype)
          793 {
          794         if (!allowansi)
          795                 return;
          796 
          797         /* need change? */
          798         if (curmarkup == markuptype)
          799                 return;
          800 
          801         if (!markuptype) {
          802                 printansi("\033[0m"); /* reset all attributes */
          803                 curmarkup = markuptype;
          804                 return;
          805         }
          806 
          807         /* set */
          808         if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
          809                 printansi("\033[1m");
          810         if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
          811                 printansi("\033[3m");
          812         if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline))
          813                 printansi("\033[4m");
          814         if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
          815                 printansi("\033[5m");
          816         if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
          817                 printansi("\033[7m");
          818         if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
          819                 printansi("\033[9m");
          820 
          821         /* unset */
          822         if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
          823                 printansi("\033[22m"); /* reset bold or faint */
          824         if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
          825                 printansi("\033[23m"); /* reset italic */
          826         if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline))
          827                 printansi("\033[24m"); /* reset underline */
          828         if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
          829                 printansi("\033[25m"); /* reset blink */
          830         if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
          831                 printansi("\033[27m"); /* reset reverse */
          832         if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
          833                 printansi("\033[29m"); /* reset strike */
          834 
          835         curmarkup = markuptype;
          836 }
          837 
          838 static void
          839 startmarkup(int markuptype)
          840 {
          841         setmarkup(curmarkup | markuptype);
          842 }
          843 
          844 static void
          845 endmarkup(int markuptype)
          846 {
          847         setmarkup(curmarkup & ~markuptype);
          848 }
          849 
          850 /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1
          851    cell in general.
          852    NOTE: this is of course incorrect since characters can be 2 width aswell,
          853    in the future maybe replace this with wcwidth() or similar */
          854 static int
          855 utfwidth(int c)
          856 {
          857         /* not the start of a codepoint */
          858         if ((c & 0xc0) == 0x80)
          859                 return 0;
          860         /* count TAB as 8 */
          861         if (c == '\t')
          862                 return 8;
          863         return 1;
          864 }
          865 
          866 /* write a character, handling state of repeated newlines, some HTML
          867    white-space rules, indentation and word-wrapping */
          868 static void
          869 hputchar(int c)
          870 {
          871         struct node *cur = &nodes[curnode];
          872         cur->hasdata = 1;
          873 
          874         if (c == '\n') {
          875                 /* previous line had characters, so not a repeated newline */
          876                 if (nbytesline > 0)
          877                         hadnewline = 0;
          878 
          879                 /* start a new line, no chars on this line yet */
          880                 whitespace_mode &= ~2; /* no chars on this line yet */
          881                 nbytesline = 0;
          882                 ncells = 0;
          883 
          884                 if (hadnewline)
          885                         currentnewlines++; /* repeating newlines */
          886                 hadnewline = 1;
          887         } else {
          888                 hadnewline = 0;
          889                 currentnewlines = 0;
          890         }
          891 
          892         /* skip initial/leading white-space */
          893         if (ISSPACE((unsigned char)c)) {
          894                 if (skipinitialws)
          895                         return;
          896         } else {
          897                 skipinitialws = 0;
          898         }
          899 
          900         if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c)))
          901                 return;
          902 
          903         if (!linewrap) {
          904                 if (c == '\n') {
          905                         putchar('\n');
          906                         nbytesline = 0;
          907                         ncells = 0;
          908                 } else {
          909                         if (!nbytesline) {
          910                                 if (curmarkup)
          911                                         emitmarkup(0);
          912                                 rindent();
          913                                 /* emit code again per line, needed for GNU/less -R */
          914                                 if (curmarkup)
          915                                         emitmarkup(curmarkup);
          916                         }
          917                         putchar(c);
          918                         nbytesline++;
          919                         ncells += utfwidth(c);
          920                 }
          921                 return;
          922         }
          923 
          924         /* really too long: the whole word doesn't even fit, flush it */
          925         if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) {
          926                 putchar('\n');
          927                 nbytesline = 0;
          928                 ncells = 0;
          929                 hflush();
          930         }
          931 
          932         if (c == '\n') {
          933                 putchar('\n');
          934                 hflush();
          935                 return;
          936         } else if (ISSPACE((unsigned char)c) || c == '-') {
          937                 if (ncells + rnbufcells >= termwidth) {
          938                         putchar('\n');
          939                         nbytesline = 0;
          940                         ncells = 0;
          941                 }
          942                 rbuf[rbuflen++] = c;
          943                 rnbufcells += utfwidth(c);
          944                 hflush();
          945                 return;
          946         }
          947 
          948         rbuf[rbuflen++] = c;
          949         rnbufcells += utfwidth(c);
          950 }
          951 
          952 /* calculate indentation of current node depth, using the sum of each
          953    indentation per node */
          954 static int
          955 calcindent(void)
          956 {
          957         int i, n = 0;
          958 
          959         for (i = curnode; i >= 0; i--)
          960                 n += nodes[i].indent;
          961 
          962         return n;
          963 }
          964 
          965 static void
          966 hprint(const char *s)
          967 {
          968         for (; *s; ++s)
          969                 hputchar(*s);
          970 }
          971 
          972 /* printf(), max 256 bytes for now */
          973 static void
          974 hprintf(const char *fmt, ...)
          975 {
          976         va_list ap;
          977         char buf[256];
          978 
          979         va_start(ap, fmt);
          980         vsnprintf(buf, sizeof(buf), fmt, ap);
          981         va_end(ap);
          982 
          983         /* use hprint() formatting logic. */
          984         hprint(buf);
          985 }
          986 
          987 static void
          988 newline(void)
          989 {
          990         if (skipinitialws)
          991                 return;
          992         hputchar('\n');
          993 }
          994 
          995 static int
          996 parentcontainerhasdata(int curtype, int n)
          997 {
          998         int i;
          999 
         1000         for (i = n; i >= 0; i--) {
         1001                 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable))
         1002                         break;
         1003                 if (nodes[i].hasdata)
         1004                         return 1;
         1005         }
         1006 
         1007         return 0;
         1008 }
         1009 
         1010 /* start on a newline for the start of a block element or not */
         1011 static void
         1012 startblock(void)
         1013 {
         1014         hflush();
         1015         whitespace_mode &= ~2; /* no characters on this line yet */
         1016         if (nbytesline <= 0)
         1017                 return;
         1018         if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
         1019                 hputchar('\n');
         1020 }
         1021 
         1022 /* start on a newline for the end of a block element or not */
         1023 static void
         1024 endblock(void)
         1025 {
         1026         hflush();
         1027         whitespace_mode &= ~2; /* no characters on this line yet */
         1028         if (nbytesline <= 0)
         1029                 return;
         1030         if (!hadnewline)
         1031                 hputchar('\n');
         1032 }
         1033 
         1034 /* print one character safely: no control characters,
         1035    handle HTML white-space rules */
         1036 static void
         1037 printc(int c)
         1038 {
         1039         if (ISSPACE((unsigned char)c)) {
         1040                 if (whitespace_mode == 2)
         1041                         hputchar(' ');
         1042                 whitespace_mode |= 1;
         1043         } else {
         1044                 whitespace_mode = 2;
         1045                 if (!ISCNTRL((unsigned char)c))
         1046                         hputchar(c);
         1047         }
         1048 }
         1049 
         1050 static void
         1051 printpre(const char *s, size_t len)
         1052 {
         1053         struct node *cur;
         1054         size_t i;
         1055 
         1056         /* reset state of newlines because this data is printed literally */
         1057         hadnewline = 0;
         1058         currentnewlines = 0;
         1059 
         1060         /* skip leading newline */
         1061         i = 0;
         1062         if (skipinitialws) {
         1063                 if (*s == '\n' && i < len) {
         1064                         s++;
         1065                         i++;
         1066                 }
         1067         }
         1068 
         1069         hflush();
         1070 
         1071         skipinitialws = 0;
         1072 
         1073         if (*s) {
         1074                 cur = &nodes[curnode];
         1075                 cur->hasdata = 1;
         1076         }
         1077 
         1078         for (; *s && i < len; s++, i++) {
         1079                 switch (*s) {
         1080                 case '\n':
         1081                         putchar('\n');
         1082                         nbytesline = 0;
         1083                         ncells = 0;
         1084                         break;
         1085                 case '\t':
         1086                         hadnewline = 0;
         1087                         if (!nbytesline) {
         1088                                 if (curmarkup)
         1089                                         emitmarkup(0);
         1090                                 rindent();
         1091                                 /* emit code again per line, needed for GNU/less -R */
         1092                                 if (curmarkup)
         1093                                         emitmarkup(curmarkup);
         1094                         }
         1095 
         1096                         /* TAB to 8 spaces */
         1097                         fputs("        ", stdout);
         1098                         nbytesline += 8;
         1099                         ncells += 8;
         1100                         break;
         1101                 default:
         1102                         if (ISCNTRL((unsigned char)*s))
         1103                                 continue;
         1104 
         1105                         if (!nbytesline) {
         1106                                 if (curmarkup)
         1107                                         emitmarkup(0);
         1108                                 rindent();
         1109                                 /* emit code again per line, needed for GNU/less -R */
         1110                                 if (curmarkup)
         1111                                         emitmarkup(curmarkup);
         1112                         }
         1113 
         1114                         putchar(*s);
         1115                         nbytesline++;
         1116                         /* start of rune: incorrectly assume 1 rune is 1 cell for now */
         1117                         ncells += utfwidth((unsigned char)*s);
         1118                 }
         1119         }
         1120 }
         1121 
         1122 static struct node *
         1123 findparenttype(int cur, int findtype)
         1124 {
         1125         int i;
         1126 
         1127         for (i = cur; i >= 0; i--) {
         1128                 if ((nodes[i].tag.displaytype & findtype))
         1129                         return &nodes[i];
         1130         }
         1131         return NULL;
         1132 }
         1133 
         1134 static int
         1135 isclassmatch(const char *haystack, const char *needle)
         1136 {
         1137         const char *p;
         1138         size_t needlelen;
         1139         size_t matched = 0;
         1140 
         1141         needlelen = strlen(needle);
         1142         for (p = haystack; *p; p++) {
         1143                 if (ISSPACE((unsigned char)*p)) {
         1144                         matched = 0;
         1145                         continue;
         1146                 }
         1147                 if (needle[matched] == *p)
         1148                         matched++;
         1149                 else
         1150                         matched = 0;
         1151                 if (matched == needlelen) {
         1152                         if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1)))
         1153                                 return 1;
         1154                 }
         1155         }
         1156 
         1157         return 0;
         1158 }
         1159 
         1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
         1161    ".class", "#id", "ul li a" */
         1162 static int
         1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
         1164 {
         1165         int depth = 0, len;
         1166         long l;
         1167         const char *s, *start;
         1168         char tmp[256];
         1169         int nameset = 0;
         1170 
         1171         memset(&nodes[0], 0, sizeof(nodes[0]));
         1172         nodes[0].index = -1;
         1173 
         1174         s = sel;
         1175         for (; *s && ISSPACE((unsigned char)*s); s++)
         1176                 ;
         1177 
         1178         start = s;
         1179         for (; ; s++) {
         1180                 /* end of tag */
         1181                 if (!nameset &&
         1182                     (*s == '#' || *s == '.' || *s == '@' ||
         1183                      *s == '\0' || ISSPACE((unsigned char)*s))) {
         1184                         nameset = 1;
         1185                         len = s - start; /* tag name */
         1186                         if (len >= sizeof(tmp))
         1187                                 return 0;
         1188                         if (len)
         1189                                 memcpy(tmp, start, len);
         1190                         tmp[len] = '\0';
         1191 
         1192                         memcpy(nodes[depth].tagname, tmp, len + 1);
         1193                 }
         1194 
         1195                 /* end */
         1196                 if (*s == '\0' || ISSPACE((unsigned char)*s)) {
         1197                         for (; ISSPACE((unsigned char)*s); s++)
         1198                                 ;
         1199                         start = s; /* start of a new tag */
         1200                         depth++;
         1201                         if (depth >= maxnodes)
         1202                                 return 0;
         1203 
         1204                         nameset = 0;
         1205                         memset(&nodes[depth], 0, sizeof(nodes[depth]));
         1206                         nodes[depth].index = -1;
         1207 
         1208                         /* end of selector */
         1209                         if (*s == '\0')
         1210                                 break;
         1211                 }
         1212 
         1213                 /* index */
         1214                 if (*s == '@') {
         1215                         len = strcspn(s + 1, ".#@ \t\n");
         1216                         if (len >= sizeof(tmp))
         1217                                 return 0;
         1218                         memcpy(tmp, s + 1, len);
         1219                         tmp[len] = '\0';
         1220 
         1221                         l = strtol(tmp, NULL, 10);
         1222                         if (l >= 0)
         1223                                 nodes[depth].index = l;
         1224                         s += len;
         1225                         start = s + 1;
         1226                         continue;
         1227                 }
         1228 
         1229                 /* id */
         1230                 if (*s == '#') {
         1231                         len = strcspn(s + 1, ".#@ \t\n");
         1232                         if (len >= sizeof(tmp))
         1233                                 return 0;
         1234                         memcpy(tmp, s + 1, len);
         1235                         tmp[len] = '\0';
         1236                         memcpy(nodes[depth].id, tmp, len + 1);
         1237                         s += len;
         1238                         start = s + 1;
         1239                         continue;
         1240                 }
         1241 
         1242                 /* class */
         1243                 if (*s == '.') {
         1244                         len = strcspn(s + 1, ".#@ \t\n");
         1245                         if (len >= sizeof(tmp))
         1246                                 return 0;
         1247                         memcpy(tmp, s + 1, len);
         1248                         tmp[len] = '\0';
         1249                         /* allow only one classname for now */
         1250                         memcpy(nodes[depth].classnames, tmp, len + 1);
         1251                         s += len;
         1252                         start = s + 1;
         1253                         continue;
         1254                 }
         1255         }
         1256 
         1257         return depth;
         1258 }
         1259 
         1260 static struct selector *
         1261 newselector(const char *q)
         1262 {
         1263         struct selector *sel;
         1264         int r;
         1265 
         1266         sel = ecalloc(1, sizeof(*sel));
         1267         sel->text = estrdup(q);
         1268 
         1269         r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
         1270         if (r <= 0) {
         1271                 free(sel);
         1272                 return NULL;
         1273         }
         1274         sel->depth = r;
         1275 
         1276         return sel;
         1277 }
         1278 
         1279 static struct selectors *
         1280 compileselectors(const char *q)
         1281 {
         1282         struct selectors *sels = NULL;
         1283         struct selector *sel;
         1284         const char *start;
         1285         char *qe;
         1286         int count = 0;
         1287         size_t siz;
         1288 
         1289         sels = ecalloc(1, sizeof(*sels));
         1290 
         1291         start = q;
         1292         for (; ; q++) {
         1293                 if (*q == ',' || *q == '\0') {
         1294                         qe = estrndup(start, q - start);
         1295                         sel = newselector(qe);
         1296                         free(qe);
         1297 
         1298                         /* add new selector */
         1299                         siz = (count + 1) * sizeof(struct selector *);
         1300                         sels->selectors = erealloc(sels->selectors, siz);
         1301                         sels->selectors[count] = sel;
         1302                         count++;
         1303 
         1304                         if (*q == '\0')
         1305                                 break;
         1306                         start = q + 1;
         1307                 }
         1308         }
         1309         sels->count = count;
         1310 
         1311         return sels;
         1312 }
         1313 
         1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
         1315    ".class", "#id", "ul li a" */
         1316 static int
         1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth)
         1318 {
         1319         int d, md = 0;
         1320 
         1321         for (d = 0; d <= maxdepth; d++) {
         1322                 /* tag matched? */
         1323                 if (sel->nodes[md].tagname[0] &&
         1324                     strcasecmp(sel->nodes[md].tagname, root[d].tagname))
         1325                         continue; /* no */
         1326 
         1327                 /* id matched? */
         1328                 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id))
         1329                         continue; /* no */
         1330 
         1331                 /* class matched, for now allow only one classname in the selector,
         1332                    matching multiple classnames */
         1333                 if (sel->nodes[md].classnames[0] &&
         1334                     !isclassmatch(root[d].classnames, sel->nodes[md].classnames))
         1335                         continue; /* no */
         1336 
         1337                 /* index matched */
         1338                 if (sel->nodes[md].index != -1 &&
         1339                     (d == 0 ||
         1340                     root[d - 1].nchildren == 0 ||
         1341                     sel->nodes[md].index != root[d - 1].nchildren - 1))
         1342                         continue;
         1343 
         1344                 md++;
         1345                 /* all matched of one selector */
         1346                 if (md == sel->depth)
         1347                         return 1;
         1348         }
         1349 
         1350         return 0;
         1351 }
         1352 
         1353 static int
         1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
         1355 {
         1356         struct selector *sel;
         1357         int i;
         1358 
         1359         for (i = 0; i < sels->count; i++) {
         1360                 sel = sels->selectors[i];
         1361                 if (iscssmatch(sel, root, maxdepth))
         1362                         return 1;
         1363         }
         1364         return 0;
         1365 }
         1366 
         1367 static void
         1368 handleinlinealt(void)
         1369 {
         1370         struct node *cur;
         1371         char *start, *s, *e;
         1372 
         1373         /* do not show the alt text if the element is hidden */
         1374         cur = &nodes[curnode];
         1375         if (cur->tag.displaytype & DisplayNone)
         1376                 return;
         1377 
         1378         /* show img alt attribute as text. */
         1379         if (attr_alt.len) {
         1380                 start = attr_alt.data;
         1381                 e = attr_alt.data + attr_alt.len;
         1382 
         1383                 for (s = start; s < e; s++)
         1384                         printc((unsigned char)*s);
         1385                 hflush();
         1386         } else if (cur->tag.id == TagImg && !showurlinline) {
         1387                 /* if there is no alt text and no URL is shown inline, then
         1388                    show "[IMG]" to indicate there was an image there */
         1389                 hprint("[IMG]");
         1390         }
         1391 }
         1392 
         1393 /* lookup a link reference by URL in the red-black tree */
         1394 static struct linkref *
         1395 findlinkref(const char *url)
         1396 {
         1397         struct linkref find;
         1398 
         1399         find.url = (char *)url;
         1400 
         1401         return RB_FIND(linkreftree, &linkrefhead, &find);
         1402 }
         1403 
         1404 /* add a link reference. Returns the added link reference, or the existing link
         1405    reference if links are deduplicated */
         1406 static struct linkref *
         1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden)
         1408 {
         1409         struct linkref *link;
         1410         size_t linknr;
         1411 
         1412         /* if links are deduplicates return the existing link */
         1413         if (uniqrefs && (link = findlinkref(url)))
         1414                 return link;
         1415 
         1416         if (tagid == TagA)
         1417                 _type = "link";
         1418 
         1419         link = ecalloc(1, sizeof(*link));
         1420 
         1421         if (!ishidden) {
         1422                 linknr = ++nvisrefs;
         1423                 if (nvisrefs >= ncapvisrefs) {
         1424                         ncapvisrefs += 256; /* greedy alloc */
         1425                         visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs);
         1426                 }
         1427                 visrefs[linknr - 1] = link; /* add pointer to list */
         1428         } else {
         1429                 linknr = ++nhiddenrefs;
         1430                 if (nhiddenrefs >= ncaphiddenrefs) {
         1431                         ncaphiddenrefs += 256; /* greedy alloc */
         1432                         hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs);
         1433                 }
         1434                 hiddenrefs[linknr - 1] = link; /* add pointer to list */
         1435         }
         1436 
         1437         link->url = estrdup(url);
         1438         link->type = estrdup(_type);
         1439         link->tagid = tagid;
         1440         link->ishidden = ishidden;
         1441         link->linknr = linknr;
         1442 
         1443         /* add to tree: the tree is only used for checking unique link references */
         1444         if (uniqrefs)
         1445                 RB_INSERT(linkreftree, &linkrefhead, link);
         1446 
         1447         return link;
         1448 }
         1449 
         1450 static void
         1451 handleinlinelink(void)
         1452 {
         1453         struct uri newuri, olduri;
         1454         struct node *cur;
         1455         char buf[4096], *url;
         1456         int r;
         1457 
         1458         if (!showrefbottom && !showrefinline && !showurlinline && !resources)
         1459                 return; /* there is no need to collect the reference */
         1460 
         1461         if (!attr_href.len && !attr_src.len && !attr_data.len)
         1462                 return; /* there is no reference */
         1463 
         1464         /* by default use the original URL */
         1465         if (attr_src.len)
         1466                 url = attr_src.data;
         1467         else if (attr_href.len)
         1468                 url = attr_href.data;
         1469         else
         1470                 url = attr_data.data;
         1471 
         1472         if (!url)
         1473                 return;
         1474 
         1475         /* Not an absolute URL yet: try to make it absolute.
         1476            If it is not possible use the relative URL */
         1477         if (!uri_hasscheme(url) && basehrefset &&
         1478             uri_parse(url, &olduri) != -1 &&
         1479             uri_makeabs(&newuri, &olduri, &base) != -1 &&
         1480             newuri.proto[0]) {
         1481                 r = uri_format(buf, sizeof(buf), &newuri);
         1482                 if (r >= 0 && (size_t)r < sizeof(buf))
         1483                         url = buf;
         1484         }
         1485 
         1486         if (!url[0])
         1487                 return;
         1488 
         1489         cur = &nodes[curnode];
         1490 
         1491         if (!(cur->tag.displaytype & DisplayNone)) {
         1492                 string_clear(&nodes_links[curnode]);
         1493                 string_append(&nodes_links[curnode], url, strlen(url));
         1494         }
         1495 
         1496         /* add hidden links directly to the reference,
         1497            the order doesn't matter */
         1498         if (cur->tag.displaytype & DisplayNone)
         1499                 addlinkref(url, cur->tag.name, cur->tag.id, 1);
         1500 }
         1501 
         1502 static void
         1503 printlinkrefs(void)
         1504 {
         1505         struct linkref *ref;
         1506         size_t i;
         1507 
         1508         if (!nvisrefs && !nhiddenrefs)
         1509                 return;
         1510 
         1511         if (resources) {
         1512                 for (i = 0; i < nvisrefs; i++) {
         1513                         ref = visrefs[i];
         1514                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1515                 }
         1516                 for (i = 0; i < nhiddenrefs; i++) {
         1517                         ref = hiddenrefs[i];
         1518                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1519                 }
         1520         }
         1521 
         1522         printf("\nReferences\n\n");
         1523 
         1524         for (i = 0; i < nvisrefs; i++) {
         1525                 ref = visrefs[i];
         1526                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1527         }
         1528 
         1529         if (nhiddenrefs > 0)
         1530                 printf("\n\nHidden references\n\n");
         1531         /* hidden links don't have a link number, just count them */
         1532         for (i = 0; i < nhiddenrefs; i++) {
         1533                 ref = hiddenrefs[i];
         1534                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1535         }
         1536 }
         1537 
         1538 /* size to grow node capacity (greedy) */
         1539 #define NODE_CAP_INC 16
         1540 
         1541 /* increase node depth, allocate space for nodes if needed */
         1542 static void
         1543 incnode(void)
         1544 {
         1545         size_t i;
         1546 
         1547         curnode++;
         1548 
         1549         if (curnode >= MAX_NODE_DEPTH)
         1550                 errx(1, "max node depth reached: %d", curnode);
         1551 
         1552         if (curnode >= ncapnodes) {
         1553                 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC));
         1554                 nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC));
         1555 
         1556                 /* clear new region */
         1557                 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC);
         1558                 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC);
         1559 
         1560                 for (i = 0; i < ncapnodes; i++)
         1561                         nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
         1562 
         1563                 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
         1564                         nodes[i].tag.displaytype = DisplayInline;
         1565                         nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
         1566                 }
         1567 
         1568                 ncapnodes += NODE_CAP_INC; /* greedy alloc */
         1569         }
         1570 }
         1571 
         1572 static void
         1573 xmldatastart(XMLParser *p)
         1574 {
         1575 }
         1576 
         1577 static void
         1578 xmldataend(XMLParser *p)
         1579 {
         1580         struct node *cur;
         1581         char *start, *s, *e;
         1582 
         1583         if (!htmldata.data || !htmldata.len)
         1584                 return;
         1585 
         1586         cur = &nodes[curnode];
         1587 
         1588         if (reader_ignore || (cur->tag.displaytype & DisplayNone)) {
         1589                 /* print nothing */
         1590         } else if ((cur->tag.displaytype & DisplayPre) ||
         1591                    findparenttype(curnode - 1, DisplayPre)) {
         1592                 printpre(htmldata.data, htmldata.len);
         1593         } else {
         1594                 start = htmldata.data;
         1595                 e = htmldata.data + htmldata.len;
         1596 
         1597                 for (s = start; s < e; s++)
         1598                         printc((unsigned char)*s);
         1599         }
         1600 
         1601         string_clear(&htmldata);
         1602 }
         1603 
         1604 static void
         1605 xmldata(XMLParser *p, const char *data, size_t datalen)
         1606 {
         1607         struct node *cur;
         1608 
         1609         if (reader_ignore)
         1610                 return;
         1611 
         1612         cur = &nodes[curnode];
         1613         if (cur->tag.displaytype & DisplayNone)
         1614                 return;
         1615 
         1616         string_append(&htmldata, data, datalen);
         1617 }
         1618 
         1619 static void
         1620 xmldataentity(XMLParser *p, const char *data, size_t datalen)
         1621 {
         1622         struct node *cur;
         1623         char buf[8];
         1624         int len;
         1625 
         1626         if (reader_ignore)
         1627                 return;
         1628 
         1629         cur = &nodes[curnode];
         1630         if (cur->tag.displaytype & DisplayNone)
         1631                 return;
         1632 
         1633         len = xml_entitytostr(data, buf, sizeof(buf));
         1634         if (len > 0)
         1635                 xmldata(p, buf, (size_t)len);
         1636         else
         1637                 xmldata(p, data, datalen);
         1638 }
         1639 
         1640 static void
         1641 xmlcdatastart(XMLParser *p)
         1642 {
         1643         xmldatastart(p);
         1644 }
         1645 
         1646 static void
         1647 xmlcdataend(XMLParser *p)
         1648 {
         1649         xmldataend(p); /* treat CDATA as data */
         1650 }
         1651 
         1652 static void
         1653 xmlcdata(XMLParser *p, const char *data, size_t datalen)
         1654 {
         1655         xmldata(p, data, datalen); /* treat CDATA as data */
         1656 }
         1657 
         1658 /* lookup function to compare tag name (case-insensitive) for sort functions */
         1659 static int
         1660 findtagcmp(const void *v1, const void *v2)
         1661 {
         1662         struct tag *t1 = (struct tag *)v1;
         1663         struct tag *t2 = (struct tag *)v2;
         1664 
         1665         return strcasecmp(t1->name, t2->name);
         1666 }
         1667 
         1668 /* binary search tag by tag name */
         1669 static struct tag *
         1670 findtag(const char *t)
         1671 {
         1672         struct tag find = { 0 };
         1673 
         1674         find.name = t;
         1675 
         1676         return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp);
         1677 }
         1678 
         1679 static void
         1680 handleendtag(struct tag *tag)
         1681 {
         1682         int i, marginbottom;
         1683 
         1684         if (tag->displaytype & DisplayNone)
         1685                 return;
         1686         if (reader_ignore)
         1687                 return;
         1688 
         1689         if (tag->displaytype & (DisplayButton | DisplayOption)) {
         1690                 hputchar(']');
         1691                 hflush();
         1692         }
         1693 
         1694         if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow |
         1695                 DisplayList | DisplayListItem | DisplayPre)) {
         1696                 endblock(); /* break line if needed */
         1697         }
         1698 
         1699         /* when a list ends and its not inside a list add an extra bottom margin */
         1700         marginbottom = tag->marginbottom;
         1701 
         1702         if (marginbottom > 0) {
         1703                 if (tag->displaytype & DisplayList) {
         1704                         if (findparenttype(curnode - 1, DisplayList))
         1705                                 marginbottom--;
         1706                 }
         1707         }
         1708 
         1709         if (marginbottom > 0) {
         1710                 hflush();
         1711                 for (i = currentnewlines; i < marginbottom; i++) {
         1712                         putchar('\n');
         1713                         nbytesline = 0;
         1714                         ncells = 0;
         1715                         currentnewlines++;
         1716                 }
         1717                 hadnewline = 1;
         1718         }
         1719 }
         1720 
         1721 static void
         1722 endnode(struct node *cur)
         1723 {
         1724         struct linkref *ref;
         1725         int i, ishidden;
         1726 
         1727         /* set a flag indicating the element and its parent containers have data.
         1728            This is used for some formatting */
         1729         if (cur->hasdata) {
         1730                 for (i = curnode; i >= 0; i--)
         1731                         nodes[i].hasdata = 1;
         1732         }
         1733 
         1734         endmarkup(cur->tag.markuptype);
         1735 
         1736         ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone);
         1737 
         1738         /* add link and show the link number in the visible order */
         1739         if (!ishidden && nodes_links[curnode].len > 0) {
         1740                 ref = addlinkref(nodes_links[curnode].data,
         1741                         cur->tag.name, cur->tag.id, ishidden);
         1742 
         1743                 if (showrefinline || showurlinline) {
         1744                         hflush();
         1745                         startmarkup(MarkupReverse);
         1746                 }
         1747 
         1748                 if (showrefinline)
         1749                         hprintf("[%zu]", ref->linknr);
         1750                 if (showurlinline) {
         1751                         if (ref->tagid == TagA)
         1752                                 hprintf("[%s]", ref->url);
         1753                         else
         1754                                 hprintf("[%s: %s]", ref->type, ref->url);
         1755                 }
         1756                 if (showrefinline || showurlinline) {
         1757                         endmarkup(MarkupReverse);
         1758                         hflush();
         1759                 }
         1760         }
         1761 
         1762         handleendtag(&(cur->tag));
         1763 }
         1764 
         1765 static void
         1766 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
         1767 {
         1768         struct tag *found, *tag;
         1769         enum TagId childs[16];
         1770         size_t nchilds;
         1771         int i, j, k, nchildfound, parenttype;
         1772 
         1773         /* match tag and lookup metadata */
         1774         /* ignore closing of void elements, like </br>, which is not allowed */
         1775         if ((found = findtag(t))) {
         1776                 if (!isshort && found->isvoid)
         1777                         return;
         1778         }
         1779 
         1780         /* TODO: implement more complete optional tag handling.
         1781            in reality the optional tag rules are more complex, see:
         1782            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1783 
         1784         nchilds = 0;
         1785         nchildfound = 0;
         1786         parenttype = 0; /* by default, seek until the root */
         1787 
         1788         if (found && found->displaytype & DisplayPre) {
         1789                 skipinitialws = 0; /* do not skip white-space, for margins */
         1790         } else if (found && found->displaytype & DisplayList) {
         1791                 childs[0] = TagLi;
         1792                 nchilds = 1;
         1793                 parenttype = DisplayList;
         1794         } else if (found && found->displaytype & DisplayTableRow) {
         1795                 childs[0] = TagTd;
         1796                 nchilds = 1;
         1797                 parenttype = DisplayTableRow;
         1798         } else if (found && found->displaytype & DisplayTable) {
         1799                 childs[0] = TagTd;
         1800                 nchilds = 1;
         1801                 parenttype = DisplayTable;
         1802         } else if (found && found->displaytype & DisplaySelect) {
         1803                 childs[0] = TagOption;
         1804                 nchilds = 1;
         1805                 parenttype = DisplaySelect;
         1806         } else if (found && found->displaytype & DisplayDl) {
         1807                 childs[0] = TagP;
         1808                 childs[1] = TagDd;
         1809                 childs[2] = TagDt;
         1810                 nchilds = 3;
         1811                 parenttype = DisplayDl;
         1812         } else if (found && found->displaytype & DisplayBlock) {
         1813                 childs[0] = TagP;
         1814                 nchilds = 1;
         1815                 parenttype = 0; /* seek until the root */
         1816         }
         1817 
         1818         if (nchilds > 0) {
         1819                 for (i = curnode; i >= 0; i--) {
         1820                         if (nchildfound)
         1821                                 break;
         1822                         if ((nodes[i].tag.displaytype & parenttype))
         1823                                 break;
         1824                         for (j = 0; j < nchilds; j++) {
         1825                                 if (nodes[i].tag.id == childs[j]) {
         1826                                         /* fake closing the previous tags */
         1827                                         for (k = curnode; k >= i; k--)
         1828                                                 endnode(&nodes[k]);
         1829                                         curnode = k;
         1830                                         nchildfound = 1;
         1831                                         break;
         1832                                 }
         1833                         }
         1834                 }
         1835         }
         1836 
         1837         /* if the current closing tag matches the current open tag */
         1838         if (nodes[curnode].tag.name &&
         1839             !tagcmp(nodes[curnode].tag.name, t)) {
         1840                 endnode(&nodes[curnode]);
         1841                 if (curnode)
         1842                         curnode--;
         1843         } else {
         1844                 /* ... else lookup the first matching start tag. This is also
         1845                    for handling optional closing tags */
         1846                 tag = NULL;
         1847                 for (i = curnode; i >= 0; i--) {
         1848                         if (nodes[i].tag.name &&
         1849                             !tagcmp(nodes[i].tag.name, t)) {
         1850                                 endnode(&nodes[i]);
         1851                                 curnode = i > 0 ? i - 1 : 0;
         1852                                 tag = &nodes[i].tag;
         1853                                 break;
         1854                         }
         1855                 }
         1856                 /* unmatched closing tag found */
         1857                 if (!tag && found)
         1858                         handleendtag(found);
         1859         }
         1860         indent = calcindent();
         1861 
         1862 #if 0
         1863         /* check if linewrap is enabled, but currently is disabled and needs to
         1864            be restored */
         1865         if (allowlinewrap && !linewrap) {
         1866                 tag = NULL;
         1867                 for (i = curnode; i >= 0; i--) {
         1868                         if (nodes[i].tag.id == TagTable) {
         1869                                 tag = &nodes[i].tag;
         1870                                 break;
         1871                         }
         1872                 }
         1873                 if (!tag)
         1874                         linewrap = allowlinewrap;
         1875         }
         1876 #endif
         1877 
         1878         /* restore markup of the tag we are in now */
         1879         startmarkup(nodes[curnode].tag.markuptype);
         1880 
         1881         /* check if the current node still matches the visible selector */
         1882         if (reader_mode && sel_show && !reader_ignore) {
         1883                 if (!iscssmatchany(sel_show, nodes, curnode)) {
         1884                         reader_ignore = 1;
         1885                         newline();
         1886                 }
         1887         }
         1888 }
         1889 
         1890 static void
         1891 xmltagstart(XMLParser *p, const char *t, size_t tl)
         1892 {
         1893         struct tag *found;
         1894         struct node *cur;
         1895         enum TagId tagid;
         1896         enum TagId childs[16];
         1897         size_t nchilds;
         1898         char *s;
         1899         int i, j, k, nchildfound, parenttype;
         1900 
         1901         cur = &nodes[curnode];
         1902 
         1903         string_clear(&attr_alt);
         1904         string_clear(&attr_checked);
         1905         string_clear(&attr_class);
         1906         attr_class_set = 0;
         1907         string_clear(&attr_data);
         1908         string_clear(&attr_href);
         1909         string_clear(&attr_id);
         1910         attr_id_set = 0;
         1911         string_clear(&attr_src);
         1912         string_clear(&attr_type);
         1913         string_clear(&attr_value);
         1914 
         1915         /* match tag and lookup metadata */
         1916         found = findtag(t);
         1917 
         1918         /* TODO: implement more complete optional tag handling.
         1919            in reality the optional tag rules are more complex, see:
         1920            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1921 
         1922         nchilds = 0;
         1923         nchildfound = 0;
         1924         parenttype = 0; /* by default, seek until the root */
         1925 
         1926         /* if optional tag <p> is open and a list element is found, close </p>. */
         1927         if (found && found->displaytype & DisplayList) {
         1928                 /* not inside a list */
         1929                 childs[0] = TagP;
         1930                 nchilds = 1;
         1931                 parenttype = DisplayList;
         1932         } else if (found && found->isoptional) {
         1933                 tagid = found->id;
         1934                 if (tagid == TagLi) {
         1935                         childs[0] = TagLi;
         1936                         nchilds = 1;
         1937                         parenttype = DisplayList;
         1938                 } else if (tagid == TagTd) {
         1939                         childs[0] = TagTd;
         1940                         nchilds = 1;
         1941                         parenttype = DisplayTableRow;
         1942                 } else if (tagid == TagTr) {
         1943                         childs[0] = TagTr;
         1944                         nchilds = 1;
         1945                         parenttype = DisplayTable;
         1946                 } else if (tagid == TagP) {
         1947                         childs[0] = TagP;
         1948                         nchilds = 1;
         1949                         parenttype = 0; /* seek until the root */
         1950                 } else if (tagid == TagOption) {
         1951                         childs[0] = TagOption;
         1952                         nchilds = 1;
         1953                         parenttype = DisplaySelect;
         1954                 } else if (tagid == TagDt) {
         1955                         childs[0] = TagDd;
         1956                         nchilds = 1;
         1957                         parenttype = DisplayDl;
         1958                 } else if (tagid == TagDd) {
         1959                         childs[0] = TagDd;
         1960                         childs[1] = TagDt;
         1961                         nchilds = 2;
         1962                         parenttype = DisplayDl;
         1963                 } else if (tagid == cur->tag.id) {
         1964                         /* fake closing the previous tag if it is the same and repeated */
         1965                         xmltagend(p, t, tl, 0);
         1966                 }
         1967         } else if (found && found->displaytype & DisplayBlock) {
         1968                 /* check if we have an open "<p>" tag */
         1969                 childs[0] = TagP;
         1970                 childs[1] = TagDl;
         1971                 nchilds = 2;
         1972                 parenttype = DisplayDl;
         1973         }
         1974 
         1975         if (nchilds > 0) {
         1976                 for (i = curnode; i >= 0; i--) {
         1977                         if (nchildfound)
         1978                                 break;
         1979                         if ((nodes[i].tag.displaytype & parenttype))
         1980                                 break;
         1981                         for (j = 0; j < nchilds; j++) {
         1982                                 if (nodes[i].tag.id == childs[j]) {
         1983                                         /* fake closing the previous tags */
         1984                                         for (k = curnode; k >= i; k--)
         1985                                                 xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
         1986                                         nchildfound = 1;
         1987                                         break;
         1988                                 }
         1989                         }
         1990                 }
         1991         }
         1992 
         1993         incnode();
         1994         string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */
         1995         cur = &nodes[curnode];
         1996         memset(cur, 0, sizeof(*cur)); /* clear / reset node */
         1997         /* tag defaults */
         1998         cur->tag.displaytype = DisplayInline;
         1999         cur->tag.name = cur->tagname; /* assign fixed-size buffer */
         2000         strlcpy(cur->tagname, t, sizeof(cur->tagname));
         2001 
         2002         /* force to lowercase */
         2003         for (s = cur->tagname; *s; s++)
         2004                 *s = TOLOWER((unsigned char)*s);
         2005 
         2006         /* matched tag: copy tag information to current node */
         2007         if (found)
         2008                 memcpy(&(cur->tag), found, sizeof(*found));
         2009 
         2010         /* if parent tag is hidden then hide itself too */
         2011         if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone))
         2012                 cur->tag.displaytype |= DisplayNone;
         2013 }
         2014 
         2015 static void
         2016 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
         2017 {
         2018         struct tag *found;
         2019         enum TagId tagid;
         2020         struct node *cur, *parent;
         2021         int i, margintop;
         2022 
         2023         /* match tag and lookup metadata */
         2024         tagid = 0;
         2025         if ((found = findtag(t)))
         2026                 tagid = found->id;
         2027 
         2028         /* temporary replace the callback except the reader and end of tag
         2029            restore the context once we receive the same ignored tag in the
         2030            end tag handler */
         2031         if (tagid == TagScript) {
         2032                 ignorestate = endtag = "</script>";
         2033                 getnext = p->getnext; /* for restore */
         2034                 p->getnext = getnext_ignore;
         2035                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2036                 return;
         2037         } else if (tagid == TagStyle) {
         2038                 ignorestate = endtag = "</style>";
         2039                 getnext = p->getnext; /* for restore */
         2040                 p->getnext = getnext_ignore;
         2041                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2042                 return;
         2043         }
         2044 
         2045 #if 0
         2046         /* disable line-wrapping inside tables */
         2047         if (tagid == TagTable)
         2048                 linewrap = 0;
         2049 #endif
         2050 
         2051         cur = &nodes[curnode];
         2052 
         2053         /* copy attributes if set */
         2054         if (attr_id.len)
         2055                 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
         2056         else
         2057                 cur->id[0] = '\0';
         2058         if (attr_class.len)
         2059                 strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames));
         2060         else
         2061                 cur->classnames[0] = '\0';
         2062 
         2063         /* parent node */
         2064         if (curnode > 0) {
         2065                 parent = &nodes[curnode - 1];
         2066                 parent->nchildren++; /* increase child node count */
         2067                 /* count visible childnodes */
         2068                 if (!(cur->tag.displaytype & DisplayNone))
         2069                         parent->visnchildren++;
         2070         } else {
         2071                 parent = NULL;
         2072         }
         2073 
         2074         if (reader_mode && sel_show && reader_ignore &&
         2075             iscssmatchany(sel_show, nodes, curnode))
         2076                 reader_ignore = 0;
         2077 
         2078         /* hide element */
         2079         if (reader_mode && sel_hide &&
         2080             iscssmatchany(sel_hide, nodes, curnode))
         2081                 cur->tag.displaytype |= DisplayNone;
         2082 
         2083         /* indent for this tag */
         2084         cur->indent = cur->tag.indent;
         2085 
         2086         if (!reader_ignore) {
         2087                 /* add link reference, print links and alt text */
         2088                 handleinlinelink();
         2089                 handleinlinealt();
         2090         }
         2091 
         2092         /* <select><option> */
         2093         if ((cur->tag.displaytype & DisplayOption) && parent) {
         2094                 /* <select multiple>: show all options */
         2095                 if (parent->tag.displaytype & DisplaySelectMulti)
         2096                         cur->tag.displaytype |= DisplayBlock;
         2097                 else if (parent->nchildren > 1) /* show the first item as selected */
         2098                         cur->tag.displaytype |= DisplayNone; /* else hide */
         2099         }
         2100 
         2101         if (cur->tag.displaytype & DisplayNone)
         2102                 return;
         2103 
         2104         if (reader_ignore)
         2105                 return;
         2106 
         2107         indent = calcindent();
         2108 
         2109         if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre |
         2110                 DisplayTable | DisplayTableRow |
         2111                 DisplayList | DisplayListItem))) {
         2112                 startblock(); /* break line if needed */
         2113         }
         2114 
         2115         if (cur->tag.displaytype & (DisplayButton | DisplayOption)) {
         2116                 hflush();
         2117                 hputchar('[');
         2118         }
         2119 
         2120         margintop = cur->tag.margintop;
         2121         if (cur->tag.displaytype & (DisplayList)) {
         2122                 for (i = curnode - 1; i >= 0; i--) {
         2123                         if (nodes[i].tag.displaytype & DisplayList)
         2124                                 break;
         2125                         if (!(nodes[i].tag.displaytype & DisplayListItem))
         2126                                 continue;
         2127                         if (nodes[i].hasdata && margintop > 0) {
         2128                                 margintop--;
         2129                                 break;
         2130                         }
         2131                 }
         2132         } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) {
         2133                 if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) {
         2134                         if (margintop > 0)
         2135                                 margintop--;
         2136                 }
         2137         }
         2138 
         2139         if (margintop > 0) {
         2140                 hflush();
         2141                 for (i = currentnewlines; i < margintop; i++) {
         2142                         putchar('\n');
         2143                         nbytesline = 0;
         2144                         ncells = 0;
         2145                         currentnewlines++;
         2146                 }
         2147                 hadnewline = 1;
         2148         }
         2149 
         2150         if (cur->tag.displaytype & DisplayPre) {
         2151                 skipinitialws = 1;
         2152         } else if (cur->tag.displaytype & DisplayTableCell) {
         2153                 if (parent && parent->visnchildren > 1)
         2154                         hputchar('\t');
         2155         } else if (cur->tag.displaytype & DisplayListItem) {
         2156                 /* find first parent node and ordered numbers or unordered */
         2157                 if (parent) {
         2158                         skipinitialws = 0;
         2159 
         2160                         /* print bullet, add columns to indentation level */
         2161                         if (parent->tag.displaytype & DisplayListOrdered) {
         2162                                 hprintf("%4zu. ", parent->nchildren);
         2163                                 cur->indent = 6;
         2164                                 indent += cur->indent; /* align to number */
         2165                         } else if (parent->tag.displaytype & DisplayList) {
         2166                                 hprint(str_bullet_item);
         2167                                 cur->indent = 2;
         2168                                 indent += 2; /* align to bullet */
         2169                         }
         2170                 }
         2171                 skipinitialws = 0;
         2172         } else if (cur->tag.displaytype & DisplayInput) {
         2173                 if (!attr_type.len) {
         2174                         hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
         2175                 } else if (!strcasecmp(attr_type.data, "button")) {
         2176                         hprintf("[%s]", attr_value.len ? attr_value.data : "");
         2177                 } else if (!strcasecmp(attr_type.data, "submit")) {
         2178                         hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query");
         2179                 } else if (!strcasecmp(attr_type.data, "reset")) {
         2180                         hprintf("[%s]", attr_value.len ? attr_value.data : "Reset");
         2181                 } else if (!strcasecmp(attr_type.data, "checkbox")) {
         2182                         hprintf("[%s]",
         2183                                 attr_checked.len &&
         2184                                 !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
         2185                 } else if (!strcasecmp(attr_type.data, "radio")) {
         2186                         hprintf("[%s]",
         2187                                 attr_checked.len &&
         2188                                 !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
         2189                 } else if (!strcasecmp(attr_type.data, "hidden")) {
         2190                         cur->tag.displaytype |= DisplayNone;
         2191                 } else {
         2192                         /* unrecognized / default case is text */
         2193                         hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
         2194                 }
         2195         }
         2196 
         2197         startmarkup(cur->tag.markuptype);
         2198 
         2199         /* do not count data such as an item bullet as part of the data for
         2200            the node */
         2201         cur->hasdata = 0;
         2202 
         2203         if (tagid == TagHr) { /* ruler */
         2204                 i = termwidth - indent - defaultindent;
         2205                 for (; i > 0; i--)
         2206                         hprint(str_ruler);
         2207                 cur->hasdata = 1; /* treat <hr/> as data */
         2208         } else if (tagid == TagBr) {
         2209                 hflush();
         2210                 hadnewline = 0; /* forced newline */
         2211                 hputchar('\n');
         2212                 cur->hasdata = 1; /* treat <br/> as data */
         2213         }
         2214 
         2215         /* autoclose tags, such as <br>, pretend we are <br/> */
         2216         if (!isshort && cur->tag.isvoid)
         2217                 xmltagend(p, t, tl, 1); /* pretend close of short tag */
         2218 }
         2219 
         2220 static void
         2221 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
         2222         size_t nl, const char *v, size_t vl)
         2223 {
         2224         struct node *cur;
         2225         enum TagId tagid;
         2226 
         2227         cur = &nodes[curnode];
         2228         tagid = cur->tag.id;
         2229 
         2230         /* hide tags with attribute aria-hidden or hidden */
         2231         if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
         2232                 cur->tag.displaytype |= DisplayNone;
         2233 
         2234         if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */
         2235                 string_append(&attr_class, v, vl);
         2236         else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */
         2237                 string_append(&attr_id, v, vl);
         2238         else if (!attrcmp(n, "type"))
         2239                 string_append(&attr_type, v, vl);
         2240         else if (!attrcmp(n, "value"))
         2241                 string_append(&attr_value, v, vl);
         2242 
         2243         /* <base href="..." /> */
         2244         if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
         2245                 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
         2246 
         2247         if (tagid == TagA && !attrcmp(n, "href"))
         2248                 string_append(&attr_href, v, vl);
         2249 
         2250         if (tagid == TagSelect && !attrcmp(n, "multiple"))
         2251                 cur->tag.displaytype |= DisplaySelectMulti;
         2252 
         2253         if (tagid == TagObject && !attrcmp(n, "data"))
         2254                 string_append(&attr_data, v, vl);
         2255 
         2256         /* show img alt attribute as text. */
         2257         if (tagid == TagImg && !attrcmp(n, "alt"))
         2258                 string_append(&attr_alt, v, vl);
         2259 
         2260         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
         2261                 string_append(&attr_checked, v, vl);
         2262 
         2263         /* src attribute */
         2264         switch (tagid) {
         2265         case TagAudio:
         2266         case TagEmbed:
         2267         case TagFrame:
         2268         case TagIframe:
         2269         case TagImg:
         2270         case TagSource:
         2271         case TagTrack:
         2272         case TagVideo:
         2273                 if (!attrcmp(n, "src"))
         2274                         string_append(&attr_src, v, vl);
         2275                 break;
         2276         default:
         2277                 break;
         2278         }
         2279 }
         2280 
         2281 static void
         2282 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
         2283         size_t nl, const char *v, size_t vl)
         2284 {
         2285         char buf[8];
         2286         int len;
         2287 
         2288         len = xml_entitytostr(v, buf, sizeof(buf));
         2289         if (len > 0)
         2290                 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
         2291         else
         2292                 xmlattr(p, t, tl, n, nl, v, vl);
         2293 }
         2294 
         2295 static void
         2296 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
         2297         size_t nl)
         2298 {
         2299         struct node *cur;
         2300         enum TagId tagid;
         2301 
         2302         cur = &nodes[curnode];
         2303         tagid = cur->tag.id;
         2304 
         2305         if (!attr_class_set && !attrcmp(n, "class"))
         2306                 attr_class_set = 1;
         2307         else if (!attr_id_set && !attrcmp(n, "id"))
         2308                 attr_id_set = 1;
         2309 
         2310         /* set base URL, if it is set it cannot be overwritten again */
         2311         if (!basehrefset && basehrefdoc[0] &&
         2312             tagid == TagBase && !attrcmp(n, "href"))
         2313                 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
         2314 
         2315         /* if attribute checked is set but it has no value then set it to "checked" */
         2316         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
         2317                 string_append(&attr_checked, "checked", sizeof("checked") - 1);
         2318 }
         2319 
         2320 static void
         2321 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
         2322         size_t nl)
         2323 {
         2324         struct node *cur;
         2325         enum TagId tagid;
         2326 
         2327         cur = &nodes[curnode];
         2328         tagid = cur->tag.id;
         2329 
         2330         if (!attrcmp(n, "alt"))
         2331                 string_clear(&attr_alt);
         2332         else if (!attrcmp(n, "checked"))
         2333                 string_clear(&attr_checked);
         2334         else if (!attr_class_set && !attrcmp(n, "class"))
         2335                 string_clear(&attr_class);
         2336         else if (!attrcmp(n, "data"))
         2337                 string_clear(&attr_data);
         2338         else if (!attrcmp(n, "href"))
         2339                 string_clear(&attr_href);
         2340         else if (!attr_id_set && !attrcmp(n, "id"))
         2341                 string_clear(&attr_id);
         2342         else if (!attrcmp(n, "src"))
         2343                 string_clear(&attr_src);
         2344         else if (!attrcmp(n, "type"))
         2345                 string_clear(&attr_type);
         2346         else if (!attrcmp(n, "value"))
         2347                 string_clear(&attr_value);
         2348 
         2349         if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
         2350                 basehrefdoc[0] = '\0';
         2351 }
         2352 
         2353 static void
         2354 usage(void)
         2355 {
         2356         fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
         2357         exit(1);
         2358 }
         2359 
         2360 int
         2361 main(int argc, char **argv)
         2362 {
         2363         char *basehref;
         2364 
         2365         if (pledge("stdio", NULL) < 0)
         2366                 err(1, "pledge");
         2367 
         2368         ARGBEGIN {
         2369         case '8':
         2370                 str_bullet_item = "\xe2\x80\xa2 ";
         2371                 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */
         2372                 break;
         2373         case 'a':
         2374                 allowansi = !allowansi;
         2375                 break;
         2376         case 'b':
         2377                 basehref = EARGF(usage());
         2378                 if (uri_parse(basehref, &base) == -1 ||
         2379                     !base.proto[0])
         2380                         usage();
         2381                 basehrefset = 1;
         2382                 break;
         2383         case 'd':
         2384                 uniqrefs = !uniqrefs;
         2385                 break;
         2386         case 'i':
         2387                 showrefinline = !showrefinline;
         2388                 break;
         2389         case 'I':
         2390                 showurlinline = !showurlinline;
         2391                 break;
         2392         case 'l':
         2393                 showrefbottom = !showrefbottom;
         2394                 break;
         2395         case 'r':
         2396                 allowlinewrap = !allowlinewrap;
         2397                 break;
         2398         case 's':
         2399                 sel_show = compileselectors(EARGF(usage()));
         2400                 /* switch to reader/selector mode, ignore all data except when matched */
         2401                 reader_mode = 1;
         2402                 reader_ignore = 1;
         2403                 break;
         2404         case 'u':
         2405                 sel_hide = compileselectors(EARGF(usage()));
         2406                 /* switch to reader/selector mode */
         2407                 reader_mode = 1;
         2408                 break;
         2409         case 'w':
         2410                 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
         2411                         usage();
         2412                 break;
         2413         case 'x':
         2414                 resources = !resources;
         2415                 break;
         2416         default:
         2417                 usage();
         2418         } ARGEND
         2419 
         2420         linewrap = allowlinewrap;
         2421 
         2422         /* initial nodes */
         2423         ncapnodes = NODE_CAP_INC;
         2424         nodes = ecalloc(ncapnodes, sizeof(*nodes));
         2425         nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
         2426 
         2427         parser.xmlattrstart = xmlattrstart;
         2428         parser.xmlattr = xmlattr;
         2429         parser.xmlattrentity = xmlattrentity;
         2430         parser.xmlattrend = xmlattrend;
         2431         parser.xmlcdatastart = xmlcdatastart;
         2432         parser.xmlcdata = xmlcdata;
         2433         parser.xmlcdataend = xmlcdataend;
         2434         parser.xmldatastart = xmldatastart;
         2435         parser.xmldata = xmldata;
         2436         parser.xmldataentity = xmldataentity;
         2437         parser.xmldataend = xmldataend;
         2438         parser.xmltagstart = xmltagstart;
         2439         parser.xmltagstartparsed = xmltagstartparsed;
         2440         parser.xmltagend = xmltagend;
         2441 
         2442         parser.getnext = getchar;
         2443         xml_parse(&parser);
         2444 
         2445         hflush();
         2446         if (ncells > 0)
         2447                 newline();
         2448 
         2449         if (showrefbottom || resources)
         2450                 printlinkrefs();
         2451 
         2452         hflush();
         2453         setmarkup(0);
         2454 
         2455         return 0;
         2456 }