webdump.c - webdump - HTML to plain-text converter for webpages
HTML git clone git://git.codemadness.org/webdump
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
webdump.c (66874B)
---
1 #include <errno.h>
2 #include <limits.h>
3 #include <stdio.h>
4 #include <stdarg.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <strings.h>
8 #include <unistd.h>
9
10 #include "arg.h"
11 char *argv0;
12
13 #include "tree.h"
14 #include "xml.h"
15
16 static XMLParser parser;
17
18 #ifndef __OpenBSD__
19 #define pledge(p1,p2) 0
20 #endif
21
22 #undef strlcat
23 size_t strlcat(char *, const char *, size_t);
24 #undef strlcpy
25 size_t strlcpy(char *, const char *, size_t);
26
27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c))
33
34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
35
36 /* URI */
37 struct uri {
38 char proto[48]; /* scheme including ":" or "://" */
39 char userinfo[256]; /* username [:password] */
40 char host[256];
41 char port[6]; /* numeric port */
42 char path[1024];
43 char query[1024];
44 char fragment[1024];
45 };
46
47 /* options */
48 static int allowansi = 0; /* (-a) allow ANSI escape codes */
49 static int uniqrefs = 0; /* (-d) number unique references */
50 static int showrefinline = 0; /* (-i) show link reference number inline */
51 static int showurlinline = 0; /* (-I) show full link reference inline */
52 static int showrefbottom = 0; /* (-l) show link references at the bottom */
53 static int allowlinewrap = 0; /* (-r) line-wrapping */
54 static int termwidth = 77; /* (-w) terminal width */
55 static int resources = 0; /* (-x) write resources line-by-line to fd 3? */
56
57 enum DisplayType {
58 DisplayUnknown = 0,
59 DisplayInline = 1 << 0,
60 DisplayInlineBlock = 1 << 1, /* unused for now */
61 DisplayBlock = 1 << 2,
62 DisplayNone = 1 << 3,
63 DisplayPre = 1 << 4,
64 DisplayList = 1 << 5,
65 DisplayListOrdered = 1 << 6,
66 DisplayListItem = 1 << 7,
67 DisplayTable = 1 << 8,
68 DisplayTableRow = 1 << 9,
69 DisplayTableCell = 1 << 10,
70 DisplayHeader = 1 << 11,
71 DisplayDl = 1 << 12,
72 DisplayInput = 1 << 13,
73 DisplayButton = 1 << 14,
74 DisplaySelect = 1 << 15,
75 DisplaySelectMulti = 1 << 16,
76 DisplayOption = 1 << 17
77 };
78
79 /* ANSI markup */
80 enum MarkupType {
81 MarkupNone = 0,
82 MarkupBold = 1 << 0,
83 MarkupItalic = 1 << 1,
84 MarkupUnderline = 1 << 2,
85 MarkupBlink = 1 << 3, /* lol */
86 MarkupReverse = 1 << 4,
87 MarkupStrike = 1 << 5
88 };
89
90 /* String data / memory pool */
91 typedef struct string {
92 char *data; /* data */
93 size_t len; /* string length */
94 size_t bufsiz; /* allocated size */
95 } String;
96
97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
98 TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
99 TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
100 TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
101 TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
102 TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
103 TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
104 TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
105 TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
106 TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
107 TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
108 TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack,
109 TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
110
111 struct tag {
112 const char *name;
113 enum TagId id;
114 enum DisplayType displaytype;
115 enum MarkupType markuptype; /* ANSI markup */
116 enum DisplayType parenttype; /* display type belonging to element */
117 int isvoid; /* "void" element */
118 int isoptional; /* optional to close tag */
119 int margintop; /* newlines when the tag starts */
120 int marginbottom; /* newlines after the tag ends */
121 int indent; /* indent in cells */
122 };
123
124 struct node {
125 char tagname[256];
126 struct tag tag;
127 size_t nchildren; /* child node count */
128 size_t visnchildren; /* child node count which are visible */
129 /* attributes */
130 char id[256];
131 char classnames[1024];
132 int indent; /* indent per node, for formatting */
133 int hasdata; /* tag contains some data, for formatting */
134 };
135
136 struct selectornode {
137 char tagname[256];
138 long index; /* index of node to match on: -1 if not matching on index */
139 /* attributes */
140 char id[256];
141 char classnames[1024];
142 };
143
144 struct selector {
145 const char *text;
146 struct selectornode nodes[32];
147 int depth;
148 };
149
150 /* list of selectors */
151 struct selectors {
152 struct selector **selectors;
153 size_t count;
154 };
155
156 /* RB tree of link references */
157 struct linkref {
158 char *type;
159 enum TagId tagid;
160 char *url;
161 int ishidden;
162 size_t linknr;
163 RB_ENTRY(linkref) entry;
164 };
165
166 /* link references and hidden link references */
167 static struct linkref **visrefs;
168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
169 static struct linkref **hiddenrefs;
170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
171
172 /* compare link by URL for link references RB-tree */
173 static int
174 linkrefcmp(struct linkref *r1, struct linkref *r2)
175 {
176 return strcmp(r1->url, r2->url);
177 }
178
179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
181
182 static const char *str_bullet_item = "* ";
183 static const char *str_checkbox_checked = "x";
184 static const char *str_ruler = "-";
185 static const char *str_radio_checked = "*";
186
187 /* base href, to make URLs absolute */
188 static char basehrefdoc[4096]; /* buffer for base href in document, if any */
189 static int basehrefset; /* base href set and can be used? */
190 static struct uri base; /* parsed current base href */
191
192 /* buffers for some attributes of the current tag */
193 static String attr_alt; /* alt attribute */
194 static String attr_checked; /* checked attribute */
195 static String attr_class; /* class attribute */
196 static int attr_class_set; /* class attribute is set already */
197 static String attr_data; /* data attribute */
198 static String attr_href; /* href attribute */
199 static String attr_id; /* id attribute */
200 static int attr_id_set; /* class attribute is set already */
201 static String attr_src; /* src attribute */
202 static String attr_type; /* type attribute */
203 static String attr_value; /* value attribute */
204
205 static String htmldata; /* buffered HTML data near the current tag */
206
207 /* for white-space output handling:
208 1 = whitespace emitted (suppress repeated), 2 = other characters on this line
209 Behaviour:
210 * White-space data before non-whitespace data in tags are ignored on a line.
211 * Repeated white-space are ignored: a single space (' ') is emitted.
212 */
213 static int whitespace_mode;
214 static int nbytesline; /* bytes on this line */
215 static int ncells; /* current cell/column count */
216 static int hadnewline; /* count for repeated newlines */
217 /* flag for skipping initial white-space in tag: for HTML white-space handling */
218 static int skipinitialws = 1;
219 #define DEFAULT_INDENT 2
220 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
221 static int indent; /* indent for the current line, in columns */
222 /* previous output sequential newlines, used for calculating margins between
223 elements and reducing excessive newlines */
224 static int currentnewlines;
225
226 /* buffers for line-wrapping (buffer per word boundary) */
227 static char rbuf[1024];
228 static int rbuflen;
229 static int rnbufcells; /* pending cell count to add */
230
231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
232 static struct node *nodes; /* node tree (one per level is remembered) */
233 static String *nodes_links; /* keep track of links per node */
234 static size_t ncapnodes; /* current allocated node capacity */
235 static int curnode; /* current node depth */
236
237 /* reader / selector mode (-s) */
238 static int reader_mode;
239 /* flag if the tags and their children should be ignored in the current context */
240 static int reader_ignore;
241
242 static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */
243 static int linewrap; /* allow linewrap in this context */
244
245 /* selector to match (for -s and -u) */
246 static struct selectors *sel_hide, *sel_show;
247
248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
249
250 /* tag id displaytype markup parent v o b a i */
251 static struct tag tags[] = {
252 { "a", TagA, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
253 { "address", TagAddress, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
254 { "area", TagArea, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
255 { "article", TagArticle, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
256 { "aside", TagAside, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
257 { "audio", TagAudio, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
258 { "b", TagB, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
259 { "base", TagBase, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
260 { "blink", TagBlink, DisplayInline, MarkupBlink, 0, 0, 0, 0, 0, 0 },
261 { "blockquote", TagBlockquote, DisplayBlock, 0, 0, 0, 0, 0, 0, 2 },
262 { "body", TagBody, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
263 { "br", TagBr, 0, 0, 0, 1, 0, 0, 0, 0 },
264 { "button", TagButton, DisplayInline | DisplayButton, 0, 0, 0, 0, 0, 0, 0 },
265 { "cite", TagCite, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
266 { "col", TagCol, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
267 { "colgroup", TagColgroup, DisplayInline, 0, 0, 0, 1, 0, 0, 0 },
268 { "datalist", TagDatalist, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
269 { "dd", TagDd, DisplayBlock, 0, 0, 0, 1, 0, 0, 4 },
270 { "del", TagDel, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
271 { "details", TagDetails, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
272 { "dfn", TagDfn, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
273 { "dir", TagDir, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
274 { "div", TagDiv, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
275 { "dl", TagDl, DisplayBlock | DisplayDl, 0, 0, 0, 0, 0, 0, 0 },
276 { "dt", TagDt, DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 },
277 { "em", TagEm, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
278 { "embed", TagEmbed, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
279 { "fieldset", TagFieldset, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
280 { "figcaption", TagFigcaption, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
281 { "figure", TagFigure, DisplayBlock, 0, 0, 0, 0, 1, 1, 4 },
282 { "footer", TagFooter, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
283 { "form", TagForm, DisplayBlock, 0, 0, 0, 0, 0, 1, 0 },
284 { "frame", TagFrame, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
285 { "h1", TagH1, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
286 { "h2", TagH2, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
287 { "h3", TagH3, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
288 { "h4", TagH4, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
289 { "h5", TagH5, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
290 { "h6", TagH6, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
291 { "head", TagHead, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
292 { "header", TagHeader, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
293 { "hr", TagHr, DisplayBlock, 0, 0, 1, 0, 0, 0, 0 },
294 { "html", TagHtml, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
295 { "i", TagI, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
296 { "iframe", TagIframe, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
297 { "img", TagImg, DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 },
298 { "input", TagInput, DisplayInput, 0, 0, 1, 0, 0, 0, 0 },
299 { "ins", TagIns, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
300 { "label", TagLabel, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
301 { "legend", TagLegend, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
302 { "li", TagLi, DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 },
303 { "link", TagLink, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
304 { "main", TagMain, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
305 { "mark", TagMark, DisplayInline, MarkupReverse, 0, 0, 0, 0, 0, 0 },
306 { "menu", TagMenu, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
307 { "meta", TagMeta, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
308 { "nav", TagNav, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
309 { "object", TagObject, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
310 { "ol", TagOl, DisplayList | DisplayListOrdered, 0, 0, 0, 0, 1, 1, 0 },
311 { "option", TagOption, DisplayInline | DisplayOption, 0, 0, 0, 1, 0, 0, 0 },
312 { "p", TagP, DisplayBlock, 0, 0, 0, 1, 1, 1, 0 },
313 { "param", TagParam, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
314 { "pre", TagPre, DisplayPre, 0, 0, 0, 0, 1, 1, 4 },
315 { "s", TagS, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
316 { "script", TagScript, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
317 { "search", TagSearch, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
318 { "section", TagSection, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
319 { "select", TagSelect, DisplayInline | DisplaySelect, 0, 0, 0, 0, 0, 0, 0 },
320 { "source", TagSource, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
321 { "strike", TagStrike, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
322 { "strong", TagStrong, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
323 { "style", TagStyle, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
324 { "summary", TagSummary, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
325 { "svg", TagSvg, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
326 { "table", TagTable, DisplayTable, 0, 0, 0, 0, 0, 0, 0 },
327 { "tbody", TagTbody, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
328 { "td", TagTd, DisplayTableCell, 0, DisplayTableRow, 0, 1, 0, 0, 0 },
329 { "template", TagTemplate, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
330 { "textarea", TagTextarea, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
331 { "tfoot", TagTfoot, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
332 { "th", TagTh, DisplayTableCell, MarkupBold, DisplayTableRow, 0, 1, 0, 0, 0 },
333 { "thead", TagThead, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
334 { "title", TagTitle, DisplayBlock, 0, 0, 0, 0, 0, 1, -DEFAULT_INDENT },
335 { "tr", TagTr, DisplayTableRow, 0, DisplayTable, 0, 1, 0, 0, 0 },
336 { "track", TagTrack, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
337 { "u", TagU, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
338 { "ul", TagUl, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
339 { "var", TagVar, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
340 { "video", TagVideo, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
341 { "wbr", TagWbr, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
342 { "xmp", TagXmp, DisplayPre, 0, 0, 0, 0, 1, 1, 4 }
343 };
344
345 /* hint for compilers and static analyzers that a function exits */
346 #ifndef __dead
347 #define __dead
348 #endif
349
350 /* print to stderr, print error message of errno and exit(). */
351 __dead static void
352 err(int exitstatus, const char *fmt, ...)
353 {
354 va_list ap;
355 int saved_errno;
356
357 saved_errno = errno;
358
359 fputs("webdump: ", stderr);
360 if (fmt) {
361 va_start(ap, fmt);
362 vfprintf(stderr, fmt, ap);
363 va_end(ap);
364 fputs(": ", stderr);
365 }
366 fprintf(stderr, "%s\n", strerror(saved_errno));
367
368 exit(exitstatus);
369 }
370
371 /* print to stderr and exit(). */
372 __dead static void
373 errx(int exitstatus, const char *fmt, ...)
374 {
375 va_list ap;
376
377 fputs("webdump: ", stderr);
378 if (fmt) {
379 va_start(ap, fmt);
380 vfprintf(stderr, fmt, ap);
381 va_end(ap);
382 }
383 fputs("\n", stderr);
384
385 exit(exitstatus);
386 }
387
388 static const char *ignorestate, *endtag;
389 static int (*getnext)(void);
390
391 /* return a space for all data until some case-insensitive string occurs. This
392 is used to parse incorrect HTML/XML that contains unescaped HTML in script
393 or style tags. If you see some </script> tag in a CDATA or comment
394 section then e-mail W3C and tell them the web is too complex. */
395 static inline int
396 getnext_ignore(void)
397 {
398 int c;
399
400 if ((c = getnext()) == EOF)
401 return EOF;
402
403 if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) {
404 ignorestate++;
405 if (*ignorestate == '\0') {
406 parser.getnext = getnext; /* restore */
407 return ' ';
408 }
409 } else {
410 ignorestate = endtag; /* no full match: reset to beginning */
411 }
412
413 return ' '; /* pretend there is just SPACEs */
414 }
415
416 /* Clear string only; don't free, prevents unnecessary reallocation. */
417 static void
418 string_clear(String *s)
419 {
420 if (s->data)
421 s->data[0] = '\0';
422 s->len = 0;
423 }
424
425 static void
426 string_buffer_realloc(String *s, size_t newlen)
427 {
428 size_t alloclen;
429
430 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
431 ;
432 if (!(s->data = realloc(s->data, alloclen)))
433 err(1, "realloc");
434 s->bufsiz = alloclen;
435 }
436
437 static void
438 string_append(String *s, const char *data, size_t len)
439 {
440 if (!len)
441 return;
442 /* check if allocation is necesary, don't shrink buffer,
443 * should be more than bufsiz ofcourse. */
444 if (s->len + len >= s->bufsiz)
445 string_buffer_realloc(s, s->len + len + 1);
446 memcpy(s->data + s->len, data, len);
447 s->len += len;
448 s->data[s->len] = '\0';
449 }
450
451 static char *
452 estrdup(const char *s)
453 {
454 char *p;
455
456 if (!(p = strdup(s)))
457 err(1, "strdup");
458 return p;
459 }
460
461 static char *
462 estrndup(const char *s, size_t n)
463 {
464 char *p;
465
466 if (!(p = strndup(s, n)))
467 err(1, "strndup");
468 return p;
469 }
470
471 static void *
472 erealloc(void *p, size_t siz)
473 {
474 if (!(p = realloc(p, siz)))
475 err(1, "realloc");
476
477 return p;
478 }
479
480 static void *
481 ecalloc(size_t nmemb, size_t size)
482 {
483 void *p;
484
485 if (!(p = calloc(nmemb, size)))
486 err(1, "calloc");
487 return p;
488 }
489
490 /* check if string has a non-empty scheme / protocol part */
491 static int
492 uri_hasscheme(const char *s)
493 {
494 const char *p = s;
495
496 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
497 *p == '+' || *p == '-' || *p == '.'; p++)
498 ;
499 /* scheme, except if empty and starts with ":" then it is a path */
500 return (*p == ':' && p != s);
501 }
502
503 static int
504 uri_parse(const char *s, struct uri *u)
505 {
506 const char *p = s;
507 char *endptr;
508 size_t i;
509 long l;
510
511 u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
512 u->path[0] = u->query[0] = u->fragment[0] = '\0';
513
514 /* protocol-relative */
515 if (*p == '/' && *(p + 1) == '/') {
516 p += 2; /* skip "//" */
517 goto parseauth;
518 }
519
520 /* scheme / protocol part */
521 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
522 *p == '+' || *p == '-' || *p == '.'; p++)
523 ;
524 /* scheme, except if empty and starts with ":" then it is a path */
525 if (*p == ':' && p != s) {
526 if (*(p + 1) == '/' && *(p + 2) == '/')
527 p += 3; /* skip "://" */
528 else
529 p++; /* skip ":" */
530
531 if ((size_t)(p - s) >= sizeof(u->proto))
532 return -1; /* protocol too long */
533 memcpy(u->proto, s, p - s);
534 u->proto[p - s] = '\0';
535
536 if (*(p - 1) != '/')
537 goto parsepath;
538 } else {
539 p = s; /* no scheme format, reset to start */
540 goto parsepath;
541 }
542
543 parseauth:
544 /* userinfo (username:password) */
545 i = strcspn(p, "@/?#");
546 if (p[i] == '@') {
547 if (i >= sizeof(u->userinfo))
548 return -1; /* userinfo too long */
549 memcpy(u->userinfo, p, i);
550 u->userinfo[i] = '\0';
551 p += i + 1;
552 }
553
554 /* IPv6 address */
555 if (*p == '[') {
556 /* bracket not found, host too short or too long */
557 i = strcspn(p, "]");
558 if (p[i] != ']' || i < 3)
559 return -1;
560 i++; /* including "]" */
561 } else {
562 /* domain / host part, skip until port, path or end. */
563 i = strcspn(p, ":/?#");
564 }
565 if (i >= sizeof(u->host))
566 return -1; /* host too long */
567 memcpy(u->host, p, i);
568 u->host[i] = '\0';
569 p += i;
570
571 /* port */
572 if (*p == ':') {
573 p++;
574 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
575 return -1; /* port too long */
576 memcpy(u->port, p, i);
577 u->port[i] = '\0';
578 /* check for valid port: range 1 - 65535, may be empty */
579 errno = 0;
580 l = strtol(u->port, &endptr, 10);
581 if (i && (errno || *endptr || l <= 0 || l > 65535))
582 return -1;
583 p += i;
584 }
585
586 parsepath:
587 /* path */
588 if ((i = strcspn(p, "?#")) >= sizeof(u->path))
589 return -1; /* path too long */
590 memcpy(u->path, p, i);
591 u->path[i] = '\0';
592 p += i;
593
594 /* query */
595 if (*p == '?') {
596 p++;
597 if ((i = strcspn(p, "#")) >= sizeof(u->query))
598 return -1; /* query too long */
599 memcpy(u->query, p, i);
600 u->query[i] = '\0';
601 p += i;
602 }
603
604 /* fragment */
605 if (*p == '#') {
606 p++;
607 if ((i = strlen(p)) >= sizeof(u->fragment))
608 return -1; /* fragment too long */
609 memcpy(u->fragment, p, i);
610 u->fragment[i] = '\0';
611 }
612
613 return 0;
614 }
615
616 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
617 Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
618 Returns 0 on success, -1 on error or truncation. */
619 static int
620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
621 {
622 char *p;
623 int c;
624
625 strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
626
627 if (u->proto[0] || u->host[0]) {
628 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
629 strlcpy(a->host, u->host, sizeof(a->host));
630 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
631 strlcpy(a->host, u->host, sizeof(a->host));
632 strlcpy(a->port, u->port, sizeof(a->port));
633 strlcpy(a->path, u->path, sizeof(a->path));
634 strlcpy(a->query, u->query, sizeof(a->query));
635 return 0;
636 }
637
638 strlcpy(a->proto, b->proto, sizeof(a->proto));
639 strlcpy(a->host, b->host, sizeof(a->host));
640 strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
641 strlcpy(a->host, b->host, sizeof(a->host));
642 strlcpy(a->port, b->port, sizeof(a->port));
643
644 if (!u->path[0]) {
645 strlcpy(a->path, b->path, sizeof(a->path));
646 } else if (u->path[0] == '/') {
647 strlcpy(a->path, u->path, sizeof(a->path));
648 } else {
649 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
650 a->path[1] = '\0';
651
652 if ((p = strrchr(b->path, '/'))) {
653 c = *(++p);
654 *p = '\0'; /* temporary NUL-terminate */
655 if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
656 return -1;
657 *p = c; /* restore */
658 }
659 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
660 return -1;
661 }
662
663 if (u->path[0] || u->query[0])
664 strlcpy(a->query, u->query, sizeof(a->query));
665 else
666 strlcpy(a->query, b->query, sizeof(a->query));
667
668 return 0;
669 }
670
671 static int
672 uri_format(char *buf, size_t bufsiz, struct uri *u)
673 {
674 return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
675 u->proto,
676 u->userinfo[0] ? u->userinfo : "",
677 u->userinfo[0] ? "@" : "",
678 u->host,
679 u->port[0] ? ":" : "",
680 u->port,
681 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
682 u->path,
683 u->query[0] ? "?" : "",
684 u->query,
685 u->fragment[0] ? "#" : "",
686 u->fragment);
687 }
688
689 /* compare tag name (case-insensitive) */
690 static int
691 tagcmp(const char *s1, const char *s2)
692 {
693 return strcasecmp(s1, s2);
694 }
695
696 /* compare attribute name (case-insensitive) */
697 static int
698 attrcmp(const char *s1, const char *s2)
699 {
700 return strcasecmp(s1, s2);
701 }
702
703 static void
704 rindent(void)
705 {
706 int i, total;
707
708 total = indent + defaultindent;
709 if (total < 0)
710 total = 0;
711 for (i = 0; i < total; i++)
712 putchar(' ');
713
714 nbytesline += total;
715 ncells += total;
716 }
717
718 static void
719 emitmarkup(int markuptype)
720 {
721 if (!allowansi)
722 return;
723
724 if (!markuptype)
725 fputs("\033[0m", stdout); /* reset all attributes */
726
727 /* set */
728 if (markuptype & MarkupBold)
729 fputs("\033[1m", stdout);
730 if (markuptype & MarkupItalic)
731 fputs("\033[3m", stdout);
732 if (markuptype & MarkupUnderline)
733 fputs("\033[4m", stdout);
734 if (markuptype & MarkupBlink)
735 fputs("\033[5m", stdout);
736 if (markuptype & MarkupReverse)
737 fputs("\033[7m", stdout);
738 if (markuptype & MarkupStrike)
739 fputs("\033[9m", stdout);
740 }
741
742 /* flush remaining buffer (containing a word): used for word-wrap handling */
743 static void
744 hflush(void)
745 {
746 int i;
747
748 if (!rbuflen)
749 return;
750
751 if (!nbytesline) {
752 if (curmarkup)
753 emitmarkup(0);
754 rindent();
755 /* emit code again per line, needed for GNU/less -R */
756 if (curmarkup)
757 emitmarkup(curmarkup);
758 }
759
760 for (i = 0; i < rbuflen; i++)
761 putchar(rbuf[i]);
762
763 nbytesline += rbuflen;
764 ncells += rnbufcells;
765 rbuflen = 0;
766 rnbufcells = 0;
767 }
768
769 static void
770 printansi(const char *s)
771 {
772 size_t len;
773
774 if (!allowansi)
775 return;
776
777 if (linewrap) {
778 len = strlen(s);
779 if (rbuflen + len + 1 >= sizeof(rbuf))
780 hflush();
781 if (rbuflen + len + 1 < sizeof(rbuf)) {
782 memcpy(rbuf + rbuflen, s, len);
783 rbuflen += len;
784 /* NOTE: nbytesline and ncells are not counted for markup */
785 }
786 } else {
787 fputs(s, stdout);
788 }
789 }
790
791 static void
792 setmarkup(int markuptype)
793 {
794 if (!allowansi)
795 return;
796
797 /* need change? */
798 if (curmarkup == markuptype)
799 return;
800
801 if (!markuptype) {
802 printansi("\033[0m"); /* reset all attributes */
803 curmarkup = markuptype;
804 return;
805 }
806
807 /* set */
808 if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
809 printansi("\033[1m");
810 if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
811 printansi("\033[3m");
812 if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline))
813 printansi("\033[4m");
814 if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
815 printansi("\033[5m");
816 if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
817 printansi("\033[7m");
818 if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
819 printansi("\033[9m");
820
821 /* unset */
822 if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
823 printansi("\033[22m"); /* reset bold or faint */
824 if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
825 printansi("\033[23m"); /* reset italic */
826 if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline))
827 printansi("\033[24m"); /* reset underline */
828 if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
829 printansi("\033[25m"); /* reset blink */
830 if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
831 printansi("\033[27m"); /* reset reverse */
832 if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
833 printansi("\033[29m"); /* reset strike */
834
835 curmarkup = markuptype;
836 }
837
838 static void
839 startmarkup(int markuptype)
840 {
841 setmarkup(curmarkup | markuptype);
842 }
843
844 static void
845 endmarkup(int markuptype)
846 {
847 setmarkup(curmarkup & ~markuptype);
848 }
849
850 /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1
851 cell in general.
852 NOTE: this is of course incorrect since characters can be 2 width aswell,
853 in the future maybe replace this with wcwidth() or similar */
854 static int
855 utfwidth(int c)
856 {
857 /* not the start of a codepoint */
858 if ((c & 0xc0) == 0x80)
859 return 0;
860 /* count TAB as 8 */
861 if (c == '\t')
862 return 8;
863 return 1;
864 }
865
866 /* write a character, handling state of repeated newlines, some HTML
867 white-space rules, indentation and word-wrapping */
868 static void
869 hputchar(int c)
870 {
871 struct node *cur = &nodes[curnode];
872 cur->hasdata = 1;
873
874 if (c == '\n') {
875 /* previous line had characters, so not a repeated newline */
876 if (nbytesline > 0)
877 hadnewline = 0;
878
879 /* start a new line, no chars on this line yet */
880 whitespace_mode &= ~2; /* no chars on this line yet */
881 nbytesline = 0;
882 ncells = 0;
883
884 if (hadnewline)
885 currentnewlines++; /* repeating newlines */
886 hadnewline = 1;
887 } else {
888 hadnewline = 0;
889 currentnewlines = 0;
890 }
891
892 /* skip initial/leading white-space */
893 if (ISSPACE((unsigned char)c)) {
894 if (skipinitialws)
895 return;
896 } else {
897 skipinitialws = 0;
898 }
899
900 if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c)))
901 return;
902
903 if (!linewrap) {
904 if (c == '\n') {
905 putchar('\n');
906 nbytesline = 0;
907 ncells = 0;
908 } else {
909 if (!nbytesline) {
910 if (curmarkup)
911 emitmarkup(0);
912 rindent();
913 /* emit code again per line, needed for GNU/less -R */
914 if (curmarkup)
915 emitmarkup(curmarkup);
916 }
917 putchar(c);
918 nbytesline++;
919 ncells += utfwidth(c);
920 }
921 return;
922 }
923
924 /* really too long: the whole word doesn't even fit, flush it */
925 if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) {
926 putchar('\n');
927 nbytesline = 0;
928 ncells = 0;
929 hflush();
930 }
931
932 if (c == '\n') {
933 putchar('\n');
934 hflush();
935 return;
936 } else if (ISSPACE((unsigned char)c) || c == '-') {
937 if (ncells + rnbufcells >= termwidth) {
938 putchar('\n');
939 nbytesline = 0;
940 ncells = 0;
941 }
942 rbuf[rbuflen++] = c;
943 rnbufcells += utfwidth(c);
944 hflush();
945 return;
946 }
947
948 rbuf[rbuflen++] = c;
949 rnbufcells += utfwidth(c);
950 }
951
952 /* calculate indentation of current node depth, using the sum of each
953 indentation per node */
954 static int
955 calcindent(void)
956 {
957 int i, n = 0;
958
959 for (i = curnode; i >= 0; i--)
960 n += nodes[i].indent;
961
962 return n;
963 }
964
965 static void
966 hprint(const char *s)
967 {
968 for (; *s; ++s)
969 hputchar(*s);
970 }
971
972 /* printf(), max 256 bytes for now */
973 static void
974 hprintf(const char *fmt, ...)
975 {
976 va_list ap;
977 char buf[256];
978
979 va_start(ap, fmt);
980 vsnprintf(buf, sizeof(buf), fmt, ap);
981 va_end(ap);
982
983 /* use hprint() formatting logic. */
984 hprint(buf);
985 }
986
987 static void
988 newline(void)
989 {
990 if (skipinitialws)
991 return;
992 hputchar('\n');
993 }
994
995 static int
996 parentcontainerhasdata(int curtype, int n)
997 {
998 int i;
999
1000 for (i = n; i >= 0; i--) {
1001 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable))
1002 break;
1003 if (nodes[i].hasdata)
1004 return 1;
1005 }
1006
1007 return 0;
1008 }
1009
1010 /* start on a newline for the start of a block element or not */
1011 static void
1012 startblock(void)
1013 {
1014 hflush();
1015 whitespace_mode &= ~2; /* no characters on this line yet */
1016 if (nbytesline <= 0)
1017 return;
1018 if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
1019 hputchar('\n');
1020 }
1021
1022 /* start on a newline for the end of a block element or not */
1023 static void
1024 endblock(void)
1025 {
1026 hflush();
1027 whitespace_mode &= ~2; /* no characters on this line yet */
1028 if (nbytesline <= 0)
1029 return;
1030 if (!hadnewline)
1031 hputchar('\n');
1032 }
1033
1034 /* print one character safely: no control characters,
1035 handle HTML white-space rules */
1036 static void
1037 printc(int c)
1038 {
1039 if (ISSPACE((unsigned char)c)) {
1040 if (whitespace_mode == 2)
1041 hputchar(' ');
1042 whitespace_mode |= 1;
1043 } else {
1044 whitespace_mode = 2;
1045 if (!ISCNTRL((unsigned char)c))
1046 hputchar(c);
1047 }
1048 }
1049
1050 static void
1051 printpre(const char *s, size_t len)
1052 {
1053 struct node *cur;
1054 size_t i;
1055
1056 /* reset state of newlines because this data is printed literally */
1057 hadnewline = 0;
1058 currentnewlines = 0;
1059
1060 /* skip leading newline */
1061 i = 0;
1062 if (skipinitialws) {
1063 if (*s == '\n' && i < len) {
1064 s++;
1065 i++;
1066 }
1067 }
1068
1069 hflush();
1070
1071 skipinitialws = 0;
1072
1073 if (*s) {
1074 cur = &nodes[curnode];
1075 cur->hasdata = 1;
1076 }
1077
1078 for (; *s && i < len; s++, i++) {
1079 switch (*s) {
1080 case '\n':
1081 putchar('\n');
1082 nbytesline = 0;
1083 ncells = 0;
1084 break;
1085 case '\t':
1086 hadnewline = 0;
1087 if (!nbytesline) {
1088 if (curmarkup)
1089 emitmarkup(0);
1090 rindent();
1091 /* emit code again per line, needed for GNU/less -R */
1092 if (curmarkup)
1093 emitmarkup(curmarkup);
1094 }
1095
1096 /* TAB to 8 spaces */
1097 fputs(" ", stdout);
1098 nbytesline += 8;
1099 ncells += 8;
1100 break;
1101 default:
1102 if (ISCNTRL((unsigned char)*s))
1103 continue;
1104
1105 if (!nbytesline) {
1106 if (curmarkup)
1107 emitmarkup(0);
1108 rindent();
1109 /* emit code again per line, needed for GNU/less -R */
1110 if (curmarkup)
1111 emitmarkup(curmarkup);
1112 }
1113
1114 putchar(*s);
1115 nbytesline++;
1116 /* start of rune: incorrectly assume 1 rune is 1 cell for now */
1117 ncells += utfwidth((unsigned char)*s);
1118 }
1119 }
1120 }
1121
1122 static struct node *
1123 findparenttype(int cur, int findtype)
1124 {
1125 int i;
1126
1127 for (i = cur; i >= 0; i--) {
1128 if ((nodes[i].tag.displaytype & findtype))
1129 return &nodes[i];
1130 }
1131 return NULL;
1132 }
1133
1134 static int
1135 isclassmatch(const char *haystack, const char *needle)
1136 {
1137 const char *p;
1138 size_t needlelen;
1139 size_t matched = 0;
1140
1141 needlelen = strlen(needle);
1142 for (p = haystack; *p; p++) {
1143 if (ISSPACE((unsigned char)*p)) {
1144 matched = 0;
1145 continue;
1146 }
1147 if (needle[matched] == *p)
1148 matched++;
1149 else
1150 matched = 0;
1151 if (matched == needlelen) {
1152 if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1)))
1153 return 1;
1154 }
1155 }
1156
1157 return 0;
1158 }
1159
1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
1161 ".class", "#id", "ul li a" */
1162 static int
1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
1164 {
1165 int depth = 0, len;
1166 long l;
1167 const char *s, *start;
1168 char tmp[256];
1169 int nameset = 0;
1170
1171 memset(&nodes[0], 0, sizeof(nodes[0]));
1172 nodes[0].index = -1;
1173
1174 s = sel;
1175 for (; *s && ISSPACE((unsigned char)*s); s++)
1176 ;
1177
1178 start = s;
1179 for (; ; s++) {
1180 /* end of tag */
1181 if (!nameset &&
1182 (*s == '#' || *s == '.' || *s == '@' ||
1183 *s == '\0' || ISSPACE((unsigned char)*s))) {
1184 nameset = 1;
1185 len = s - start; /* tag name */
1186 if (len >= sizeof(tmp))
1187 return 0;
1188 if (len)
1189 memcpy(tmp, start, len);
1190 tmp[len] = '\0';
1191
1192 memcpy(nodes[depth].tagname, tmp, len + 1);
1193 }
1194
1195 /* end */
1196 if (*s == '\0' || ISSPACE((unsigned char)*s)) {
1197 for (; ISSPACE((unsigned char)*s); s++)
1198 ;
1199 start = s; /* start of a new tag */
1200 depth++;
1201 if (depth >= maxnodes)
1202 return 0;
1203
1204 nameset = 0;
1205 memset(&nodes[depth], 0, sizeof(nodes[depth]));
1206 nodes[depth].index = -1;
1207
1208 /* end of selector */
1209 if (*s == '\0')
1210 break;
1211 }
1212
1213 /* index */
1214 if (*s == '@') {
1215 len = strcspn(s + 1, ".#@ \t\n");
1216 if (len >= sizeof(tmp))
1217 return 0;
1218 memcpy(tmp, s + 1, len);
1219 tmp[len] = '\0';
1220
1221 l = strtol(tmp, NULL, 10);
1222 if (l >= 0)
1223 nodes[depth].index = l;
1224 s += len;
1225 start = s + 1;
1226 continue;
1227 }
1228
1229 /* id */
1230 if (*s == '#') {
1231 len = strcspn(s + 1, ".#@ \t\n");
1232 if (len >= sizeof(tmp))
1233 return 0;
1234 memcpy(tmp, s + 1, len);
1235 tmp[len] = '\0';
1236 memcpy(nodes[depth].id, tmp, len + 1);
1237 s += len;
1238 start = s + 1;
1239 continue;
1240 }
1241
1242 /* class */
1243 if (*s == '.') {
1244 len = strcspn(s + 1, ".#@ \t\n");
1245 if (len >= sizeof(tmp))
1246 return 0;
1247 memcpy(tmp, s + 1, len);
1248 tmp[len] = '\0';
1249 /* allow only one classname for now */
1250 memcpy(nodes[depth].classnames, tmp, len + 1);
1251 s += len;
1252 start = s + 1;
1253 continue;
1254 }
1255 }
1256
1257 return depth;
1258 }
1259
1260 static struct selector *
1261 newselector(const char *q)
1262 {
1263 struct selector *sel;
1264 int r;
1265
1266 sel = ecalloc(1, sizeof(*sel));
1267 sel->text = estrdup(q);
1268
1269 r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
1270 if (r <= 0) {
1271 free(sel);
1272 return NULL;
1273 }
1274 sel->depth = r;
1275
1276 return sel;
1277 }
1278
1279 static struct selectors *
1280 compileselectors(const char *q)
1281 {
1282 struct selectors *sels = NULL;
1283 struct selector *sel;
1284 const char *start;
1285 char *qe;
1286 int count = 0;
1287 size_t siz;
1288
1289 sels = ecalloc(1, sizeof(*sels));
1290
1291 start = q;
1292 for (; ; q++) {
1293 if (*q == ',' || *q == '\0') {
1294 qe = estrndup(start, q - start);
1295 sel = newselector(qe);
1296 free(qe);
1297
1298 /* add new selector */
1299 siz = (count + 1) * sizeof(struct selector *);
1300 sels->selectors = erealloc(sels->selectors, siz);
1301 sels->selectors[count] = sel;
1302 count++;
1303
1304 if (*q == '\0')
1305 break;
1306 start = q + 1;
1307 }
1308 }
1309 sels->count = count;
1310
1311 return sels;
1312 }
1313
1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
1315 ".class", "#id", "ul li a" */
1316 static int
1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth)
1318 {
1319 int d, md = 0;
1320
1321 for (d = 0; d <= maxdepth; d++) {
1322 /* tag matched? */
1323 if (sel->nodes[md].tagname[0] &&
1324 strcasecmp(sel->nodes[md].tagname, root[d].tagname))
1325 continue; /* no */
1326
1327 /* id matched? */
1328 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id))
1329 continue; /* no */
1330
1331 /* class matched, for now allow only one classname in the selector,
1332 matching multiple classnames */
1333 if (sel->nodes[md].classnames[0] &&
1334 !isclassmatch(root[d].classnames, sel->nodes[md].classnames))
1335 continue; /* no */
1336
1337 /* index matched */
1338 if (sel->nodes[md].index != -1 &&
1339 (d == 0 ||
1340 root[d - 1].nchildren == 0 ||
1341 sel->nodes[md].index != root[d - 1].nchildren - 1))
1342 continue;
1343
1344 md++;
1345 /* all matched of one selector */
1346 if (md == sel->depth)
1347 return 1;
1348 }
1349
1350 return 0;
1351 }
1352
1353 static int
1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
1355 {
1356 struct selector *sel;
1357 int i;
1358
1359 for (i = 0; i < sels->count; i++) {
1360 sel = sels->selectors[i];
1361 if (iscssmatch(sel, root, maxdepth))
1362 return 1;
1363 }
1364 return 0;
1365 }
1366
1367 static void
1368 handleinlinealt(void)
1369 {
1370 struct node *cur;
1371 char *start, *s, *e;
1372
1373 /* do not show the alt text if the element is hidden */
1374 cur = &nodes[curnode];
1375 if (cur->tag.displaytype & DisplayNone)
1376 return;
1377
1378 /* show img alt attribute as text. */
1379 if (attr_alt.len) {
1380 start = attr_alt.data;
1381 e = attr_alt.data + attr_alt.len;
1382
1383 for (s = start; s < e; s++)
1384 printc((unsigned char)*s);
1385 hflush();
1386 } else if (cur->tag.id == TagImg && !showurlinline) {
1387 /* if there is no alt text and no URL is shown inline, then
1388 show "[IMG]" to indicate there was an image there */
1389 hprint("[IMG]");
1390 }
1391 }
1392
1393 /* lookup a link reference by URL in the red-black tree */
1394 static struct linkref *
1395 findlinkref(const char *url)
1396 {
1397 struct linkref find;
1398
1399 find.url = (char *)url;
1400
1401 return RB_FIND(linkreftree, &linkrefhead, &find);
1402 }
1403
1404 /* add a link reference. Returns the added link reference, or the existing link
1405 reference if links are deduplicated */
1406 static struct linkref *
1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden)
1408 {
1409 struct linkref *link;
1410 size_t linknr;
1411
1412 /* if links are deduplicates return the existing link */
1413 if (uniqrefs && (link = findlinkref(url)))
1414 return link;
1415
1416 if (tagid == TagA)
1417 _type = "link";
1418
1419 link = ecalloc(1, sizeof(*link));
1420
1421 if (!ishidden) {
1422 linknr = ++nvisrefs;
1423 if (nvisrefs >= ncapvisrefs) {
1424 ncapvisrefs += 256; /* greedy alloc */
1425 visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs);
1426 }
1427 visrefs[linknr - 1] = link; /* add pointer to list */
1428 } else {
1429 linknr = ++nhiddenrefs;
1430 if (nhiddenrefs >= ncaphiddenrefs) {
1431 ncaphiddenrefs += 256; /* greedy alloc */
1432 hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs);
1433 }
1434 hiddenrefs[linknr - 1] = link; /* add pointer to list */
1435 }
1436
1437 link->url = estrdup(url);
1438 link->type = estrdup(_type);
1439 link->tagid = tagid;
1440 link->ishidden = ishidden;
1441 link->linknr = linknr;
1442
1443 /* add to tree: the tree is only used for checking unique link references */
1444 if (uniqrefs)
1445 RB_INSERT(linkreftree, &linkrefhead, link);
1446
1447 return link;
1448 }
1449
1450 static void
1451 handleinlinelink(void)
1452 {
1453 struct uri newuri, olduri;
1454 struct node *cur;
1455 char buf[4096], *url;
1456 int r;
1457
1458 if (!showrefbottom && !showrefinline && !showurlinline && !resources)
1459 return; /* there is no need to collect the reference */
1460
1461 if (!attr_href.len && !attr_src.len && !attr_data.len)
1462 return; /* there is no reference */
1463
1464 /* by default use the original URL */
1465 if (attr_src.len)
1466 url = attr_src.data;
1467 else if (attr_href.len)
1468 url = attr_href.data;
1469 else
1470 url = attr_data.data;
1471
1472 if (!url)
1473 return;
1474
1475 /* Not an absolute URL yet: try to make it absolute.
1476 If it is not possible use the relative URL */
1477 if (!uri_hasscheme(url) && basehrefset &&
1478 uri_parse(url, &olduri) != -1 &&
1479 uri_makeabs(&newuri, &olduri, &base) != -1 &&
1480 newuri.proto[0]) {
1481 r = uri_format(buf, sizeof(buf), &newuri);
1482 if (r >= 0 && (size_t)r < sizeof(buf))
1483 url = buf;
1484 }
1485
1486 if (!url[0])
1487 return;
1488
1489 cur = &nodes[curnode];
1490
1491 if (!(cur->tag.displaytype & DisplayNone)) {
1492 string_clear(&nodes_links[curnode]);
1493 string_append(&nodes_links[curnode], url, strlen(url));
1494 }
1495
1496 /* add hidden links directly to the reference,
1497 the order doesn't matter */
1498 if (cur->tag.displaytype & DisplayNone)
1499 addlinkref(url, cur->tag.name, cur->tag.id, 1);
1500 }
1501
1502 static void
1503 printlinkrefs(void)
1504 {
1505 struct linkref *ref;
1506 size_t i;
1507
1508 if (!nvisrefs && !nhiddenrefs)
1509 return;
1510
1511 if (resources) {
1512 for (i = 0; i < nvisrefs; i++) {
1513 ref = visrefs[i];
1514 dprintf(3, "%s\t%s\n", ref->type, ref->url);
1515 }
1516 for (i = 0; i < nhiddenrefs; i++) {
1517 ref = hiddenrefs[i];
1518 dprintf(3, "%s\t%s\n", ref->type, ref->url);
1519 }
1520 }
1521
1522 printf("\nReferences\n\n");
1523
1524 for (i = 0; i < nvisrefs; i++) {
1525 ref = visrefs[i];
1526 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
1527 }
1528
1529 if (nhiddenrefs > 0)
1530 printf("\n\nHidden references\n\n");
1531 /* hidden links don't have a link number, just count them */
1532 for (i = 0; i < nhiddenrefs; i++) {
1533 ref = hiddenrefs[i];
1534 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
1535 }
1536 }
1537
1538 /* size to grow node capacity (greedy) */
1539 #define NODE_CAP_INC 16
1540
1541 /* increase node depth, allocate space for nodes if needed */
1542 static void
1543 incnode(void)
1544 {
1545 size_t i;
1546
1547 curnode++;
1548
1549 if (curnode >= MAX_NODE_DEPTH)
1550 errx(1, "max node depth reached: %d", curnode);
1551
1552 if (curnode >= ncapnodes) {
1553 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC));
1554 nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC));
1555
1556 /* clear new region */
1557 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC);
1558 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC);
1559
1560 for (i = 0; i < ncapnodes; i++)
1561 nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
1562
1563 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
1564 nodes[i].tag.displaytype = DisplayInline;
1565 nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
1566 }
1567
1568 ncapnodes += NODE_CAP_INC; /* greedy alloc */
1569 }
1570 }
1571
1572 static void
1573 xmldatastart(XMLParser *p)
1574 {
1575 }
1576
1577 static void
1578 xmldataend(XMLParser *p)
1579 {
1580 struct node *cur;
1581 char *start, *s, *e;
1582
1583 if (!htmldata.data || !htmldata.len)
1584 return;
1585
1586 cur = &nodes[curnode];
1587
1588 if (reader_ignore || (cur->tag.displaytype & DisplayNone)) {
1589 /* print nothing */
1590 } else if ((cur->tag.displaytype & DisplayPre) ||
1591 findparenttype(curnode - 1, DisplayPre)) {
1592 printpre(htmldata.data, htmldata.len);
1593 } else {
1594 start = htmldata.data;
1595 e = htmldata.data + htmldata.len;
1596
1597 for (s = start; s < e; s++)
1598 printc((unsigned char)*s);
1599 }
1600
1601 string_clear(&htmldata);
1602 }
1603
1604 static void
1605 xmldata(XMLParser *p, const char *data, size_t datalen)
1606 {
1607 struct node *cur;
1608
1609 if (reader_ignore)
1610 return;
1611
1612 cur = &nodes[curnode];
1613 if (cur->tag.displaytype & DisplayNone)
1614 return;
1615
1616 string_append(&htmldata, data, datalen);
1617 }
1618
1619 static void
1620 xmldataentity(XMLParser *p, const char *data, size_t datalen)
1621 {
1622 struct node *cur;
1623 char buf[8];
1624 int len;
1625
1626 if (reader_ignore)
1627 return;
1628
1629 cur = &nodes[curnode];
1630 if (cur->tag.displaytype & DisplayNone)
1631 return;
1632
1633 len = xml_entitytostr(data, buf, sizeof(buf));
1634 if (len > 0)
1635 xmldata(p, buf, (size_t)len);
1636 else
1637 xmldata(p, data, datalen);
1638 }
1639
1640 static void
1641 xmlcdatastart(XMLParser *p)
1642 {
1643 xmldatastart(p);
1644 }
1645
1646 static void
1647 xmlcdataend(XMLParser *p)
1648 {
1649 xmldataend(p); /* treat CDATA as data */
1650 }
1651
1652 static void
1653 xmlcdata(XMLParser *p, const char *data, size_t datalen)
1654 {
1655 xmldata(p, data, datalen); /* treat CDATA as data */
1656 }
1657
1658 /* lookup function to compare tag name (case-insensitive) for sort functions */
1659 static int
1660 findtagcmp(const void *v1, const void *v2)
1661 {
1662 struct tag *t1 = (struct tag *)v1;
1663 struct tag *t2 = (struct tag *)v2;
1664
1665 return strcasecmp(t1->name, t2->name);
1666 }
1667
1668 /* binary search tag by tag name */
1669 static struct tag *
1670 findtag(const char *t)
1671 {
1672 struct tag find = { 0 };
1673
1674 find.name = t;
1675
1676 return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp);
1677 }
1678
1679 static void
1680 handleendtag(struct tag *tag)
1681 {
1682 int i, marginbottom;
1683
1684 if (tag->displaytype & DisplayNone)
1685 return;
1686 if (reader_ignore)
1687 return;
1688
1689 if (tag->displaytype & (DisplayButton | DisplayOption)) {
1690 hputchar(']');
1691 hflush();
1692 }
1693
1694 if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow |
1695 DisplayList | DisplayListItem | DisplayPre)) {
1696 endblock(); /* break line if needed */
1697 }
1698
1699 /* when a list ends and its not inside a list add an extra bottom margin */
1700 marginbottom = tag->marginbottom;
1701
1702 if (marginbottom > 0) {
1703 if (tag->displaytype & DisplayList) {
1704 if (findparenttype(curnode - 1, DisplayList))
1705 marginbottom--;
1706 }
1707 }
1708
1709 if (marginbottom > 0) {
1710 hflush();
1711 for (i = currentnewlines; i < marginbottom; i++) {
1712 putchar('\n');
1713 nbytesline = 0;
1714 ncells = 0;
1715 currentnewlines++;
1716 }
1717 hadnewline = 1;
1718 }
1719 }
1720
1721 static void
1722 endnode(struct node *cur)
1723 {
1724 struct linkref *ref;
1725 int i, ishidden;
1726
1727 /* set a flag indicating the element and its parent containers have data.
1728 This is used for some formatting */
1729 if (cur->hasdata) {
1730 for (i = curnode; i >= 0; i--)
1731 nodes[i].hasdata = 1;
1732 }
1733
1734 endmarkup(cur->tag.markuptype);
1735
1736 ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone);
1737
1738 /* add link and show the link number in the visible order */
1739 if (!ishidden && nodes_links[curnode].len > 0) {
1740 ref = addlinkref(nodes_links[curnode].data,
1741 cur->tag.name, cur->tag.id, ishidden);
1742
1743 if (showrefinline || showurlinline) {
1744 hflush();
1745 startmarkup(MarkupReverse);
1746 }
1747
1748 if (showrefinline)
1749 hprintf("[%zu]", ref->linknr);
1750 if (showurlinline) {
1751 if (ref->tagid == TagA)
1752 hprintf("[%s]", ref->url);
1753 else
1754 hprintf("[%s: %s]", ref->type, ref->url);
1755 }
1756 if (showrefinline || showurlinline) {
1757 endmarkup(MarkupReverse);
1758 hflush();
1759 }
1760 }
1761
1762 handleendtag(&(cur->tag));
1763 }
1764
1765 static void
1766 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
1767 {
1768 struct tag *found, *tag;
1769 enum TagId childs[16];
1770 size_t nchilds;
1771 int i, j, k, nchildfound, parenttype;
1772
1773 /* match tag and lookup metadata */
1774 /* ignore closing of void elements, like </br>, which is not allowed */
1775 if ((found = findtag(t))) {
1776 if (!isshort && found->isvoid)
1777 return;
1778 }
1779
1780 /* TODO: implement more complete optional tag handling.
1781 in reality the optional tag rules are more complex, see:
1782 https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
1783
1784 nchilds = 0;
1785 nchildfound = 0;
1786 parenttype = 0; /* by default, seek until the root */
1787
1788 if (found && found->displaytype & DisplayPre) {
1789 skipinitialws = 0; /* do not skip white-space, for margins */
1790 } else if (found && found->displaytype & DisplayList) {
1791 childs[0] = TagLi;
1792 nchilds = 1;
1793 parenttype = DisplayList;
1794 } else if (found && found->displaytype & DisplayTableRow) {
1795 childs[0] = TagTd;
1796 nchilds = 1;
1797 parenttype = DisplayTableRow;
1798 } else if (found && found->displaytype & DisplayTable) {
1799 childs[0] = TagTd;
1800 nchilds = 1;
1801 parenttype = DisplayTable;
1802 } else if (found && found->displaytype & DisplaySelect) {
1803 childs[0] = TagOption;
1804 nchilds = 1;
1805 parenttype = DisplaySelect;
1806 } else if (found && found->displaytype & DisplayDl) {
1807 childs[0] = TagP;
1808 childs[1] = TagDd;
1809 childs[2] = TagDt;
1810 nchilds = 3;
1811 parenttype = DisplayDl;
1812 } else if (found && found->displaytype & DisplayBlock) {
1813 childs[0] = TagP;
1814 nchilds = 1;
1815 parenttype = 0; /* seek until the root */
1816 }
1817
1818 if (nchilds > 0) {
1819 for (i = curnode; i >= 0; i--) {
1820 if (nchildfound)
1821 break;
1822 if ((nodes[i].tag.displaytype & parenttype))
1823 break;
1824 for (j = 0; j < nchilds; j++) {
1825 if (nodes[i].tag.id == childs[j]) {
1826 /* fake closing the previous tags */
1827 for (k = curnode; k >= i; k--)
1828 endnode(&nodes[k]);
1829 curnode = k;
1830 nchildfound = 1;
1831 break;
1832 }
1833 }
1834 }
1835 }
1836
1837 /* if the current closing tag matches the current open tag */
1838 if (nodes[curnode].tag.name &&
1839 !tagcmp(nodes[curnode].tag.name, t)) {
1840 endnode(&nodes[curnode]);
1841 if (curnode)
1842 curnode--;
1843 } else {
1844 /* ... else lookup the first matching start tag. This is also
1845 for handling optional closing tags */
1846 tag = NULL;
1847 for (i = curnode; i >= 0; i--) {
1848 if (nodes[i].tag.name &&
1849 !tagcmp(nodes[i].tag.name, t)) {
1850 endnode(&nodes[i]);
1851 curnode = i > 0 ? i - 1 : 0;
1852 tag = &nodes[i].tag;
1853 break;
1854 }
1855 }
1856 /* unmatched closing tag found */
1857 if (!tag && found)
1858 handleendtag(found);
1859 }
1860 indent = calcindent();
1861
1862 #if 0
1863 /* check if linewrap is enabled, but currently is disabled and needs to
1864 be restored */
1865 if (allowlinewrap && !linewrap) {
1866 tag = NULL;
1867 for (i = curnode; i >= 0; i--) {
1868 if (nodes[i].tag.id == TagTable) {
1869 tag = &nodes[i].tag;
1870 break;
1871 }
1872 }
1873 if (!tag)
1874 linewrap = allowlinewrap;
1875 }
1876 #endif
1877
1878 /* restore markup of the tag we are in now */
1879 startmarkup(nodes[curnode].tag.markuptype);
1880
1881 /* check if the current node still matches the visible selector */
1882 if (reader_mode && sel_show && !reader_ignore) {
1883 if (!iscssmatchany(sel_show, nodes, curnode)) {
1884 reader_ignore = 1;
1885 newline();
1886 }
1887 }
1888 }
1889
1890 static void
1891 xmltagstart(XMLParser *p, const char *t, size_t tl)
1892 {
1893 struct tag *found;
1894 struct node *cur;
1895 enum TagId tagid;
1896 enum TagId childs[16];
1897 size_t nchilds;
1898 char *s;
1899 int i, j, k, nchildfound, parenttype;
1900
1901 cur = &nodes[curnode];
1902
1903 string_clear(&attr_alt);
1904 string_clear(&attr_checked);
1905 string_clear(&attr_class);
1906 attr_class_set = 0;
1907 string_clear(&attr_data);
1908 string_clear(&attr_href);
1909 string_clear(&attr_id);
1910 attr_id_set = 0;
1911 string_clear(&attr_src);
1912 string_clear(&attr_type);
1913 string_clear(&attr_value);
1914
1915 /* match tag and lookup metadata */
1916 found = findtag(t);
1917
1918 /* TODO: implement more complete optional tag handling.
1919 in reality the optional tag rules are more complex, see:
1920 https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
1921
1922 nchilds = 0;
1923 nchildfound = 0;
1924 parenttype = 0; /* by default, seek until the root */
1925
1926 /* if optional tag <p> is open and a list element is found, close </p>. */
1927 if (found && found->displaytype & DisplayList) {
1928 /* not inside a list */
1929 childs[0] = TagP;
1930 nchilds = 1;
1931 parenttype = DisplayList;
1932 } else if (found && found->isoptional) {
1933 tagid = found->id;
1934 if (tagid == TagLi) {
1935 childs[0] = TagLi;
1936 nchilds = 1;
1937 parenttype = DisplayList;
1938 } else if (tagid == TagTd) {
1939 childs[0] = TagTd;
1940 nchilds = 1;
1941 parenttype = DisplayTableRow;
1942 } else if (tagid == TagTr) {
1943 childs[0] = TagTr;
1944 nchilds = 1;
1945 parenttype = DisplayTable;
1946 } else if (tagid == TagP) {
1947 childs[0] = TagP;
1948 nchilds = 1;
1949 parenttype = 0; /* seek until the root */
1950 } else if (tagid == TagOption) {
1951 childs[0] = TagOption;
1952 nchilds = 1;
1953 parenttype = DisplaySelect;
1954 } else if (tagid == TagDt) {
1955 childs[0] = TagDd;
1956 nchilds = 1;
1957 parenttype = DisplayDl;
1958 } else if (tagid == TagDd) {
1959 childs[0] = TagDd;
1960 childs[1] = TagDt;
1961 nchilds = 2;
1962 parenttype = DisplayDl;
1963 } else if (tagid == cur->tag.id) {
1964 /* fake closing the previous tag if it is the same and repeated */
1965 xmltagend(p, t, tl, 0);
1966 }
1967 } else if (found && found->displaytype & DisplayBlock) {
1968 /* check if we have an open "<p>" tag */
1969 childs[0] = TagP;
1970 childs[1] = TagDl;
1971 nchilds = 2;
1972 parenttype = DisplayDl;
1973 }
1974
1975 if (nchilds > 0) {
1976 for (i = curnode; i >= 0; i--) {
1977 if (nchildfound)
1978 break;
1979 if ((nodes[i].tag.displaytype & parenttype))
1980 break;
1981 for (j = 0; j < nchilds; j++) {
1982 if (nodes[i].tag.id == childs[j]) {
1983 /* fake closing the previous tags */
1984 for (k = curnode; k >= i; k--)
1985 xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
1986 nchildfound = 1;
1987 break;
1988 }
1989 }
1990 }
1991 }
1992
1993 incnode();
1994 string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */
1995 cur = &nodes[curnode];
1996 memset(cur, 0, sizeof(*cur)); /* clear / reset node */
1997 /* tag defaults */
1998 cur->tag.displaytype = DisplayInline;
1999 cur->tag.name = cur->tagname; /* assign fixed-size buffer */
2000 strlcpy(cur->tagname, t, sizeof(cur->tagname));
2001
2002 /* force to lowercase */
2003 for (s = cur->tagname; *s; s++)
2004 *s = TOLOWER((unsigned char)*s);
2005
2006 /* matched tag: copy tag information to current node */
2007 if (found)
2008 memcpy(&(cur->tag), found, sizeof(*found));
2009
2010 /* if parent tag is hidden then hide itself too */
2011 if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone))
2012 cur->tag.displaytype |= DisplayNone;
2013 }
2014
2015 static void
2016 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
2017 {
2018 struct tag *found;
2019 enum TagId tagid;
2020 struct node *cur, *parent;
2021 int i, margintop;
2022
2023 /* match tag and lookup metadata */
2024 tagid = 0;
2025 if ((found = findtag(t)))
2026 tagid = found->id;
2027
2028 /* temporary replace the callback except the reader and end of tag
2029 restore the context once we receive the same ignored tag in the
2030 end tag handler */
2031 if (tagid == TagScript) {
2032 ignorestate = endtag = "</script>";
2033 getnext = p->getnext; /* for restore */
2034 p->getnext = getnext_ignore;
2035 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
2036 return;
2037 } else if (tagid == TagStyle) {
2038 ignorestate = endtag = "</style>";
2039 getnext = p->getnext; /* for restore */
2040 p->getnext = getnext_ignore;
2041 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
2042 return;
2043 }
2044
2045 #if 0
2046 /* disable line-wrapping inside tables */
2047 if (tagid == TagTable)
2048 linewrap = 0;
2049 #endif
2050
2051 cur = &nodes[curnode];
2052
2053 /* copy attributes if set */
2054 if (attr_id.len)
2055 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
2056 else
2057 cur->id[0] = '\0';
2058 if (attr_class.len)
2059 strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames));
2060 else
2061 cur->classnames[0] = '\0';
2062
2063 /* parent node */
2064 if (curnode > 0) {
2065 parent = &nodes[curnode - 1];
2066 parent->nchildren++; /* increase child node count */
2067 /* count visible childnodes */
2068 if (!(cur->tag.displaytype & DisplayNone))
2069 parent->visnchildren++;
2070 } else {
2071 parent = NULL;
2072 }
2073
2074 if (reader_mode && sel_show && reader_ignore &&
2075 iscssmatchany(sel_show, nodes, curnode))
2076 reader_ignore = 0;
2077
2078 /* hide element */
2079 if (reader_mode && sel_hide &&
2080 iscssmatchany(sel_hide, nodes, curnode))
2081 cur->tag.displaytype |= DisplayNone;
2082
2083 /* indent for this tag */
2084 cur->indent = cur->tag.indent;
2085
2086 if (!reader_ignore) {
2087 /* add link reference, print links and alt text */
2088 handleinlinelink();
2089 handleinlinealt();
2090 }
2091
2092 /* <select><option> */
2093 if ((cur->tag.displaytype & DisplayOption) && parent) {
2094 /* <select multiple>: show all options */
2095 if (parent->tag.displaytype & DisplaySelectMulti)
2096 cur->tag.displaytype |= DisplayBlock;
2097 else if (parent->nchildren > 1) /* show the first item as selected */
2098 cur->tag.displaytype |= DisplayNone; /* else hide */
2099 }
2100
2101 if (cur->tag.displaytype & DisplayNone)
2102 return;
2103
2104 if (reader_ignore)
2105 return;
2106
2107 indent = calcindent();
2108
2109 if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre |
2110 DisplayTable | DisplayTableRow |
2111 DisplayList | DisplayListItem))) {
2112 startblock(); /* break line if needed */
2113 }
2114
2115 if (cur->tag.displaytype & (DisplayButton | DisplayOption)) {
2116 hflush();
2117 hputchar('[');
2118 }
2119
2120 margintop = cur->tag.margintop;
2121 if (cur->tag.displaytype & (DisplayList)) {
2122 for (i = curnode - 1; i >= 0; i--) {
2123 if (nodes[i].tag.displaytype & DisplayList)
2124 break;
2125 if (!(nodes[i].tag.displaytype & DisplayListItem))
2126 continue;
2127 if (nodes[i].hasdata && margintop > 0) {
2128 margintop--;
2129 break;
2130 }
2131 }
2132 } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) {
2133 if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) {
2134 if (margintop > 0)
2135 margintop--;
2136 }
2137 }
2138
2139 if (margintop > 0) {
2140 hflush();
2141 for (i = currentnewlines; i < margintop; i++) {
2142 putchar('\n');
2143 nbytesline = 0;
2144 ncells = 0;
2145 currentnewlines++;
2146 }
2147 hadnewline = 1;
2148 }
2149
2150 if (cur->tag.displaytype & DisplayPre) {
2151 skipinitialws = 1;
2152 } else if (cur->tag.displaytype & DisplayTableCell) {
2153 if (parent && parent->visnchildren > 1)
2154 hputchar('\t');
2155 } else if (cur->tag.displaytype & DisplayListItem) {
2156 /* find first parent node and ordered numbers or unordered */
2157 if (parent) {
2158 skipinitialws = 0;
2159
2160 /* print bullet, add columns to indentation level */
2161 if (parent->tag.displaytype & DisplayListOrdered) {
2162 hprintf("%4zu. ", parent->nchildren);
2163 cur->indent = 6;
2164 indent += cur->indent; /* align to number */
2165 } else if (parent->tag.displaytype & DisplayList) {
2166 hprint(str_bullet_item);
2167 cur->indent = 2;
2168 indent += 2; /* align to bullet */
2169 }
2170 }
2171 skipinitialws = 0;
2172 } else if (cur->tag.displaytype & DisplayInput) {
2173 if (!attr_type.len) {
2174 hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
2175 } else if (!strcasecmp(attr_type.data, "button")) {
2176 hprintf("[%s]", attr_value.len ? attr_value.data : "");
2177 } else if (!strcasecmp(attr_type.data, "submit")) {
2178 hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query");
2179 } else if (!strcasecmp(attr_type.data, "reset")) {
2180 hprintf("[%s]", attr_value.len ? attr_value.data : "Reset");
2181 } else if (!strcasecmp(attr_type.data, "checkbox")) {
2182 hprintf("[%s]",
2183 attr_checked.len &&
2184 !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
2185 } else if (!strcasecmp(attr_type.data, "radio")) {
2186 hprintf("[%s]",
2187 attr_checked.len &&
2188 !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
2189 } else if (!strcasecmp(attr_type.data, "hidden")) {
2190 cur->tag.displaytype |= DisplayNone;
2191 } else {
2192 /* unrecognized / default case is text */
2193 hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
2194 }
2195 }
2196
2197 startmarkup(cur->tag.markuptype);
2198
2199 /* do not count data such as an item bullet as part of the data for
2200 the node */
2201 cur->hasdata = 0;
2202
2203 if (tagid == TagHr) { /* ruler */
2204 i = termwidth - indent - defaultindent;
2205 for (; i > 0; i--)
2206 hprint(str_ruler);
2207 cur->hasdata = 1; /* treat <hr/> as data */
2208 } else if (tagid == TagBr) {
2209 hflush();
2210 hadnewline = 0; /* forced newline */
2211 hputchar('\n');
2212 cur->hasdata = 1; /* treat <br/> as data */
2213 }
2214
2215 /* autoclose tags, such as <br>, pretend we are <br/> */
2216 if (!isshort && cur->tag.isvoid)
2217 xmltagend(p, t, tl, 1); /* pretend close of short tag */
2218 }
2219
2220 static void
2221 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
2222 size_t nl, const char *v, size_t vl)
2223 {
2224 struct node *cur;
2225 enum TagId tagid;
2226
2227 cur = &nodes[curnode];
2228 tagid = cur->tag.id;
2229
2230 /* hide tags with attribute aria-hidden or hidden */
2231 if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
2232 cur->tag.displaytype |= DisplayNone;
2233
2234 if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */
2235 string_append(&attr_class, v, vl);
2236 else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */
2237 string_append(&attr_id, v, vl);
2238 else if (!attrcmp(n, "type"))
2239 string_append(&attr_type, v, vl);
2240 else if (!attrcmp(n, "value"))
2241 string_append(&attr_value, v, vl);
2242
2243 /* <base href="..." /> */
2244 if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
2245 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
2246
2247 if (tagid == TagA && !attrcmp(n, "href"))
2248 string_append(&attr_href, v, vl);
2249
2250 if (tagid == TagSelect && !attrcmp(n, "multiple"))
2251 cur->tag.displaytype |= DisplaySelectMulti;
2252
2253 if (tagid == TagObject && !attrcmp(n, "data"))
2254 string_append(&attr_data, v, vl);
2255
2256 /* show img alt attribute as text. */
2257 if (tagid == TagImg && !attrcmp(n, "alt"))
2258 string_append(&attr_alt, v, vl);
2259
2260 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
2261 string_append(&attr_checked, v, vl);
2262
2263 /* src attribute */
2264 switch (tagid) {
2265 case TagAudio:
2266 case TagEmbed:
2267 case TagFrame:
2268 case TagIframe:
2269 case TagImg:
2270 case TagSource:
2271 case TagTrack:
2272 case TagVideo:
2273 if (!attrcmp(n, "src"))
2274 string_append(&attr_src, v, vl);
2275 break;
2276 default:
2277 break;
2278 }
2279 }
2280
2281 static void
2282 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
2283 size_t nl, const char *v, size_t vl)
2284 {
2285 char buf[8];
2286 int len;
2287
2288 len = xml_entitytostr(v, buf, sizeof(buf));
2289 if (len > 0)
2290 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
2291 else
2292 xmlattr(p, t, tl, n, nl, v, vl);
2293 }
2294
2295 static void
2296 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
2297 size_t nl)
2298 {
2299 struct node *cur;
2300 enum TagId tagid;
2301
2302 cur = &nodes[curnode];
2303 tagid = cur->tag.id;
2304
2305 if (!attr_class_set && !attrcmp(n, "class"))
2306 attr_class_set = 1;
2307 else if (!attr_id_set && !attrcmp(n, "id"))
2308 attr_id_set = 1;
2309
2310 /* set base URL, if it is set it cannot be overwritten again */
2311 if (!basehrefset && basehrefdoc[0] &&
2312 tagid == TagBase && !attrcmp(n, "href"))
2313 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
2314
2315 /* if attribute checked is set but it has no value then set it to "checked" */
2316 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
2317 string_append(&attr_checked, "checked", sizeof("checked") - 1);
2318 }
2319
2320 static void
2321 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
2322 size_t nl)
2323 {
2324 struct node *cur;
2325 enum TagId tagid;
2326
2327 cur = &nodes[curnode];
2328 tagid = cur->tag.id;
2329
2330 if (!attrcmp(n, "alt"))
2331 string_clear(&attr_alt);
2332 else if (!attrcmp(n, "checked"))
2333 string_clear(&attr_checked);
2334 else if (!attr_class_set && !attrcmp(n, "class"))
2335 string_clear(&attr_class);
2336 else if (!attrcmp(n, "data"))
2337 string_clear(&attr_data);
2338 else if (!attrcmp(n, "href"))
2339 string_clear(&attr_href);
2340 else if (!attr_id_set && !attrcmp(n, "id"))
2341 string_clear(&attr_id);
2342 else if (!attrcmp(n, "src"))
2343 string_clear(&attr_src);
2344 else if (!attrcmp(n, "type"))
2345 string_clear(&attr_type);
2346 else if (!attrcmp(n, "value"))
2347 string_clear(&attr_value);
2348
2349 if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
2350 basehrefdoc[0] = '\0';
2351 }
2352
2353 static void
2354 usage(void)
2355 {
2356 fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
2357 exit(1);
2358 }
2359
2360 int
2361 main(int argc, char **argv)
2362 {
2363 char *basehref;
2364
2365 if (pledge("stdio", NULL) < 0)
2366 err(1, "pledge");
2367
2368 ARGBEGIN {
2369 case '8':
2370 str_bullet_item = "\xe2\x80\xa2 ";
2371 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */
2372 break;
2373 case 'a':
2374 allowansi = !allowansi;
2375 break;
2376 case 'b':
2377 basehref = EARGF(usage());
2378 if (uri_parse(basehref, &base) == -1 ||
2379 !base.proto[0])
2380 usage();
2381 basehrefset = 1;
2382 break;
2383 case 'd':
2384 uniqrefs = !uniqrefs;
2385 break;
2386 case 'i':
2387 showrefinline = !showrefinline;
2388 break;
2389 case 'I':
2390 showurlinline = !showurlinline;
2391 break;
2392 case 'l':
2393 showrefbottom = !showrefbottom;
2394 break;
2395 case 'r':
2396 allowlinewrap = !allowlinewrap;
2397 break;
2398 case 's':
2399 sel_show = compileselectors(EARGF(usage()));
2400 /* switch to reader/selector mode, ignore all data except when matched */
2401 reader_mode = 1;
2402 reader_ignore = 1;
2403 break;
2404 case 'u':
2405 sel_hide = compileselectors(EARGF(usage()));
2406 /* switch to reader/selector mode */
2407 reader_mode = 1;
2408 break;
2409 case 'w':
2410 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
2411 usage();
2412 break;
2413 case 'x':
2414 resources = !resources;
2415 break;
2416 default:
2417 usage();
2418 } ARGEND
2419
2420 linewrap = allowlinewrap;
2421
2422 /* initial nodes */
2423 ncapnodes = NODE_CAP_INC;
2424 nodes = ecalloc(ncapnodes, sizeof(*nodes));
2425 nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
2426
2427 parser.xmlattrstart = xmlattrstart;
2428 parser.xmlattr = xmlattr;
2429 parser.xmlattrentity = xmlattrentity;
2430 parser.xmlattrend = xmlattrend;
2431 parser.xmlcdatastart = xmlcdatastart;
2432 parser.xmlcdata = xmlcdata;
2433 parser.xmlcdataend = xmlcdataend;
2434 parser.xmldatastart = xmldatastart;
2435 parser.xmldata = xmldata;
2436 parser.xmldataentity = xmldataentity;
2437 parser.xmldataend = xmldataend;
2438 parser.xmltagstart = xmltagstart;
2439 parser.xmltagstartparsed = xmltagstartparsed;
2440 parser.xmltagend = xmltagend;
2441
2442 parser.getnext = getchar;
2443 xml_parse(&parser);
2444
2445 hflush();
2446 if (ncells > 0)
2447 newline();
2448
2449 if (showrefbottom || resources)
2450 printlinkrefs();
2451
2452 hflush();
2453 setmarkup(0);
2454
2455 return 0;
2456 }