feed.c - frontends - front-ends for some sites (experiment)
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
feed.c (30210B)
---
1 #include <err.h>
2 #include <errno.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <strings.h>
8 #include <time.h>
9 #include <unistd.h>
10
11 #include "https.h"
12 #include "util.h"
13 #include "youtube.h"
14 #include "xml.h"
15
16 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
17 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
18
19 /* string and byte-length */
20 #define STRP(s) s,sizeof(s)-1
21
22 enum FeedType {
23 FeedTypeNone = 0,
24 FeedTypeAtom = 2
25 };
26
27 /* String data / memory pool */
28 typedef struct string {
29 char *data; /* data */
30 size_t len; /* string length */
31 size_t bufsiz; /* allocated size */
32 } String;
33
34 /* NOTE: the order of these fields (content, date, author) indicate the
35 * priority to use them, from least important to high. */
36 enum TagId {
37 TagUnknown = 0,
38 /* Atom */
39 /* creation date has higher priority */
40 AtomTagPublished,
41 AtomTagTitle,
42 AtomTagMediaDescription,
43 AtomTagId,
44 AtomTagLink,
45 AtomTagLinkAlternate,
46 AtomTagAuthor, AtomTagAuthorName,
47 TagYoutubeVideoId,
48 TagLast
49 };
50
51 typedef struct feedtag {
52 char *name; /* name of tag to match */
53 size_t len; /* len of `name` */
54 enum TagId id; /* unique ID */
55 } FeedTag;
56
57 typedef struct field {
58 String str;
59 enum TagId tagid; /* tagid set previously, used for tag priority */
60 } FeedField;
61
62 enum {
63 /* sfeed fields */
64 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
65 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
66 FeedFieldYoutubeId, /* yt:videoId */
67 FeedFieldLast
68 };
69
70 typedef struct feedcontext {
71 String *field; /* current FeedItem field String */
72 FeedField fields[FeedFieldLast]; /* data for current item */
73 FeedTag tag; /* unique current parsed tag */
74 int iscontent; /* in content data */
75 int iscontenttag; /* in content tag */
76 enum FeedType feedtype;
77 } FeedContext;
78
79 static long long datetounix(long long, int, int, int, int, int);
80 static FeedTag * gettag(enum FeedType, const char *, size_t);
81 static long gettzoffset(const char *);
82 static int isattr(const char *, size_t, const char *, size_t);
83 static int istag(const char *, size_t, const char *, size_t);
84 static int parsetime(const char *, long long *);
85
86 static void atom_header(void);
87 static void atom_item(void);
88 static void atom_footer(void);
89 static void gph_header(void);
90 static void gph_footer(void);
91 static void html_header(void);
92 static void html_footer(void);
93 static void json_header(void);
94 static void json_item(void);
95 static void json_footer(void);
96 static void sfeed_item(void); /* TSV / sfeed */
97 static void twtxt_item(void);
98
99 static void string_append(String *, const char *, size_t);
100 static void string_buffer_realloc(String *, size_t);
101 static void string_clear(String *);
102 static void string_print_encoded(String *);
103 static void string_print_timestamp(String *);
104 static void string_print(String *);
105 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
106 const char *, size_t);
107 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
108 size_t, const char *, size_t);
109 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
110 size_t);
111 static void xmldata(XMLParser *, const char *, size_t);
112 static void xmldataentity(XMLParser *, const char *, size_t);
113 static void xmltagend(XMLParser *, const char *, size_t, int);
114 static void xmltagstart(XMLParser *, const char *, size_t);
115 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
116
117 /* Atom, must be alphabetical order */
118 static const FeedTag atomtags[] = {
119 { STRP("author"), AtomTagAuthor },
120 { STRP("id"), AtomTagId },
121 /* Atom: <link href="" />, RSS has <link></link> */
122 { STRP("link"), AtomTagLink },
123 { STRP("media:description"), AtomTagMediaDescription },
124 { STRP("published"), AtomTagPublished },
125 { STRP("title"), AtomTagTitle },
126 { STRP("yt:videoId"), TagYoutubeVideoId }
127 };
128
129 /* special case: nested <author><name> */
130 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
131 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
132
133 /* reference to no / unknown tag */
134 static const FeedTag notag = { STRP(""), TagUnknown };
135
136 /* map TagId type to RSS/Atom field, all tags must be defined */
137 static const int fieldmap[TagLast] = {
138 [TagUnknown] = -1,
139 /* Atom */
140 [AtomTagPublished] = FeedFieldTime,
141 [AtomTagTitle] = FeedFieldTitle,
142 [AtomTagMediaDescription] = FeedFieldContent,
143 [AtomTagId] = FeedFieldId,
144 [AtomTagLink] = -1,
145 [AtomTagLinkAlternate] = FeedFieldLink,
146 [AtomTagAuthor] = -1,
147 [AtomTagAuthorName] = FeedFieldAuthor,
148 [TagYoutubeVideoId] = FeedFieldYoutubeId
149 };
150
151 static const int FieldSeparator = '\t';
152
153 static FeedContext ctx;
154 static XMLParser parser; /* XML parser state */
155 static String attrrel, tmpstr;
156
157 static struct search_response *search_res = NULL;
158 static void (*printfields)(void) = sfeed_item;
159 static int cgimode = 0, godmode = 0;
160 static const char *server_name = "127.0.0.1", *server_port = "70";
161
162 static int
163 tagcmp(const void *v1, const void *v2)
164 {
165 return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
166 }
167
168 /* Unique tagid for parsed tag name. */
169 static FeedTag *
170 gettag(enum FeedType feedtype, const char *name, size_t namelen)
171 {
172 FeedTag f, *r = NULL;
173
174 f.name = (char *)name;
175
176 switch (feedtype) {
177 case FeedTypeAtom:
178 r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
179 sizeof(atomtags[0]), tagcmp);
180 break;
181 default:
182 break;
183 }
184
185 return r;
186 }
187
188 /* Clear string only; don't free, prevents unnecessary reallocation. */
189 static void
190 string_clear(String *s)
191 {
192 if (s->data)
193 s->data[0] = '\0';
194 s->len = 0;
195 }
196
197 static void
198 string_buffer_realloc(String *s, size_t newlen)
199 {
200 size_t alloclen;
201
202 if (newlen > SIZE_MAX / 2) {
203 alloclen = SIZE_MAX;
204 } else {
205 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
206 ;
207 }
208 if (!(s->data = realloc(s->data, alloclen)))
209 err(1, "realloc");
210 s->bufsiz = alloclen;
211 }
212
213 /* Append data to String, s->data and data may not overlap. */
214 static void
215 string_append(String *s, const char *data, size_t len)
216 {
217 if (!len)
218 return;
219
220 if (s->len >= SIZE_MAX - len) {
221 errno = ENOMEM;
222 err(1, "realloc");
223 }
224
225 /* check if allocation is necessary, never shrink the buffer. */
226 if (s->len + len >= s->bufsiz)
227 string_buffer_realloc(s, s->len + len + 1);
228 memcpy(s->data + s->len, data, len);
229 s->len += len;
230 s->data[s->len] = '\0';
231 }
232
233 /* Print text, encode TABs, newlines and '\', remove other whitespace.
234 * Remove leading and trailing whitespace. */
235 static void
236 string_print_encoded(String *s)
237 {
238 const char *p, *e;
239
240 if (!s->data || !s->len)
241 return;
242
243 p = s->data;
244 e = p + strlen(p);
245
246 for (; *p && p != e; p++) {
247 switch (*p) {
248 case '\n': putchar('\\'); putchar('n'); break;
249 case '\\': putchar('\\'); putchar('\\'); break;
250 case '\t': putchar('\\'); putchar('t'); break;
251 default:
252 /* ignore control chars */
253 if (!ISCNTRL((unsigned char)*p))
254 putchar(*p);
255 break;
256 }
257 }
258 }
259
260 /* Print text, replace TABs, carriage return and other whitespace with ' '.
261 * Other control chars are removed. Remove leading and trailing whitespace. */
262 static void
263 string_print(String *s)
264 {
265 char *p, *e;
266
267 if (!s->data || !s->len)
268 return;
269
270 p = s->data;
271 e = p + s->len;
272 for (; *p && p != e; p++) {
273 if (ISSPACE((unsigned char)*p))
274 putchar(' '); /* any whitespace to space */
275 else if (!ISCNTRL((unsigned char)*p))
276 /* ignore other control chars */
277 putchar(*p);
278 }
279 }
280
281 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
282 static void
283 string_print_timestamp(String *s)
284 {
285 long long t;
286
287 if (!s->data || !s->len)
288 return;
289
290 if (parsetime(s->data, &t) != -1)
291 printf("%lld", t);
292 }
293
294 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
295 Parameters should be passed as they are in a struct tm:
296 that is: year = year - 1900, month = month - 1. */
297 static long long
298 datetounix(long long year, int mon, int day, int hour, int min, int sec)
299 {
300 /* seconds in a month in a regular (non-leap) year */
301 static const long secs_through_month[] = {
302 0, 31 * 86400, 59 * 86400, 90 * 86400,
303 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
304 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
305 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
306 long long t;
307
308 /* optimization: handle common range year 1902 up to and including 2038 */
309 if (year - 2ULL <= 136) {
310 /* amount of leap days relative to 1970: every 4 years */
311 leaps = (year - 68) >> 2;
312 if (!((year - 68) & 3)) {
313 leaps--;
314 is_leap = 1;
315 } else {
316 is_leap = 0;
317 }
318 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
319 } else {
320 /* general leap year calculation:
321 leap years occur mostly every 4 years but every 100 years
322 a leap year is skipped unless the year is divisible by 400 */
323 cycles = (year - 100) / 400;
324 rem = (year - 100) % 400;
325 if (rem < 0) {
326 cycles--;
327 rem += 400;
328 }
329 if (!rem) {
330 is_leap = 1;
331 } else {
332 if (rem >= 300)
333 centuries = 3, rem -= 300;
334 else if (rem >= 200)
335 centuries = 2, rem -= 200;
336 else if (rem >= 100)
337 centuries = 1, rem -= 100;
338 if (rem) {
339 leaps = rem / 4U;
340 rem %= 4U;
341 is_leap = !rem;
342 }
343 }
344 leaps += (97 * cycles) + (24 * centuries) - is_leap;
345
346 /* adjust 8 leap days from 1970 up to and including 2000:
347 ((30 * 365) + 8) * 86400 = 946771200 */
348 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
349 }
350 t += secs_through_month[mon];
351 if (is_leap && mon >= 2)
352 t += 86400;
353 t += 86400LL * (day - 1);
354 t += 3600LL * hour;
355 t += 60LL * min;
356 t += sec;
357
358 return t;
359 }
360
361 /* Get timezone from string, return time offset in seconds from UTC. */
362 static long
363 gettzoffset(const char *s)
364 {
365 const char *p;
366 long tzhour = 0, tzmin = 0;
367 size_t i;
368
369 switch (*s) {
370 case '-': /* offset */
371 case '+':
372 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
373 tzhour = (tzhour * 10) + (*p - '0');
374 if (*p == ':')
375 p++;
376 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
377 tzmin = (tzmin * 10) + (*p - '0');
378 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
379 default: /* timezone name */
380 break;
381 }
382 return 0;
383 }
384
385 /* Parse time string `s` into the UNIX timestamp `tp`.
386 Returns 0 on success or -1 on failure. */
387 static int
388 parsetime(const char *s, long long *tp)
389 {
390 int va[6] = { 0 }, i, v, vi;
391
392 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
393 if (!ISDIGIT((unsigned char)s[0]) ||
394 !ISDIGIT((unsigned char)s[1]) ||
395 !ISDIGIT((unsigned char)s[2]) ||
396 !ISDIGIT((unsigned char)s[3]))
397 return -1;
398
399 /* parse time parts (and possibly remaining date parts) */
400 for (vi = 0; *s && vi < 6; vi++) {
401 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
402 ISDIGIT((unsigned char)*s); s++, i++) {
403 v = (v * 10) + (*s - '0');
404 }
405 va[vi] = v;
406
407 if ((vi < 2 && *s == '-') ||
408 (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
409 (vi > 2 && *s == ':'))
410 s++;
411 }
412
413 /* invalid range */
414 if (va[0] < 0 || va[0] > 9999 ||
415 va[1] < 1 || va[1] > 12 ||
416 va[2] < 1 || va[2] > 31 ||
417 va[3] < 0 || va[3] > 23 ||
418 va[4] < 0 || va[4] > 59 ||
419 va[5] < 0 || va[5] > 60) /* allow leap second */
420 return -1;
421
422 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
423 gettzoffset(s);
424
425 return 0;
426 }
427
428 static void
429 atom_header(void)
430 {
431 fputs("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
432 "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n"
433 "\t<title>Newsfeed</title>\n", stdout);
434 }
435
436 static void
437 atom_footer(void)
438 {
439 fputs("</feed>\n", stdout);
440 }
441
442 static void
443 atom_item(void)
444 {
445 struct item *v, *found = NULL;
446 size_t i;
447
448 /* must have a video id */
449 if (!ctx.fields[FeedFieldYoutubeId].str.len)
450 return;
451
452 for (i = 0; i < search_res->nitems; i++) {
453 v = &(search_res->items[i]);
454 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
455 found = v;
456 }
457 /* Only print the video if it was found in the feed aswell.
458 This way it filters away shorts too. */
459 if (!found)
460 return;
461
462 fputs("<entry>\n\t<title>", stdout);
463 if (found->membersonly)
464 xmlencode(MEMBERS_ONLY);
465 xmlencode(ctx.fields[FeedFieldTitle].str.data);
466 if (found->duration[0]) {
467 fputs(" [", stdout);
468 xmlencode(found->duration);
469 fputs("]", stdout);
470 }
471 fputs("</title>\n", stdout);
472 if (ctx.fields[FeedFieldLink].str.len) {
473 fputs("\t<link rel=\"alternate\" href=\"", stdout);
474 xmlencode(ctx.fields[FeedFieldLink].str.data);
475 fputs("\" />\n", stdout);
476 }
477 /* prefer link over id for Atom <id>. */
478 fputs("\t<id>", stdout);
479 if (ctx.fields[FeedFieldLink].str.len)
480 xmlencode(ctx.fields[FeedFieldLink].str.data);
481 else if (ctx.fields[FeedFieldId].str.len)
482 xmlencode(ctx.fields[FeedFieldId].str.data);
483 fputs("</id>\n", stdout);
484
485 /* just print the original timestamp, it should conform */
486 fputs("\t<updated>", stdout);
487 string_print(&ctx.fields[FeedFieldTime].str);
488 fputs("</updated>\n", stdout);
489
490 if (ctx.fields[FeedFieldAuthor].str.len) {
491 fputs("\t<author><name>", stdout);
492 xmlencode(ctx.fields[FeedFieldAuthor].str.data);
493 fputs("</name></author>\n", stdout);
494 }
495 if (ctx.fields[FeedFieldContent].str.len) {
496 fputs("\t<content>", stdout);
497 xmlencode(ctx.fields[FeedFieldContent].str.data);
498 fputs("</content>\n", stdout);
499 }
500 fputs("</entry>\n", stdout);
501 }
502
503
504 static void
505 html_header(void)
506 {
507 fputs("<!DOCTYPE HTML>\n"
508 "<html>\n"
509 "<head>\n"
510 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n"
511 "</head>\n"
512 "<body><pre>\n", stdout);
513 }
514
515 static void
516 html_footer(void)
517 {
518 fputs("</pre></body>\n</html>\n", stdout);
519 }
520
521 static void
522 html_item(void)
523 {
524 struct item *v, *found = NULL;
525 size_t i;
526
527 /* must have a video id */
528 if (!ctx.fields[FeedFieldYoutubeId].str.len)
529 return;
530
531 for (i = 0; i < search_res->nitems; i++) {
532 v = &(search_res->items[i]);
533 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
534 found = v;
535 }
536 /* Only print the video if it was found in the feed aswell.
537 This way it filters away shorts too. */
538 if (!found)
539 return;
540
541 /* just print the original timestamp, it should conform */
542 xmlencode(ctx.fields[FeedFieldTime].str.data);
543 fputs(" ", stdout);
544
545 if (ctx.fields[FeedFieldLink].str.len) {
546 fputs("<a href=\"", stdout);
547 xmlencode(ctx.fields[FeedFieldLink].str.data);
548 fputs("\">", stdout);
549 }
550
551 if (found->membersonly)
552 xmlencode(MEMBERS_ONLY);
553 xmlencode(ctx.fields[FeedFieldTitle].str.data);
554 if (found->duration[0]) {
555 fputs(" [", stdout);
556 xmlencode(found->duration);
557 fputs("]", stdout);
558 }
559 if (ctx.fields[FeedFieldLink].str.len) {
560 fputs("</a>", stdout);
561 }
562 fputs("\n", stdout);
563 }
564
565 static void
566 gphencode(const char *s)
567 {
568 gophertext(stdout, s, strlen(s));
569 }
570
571 static void
572 gph_header(void)
573 {
574 }
575
576 static void
577 gph_footer(void)
578 {
579 fputs(".\r\n", stdout);
580 }
581
582 static void
583 gph_item(void)
584 {
585 struct item *v, *found = NULL;
586 size_t i;
587
588 /* must have a video id */
589 if (!ctx.fields[FeedFieldYoutubeId].str.len)
590 return;
591
592 for (i = 0; i < search_res->nitems; i++) {
593 v = &(search_res->items[i]);
594 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
595 found = v;
596 }
597 /* Only print the video if it was found in the feed aswell.
598 This way it filters away shorts too. */
599 if (!found)
600 return;
601
602 fputs("h", stdout);
603 /* just print the original timestamp, it should conform */
604 gphencode(ctx.fields[FeedFieldTime].str.data);
605 fputs(" ", stdout);
606 if (found->membersonly)
607 gphencode(MEMBERS_ONLY);
608 gphencode(ctx.fields[FeedFieldTitle].str.data);
609 if (found->duration[0]) {
610 fputs(" [", stdout);
611 gphencode(found->duration);
612 fputs("]", stdout);
613 }
614 fputs("\t", stdout);
615 if (ctx.fields[FeedFieldLink].str.len) {
616 fputs("URL:", stdout);
617 gphencode(ctx.fields[FeedFieldLink].str.data);
618 }
619 printf("\t%s\t%s\r\n", server_name, server_port);
620 }
621
622 static void
623 json_header(void)
624 {
625 fputs("{\n"
626 "\"version\": \"https://jsonfeed.org/version/1.1\",\n"
627 "\"title\": \"Newsfeed\",\n"
628 "\"items\": [\n", stdout);
629 }
630
631 static void
632 json_footer(void)
633 {
634 fputs("]\n}\n", stdout);
635 }
636
637 static void
638 json_printfield(const char *s)
639 {
640 for (; *s; s++) {
641 if (*s == '\\')
642 fputs("\\\\", stdout);
643 else if (*s == '"')
644 fputs("\\\"", stdout);
645 else if (ISCNTRL((unsigned char)*s))
646 printf("\\u00%02x", (unsigned char)*s);
647 else
648 putchar(*s);
649 }
650 }
651
652 static void
653 json_item(void)
654 {
655 static int json_firstitem = 1;
656 struct item *v, *found = NULL;
657 size_t i;
658
659 /* must have a video id */
660 if (!ctx.fields[FeedFieldYoutubeId].str.len)
661 return;
662
663 for (i = 0; i < search_res->nitems; i++) {
664 v = &(search_res->items[i]);
665 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
666 found = v;
667 }
668 /* Only print the video if it was found in the feed aswell.
669 This way it filters away shorts too. */
670 if (!found)
671 return;
672
673 if (!json_firstitem)
674 fputs(",\n", stdout);
675 json_firstitem = 0;
676
677 fputs("{\n\t\"id\": \"", stdout);
678 json_printfield(ctx.fields[FeedFieldId].str.data);
679 fputs("\"", stdout);
680
681 /* just print the original timestamp, it should conform */
682 fputs(",\n\t\"date_published\": \"", stdout);
683 string_print(&ctx.fields[FeedFieldTime].str);
684 fputs("\"", stdout);
685
686 fputs(",\n\t\"title\": \"", stdout);
687 if (found->membersonly)
688 json_printfield(MEMBERS_ONLY);
689 json_printfield(ctx.fields[FeedFieldTitle].str.data);
690 if (found->duration[0]) {
691 fputs(" [", stdout);
692 json_printfield(found->duration);
693 fputs("]", stdout);
694 }
695 fputs("\"", stdout);
696
697 if (ctx.fields[FeedFieldLink].str.len) {
698 fputs(",\n\t\"url\": \"", stdout);
699 json_printfield(ctx.fields[FeedFieldLink].str.data);
700 fputs("\"", stdout);
701 }
702
703 if (ctx.fields[FeedFieldAuthor].str.len) {
704 fputs(",\n\t\"authors\": [{\"name\": \"", stdout);
705 json_printfield(ctx.fields[FeedFieldAuthor].str.data);
706 fputs("\"}]", stdout);
707 }
708
709 fputs(",\n\t\"content_text\": \"", stdout);
710 json_printfield(ctx.fields[FeedFieldContent].str.data);
711 fputs("\"\n}", stdout);
712 }
713
714 static void
715 sfeed_item(void)
716 {
717 struct item *v, *found = NULL;
718 size_t i;
719
720 /* must have a video id */
721 if (!ctx.fields[FeedFieldYoutubeId].str.len)
722 return;
723
724 for (i = 0; i < search_res->nitems; i++) {
725 v = &(search_res->items[i]);
726 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
727 found = v;
728 }
729 /* Only print the video if it was found in the feed aswell.
730 This way it filters away shorts too. */
731 if (!found)
732 return;
733
734 string_print_timestamp(&ctx.fields[FeedFieldTime].str);
735 putchar(FieldSeparator);
736 if (found->membersonly)
737 fputs(MEMBERS_ONLY, stdout);
738 string_print(&ctx.fields[FeedFieldTitle].str);
739 if (found->duration[0]) {
740 fputs(" [", stdout);
741 fputs(found->duration, stdout);
742 fputs("]", stdout);
743 }
744 putchar(FieldSeparator);
745 string_print(&ctx.fields[FeedFieldLink].str);
746 putchar(FieldSeparator);
747 string_print_encoded(&ctx.fields[FeedFieldContent].str);
748 putchar(FieldSeparator);
749 fputs("plain", stdout);
750 putchar(FieldSeparator);
751 string_print(&ctx.fields[FeedFieldId].str);
752 putchar(FieldSeparator);
753 string_print(&ctx.fields[FeedFieldAuthor].str);
754 putchar(FieldSeparator);
755 /* no/empty enclosure */
756 putchar(FieldSeparator);
757 /* empty category */
758 putchar('\n');
759 }
760
761 static void
762 twtxt_item(void)
763 {
764 struct item *v, *found = NULL;
765 size_t i;
766
767 /* must have a video id */
768 if (!ctx.fields[FeedFieldYoutubeId].str.len)
769 return;
770
771 for (i = 0; i < search_res->nitems; i++) {
772 v = &(search_res->items[i]);
773 if (!strcmp(ctx.fields[FeedFieldYoutubeId].str.data, v->id))
774 found = v;
775 }
776 /* Only print the video if it was found in the feed aswell.
777 This way it filters away shorts too. */
778 if (!found)
779 return;
780
781 string_print(&ctx.fields[FeedFieldTime].str);
782 putchar(FieldSeparator);
783 if (found->membersonly)
784 fputs(MEMBERS_ONLY, stdout);
785 string_print(&ctx.fields[FeedFieldTitle].str);
786 if (found->duration[0]) {
787 fputs(" [", stdout);
788 fputs(found->duration, stdout);
789 fputs("]", stdout);
790 }
791 fputs(": ", stdout);
792 string_print(&ctx.fields[FeedFieldLink].str);
793 putchar('\n');
794 }
795
796 static int
797 istag(const char *name, size_t len, const char *name2, size_t len2)
798 {
799 return (len == len2 && !strcasecmp(name, name2));
800 }
801
802 static int
803 isattr(const char *name, size_t len, const char *name2, size_t len2)
804 {
805 return (len == len2 && !strcasecmp(name, name2));
806 }
807
808 static void
809 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
810 const char *v, size_t vl)
811 {
812 if (ISINCONTENT(ctx))
813 return;
814
815 if (!ctx.tag.id)
816 return;
817
818 if (ISCONTENTTAG(ctx))
819 return;
820
821 if (ctx.tag.id == AtomTagLink) {
822 if (isattr(n, nl, STRP("rel"))) {
823 string_append(&attrrel, v, vl);
824 } else if (isattr(n, nl, STRP("href"))) {
825 string_append(&tmpstr, v, vl);
826 }
827 }
828 }
829
830 static void
831 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
832 const char *data, size_t datalen)
833 {
834 char buf[8];
835 int len;
836
837 if (ISINCONTENT(ctx))
838 return;
839
840 if (!ctx.tag.id)
841 return;
842
843 /* try to translate entity, else just pass as data to
844 * xmlattr handler. */
845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
846 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
847 else
848 xmlattr(p, t, tl, n, nl, data, datalen);
849 }
850
851 static void
852 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
853 {
854 if (ISINCONTENT(ctx))
855 return;
856
857 if (attrrel.len && isattr(n, nl, STRP("rel")))
858 string_clear(&attrrel);
859 else if (tmpstr.len &&
860 (isattr(n, nl, STRP("href")) ||
861 isattr(n, nl, STRP("url"))))
862 string_clear(&tmpstr); /* use the last value for multiple attribute values */
863 }
864
865 static void
866 xmldata(XMLParser *p, const char *s, size_t len)
867 {
868 if (!ctx.field)
869 return;
870
871 string_append(ctx.field, s, len);
872 }
873
874 static void
875 xmldataentity(XMLParser *p, const char *data, size_t datalen)
876 {
877 char buf[8];
878 int len;
879
880 if (!ctx.field)
881 return;
882
883 /* try to translate entity, else just pass as data to
884 * xmldata handler. */
885 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
886 xmldata(p, buf, (size_t)len);
887 else
888 xmldata(p, data, datalen);
889 }
890
891 static void
892 xmltagstart(XMLParser *p, const char *t, size_t tl)
893 {
894 const FeedTag *f;
895
896 if (ISINCONTENT(ctx))
897 return;
898
899 /* start of RSS or Atom item / entry */
900 if (ctx.feedtype == FeedTypeNone) {
901 if (istag(t, tl, STRP("entry")))
902 ctx.feedtype = FeedTypeAtom;
903 return;
904 }
905
906 /* field tagid already set or nested tags. */
907 if (ctx.tag.id) {
908 /* nested <author><name> for Atom */
909 if (ctx.tag.id == AtomTagAuthor &&
910 istag(t, tl, STRP("name"))) {
911 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
912 } else {
913 return; /* other nested tags are not allowed: return */
914 }
915 }
916
917 /* in item */
918 if (ctx.tag.id == TagUnknown) {
919 if (!(f = gettag(ctx.feedtype, t, tl)))
920 f = ¬ag;
921 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
922 }
923
924 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
925 string_clear(&attrrel);
926 }
927
928 static void
929 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
930 {
931 enum TagId tagid;
932
933 if (ISINCONTENT(ctx))
934 return;
935
936 /* set tag type based on its attribute value */
937 if (ctx.tag.id == AtomTagLink) {
938 /* empty or "alternate": other types could be
939 "enclosure", "related", "self" or "via" */
940 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
941 ctx.tag.id = AtomTagLinkAlternate;
942 else
943 ctx.tag.id = AtomTagLink; /* unknown */
944 }
945
946 tagid = ctx.tag.id;
947
948 /* map tag type to field: unknown or lesser priority is ignored,
949 when tags of the same type are repeated only the first is used. */
950 if (fieldmap[tagid] == -1 ||
951 tagid <= ctx.fields[fieldmap[tagid]].tagid) {
952 return;
953 }
954
955 if (ctx.iscontenttag) {
956 ctx.iscontent = 1;
957 ctx.iscontenttag = 0;
958 }
959
960 ctx.field = &(ctx.fields[fieldmap[tagid]].str);
961 ctx.fields[fieldmap[tagid]].tagid = tagid;
962
963 /* clear field if it is overwritten (with a priority order) for the new
964 value, if the field can have multiple values then do not clear it. */
965 string_clear(ctx.field);
966 }
967
968 static void
969 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
970 {
971 size_t i;
972
973 if (ctx.feedtype == FeedTypeNone)
974 return;
975
976 if (ISINCONTENT(ctx)) {
977 /* not a closed content field */
978 if (!istag(ctx.tag.name, ctx.tag.len, t, tl))
979 return;
980 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
981 /* matched tag end: close it */
982 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
983 istag(t, tl, STRP("entry"))))) /* Atom */
984 {
985 /* end of Atom entry */
986 printfields();
987
988 /* clear strings */
989 for (i = 0; i < FeedFieldLast; i++) {
990 string_clear(&ctx.fields[i].str);
991 ctx.fields[i].tagid = TagUnknown;
992 }
993 /* allow parsing of Atom and RSS concatenated in one XML stream. */
994 ctx.feedtype = FeedTypeNone;
995 } else {
996 return; /* not end of field */
997 }
998
999 /* temporary string: for fields that cannot be processed
1000 directly and need more context, for example by its tag
1001 attributes, like the Atom link rel="alternate|enclosure". */
1002 if (tmpstr.len && ctx.field) {
1003 string_clear(ctx.field);
1004 string_append(ctx.field, tmpstr.data, tmpstr.len);
1005 }
1006
1007 /* close field */
1008 string_clear(&tmpstr); /* reuse and clear temporary string */
1009
1010 if (ctx.tag.id == AtomTagAuthorName)
1011 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
1012 else
1013 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
1014
1015 ctx.iscontent = 0;
1016 ctx.field = NULL;
1017 }
1018
1019 static char *
1020 request_channel_feed(const char *channelid)
1021 {
1022 char path[2048];
1023 int r;
1024
1025 r = snprintf(path, sizeof(path), "/feeds/videos.xml?channel_id=%s", channelid);
1026 /* check if request is too long (truncation) */
1027 if (r < 0 || (size_t)r >= sizeof(path))
1028 return NULL;
1029
1030 return request("www.youtube.com", path, "");
1031 }
1032
1033 int
1034 isvalidchannel(const char *s)
1035 {
1036 size_t len;
1037
1038 for (len = 0; *s; s++, len++) {
1039 if (ISALPHA((unsigned char)*s) ||
1040 ISDIGIT((unsigned char)*s) ||
1041 *s == '-' || *s == '_')
1042 continue;
1043 return 0;
1044 }
1045
1046 return *s == '\0' && len == 24;
1047 }
1048
1049 void
1050 usage(void)
1051 {
1052 const char *line1 = "Bad Request, path should be the channel id + file extension, for example: UCrbvoMC0zUvPL8vjswhLOSw.json";
1053 const char *line2 = "Supported extensions are: [atom|gph|html|json|tsv|txt]";
1054
1055 if (cgimode) {
1056 if (godmode) {
1057 printf("3%s\tErr\t%s\t%s\r\n", line1, server_name, server_port);
1058 printf("3%s\tErr\t%s\t%s\r\n", line2, server_name, server_port);
1059 } else {
1060 fputs("Status: 400 Bad Request\r\n", stdout);
1061 fputs("Content-Type: text/plain; charset=utf-8\r\n\r\n", stdout);
1062 printf("400 %s\n", line1);
1063 printf("\n%s", line2);
1064 }
1065 exit(0);
1066 } else {
1067 fputs("usage: feed <channelid> [atom|gph|html|json|tsv|txt]\n", stderr);
1068 fputs("For example: feed UCrbvoMC0zUvPL8vjswhLOSw txt\n", stderr);
1069 exit(1);
1070 }
1071 }
1072
1073 int
1074 main(int argc, char *argv[])
1075 {
1076 char buf[256];
1077 const char *channelid = NULL;
1078 char *data, *format = "tsv", *p, *path = NULL, *tmp;
1079 size_t i;
1080
1081 if (pledge("stdio dns inet rpath unveil", NULL) == -1)
1082 err(1, "pledge");
1083
1084 if ((tmp = getenv("REQUEST_URI")))
1085 path = tmp;
1086 else if ((tmp = getenv("REQUEST")))
1087 path = tmp;
1088
1089 if (path) {
1090 cgimode = 1;
1091
1092 if ((tmp = getenv("SERVER_NAME")))
1093 server_name = tmp;
1094 if ((tmp = getenv("SERVER_PORT")))
1095 server_port = tmp;
1096 if ((tmp = getenv("SERVER_PROTOCOL")) && strstr(tmp, "gopher"))
1097 godmode = 1;
1098
1099 strlcpy(buf, path, sizeof(buf));
1100 path = buf;
1101
1102 if (!(p = strrchr(path, '/')))
1103 usage();
1104
1105 channelid = p + 1;
1106 if ((p = strrchr(channelid, '.'))) {
1107 *p = '\0'; /* NULL terminate */
1108 format = p + 1;
1109 }
1110 } else {
1111 if (argc <= 1)
1112 usage();
1113
1114 channelid = argv[1];
1115 if (argc > 2)
1116 format = argv[2];
1117 }
1118 if (!channelid || !isvalidchannel(channelid))
1119 usage();
1120
1121 if (!strcmp(format, "atom") || !strcmp(format, "xml"))
1122 printfields = atom_item;
1123 else if (!strcmp(format, "gph"))
1124 printfields = gph_item;
1125 else if (!strcmp(format, "html"))
1126 printfields = html_item;
1127 else if (!strcmp(format, "json"))
1128 printfields = json_item;
1129 else if (!strcmp(format, "tsv") || !strcmp(format, "sfeed"))
1130 printfields = sfeed_item;
1131 else if (!strcmp(format, "txt") || !strcmp(format, "twtxt"))
1132 printfields = twtxt_item;
1133 else
1134 usage();
1135
1136 search_res = youtube_channel_videos(channelid);
1137 if (!search_res || search_res->nitems == 0) {
1138 /* error or no videos found */
1139 return 0;
1140 }
1141
1142 if (!(data = request_channel_feed(channelid)))
1143 return 1; /* error, no data at all */
1144
1145 if (pledge("stdio", NULL) == -1)
1146 err(1, "pledge");
1147
1148 setxmldata(data, strlen(data));
1149
1150 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
1151
1152 parser.xmlattr = xmlattr;
1153 parser.xmlattrentity = xmlattrentity;
1154 parser.xmlattrstart = xmlattrstart;
1155 parser.xmlcdata = xmldata;
1156 parser.xmldata = xmldata;
1157 parser.xmldataentity = xmldataentity;
1158 parser.xmltagend = xmltagend;
1159 parser.xmltagstart = xmltagstart;
1160 parser.xmltagstartparsed = xmltagstartparsed;
1161
1162 /* init all fields, make sure it has a value */
1163 for (i = 0; i < FeedFieldLast; i++) {
1164 string_append(&(ctx.fields[i].str), " ", 1);
1165 string_clear(&(ctx.fields[i].str));
1166 }
1167
1168 if (cgimode && !godmode) {
1169 fputs("Status: 200 OK\r\n", stdout);
1170 if (!strcmp(format, "atom") || !strcmp(format, "xml"))
1171 fputs("Content-Type: text/xml; charset=utf-8\r\n\r\n", stdout);
1172 else if (!strcmp(format, "html"))
1173 fputs("Content-Type: text/html; charset=utf-8\r\n\r\n", stdout);
1174 else if (!strcmp(format, "json"))
1175 fputs("Content-Type: application/json; charset=utf-8\r\n\r\n", stdout);
1176 else
1177 fputs("Content-Type: text/plain; charset=utf-8\r\n\r\n", stdout);
1178 }
1179
1180 if (!strcmp(format, "atom") || !strcmp(format, "xml"))
1181 atom_header();
1182 else if (!strcmp(format, "gph"))
1183 gph_header();
1184 else if (!strcmp(format, "html"))
1185 html_header();
1186 else if (!strcmp(format, "json"))
1187 json_header();
1188
1189 /* NOTE: getnext is defined in xml.h for inline optimization */
1190 xml_parse(&parser);
1191
1192 if (!strcmp(format, "atom") || !strcmp(format, "xml"))
1193 atom_footer();
1194 else if (!strcmp(format, "gph"))
1195 gph_footer();
1196 else if (!strcmp(format, "html"))
1197 html_footer();
1198 else if (!strcmp(format, "json"))
1199 json_footer();
1200
1201 return 0;
1202 }