URI:
       thtml.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       thtml.c (5965B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <draw.h>
            5 #include <regexp.h>
            6 #include <html.h>
            7 #include <ctype.h>
            8 #include "dat.h"
            9 
           10 char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
           11 Reprog        *urlprog;
           12 
           13 int inword = 0;
           14 int col = 0;
           15 int wordi = 0;
           16 
           17 char*
           18 loadhtml(int fd)
           19 {
           20         URLwin *u;
           21         Bytes *b;
           22         int n;
           23         char buf[4096];
           24 
           25         u = emalloc(sizeof(URLwin));
           26         u->infd = fd;
           27         u->outfd = 1;
           28         u->url = estrdup(url);
           29         u->type = TextHtml;
           30 
           31         b = emalloc(sizeof(Bytes));
           32         while((n = read(fd, buf, sizeof buf)) > 0)
           33                 growbytes(b, buf, n);
           34         if(b->b == nil)
           35                 return nil;        /* empty file */
           36         rendertext(u, b);
           37         freeurlwin(u);
           38         return nil;
           39 }
           40 
           41 char*
           42 runetobyte(Rune *r, int n)
           43 {
           44         char *s;
           45 
           46         if(n == 0)
           47                 return emalloc(1);
           48         s = smprint("%.*S", n, r);
           49         if(s == nil)
           50                 error("malloc failed");
           51         return s;
           52 }
           53 
           54 int
           55 closingpunct(int c)
           56 {
           57         return strchr(".,:;'\")]}>!?", c) != nil;
           58 }
           59 
           60 void
           61 emitword(Bytes *b, Rune *r, int nr)
           62 {
           63         char *s;
           64         int space;
           65 
           66         if(nr == 0)
           67                 return;
           68         s = smprint("%.*S", nr, r);
           69         space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
           70         if(col>0 && col+space+nr > width){
           71                 growbytes(b, "\n", 1);
           72                 space = 0;
           73                 col = 0;
           74         }
           75         if(space && col>0){
           76                 growbytes(b, " ", 1);
           77                 col++;
           78         }
           79         growbytes(b, s, strlen(s));
           80         col += nr;
           81         free(s);
           82         inword = 0;
           83 }
           84 
           85 void
           86 renderrunes(Bytes *b, Rune *r)
           87 {
           88         int i, n;
           89 
           90         n = runestrlen(r);
           91         for(i=0; i<n; i++){
           92                 switch(r[i]){
           93                 case '\n':
           94                         if(inword)
           95                                 emitword(b, r+wordi, i-wordi);
           96                         col = 0;
           97                         if(b->n == 0)
           98                                 break;        /* don't start with blank lines */
           99                         if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
          100                                 growbytes(b, "\n", 1);
          101                         break;
          102                 case ' ':
          103                         if(inword)
          104                                 emitword(b, r+wordi, i-wordi);
          105                         break;
          106                 default:
          107                         if(!inword)
          108                                 wordi = i;
          109                         inword = 1;
          110                         break;
          111                 }
          112         }
          113         if(inword)
          114                 emitword(b, r+wordi, i-wordi);
          115 }
          116 
          117 void
          118 renderbytes(Bytes *b, char *fmt, ...)
          119 {
          120         Rune *r;
          121         va_list arg;
          122 
          123         va_start(arg, fmt);
          124         r = runevsmprint(fmt, arg);
          125         va_end(arg);
          126         renderrunes(b, r);
          127         free(r);
          128 }
          129 
          130 char*
          131 baseurl(char *url)
          132 {
          133         char *base, *slash;
          134         Resub rs[10];
          135 
          136         if(url == nil)
          137                 return nil;
          138         if(urlprog == nil){
          139                 urlprog = regcomp(urlexpr);
          140                 if(urlprog == nil)
          141                         error("can't compile URL regexp");
          142         }
          143         memset(rs, 0, sizeof rs);
          144         if(regexec(urlprog, url, rs, nelem(rs)) == 0)
          145                 return nil;
          146         base = estrdup(url);
          147         slash = strrchr(base, '/');
          148         if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp])
          149                 *slash = '\0';
          150         else
          151                 base[rs[0].e.ep-rs[0].s.sp] = '\0';
          152         return base;
          153 }
          154 
          155 char*
          156 fullurl(URLwin *u, Rune *rhref)
          157 {
          158         char *base, *href, *hrefbase;
          159         char *result;
          160 
          161         if(rhref == nil)
          162                 return estrdup("NULL URL");
          163         href = runetobyte(rhref, runestrlen(rhref));
          164         hrefbase = baseurl(href);
          165         result = nil;
          166         if(hrefbase==nil && (base = baseurl(u->url))!=nil){
          167                 result = estrdup(base);
          168                 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
          169                         result = eappend(result, "/", "");
          170                 free(base);
          171         }
          172         if(href){
          173                 if(result)
          174                         result = eappend(result, "", href);
          175                 else
          176                         result = estrdup(href);
          177         }
          178         free(hrefbase);
          179         if(result == nil)
          180                 return estrdup("***unknown***");
          181         return result;
          182 }
          183 
          184 void
          185 render(URLwin *u, Bytes *t, Item *items, int curanchor)
          186 {
          187         Item *il;
          188         Itext *it;
          189         Ifloat *ifl;
          190         Ispacer *is;
          191         Itable *ita;
          192         Iimage *im;
          193         Anchor *a;
          194         Table *tab;
          195         Tablecell *cell;
          196         char *href;
          197 
          198         inword = 0;
          199         col = 0;
          200         wordi = 0;
          201 
          202         for(il=items; il!=nil; il=il->next){
          203                 if(il->state & IFbrk)
          204                         renderbytes(t, "\n");
          205                 if(il->state & IFbrksp)
          206                         renderbytes(t, "\n");
          207 
          208                 switch(il->tag){
          209                 case Itexttag:
          210                         it = (Itext*)il;
          211                         renderrunes(t, it->s);
          212                         break;
          213                 case Iruletag:
          214                         if(t->n>0 && t->b[t->n-1]!='\n')
          215                                 renderbytes(t, "\n");
          216                         renderbytes(t, "=======\n");
          217                         break;
          218                 case Iimagetag:
          219                         if(!aflag)
          220                                 break;
          221                         im = (Iimage*)il;
          222                         if(im->imsrc){
          223                                 href = fullurl(u, im->imsrc);
          224                                 renderbytes(t, "[image %s]", href);
          225                                 free(href);
          226                         }
          227                         break;
          228                 case Iformfieldtag:
          229                         if(aflag)
          230                                 renderbytes(t, "[formfield]");
          231                         break;
          232                 case Itabletag:
          233                         ita = (Itable*)il;
          234                         tab = ita->table;
          235                         for(cell=tab->cells; cell!=nil; cell=cell->next){
          236                                 render(u, t, cell->content, curanchor);
          237                         }
          238                         if(t->n>0 && t->b[t->n-1]!='\n')
          239                                 renderbytes(t, "\n");
          240                         break;
          241                 case Ifloattag:
          242                         ifl = (Ifloat*)il;
          243                         render(u, t, ifl->item, curanchor);
          244                         break;
          245                 case Ispacertag:
          246                         is = (Ispacer*)il;
          247                         if(is->spkind != ISPnull)
          248                                 renderbytes(t, " ");
          249                         break;
          250                 default:
          251                         error("unknown item tag %d\n", il->tag);
          252                 }
          253                 if(il->anchorid != 0 && il->anchorid!=curanchor){
          254                         for(a=u->docinfo->anchors; a!=nil; a=a->next)
          255                                 if(aflag && a->index == il->anchorid){
          256                                         href = fullurl(u, a->href);
          257                                         renderbytes(t, "[%s]", href);
          258                                         free(href);
          259                                         break;
          260                                 }
          261                         curanchor = il->anchorid;
          262                 }
          263         }
          264         if(t->n>0 && t->b[t->n-1]!='\n')
          265                 renderbytes(t, "\n");
          266 }
          267 
          268 void
          269 rerender(URLwin *u)
          270 {
          271         Bytes *t;
          272 
          273         t = emalloc(sizeof(Bytes));
          274 
          275         render(u, t, u->items, 0);
          276 
          277         if(t->n)
          278                 write(u->outfd, (char*)t->b, t->n);
          279         free(t->b);
          280         free(t);
          281 }
          282 
          283 /*
          284  * Somewhat of a hack.  Not a full parse, just looks for strings in the beginning
          285  * of the document (cistrstr only looks at first somewhat bytes).
          286  */
          287 int
          288 charset(char *s)
          289 {
          290         char *meta, *emeta, *charset;
          291 
          292         if(defcharset == 0)
          293                 defcharset = ISO_8859_1;
          294         meta = cistrstr(s, "<meta");
          295         if(meta == nil)
          296                 return defcharset;
          297         for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
          298                 ;
          299         charset = cistrstr(s, "charset=");
          300         if(charset == nil)
          301                 return defcharset;
          302         charset += 8;
          303         if(*charset == '"')
          304                 charset++;
          305         if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
          306                 return UTF_8;
          307         return defcharset;
          308 }
          309 
          310 void
          311 rendertext(URLwin *u, Bytes *b)
          312 {
          313         Rune *rurl;
          314 
          315         rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
          316         u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
          317 /*        free(rurl); */
          318 
          319         rerender(u);
          320 }
          321 
          322 
          323 void
          324 freeurlwin(URLwin *u)
          325 {
          326         freeitems(u->items);
          327         u->items = nil;
          328         freedocinfo(u->docinfo);
          329         u->docinfo = nil;
          330         free(u);
          331 }