URI:
       lex.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
  HTML git clone git://git.codemadness.org/bmf
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       lex.c (11934B)
       ---
            1 /* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */
            2 
            3 /*
            4  * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
            5  *
            6  * This program is free software.  It may be distributed under the terms
            7  * in the file LICENSE, found in the top level of the distribution.
            8  *
            9  * lex.c: generate token stream for bmf.
           10  */
           11 
           12 #include "config.h"
           13 #include "dbg.h"
           14 #include "str.h"
           15 #include "lex.h"
           16 
           17 static cpchar g_htmltags[] =
           18 {
           19         "abbr",
           20         "above",
           21         "accesskey",
           22         "acronym",
           23         "align",
           24         "alink",
           25         "all",
           26         "alt",
           27         "applet",
           28         "archive",
           29         "axis",
           30         "basefont",
           31         "baseline",
           32         "below",
           33         "bgcolor",
           34         "big",
           35         "body",
           36         "border",
           37         "bottom",
           38         "box",
           39         "button",
           40         "cellpadding",
           41         "cellspacing",
           42         "center",
           43         "char",
           44         "charoff",
           45         "charset",
           46         "circle",
           47         "cite",
           48         "class",
           49         "classid",
           50         "clear",
           51         "codebase",
           52         "codetype",
           53         "color",
           54         "cols",
           55         "colspan",
           56         "compact",
           57         "content",
           58         "coords",
           59         "data",
           60         "datetime",
           61         "declare",
           62         "default",
           63         "defer",
           64         "dfn",
           65         "dir",
           66         "disabled",
           67         "face",
           68         "font",
           69         "frameborder",
           70         "groups",
           71         "head",
           72         "headers",
           73         "height",
           74         "href",
           75         "hreflang",
           76         "hsides",
           77         "hspace",
           78         "http-equiv",
           79         "iframe",
           80         "img",
           81         "input",
           82         "ismap",
           83         "justify",
           84         "kbd",
           85         "label",
           86         "lang",
           87         "language",
           88         "left",
           89         "lhs",
           90         "link",
           91         "longdesc",
           92         "map",
           93         "marginheight",
           94         "marginwidth",
           95         "media",
           96         "meta",
           97         "middle",
           98         "multiple",
           99         "name",
          100         "nohref",
          101         "none",
          102         "noresize",
          103         "noshade",
          104         "nowrap",
          105         "object",
          106         "onblur",
          107         "onchange",
          108         "onclick",
          109         "ondblclick",
          110         "onfocus",
          111         "onkeydown",
          112         "onkeypress",
          113         "onkeyup",
          114         "onload",
          115         "onmousedown",
          116         "onmousemove",
          117         "onmouseout",
          118         "onmouseover",
          119         "onmouseup",
          120         "onselect",
          121         "onunload",
          122         "param",
          123         "poly",
          124         "profile",
          125         "prompt",
          126         "readonly",
          127         "rect",
          128         "rel",
          129         "rev",
          130         "rhs",
          131         "right",
          132         "rows",
          133         "rowspan",
          134         "rules",
          135         "samp",
          136         "scheme",
          137         "scope",
          138         "script",
          139         "scrolling",
          140         "select",
          141         "selected",
          142         "shape",
          143         "size",
          144         "small",
          145         "span",
          146         "src",
          147         "standby",
          148         "strike",
          149         "strong",
          150         "style",
          151         "sub",
          152         "summary",
          153         "sup",
          154         "tabindex",
          155         "table",
          156         "target",
          157         "textarea",
          158         "title",
          159         "top",
          160         "type",
          161         "usemap",
          162         "valign",
          163         "value",
          164         "valuetype",
          165         "var",
          166         "vlink",
          167         "void",
          168         "vsides",
          169         "vspace",
          170         "width"
          171 };
          172 static const uint g_nhtmltags = sizeof(g_htmltags) / sizeof(cpchar);
          173 
          174 static cpchar g_ignoredheaders[] =
          175 {
          176         "Date:",
          177         "Delivery-date:",
          178         "Message-ID:",
          179         "X-Sorted:",
          180         "X-Spam-"
          181 };
          182 static const uint g_nignoredheaders = sizeof(g_ignoredheaders) / sizeof(cpchar);
          183 
          184 static inline bool_t
          185 is_whitespace(int c)
          186 {
          187         return (c == ' ' || c == '\t' || c == '\r');
          188 }
          189 
          190 static inline bool_t
          191 is_wordmidchar(int c)
          192 {
          193         return (isalnum(c) || c == '$' || c == '\'' || c == '.' || c == '-');
          194 }
          195 
          196 static inline bool_t
          197 is_wordendchar(int c)
          198 {
          199         return (isalnum(c) || c == '$');
          200 }
          201 
          202 static inline bool_t
          203 is_htmltag(cpchar p, uint len, uint * ptoklen)
          204 {
          205         int lo, hi, mid, minlen, cmp;
          206 
          207         *ptoklen = 0;
          208 
          209         hi = g_nhtmltags - 1;
          210         lo = -1;
          211         while (hi - lo > 1) {
          212                 mid = (hi + lo) / 2;
          213                 minlen = min(strlen(g_htmltags[mid]), len);
          214                 cmp = strncmp(g_htmltags[mid], p, minlen);
          215                 if (cmp > 0 || (cmp == 0 && minlen < len && !islower(p[minlen])))
          216                         hi = mid;
          217                 else
          218                         lo = mid;
          219         }
          220         minlen = min(strlen(g_htmltags[hi]), len);
          221         if (len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0) {
          222                 return false;
          223         }
          224         /* check if is_word() will have a longer match */
          225         if (is_wordendchar((unsigned char)p[minlen])) {
          226                 return false;
          227         }
          228         if (is_wordmidchar((unsigned char)p[minlen]) &&
          229             is_wordendchar((unsigned char)p[minlen + 1])) {
          230                 return false;
          231         }
          232         *ptoklen = strlen(g_htmltags[hi]);
          233 
          234         return true;
          235 }
          236 
          237 static inline bool_t
          238 is_htmlcomment(cpchar p, uint len, uint * ptoklen)
          239 {
          240         *ptoklen = 0;
          241 
          242         if (len >= 4 && memcmp(p, "<!--", 4) == 0) {
          243                 *ptoklen = 4;
          244                 return true;
          245         }
          246         if (len >= 3 && memcmp(p, "-->", 3) == 0) {
          247                 *ptoklen = 3;
          248                 return true;
          249         }
          250         return false;
          251 }
          252 
          253 static inline bool_t
          254 is_base64char(int c)
          255 {
          256         return (isalnum(c) || (c == '/' || c == '+'));
          257 }
          258 
          259 static inline bool_t
          260 is_base64(cpchar p, uint len, uint * ptoklen)
          261 {
          262         *ptoklen = 0;
          263         while (len > 0) {
          264                 if (*p != '\n' && *p != '\r' && !is_base64char((unsigned char)*p)) {
          265                         return false;
          266                 }
          267                 p++;
          268                 len--;
          269                 (*ptoklen)++;
          270         }
          271         return true;
          272 }
          273 
          274 static inline bool_t
          275 is_mimeboundary(cpchar p, uint len, uint * ptoklen)
          276 {
          277         *ptoklen = 0;
          278 
          279         if (len < 3 || p[0] != '-' || p[1] != '-') {
          280                 return false;
          281         }
          282         p += 2;
          283         len -= 2;
          284         *ptoklen += 2;
          285         while (len > 0) {
          286                 if (is_whitespace(*p)) {
          287                         return false;
          288                 }
          289                 if (*p == '\n' || *p == '\r') {
          290                         break;
          291                 }
          292                 p++;
          293                 len--;
          294                 (*ptoklen)++;
          295         }
          296         return true;
          297 }
          298 
          299 static inline bool_t
          300 is_ipaddr(cpchar p, uint len, uint * ptoklen)
          301 {
          302         uint noctets, ndigits;
          303 
          304         *ptoklen = 0;
          305 
          306         noctets = 0;
          307         while (len > 0 && noctets < 4) {
          308                 ndigits = 0;
          309                 while (len > 0 && isdigit((unsigned char)*p)) {
          310                         ndigits++;
          311                         p++;
          312                         len--;
          313                         (*ptoklen)++;
          314                 }
          315                 if (ndigits == 0 || ndigits > 3) {
          316                         return false;
          317                 }
          318                 noctets++;
          319                 if (noctets < 4) {
          320                         if (*p != '.') {
          321                                 return false;
          322                         }
          323                         p++;
          324                         len--;
          325                         (*ptoklen)++;
          326                 }
          327         }
          328         if (noctets < 4) {
          329                 return false;
          330         }
          331         return true;
          332 }
          333 
          334 static inline bool_t
          335 is_word(cpchar p, uint len, uint * ptoklen)
          336 {
          337         if (len < 3) {
          338                 return false;
          339         }
          340         if (!(isalpha((unsigned char)*p) || *p == '$')) {
          341                 return false;
          342         }
          343         *ptoklen = 1;
          344         p++;
          345         len--;
          346         while (len > 0) {
          347                 if (!is_wordmidchar((unsigned char)*p)) {
          348                         break;
          349                 }
          350                 (*ptoklen)++;
          351                 p++;
          352                 len--;
          353         }
          354         while (*ptoklen >= 3 && !is_wordendchar((unsigned char)*(p - 1))) {
          355                 (*ptoklen)--;
          356                 p--;
          357                 len++;
          358         }
          359         if (*ptoklen < 3) {
          360                 return false;
          361         }
          362         return true;
          363 }
          364 
          365 static inline bool_t
          366 is_ignoredheader(cpchar p, uint len, uint * ptoklen)
          367 {
          368         int lo, hi, mid, minlen, cmp;
          369 
          370         hi = g_nignoredheaders - 1;
          371         lo = -1;
          372         while (hi - lo > 1) {
          373                 mid = (hi + lo) / 2;
          374                 minlen = min(strlen(g_ignoredheaders[mid]), len);
          375                 cmp = strncasecmp(g_ignoredheaders[mid], p, minlen);
          376                 if (cmp >= 0)
          377                         hi = mid;
          378                 else
          379                         lo = mid;
          380         }
          381         minlen = min(strlen(g_ignoredheaders[hi]), len);
          382         if (len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen) != 0) {
          383                 return false;
          384         }
          385         *ptoklen = len;
          386         return true;
          387 }
          388 
          389 static inline bool_t
          390 is_mailerid(cpchar p, uint len, uint * ptoklen)
          391 {
          392         if (len < 4 || strncmp(p, "\tid ", 4) != 0) {
          393                 return false;
          394         }
          395         *ptoklen = len;
          396         return true;
          397 }
          398 
          399 static inline bool_t
          400 is_spamtext(cpchar p, uint len, uint * ptoklen)
          401 {
          402         if (len < 5 || strncmp(p, "SPAM:", 5) != 0) {
          403                 return false;
          404         }
          405         *ptoklen = len;
          406         return true;
          407 }
          408 
          409 static inline bool_t
          410 is_smtpid(cpchar p, uint len, uint * ptoklen)
          411 {
          412         if (len < 8 || strncmp(p, "SMTP id ", 8) != 0) {
          413                 return false;
          414         }
          415         *ptoklen = len;
          416         return true;
          417 }
          418 
          419 static inline bool_t
          420 is_boundaryequal(cpchar p, uint len, uint * ptoklen)
          421 {
          422         if (len < 9 || strncmp(p, "boundary=", 9) != 0) {
          423                 return false;
          424         }
          425         *ptoklen = len;
          426         return true;
          427 }
          428 
          429 static inline bool_t
          430 is_nameequal(cpchar p, uint len, uint * ptoklen)
          431 {
          432         if (len < 6 || strncmp(p, "name=\"", 6) != 0) {
          433                 return false;
          434         }
          435         *ptoklen = 6;
          436         return true;
          437 }
          438 
          439 static inline bool_t
          440 is_filenameequal(cpchar p, uint len, uint * ptoklen)
          441 {
          442         if (len < 10 || strncmp(p, "filename=\"", 10) != 0) {
          443                 return false;
          444         }
          445         *ptoklen = 10;
          446         return true;
          447 }
          448 
          449 static inline bool_t
          450 is_from(cpchar p, uint len, uint * ptoklen)
          451 {
          452         if (len < 5 || strncmp(p, "From ", 5) != 0) {
          453                 return false;
          454         }
          455         *ptoklen = 5;
          456         return true;
          457 }
          458 
          459 void
          460 lex_create(lex_t * pthis, mbox_t mboxtype)
          461 {
          462         pthis->mboxtype = mboxtype;
          463         pthis->section = envelope;
          464         pthis->pos = 0;
          465         pthis->bom = 0;
          466         pthis->eom = 0;
          467         pthis->lineend = 0;
          468         pthis->buflen = 0;
          469         pthis->pbuf = NULL;
          470 }
          471 
          472 void
          473 lex_destroy(lex_t * pthis)
          474 {
          475         free(pthis->pbuf);
          476 }
          477 
          478 bool_t
          479 lex_load(lex_t * pthis, int fd)
          480 {
          481         uint nalloc;
          482         ssize_t nread;
          483 
          484         nalloc = IOBUFSIZE;
          485         if ((pthis->pbuf = malloc(IOBUFSIZE)) == NULL)
          486                 return false;
          487 
          488         while ((nread = read(fd, pthis->pbuf + pthis->buflen, nalloc - pthis->buflen)) > 0) {
          489                 pthis->buflen += nread;
          490                 if (pthis->buflen == nalloc) {
          491                         char *pnewbuf;
          492 
          493                         nalloc += IOBUFSIZE;
          494                         pnewbuf = (char *) realloc(pthis->pbuf, nalloc);
          495                         if (pnewbuf == NULL) {
          496                                 free(pthis->pbuf);
          497                                 pthis->pbuf = NULL;
          498                                 return false;
          499                         }
          500                         pthis->pbuf = pnewbuf;
          501                 }
          502         }
          503         if (nread < 0) {
          504                 free(pthis->pbuf);
          505                 pthis->pbuf = NULL;
          506                 return false;
          507         }
          508         if (pthis->mboxtype == detect) {
          509                 if (pthis->buflen > 5 && memcmp(pthis->pbuf, "From ", 5) == 0) {
          510                         verbose(1, "Input looks like an mbox\n");
          511                         pthis->mboxtype = mbox;
          512                 } else {
          513                         verbose(1, "Input looks like a maildir\n");
          514                         pthis->mboxtype = maildir;
          515                 }
          516         }
          517         return true;
          518 }
          519 
          520 static bool_t
          521 lex_nextline(lex_t * pthis)
          522 {
          523         cpchar pbuf;
          524         uint len;
          525         uint toklen;
          526 
          527 again:
          528         /* XXX: use and update pthis->section */
          529         pthis->pos = pthis->lineend;
          530         if (pthis->lineend == pthis->buflen) {
          531                 return false;
          532         }
          533         pbuf = pthis->pbuf + pthis->pos;
          534         len = 0;
          535         while (pthis->pos + len < pthis->buflen && pbuf[len] != '\n') {
          536                 len++;
          537         }
          538         if (pthis->pos + len < pthis->buflen) {
          539                 len++;                /* bump past the LF */
          540         }
          541         pthis->lineend = pthis->pos + len;
          542 
          543         /* check beginning-of-line patterns */
          544         if (is_base64(pbuf, len, &toklen) ||
          545             is_ignoredheader(pbuf, len, &toklen) ||
          546             is_mailerid(pbuf, len, &toklen) ||
          547             is_mimeboundary(pbuf, len, &toklen) ||
          548             is_spamtext(pbuf, len, &toklen)) {
          549                 /* ignore line */
          550                 pthis->pos += toklen;
          551                 goto again;
          552         }
          553         return true;
          554 }
          555 
          556 void
          557 lex_nexttoken(lex_t * pthis, tok_t * ptok)
          558 {
          559         cpchar pbuf;
          560         uint len;
          561         uint toklen;
          562 
          563         if (pthis->pos == pthis->eom) {
          564                 pthis->bom = pthis->pos;
          565         }
          566 again:
          567         /* skip whitespace between tokens */
          568         while (pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf[pthis->pos])) {
          569                 pthis->pos++;
          570         }
          571 
          572         pbuf = pthis->pbuf + pthis->pos;
          573         len = pthis->lineend - pthis->pos;
          574 
          575         /* possibilities: end-of-line, html-comment, ipaddr, word, junk */
          576 
          577         if (pthis->pos == pthis->lineend) {
          578                 if (!lex_nextline(pthis)) {
          579                         pthis->eom = pthis->pos;
          580                         ptok->tt = eof;
          581                         return;
          582                 }
          583                 pbuf = pthis->pbuf + pthis->pos;
          584                 len = pthis->lineend - pthis->pos;
          585 
          586                 if (pthis->mboxtype == mbox) {
          587                         if (is_from(pbuf, len, &toklen)) {
          588                                 pthis->eom = pthis->pos;
          589                                 ptok->tt = from;
          590                                 ptok->p = pthis->pbuf + pthis->pos;
          591                                 ptok->len = toklen;
          592                                 pthis->pos += toklen;
          593                                 return;
          594                         }
          595                 }
          596                 goto again;        /* skip lws */
          597         }
          598         if (is_htmltag(pbuf, len, &toklen) ||
          599             is_htmlcomment(pbuf, len, &toklen) ||
          600             is_smtpid(pbuf, len, &toklen) ||
          601             is_boundaryequal(pbuf, len, &toklen) ||
          602             is_nameequal(pbuf, len, &toklen) ||
          603             is_filenameequal(pbuf, len, &toklen)) {
          604                 /* ignore it */
          605                 pthis->pos += toklen;
          606                 goto again;
          607         }
          608         if (is_ipaddr(pbuf, len, &toklen)) {
          609                 ptok->tt = word;
          610                 ptok->p = pthis->pbuf + pthis->pos;
          611                 ptok->len = toklen;
          612                 pthis->pos += toklen;
          613                 return;
          614         }
          615         if (is_word(pbuf, len, &toklen)) {
          616                 ptok->tt = word;
          617                 ptok->p = pthis->pbuf + pthis->pos;
          618                 ptok->len = toklen;
          619                 pthis->pos += toklen;
          620                 if (toklen > MAXWORDLEN) {
          621                         goto again;
          622                 }
          623                 return;
          624         }
          625         /* junk */
          626         pthis->pos++;
          627         goto again;
          628 }
          629 
          630 /* SpamAssassin style passthru */
          631 void
          632 lex_passthru(lex_t * pthis, bool_t is_spam, double hits)
          633 {
          634         char szbuf[256];
          635         bool_t in_headers = true;
          636 
          637         pthis->pos = pthis->bom;
          638         if (is_spam) {
          639                 sprintf(szbuf, "X-Spam-Status: Yes, hits=%f required=%f, tests=bmf\n"
          640                         "X-Spam-Flag: YES\n",
          641                         hits, SPAM_CUTOFF);
          642         } else {
          643                 sprintf(szbuf, "X-Spam-Status: No, hits=%f required=%f\n",
          644                         hits, SPAM_CUTOFF);
          645         }
          646 
          647         /* existing headers */
          648         while (in_headers && pthis->pos < pthis->eom) {
          649                 cpchar pbuf = pthis->pbuf + pthis->pos;
          650                 uint len = 0;
          651 
          652                 while (pthis->pos + len < pthis->buflen && pbuf[len] != '\n') {
          653                         len++;
          654                 }
          655                 if (pthis->pos + len < pthis->buflen) {
          656                         len++;        /* bump past the LF */
          657                 }
          658                 /* check for end of headers */
          659                 if (pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\n')) {
          660                         /* end of headers */
          661                         break;
          662                 }
          663                 /* write header, ignoring existing spam headers */
          664                 if (strncasecmp(pbuf, "X-Spam-", 7) != 0) {
          665                         write(STDOUT_FILENO, pbuf, len);
          666                 }
          667                 pthis->pos += len;
          668         }
          669 
          670         /* new headers */
          671         write(STDOUT_FILENO, szbuf, strlen(szbuf));
          672 
          673         /* remainder */
          674         if (pthis->pos < pthis->eom) {
          675                 write(STDOUT_FILENO, pthis->pbuf + pthis->pos, pthis->eom - pthis->pos);
          676         }
          677         pthis->bom = pthis->eom;
          678 }