URI:
       tmsgtok.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tmsgtok.c (4209B)
       ---
            1 /*
            2  * RFC822 message tokenizer (really feature generator) for spam filter.
            3  *
            4  * See Paul Graham's musings on spam filtering for theory.
            5  */
            6 
            7 #include <u.h>
            8 #include <libc.h>
            9 #include <bio.h>
           10 #include <regexp.h>
           11 #include <ctype.h>
           12 #include "dfa.h"
           13 
           14 void buildre(Dreprog*[3]);
           15 int debug;
           16 char *refile = "#9/mail/lib/classify.re";
           17 int maxtoklen = 20;
           18 int trim(char*);
           19 
           20 void
           21 usage(void)
           22 {
           23         fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
           24         exits("usage");
           25 }
           26 
           27 void
           28 main(int argc, char **argv)
           29 {
           30         int i, hdr, n, eof, off;
           31         Dreprog *re[3];
           32         int m[3];
           33         char *p, *ep, *tag;
           34         Biobuf bout, bin;
           35         char msg[1024+1];
           36         char buf[1024];
           37 
           38         refile = unsharp(refile);
           39         buildre(re);
           40         ARGBEGIN{
           41         case 'D':
           42                 debug = 1;
           43                 break;
           44         case 'n':
           45                 maxtoklen = atoi(EARGF(usage()));
           46                 break;
           47         case 'r':
           48                 refile = EARGF(usage());
           49                 break;
           50         default:
           51                 usage();
           52         }ARGEND;
           53 
           54         if(argc > 1)
           55                 usage();
           56         if(argc == 1){
           57                 close(0);
           58                 if(open(argv[0], OREAD) < 0)
           59                         sysfatal("open %s: %r", argv[0]);
           60         }
           61 
           62         tag = nil;
           63         Binit(&bin, 0, OREAD);
           64         Binit(&bout, 1, OWRITE);
           65         ep = msg;
           66         p = msg;
           67         eof = 0;
           68         off = 0;
           69         hdr = 1;
           70         for(;;){
           71                 /* replenish buffer */
           72                 if(ep - p < 512 && !eof){
           73                         if(p > msg + 1){
           74                                 n = ep - p;
           75                                 memmove(msg, p-1, ep-(p-1));
           76                                 off += (p-1) - msg;
           77                                 p = msg+1;
           78                                 ep = p + n;
           79                         }
           80                         n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
           81                         if(n < 0)
           82                                 sysfatal("read error: %r");
           83                         if(n == 0)
           84                                 eof = 1;
           85                         ep += n;
           86                         *ep = 0;
           87                 }
           88                 if(p >= ep)
           89                         break;
           90 
           91                 if(*p == 0){
           92                         p++;
           93                         continue;
           94                 }
           95 
           96                 if(hdr && p[-1]=='\n'){
           97                         if(p[0]=='\n')
           98                                 hdr = 0;
           99                         else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
          100                                 tag = "From*";
          101                         else if(cistrncmp(p-1, "\nto:", 4) == 0)
          102                                 tag = "To*";
          103                         else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
          104                                 tag = "Subject*";
          105                         else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
          106                                 tag = "Return-Path*";
          107                         else
          108                                 tag = nil;
          109                 }
          110                 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
          111                 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
          112                 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
          113 
          114                 n = m[0];
          115                 if(n < m[1])
          116                         n = m[1];
          117                 if(n < m[2])
          118                         n = m[2];
          119                 if(n <= 0){
          120 fprint(2, "«%s» %.2ux", p, p[0]);
          121                         sysfatal("no regexps matched at %ld", off + (p-msg));
          122                 }
          123 
          124                 if(m[0] >= m[1] && m[0] >= m[2]){
          125                         /* "From " marks start of new message */
          126                         Bprint(&bout, "*From*\n");
          127                         n = m[0];
          128                         hdr = 1;
          129                 }else if(m[2] > 1){
          130                         /* ignore */
          131                         n = m[2];
          132                 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
          133                         /* keyword */
          134                         /* should do UTF-aware lowercasing, too much bother */
          135 /*
          136                         for(i=0; i<n; i++)
          137                                 if('A' <= p[i] && p[i] <= 'Z')
          138                                         p[i] += 'a' - 'A';
          139 */
          140                         if(tag){
          141                                 i = strlen(tag);
          142                                 memmove(buf, tag, i);
          143                                 memmove(buf+i, p, m[1]);
          144                                 buf[i+m[1]] = 0;
          145                         }else{
          146                                 memmove(buf, p, m[1]);
          147                                 buf[m[1]] = 0;
          148                         }
          149                         Bprint(&bout, "%s\n", buf);
          150                         while(trim(buf) >= 0)
          151                                 Bprint(&bout, "stem*%s\n", buf);
          152                         n = m[1];
          153                 }else
          154                         n = m[2];
          155                 if(debug)
          156                         fprint(2, "%.*s¦", utfnlen(p, n), p);
          157                 p += n;
          158         }
          159         Bterm(&bout);
          160         exits(0);
          161 }
          162 
          163 void
          164 buildre(Dreprog *re[3])
          165 {
          166         Biobuf *b;
          167 
          168         if((b = Bopen(refile, OREAD)) == nil)
          169                 sysfatal("open %s: %r", refile);
          170 
          171         re[0] = Breaddfa(b);
          172         re[1] = Breaddfa(b);
          173         re[2] = Breaddfa(b);
          174 
          175         if(re[0]==nil || re[1]==nil || re[2]==nil)
          176                 sysfatal("Breaddfa: %r");
          177         Bterm(b);
          178 }
          179 
          180 /* perhaps this belongs in the tokenizer */
          181 int
          182 trim(char *s)
          183 {
          184         char *p, *op;
          185         int mix, mix1;
          186 
          187         if(*s == '*')
          188                 return -1;
          189 
          190         /* strip leading punctuation */
          191         p = strchr(s, '*');
          192         if(p == nil)
          193                 p = s;
          194         while(*p && !isalpha(*p))
          195                 p++;
          196         if(strlen(p) < 2)
          197 {
          198                 return -1;
          199 }
          200         memmove(s, p, strlen(p)+1);
          201 
          202         /* strip suffix of punctuation */
          203         p = s+strlen(s);
          204         op = p;
          205         while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
          206                 p--;
          207 
          208         /* chop punctuation */
          209         if(p > s){
          210                 /* free!!! -> free! */
          211                 if(p+1 < op){
          212                         p[1] = 0;
          213                         return 0;
          214                 }
          215                 /* free! -> free */
          216                 if(p < op){
          217                         p[0] = 0;
          218                         return 0;
          219                 }
          220         }
          221 
          222         mix = mix1 = 0;
          223         if(isupper(s[0]))
          224                 mix = 1;
          225         for(p=s+1; *p; p++)
          226                 if(isupper(*p)){
          227                         mix1 = 1;
          228                         break;
          229                 }
          230 
          231         /* turn FREE into Free */
          232         if(mix1){
          233                 for(p=s+1; *p; p++)
          234                         if(isupper(*p))
          235                                 *p += 'a'-'A';
          236                 return 0;
          237         }
          238 
          239         /* turn Free into free */
          240         if(mix){
          241                 *s += 'a'-'A';
          242                 return 0;
          243         }
          244         return -1;
          245 }