URI:
       tregen.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tregen.c (2446B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <regexp.h>
            5 #include "dfa.h"
            6 
            7 /***
            8  * Regular expression for matching.
            9  */
           10 
           11 char *ignore[] =
           12 {
           13         /* HTML that isn't A, IMG, or FONT */
           14         /* Must have a space somewhere to avoid catching <email@address> */
           15         "<[         \n\r]*("
           16                 "[^aif]|"
           17                 "a[^> \t\r\n]|"
           18                 "i[^mM \t\r\n]|"
           19                 "im[^gG \t\r\n]|"
           20                 "img[^> \t\r\n]|"
           21                 "f[^oO \t\r\n]|"
           22                 "fo[^Nn \t\r\n]|"
           23                 "fon[^tT \t\r\n]|"
           24                 "font[^> \r\t\n]"
           25         ")[^>]*[ \t\n\r][^>]*>",
           26         "<[         \n\r]*("
           27                 "i|im|f|fo|fon"
           28         ")[ \t\r\n][^>]*>",
           29 
           30         /* ignore html comments */
           31         "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",
           32 
           33         /* random mail strings */
           34         "^message-id:.*\n([         ].*\n)*",
           35         "^in-reply-to:.*\n([         ].*\n)*",
           36         "^references:.*\n([         ].*\n)*",
           37         "^date:.*\n([         ].*\n)*",
           38         "^delivery-date:.*\n([         ].*\n)*",
           39         "e?smtp id .*",
           40         "^        id.*",
           41         "boundary=.*",
           42         "name=\"",
           43         "filename=\"",
           44         "news:<[^>]+>",
           45         "^--[^         ]*$",
           46 
           47         /* base64 encoding */
           48         "^[0-9a-zA-Z+\\-=/]+$",
           49 
           50         /* uu encoding */
           51         "^[!-Z]+$",
           52 
           53         /* little things */
           54         ".",
           55         "\n"
           56 };
           57 
           58 char *keywords[] =
           59 {
           60         "([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+"
           61 };
           62 
           63 int debug;
           64 
           65 Dreprog*
           66 dregcomp(char *buf)
           67 {
           68         Reprog *r;
           69         Dreprog *d;
           70 
           71         if(debug)
           72                 print(">>> '%s'\n", buf);
           73 
           74         r = regcomp(buf);
           75         if(r == nil)
           76                 sysfatal("regcomp");
           77         d = dregcvt(r);
           78         if(d == nil)
           79                 sysfatal("dregcomp");
           80         free(r);
           81         return d;
           82 }
           83 
           84 char*
           85 strcpycase(char *d, char *s)
           86 {
           87         int cc, esc;
           88 
           89         cc = 0;
           90         esc = 0;
           91         while(*s){
           92                 if(*s == '[')
           93                         cc++;
           94                 if(*s == ']')
           95                         cc--;
           96                 if(!cc && 'a' <= *s && *s <= 'z'){
           97                         *d++ = '[';
           98                         *d++ = *s;
           99                         *d++ = *s+'A'-'a';
          100                         *d++ = ']';
          101                 }else
          102                         *d++ = *s;
          103                 if(*s == '\\')
          104                         esc++;
          105                 else if(esc)
          106                         esc--;
          107                 s++;
          108         }
          109         return d;
          110 }
          111 
          112 void
          113 regerror(char *msg)
          114 {
          115         sysfatal("regerror: %s", msg);
          116 }
          117 
          118 void
          119 buildre(Dreprog *re[3])
          120 {
          121         int i;
          122         static char buf[16384], *s;
          123 
          124         re[0] = dregcomp("^From ");
          125 
          126         s = buf;
          127         for(i=0; i<nelem(keywords); i++){
          128                 if(i != 0)
          129                         *s++ = '|';
          130                 s = strcpycase(s, keywords[i]);
          131         }
          132         *s = 0;
          133         re[1] = dregcomp(buf);
          134 
          135         s = buf;
          136         for(i=0; i<nelem(ignore); i++){
          137                 if(i != 0)
          138                         *s++ = '|';
          139                 s = strcpycase(s, ignore[i]);
          140         }
          141         *s = 0;
          142         re[2] = dregcomp(buf);
          143 }
          144 
          145 void
          146 usage(void)
          147 {
          148         fprint(2, "usage: regen [-d]\n");
          149         exits("usage");
          150 }
          151 
          152 void
          153 main(int argc, char **argv)
          154 {
          155         Dreprog *re[3];
          156         Biobuf b;
          157 
          158         ARGBEGIN{
          159         default:
          160                 usage();
          161         case 'd':
          162                 debug = 1;
          163         }ARGEND
          164 
          165         if(argc != 0)
          166                 usage();
          167 
          168         buildre(re);
          169         Binit(&b, 1, OWRITE);
          170         Bprintdfa(&b, re[0]);
          171         Bprintdfa(&b, re[1]);
          172         Bprintdfa(&b, re[2]);
          173         exits(0);
          174 }