URI:
       tmsgclass.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tmsgclass.c (4777B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <ctype.h>
            5 #include "msgdb.h"
            6 
            7 void
            8 usage(void)
            9 {
           10         fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
           11         exits("usage");
           12 }
           13 
           14 enum
           15 {
           16         MAXBEST = 32,
           17         MAXLEN = 64,
           18         MAXTAB = 256
           19 };
           20 
           21 typedef struct Ndb Ndb;
           22 struct Ndb
           23 {
           24         char *name;
           25         char *file;
           26         Msgdb *db;
           27         double p;
           28         long nmsg;
           29 };
           30 
           31 typedef struct Word Word;
           32 struct Word
           33 {
           34         char s[MAXLEN];
           35         int count[MAXTAB];
           36         double p[MAXTAB];
           37         double mp;
           38         int mi; /* w.p[w.mi] = w.mp */
           39         int nmsg;
           40 };
           41 
           42 Ndb db[MAXTAB];
           43 int ndb;
           44 
           45 int add;
           46 int mul;
           47 Msgdb *indb;
           48 
           49 Word best[MAXBEST];
           50 int mbest = 15;
           51 int nbest;
           52 
           53 void process(Biobuf*, char*);
           54 void lockfile(char*);
           55 
           56 void
           57 noteword(Word *w, char *s)
           58 {
           59         int i;
           60 
           61         for(i=nbest-1; i>=0; i--)
           62                 if(w->mp < best[i].mp)
           63                         break;
           64         i++;
           65 
           66         if(i >= mbest)
           67                 return;
           68         if(nbest == mbest)
           69                 nbest--;
           70         if(i < nbest)
           71                 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
           72         best[i] = *w;
           73         strecpy(best[i].s, best[i].s+MAXLEN, s);
           74         nbest++;
           75 }
           76 
           77 void
           78 main(int argc, char **argv)
           79 {
           80         int i, bad, m, tot, nn, j;
           81         Biobuf bin, *b, bout;
           82         char *s, *lf;
           83         double totp, p, thresh;
           84         long n;
           85         Word w;
           86 
           87         lf = nil;
           88         thresh = 0;
           89         ARGBEGIN{
           90         case 'a':
           91                 add = 1;
           92                 break;
           93         case 'd':
           94                 if(ndb >= MAXTAB)
           95                         sysfatal("too many db classes");
           96                 db[ndb].name = EARGF(usage());
           97                 db[ndb].file = EARGF(usage());
           98                 ndb++;
           99                 break;
          100         case 'l':
          101                 lf = EARGF(usage());
          102                 break;
          103         case 'm':
          104                 mul = atoi(EARGF(usage()));
          105                 break;
          106         case 't':
          107                 thresh = atof(EARGF(usage()));
          108                 break;
          109         default:
          110                 usage();
          111         }ARGEND
          112 
          113         if(ndb == 0){
          114                 fprint(2, "must have at least one -d option\n");
          115                 usage();
          116         }
          117 
          118         indb = mdopen(nil, 1);
          119         if(argc == 0){
          120                 Binit(&bin, 0, OREAD);
          121                 process(&bin, "<stdin>");
          122                 Bterm(&bin);
          123         }else{
          124                 bad = 0;
          125                 for(i=0; i<argc; i++){
          126                         if((b = Bopen(argv[i], OREAD)) == nil){
          127                                 fprint(2, "opening %s: %r\n", argv[i]);
          128                                 bad = 1;
          129                                 continue;
          130                         }
          131                         process(b, argv[i]);
          132                         Bterm(b);
          133                 }
          134                 if(bad)
          135                         exits("open inputs");
          136         }
          137 
          138         lockfile(lf);
          139         bad = 0;
          140         for(i=0; i<ndb; i++){
          141                 if((db[i].db = mdopen(db[i].file, 0)) == nil){
          142                         fprint(2, "opendb %s: %r\n", db[i].file);
          143                         bad = 1;
          144                 }
          145                 db[i].nmsg = mdget(db[i].db, "*From*");
          146         }
          147         if(bad)
          148                 exits("open databases");
          149 
          150         /* run conditional probabilities of input words, getting 15 most specific */
          151         mdenum(indb);
          152         nbest = 0;
          153         while(mdnext(indb, &s, &n) >= 0){
          154                 tot = 0;
          155                 totp = 0.0;
          156                 for(i=0; i<ndb; i++){
          157                         nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
          158                         tot += nn;
          159                         w.count[i] = nn;
          160                         p = w.count[i]/(double)db[i].nmsg;
          161                         if(p >= 1.0)
          162                                 p = 1.0;
          163                         w.p[i] = p;
          164                         totp += p;
          165                 }
          166 /*fprint(2, "%s tot %d totp %g\n", s, tot, totp); */
          167                 if(tot < 2)
          168                         continue;
          169                 w.mp = 0.0;
          170                 for(i=0; i<ndb; i++){
          171                         p = w.p[i];
          172                         p /= totp;
          173                         if(p < 0.001)
          174                                 p = 0.001;
          175                         else if(p > 0.999)
          176                                 p = 0.999;
          177                         if(p > w.mp){
          178                                 w.mp = p;
          179                                 w.mi = i;
          180                         }
          181                         w.p[i] = p;
          182                 }
          183                 noteword(&w, s);
          184         }
          185 
          186         /* compute conditional probabilities of message classes using 15 most specific */
          187         totp = 0.0;
          188         for(i=0; i<ndb; i++){
          189                 p = 1.0;
          190                 for(j=0; j<nbest; j++)
          191                         p *= best[j].p[i];
          192                 db[i].p = p;
          193                 totp += p;
          194         }
          195         for(i=0; i<ndb; i++)
          196                 db[i].p /= totp;
          197         m = 0;
          198         for(i=1; i<ndb; i++)
          199                 if(db[i].p > db[m].p)
          200                         m = i;
          201 
          202         Binit(&bout, 1, OWRITE);
          203         if(db[m].p < thresh)
          204                 m = -1;
          205         if(m >= 0)
          206                 Bprint(&bout, "%s", db[m].name);
          207         else
          208                 Bprint(&bout, "inconclusive");
          209         for(j=0; j<ndb; j++)
          210                 Bprint(&bout, " %s=%g", db[j].name, db[j].p);
          211         Bprint(&bout, "\n");
          212         for(i=0; i<nbest; i++){
          213                 Bprint(&bout, "%s", best[i].s);
          214                 for(j=0; j<ndb; j++)
          215                         Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
          216                 Bprint(&bout, "\n");
          217         }
          218                 Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
          219         Bterm(&bout);
          220 
          221         if(m >= 0 && add){
          222                 mdenum(indb);
          223                 while(mdnext(indb, &s, &n) >= 0)
          224                         mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
          225                 mdclose(db[m].db);
          226         }
          227         exits(nil);
          228 }
          229 
          230 void
          231 process(Biobuf *b, char*)
          232 {
          233         char *s;
          234         char *p;
          235         long n;
          236 
          237         while((s = Brdline(b, '\n')) != nil){
          238                 s[Blinelen(b)-1] = 0;
          239                 if((p = strrchr(s, ' ')) != nil){
          240                         *p++ = 0;
          241                         n = atoi(p);
          242                 }else
          243                         n = 1;
          244                 mdput(indb, s, mdget(indb, s)+n);
          245         }
          246 }
          247 
          248 int tpid;
          249 void
          250 killtickle(void)
          251 {
          252         postnote(PNPROC, tpid, "die");
          253 }
          254 
          255 void
          256 lockfile(char *s)
          257 {
          258         int fd, t, w;
          259         char err[ERRMAX];
          260 
          261         if(s == nil)
          262                 return;
          263         w = 50;
          264         t = 0;
          265         for(;;){
          266                 fd = open(s, OREAD);
          267                 if(fd >= 0)
          268                         break;
          269                 rerrstr(err, sizeof err);
          270                 if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
          271                         break;
          272                 sleep(w);
          273                 t += w;
          274                 if(w < 1000)
          275                         w = (w*3)/2;
          276                 if(t > 120*1000)
          277                         break;
          278         }
          279         if(fd < 0)
          280                 sysfatal("could not lock %s", s);
          281         switch(tpid = fork()){
          282         case -1:
          283                 sysfatal("fork: %r");
          284         case 0:
          285                 for(;;){
          286                         sleep(30*1000);
          287                         free(dirfstat(fd));
          288                 }
          289                 _exits(nil);
          290         default:
          291                 break;
          292         }
          293         close(fd);
          294         atexit(killtickle);
          295 }