URI:
       tbayes.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tbayes.c (3822B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <regexp.h>
            5 #include "hash.h"
            6 
            7 enum
            8 {
            9         MAXTAB = 256,
           10         MAXBEST = 32
           11 };
           12 
           13 typedef struct Table Table;
           14 struct Table
           15 {
           16         char *file;
           17         Hash *hash;
           18         int nmsg;
           19 };
           20 
           21 typedef struct Word Word;
           22 struct Word
           23 {
           24         Stringtab *s;        /* from hmsg */
           25         int count[MAXTAB];        /* counts from each table */
           26         double p[MAXTAB];        /* probabilities from each table */
           27         double mp;        /* max probability */
           28         int mi;                /* w.p[w.mi] = w.mp */
           29 };
           30 
           31 Table tab[MAXTAB];
           32 int ntab;
           33 
           34 Word best[MAXBEST];
           35 int mbest;
           36 int nbest;
           37 
           38 int debug;
           39 
           40 void
           41 usage(void)
           42 {
           43         fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");
           44         exits("usage");
           45 }
           46 
           47 void*
           48 emalloc(int n)
           49 {
           50         void *v;
           51 
           52         v = mallocz(n, 1);
           53         if(v == nil)
           54                 sysfatal("out of memory");
           55         return v;
           56 }
           57 
           58 void
           59 noteword(Word *w)
           60 {
           61         int i;
           62 
           63         for(i=nbest-1; i>=0; i--)
           64                 if(w->mp < best[i].mp)
           65                         break;
           66         i++;
           67 
           68         if(i >= mbest)
           69                 return;
           70         if(nbest == mbest)
           71                 nbest--;
           72         if(i < nbest)
           73                 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
           74         best[i] = *w;
           75         nbest++;
           76 }
           77 
           78 Hash*
           79 hread(char *s)
           80 {
           81         Hash *h;
           82         Biobuf *b;
           83 
           84         if((b = Bopenlock(s, OREAD)) == nil)
           85                 sysfatal("open %s: %r", s);
           86 
           87         h = emalloc(sizeof(Hash));
           88         Breadhash(b, h, 1);
           89         Bterm(b);
           90         return h;
           91 }
           92 
           93 void
           94 main(int argc, char **argv)
           95 {
           96         int i, j, a, mi, oi, tot, keywords;
           97         double totp, p, xp[MAXTAB];
           98         Hash *hmsg;
           99         Word w;
          100         Stringtab *s, *t;
          101         Biobuf bout;
          102 
          103         mbest = 15;
          104         keywords = 0;
          105         ARGBEGIN{
          106         case 'D':
          107                 debug = 1;
          108                 break;
          109         case 'k':
          110                 keywords = 1;
          111                 break;
          112         case 'm':
          113                 mbest = atoi(EARGF(usage()));
          114                 if(mbest > MAXBEST)
          115                         sysfatal("cannot keep more than %d words", MAXBEST);
          116                 break;
          117         default:
          118                 usage();
          119         }ARGEND
          120 
          121         for(i=0; i<argc; i++)
          122                 if(strcmp(argv[i], "~") == 0)
          123                         break;
          124 
          125         if(i > MAXTAB)
          126                 sysfatal("cannot handle more than %d tables", MAXTAB);
          127 
          128         if(i+1 >= argc)
          129                 usage();
          130 
          131         for(i=0; i<argc; i++){
          132                 if(strcmp(argv[i], "~") == 0)
          133                         break;
          134                 tab[ntab].file = argv[i];
          135                 tab[ntab].hash = hread(argv[i]);
          136                 s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);
          137                 if(s == nil || s->count == 0)
          138                         tab[ntab].nmsg = 1;
          139                 else
          140                         tab[ntab].nmsg = s->count;
          141                 ntab++;
          142         }
          143 
          144         Binit(&bout, 1, OWRITE);
          145 
          146         oi = ++i;
          147         for(a=i; a<argc; a++){
          148                 hmsg = hread(argv[a]);
          149                 nbest = 0;
          150                 for(s=hmsg->all; s; s=s->link){
          151                         w.s = s;
          152                         tot = 0;
          153                         totp = 0.0;
          154                         for(i=0; i<ntab; i++){
          155                                 t = findstab(tab[i].hash, s->str, s->n, 0);
          156                                 if(t == nil)
          157                                         w.count[i] = 0;
          158                                 else
          159                                         w.count[i] = t->count;
          160                                 tot += w.count[i];
          161                                 p = w.count[i]/(double)tab[i].nmsg;
          162                                 if(p >= 1.0)
          163                                         p = 1.0;
          164                                 w.p[i] = p;
          165                                 totp += p;
          166                         }
          167 
          168                         if(tot < 5){                /* word does not appear enough; give to box 0 */
          169                                 w.p[0] = 0.5;
          170                                 for(i=1; i<ntab; i++)
          171                                         w.p[i] = 0.1;
          172                                 w.mp = 0.5;
          173                                 w.mi = 0;
          174                                 noteword(&w);
          175                                 continue;
          176                         }
          177 
          178                         w.mp = 0.0;
          179                         for(i=0; i<ntab; i++){
          180                                 p = w.p[i];
          181                                 p /= totp;
          182                                 if(p < 0.01)
          183                                         p = 0.01;
          184                                 else if(p > 0.99)
          185                                         p = 0.99;
          186                                 if(p > w.mp){
          187                                         w.mp = p;
          188                                         w.mi = i;
          189                                 }
          190                                 w.p[i] = p;
          191                         }
          192                         noteword(&w);
          193                 }
          194 
          195                 totp = 0.0;
          196                 for(i=0; i<ntab; i++){
          197                         p = 1.0;
          198                         for(j=0; j<nbest; j++)
          199                                 p *= best[j].p[i];
          200                         xp[i] = p;
          201                         totp += p;
          202                 }
          203                 for(i=0; i<ntab; i++)
          204                         xp[i] /= totp;
          205                 mi = 0;
          206                 for(i=1; i<ntab; i++)
          207                         if(xp[i] > xp[mi])
          208                                 mi = i;
          209                 if(oi != argc-1)
          210                         Bprint(&bout, "%s: ", argv[a]);
          211                 Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);
          212                 if(keywords){
          213                         for(i=0; i<nbest; i++){
          214                                 Bprint(&bout, " ");
          215                                 Bwrite(&bout, best[i].s->str, best[i].s->n);
          216                                 Bprint(&bout, " %f", best[i].p[mi]);
          217                         }
          218                 }
          219                 freehash(hmsg);
          220                 Bprint(&bout, "\n");
          221                 if(debug){
          222                         for(i=0; i<nbest; i++){
          223                                 Bwrite(&bout, best[i].s->str, best[i].s->n);
          224                                 Bprint(&bout, " %f", best[i].p[mi]);
          225                                 if(best[i].p[mi] < best[i].mp)
          226                                         Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);
          227                                 Bprint(&bout, "\n");
          228                         }
          229                 }
          230         }
          231         Bterm(&bout);
          232 }