URI:
       filt.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
  HTML git clone git://git.codemadness.org/bmf
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       filt.c (3692B)
       ---
            1 /* $Id: filt.c,v 1.1 2002/10/20 18:19:17 tommy Exp $ */
            2 
            3 /*
            4  * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
            5  *
            6  * This program is free software.  It may be distributed under the terms
            7  * in the file LICENSE, found in the top level of the distribution.
            8  *
            9  * filt.c: The Bayes filter implementation.
           10  *   See http://www.paulgraham.com/spam.html for discussion.
           11  */
           12 
           13 #include "config.h"
           14 #include "dbg.h"
           15 #include "str.h"
           16 #include "lex.h"
           17 #include "vec.h"
           18 #include "dbh.h"
           19 #include "filt.h"
           20 
           21 #define DEVIATION(n)    fabs((n)-0.5f)
           22 
           23 /* Dump the contents of a statistics structure */
           24 void
           25 statdump(stats_t * pstat, FILE *fp)
           26 {
           27         discrim_t *pp;
           28         size_t i;
           29 
           30         fprintf(fp, "# Spamicity: %f\n", pstat->spamicity);
           31 
           32         for (pp = pstat->extrema; pp < pstat->extrema + pstat->keepers; pp++) {
           33                 if (pp->key.len) {
           34                         fprintf(fp, "# '");
           35                         for (i = 0; i < pp->key.len; i++)
           36                                 fputc(tolower((unsigned char)pp->key.p[i]), fp);
           37                         fprintf(fp, "' -> %f\n", pp->prob);
           38                 }
           39         }
           40 }
           41 
           42 void
           43 bayesfilt(dbt_t * pglist, dbt_t * pblist, vec_t * pmlist, stats_t * pstats)
           44 {
           45         veciter_t iter;
           46         str_t *pword;
           47 
           48         double prob, product, invproduct, dev;
           49         double slotdev, hitdev;
           50 
           51 #ifdef NON_EQUIPROBABLE
           52         /* There is an argument that we should (go?) by number of *words*
           53          * here. */
           54         double msg_prob = ((double) pblist->nitems / (double) pglist->nitems);
           55 
           56 #endif
           57 
           58         discrim_t *pp, *hit;
           59 
           60         for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
           61                 pp->key.p = NULL;
           62                 pp->key.len = 0;
           63                 pp->prob = 0.5f;
           64         }
           65 
           66         vec_first(pmlist, &iter);
           67         while ((pword = veciter_get(&iter)) != NULL) {
           68                 double goodness = pglist->getcount(pglist, pword);
           69                 double spamness = pblist->getcount(pblist, pword);
           70                 uint goodtotal = pglist->getmsgcount(pglist);
           71                 uint spamtotal = pblist->getmsgcount(pblist);
           72 
           73                 if (goodness + spamness < MINIMUM_FREQ) {
           74 #ifdef NON_EQUIPROBABLE
           75                         /*
           76                          * In the absence of evidence, the probability that a new word will
           77                          * be spam is the historical ratio of spam words to nonspam words.
           78                          */
           79                         prob = msg_prob;
           80 #else
           81                         prob = UNKNOWN_WORD;
           82 #endif
           83                 } else {
           84                         double goodprob = goodtotal ? min(1.0, (goodness / goodtotal)) : 0.0;
           85                         double spamprob = spamtotal ? min(1.0, (spamness / spamtotal)) : 0.0;
           86 
           87 #ifdef NON_EQUIPROBABLE
           88                         prob = (spamprob * msg_prob) / ((goodprob * (1 - msg_prob)) + (spamprob * msg_prob));
           89 #else
           90                         prob = spamprob / (goodprob + spamprob);
           91 #endif
           92 
           93                         prob = minmax(prob, 0.01, 0.99);
           94                 }
           95 
           96                 /* update the list of tokens with maximum deviation */
           97                 dev = DEVIATION(prob);
           98                 hit = NULL;
           99                 hitdev = 0;
          100                 for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
          101                         /* don't allow duplicate tokens in the stats.extrema */
          102                         if (pp->key.len > 0 && str_casecmp(pword, &pp->key) == 0) {
          103                                 hit = NULL;
          104                                 break;
          105                         }
          106                         slotdev = DEVIATION(pp->prob);
          107                         if (dev > slotdev && dev > hitdev) {
          108                                 hit = pp;
          109                                 hitdev = slotdev;
          110                         }
          111                 }
          112                 if (hit) {
          113                         hit->prob = prob;
          114                         hit->key = *pword;
          115                 }
          116                 veciter_next(&iter);
          117         }
          118         veciter_destroy(&iter);
          119 
          120         /*
          121          * Bayes' theorem.
          122          * For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
          123          */
          124         product = invproduct = 1.0f;
          125         for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
          126                 if (pp->prob == 0) {
          127                         break;
          128                 } else {
          129                         product *= pp->prob;
          130                         invproduct *= (1 - pp->prob);
          131                 }
          132         }
          133         pstats->spamicity = product / (product + invproduct);
          134 }
          135 
          136 bool_t
          137 bvec_loadmsg(vec_t * pthis, lex_t * plex, tok_t * ptok)
          138 {
          139         str_t w;
          140 
          141         lex_nexttoken(plex, ptok);
          142         while (ptok->tt != eof && ptok->tt != from) {
          143                 w.p = ptok->p;
          144                 w.len = ptok->len;
          145                 vec_addtail(pthis, &w);
          146                 lex_nexttoken(plex, ptok);
          147         }
          148 
          149         return true;
          150 }