filt.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
HTML git clone git://git.codemadness.org/bmf
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
filt.c (3692B)
---
1 /* $Id: filt.c,v 1.1 2002/10/20 18:19:17 tommy Exp $ */
2
3 /*
4 * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
5 *
6 * This program is free software. It may be distributed under the terms
7 * in the file LICENSE, found in the top level of the distribution.
8 *
9 * filt.c: The Bayes filter implementation.
10 * See http://www.paulgraham.com/spam.html for discussion.
11 */
12
13 #include "config.h"
14 #include "dbg.h"
15 #include "str.h"
16 #include "lex.h"
17 #include "vec.h"
18 #include "dbh.h"
19 #include "filt.h"
20
21 #define DEVIATION(n) fabs((n)-0.5f)
22
23 /* Dump the contents of a statistics structure */
24 void
25 statdump(stats_t * pstat, FILE *fp)
26 {
27 discrim_t *pp;
28 size_t i;
29
30 fprintf(fp, "# Spamicity: %f\n", pstat->spamicity);
31
32 for (pp = pstat->extrema; pp < pstat->extrema + pstat->keepers; pp++) {
33 if (pp->key.len) {
34 fprintf(fp, "# '");
35 for (i = 0; i < pp->key.len; i++)
36 fputc(tolower((unsigned char)pp->key.p[i]), fp);
37 fprintf(fp, "' -> %f\n", pp->prob);
38 }
39 }
40 }
41
42 void
43 bayesfilt(dbt_t * pglist, dbt_t * pblist, vec_t * pmlist, stats_t * pstats)
44 {
45 veciter_t iter;
46 str_t *pword;
47
48 double prob, product, invproduct, dev;
49 double slotdev, hitdev;
50
51 #ifdef NON_EQUIPROBABLE
52 /* There is an argument that we should (go?) by number of *words*
53 * here. */
54 double msg_prob = ((double) pblist->nitems / (double) pglist->nitems);
55
56 #endif
57
58 discrim_t *pp, *hit;
59
60 for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
61 pp->key.p = NULL;
62 pp->key.len = 0;
63 pp->prob = 0.5f;
64 }
65
66 vec_first(pmlist, &iter);
67 while ((pword = veciter_get(&iter)) != NULL) {
68 double goodness = pglist->getcount(pglist, pword);
69 double spamness = pblist->getcount(pblist, pword);
70 uint goodtotal = pglist->getmsgcount(pglist);
71 uint spamtotal = pblist->getmsgcount(pblist);
72
73 if (goodness + spamness < MINIMUM_FREQ) {
74 #ifdef NON_EQUIPROBABLE
75 /*
76 * In the absence of evidence, the probability that a new word will
77 * be spam is the historical ratio of spam words to nonspam words.
78 */
79 prob = msg_prob;
80 #else
81 prob = UNKNOWN_WORD;
82 #endif
83 } else {
84 double goodprob = goodtotal ? min(1.0, (goodness / goodtotal)) : 0.0;
85 double spamprob = spamtotal ? min(1.0, (spamness / spamtotal)) : 0.0;
86
87 #ifdef NON_EQUIPROBABLE
88 prob = (spamprob * msg_prob) / ((goodprob * (1 - msg_prob)) + (spamprob * msg_prob));
89 #else
90 prob = spamprob / (goodprob + spamprob);
91 #endif
92
93 prob = minmax(prob, 0.01, 0.99);
94 }
95
96 /* update the list of tokens with maximum deviation */
97 dev = DEVIATION(prob);
98 hit = NULL;
99 hitdev = 0;
100 for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
101 /* don't allow duplicate tokens in the stats.extrema */
102 if (pp->key.len > 0 && str_casecmp(pword, &pp->key) == 0) {
103 hit = NULL;
104 break;
105 }
106 slotdev = DEVIATION(pp->prob);
107 if (dev > slotdev && dev > hitdev) {
108 hit = pp;
109 hitdev = slotdev;
110 }
111 }
112 if (hit) {
113 hit->prob = prob;
114 hit->key = *pword;
115 }
116 veciter_next(&iter);
117 }
118 veciter_destroy(&iter);
119
120 /*
121 * Bayes' theorem.
122 * For discussion, see <http://www.mathpages.com/home/kmath267.htm>.
123 */
124 product = invproduct = 1.0f;
125 for (pp = pstats->extrema; pp < pstats->extrema + pstats->keepers; pp++) {
126 if (pp->prob == 0) {
127 break;
128 } else {
129 product *= pp->prob;
130 invproduct *= (1 - pp->prob);
131 }
132 }
133 pstats->spamicity = product / (product + invproduct);
134 }
135
136 bool_t
137 bvec_loadmsg(vec_t * pthis, lex_t * plex, tok_t * ptok)
138 {
139 str_t w;
140
141 lex_nexttoken(plex, ptok);
142 while (ptok->tt != eof && ptok->tt != from) {
143 w.p = ptok->p;
144 w.len = ptok->len;
145 vec_addtail(pthis, &w);
146 lex_nexttoken(plex, ptok);
147 }
148
149 return true;
150 }