├── README
├── spamoracle.1
├── wordsplit.mlp
├── .gitignore
├── wordsplit.mli
├── configfile.mli
├── htmlscan.mli
├── refhosts.mli
├── attachments.mli
├── rankmsg.mli
├── processing.mli
├── Changes
├── .depend
├── database.mli
├── refhosts.ml
├── Makefile
├── mbox.mli
├── config.ml
├── mail.mli
├── config.mli
├── attachments.ml
├── configfile.ml
├── mbox.ml
├── processing.ml
├── rankmsg.ml
├── database.ml
├── spamoracle.conf.5
├── htmlscan.mll
├── mail.ml
└── main.ml


/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/README


--------------------------------------------------------------------------------
/spamoracle.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/spamoracle.1


--------------------------------------------------------------------------------
/wordsplit.mlp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/wordsplit.mlp


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.cm[iox]
2 | *.o
3 | spamoracle
4 | htmlscan.ml
5 | wordsplit.ml
6 | wordsplit.mll
7 | *~
8 | 


--------------------------------------------------------------------------------
/wordsplit.mli:
--------------------------------------------------------------------------------
1 | (** Decompose a string into words. *)
2 | 
3 | val iter: (string -> unit) -> (string -> bool) -> string -> unit
4 |   (** [iter fn db txt] applies [fn] to each word in [txt].
5 |       [db] is a predicate that returns [true] for known words.
6 |       It is used to recognize stretched-out words, e.g. [H.e.ll.o].
7 |  *)
8 | 


--------------------------------------------------------------------------------
/configfile.mli:
--------------------------------------------------------------------------------
 1 | (* Parsing configuration files *)
 2 | 
 3 | type value =
 4 |   | Bool of bool ref
 5 |   | String of string ref
 6 |   | Int of int ref
 7 |   | Float of float ref
 8 |   | Regexp of Str.regexp ref
 9 |   | OptBool of bool option ref
10 |   | OptString of string option ref
11 |   | OptInt of int option ref
12 |   | OptFloat of float option ref
13 |   | OptRegexp of Str.regexp option ref
14 | 
15 | val parse: (string * value) list -> string -> (int * string) list
16 | 


--------------------------------------------------------------------------------
/htmlscan.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (** Approximate HTML scanner.  Extracts words and certain parameters
16 |     of certain tags (e.g. URLs) from HTML text. *)
17 | 
18 | val extract_text: string -> string
19 | 


--------------------------------------------------------------------------------
/refhosts.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (* Extract hostnames or IP addresses referenced from http URLs
16 |    in message bodies. *)
17 | 
18 | val reset: unit -> unit
19 | val add: string -> unit
20 | val summarize: unit -> string
21 | 


--------------------------------------------------------------------------------
/attachments.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (* Summarize the attachments of a message as one line that can be
16 |    put in the header of the message.  Allows procmail to filter
17 |    suspicious attachments without looking at the message body. *)
18 | 
19 | val summarize: Mail.message -> string
20 | 


--------------------------------------------------------------------------------
/rankmsg.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (** Message ranking *)
16 | 
17 | type rank =
18 |   { spam_prob: float;
19 |     num_meaningful: int;
20 |     explanation: string }
21 | 
22 | val rank_message: Database.short -> Mail.message -> rank
23 | val word_proba: int -> int -> int -> int -> float
24 | 


--------------------------------------------------------------------------------
/processing.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (* Processing messages *)
16 | 
17 | val mark_message : Database.short -> string -> unit
18 | val record_words : Database.full -> bool -> string -> unit
19 | val add_message : Database.full -> bool -> bool -> string -> unit
20 | val test_message : Database.short -> float -> float -> string -> string -> unit
21 | type message_class = Msg_good | Msg_unknown | Msg_spam
22 | val stat_message : Database.short -> string -> message_class
23 | val wordsplit_message : Database.short -> string -> unit
24 | 


--------------------------------------------------------------------------------
/Changes:
--------------------------------------------------------------------------------
 1 | - Issue #2: correctly handle mails with CR-LF end-of-lines.
 2 | 
 3 | Release 1.6:
 4 | - Adapt to safe strings, using mutable byte arrays where needed.
 5 | 
 6 | Release 1.5:
 7 | - Be resilient to changes in OCaml's hash table implementation.
 8 | - Added command "spamoracle upgrade" to convert the database to
 9 |   the current hash table format.
10 | 
11 | Release 1.4:
12 | - More lenient rule for ignoring HTML comments.
13 | - Recognition of S P A C E D  O-U-T words.
14 | - Fixed a bug causing certain text-only e-mails to be ignored.
15 | - Improved the "spamoracle test" command.
16 | - Added "spamoracle words" command for troubleshooting the word-splitting
17 |   algorithm.
18 | 
19 | Release 1.3:
20 | - Better scanning of words in HTML documents.
21 | - Better handling of multipart documents; now scan both text and HTML
22 |   parts of a multipart/alternative.
23 | - Support for Portuguese and Japanese (JIS encoding)
24 | - Added configuration file (~/.spamoracle.conf) to allow twiddling
25 |   with filtering parameters.
26 | - Fixed database corruption problem with concurrent adds.
27 | - Handles gzipped databases.
28 | - Added manual pages.
29 | 
30 | Release 1.2:
31 | - When choosing part of a multipart/alternative, ignore pure text part
32 |   if much smaller than HTML part.
33 | - Added commands "backup" and "restore" to facilitate future evolution
34 |   towards other database formats.
35 | 
36 | Release 1.1:
37 | - Fixed "usage" message.
38 | 
39 | Release 1.0:
40 | - First public release.
41 | 


--------------------------------------------------------------------------------
/.depend:
--------------------------------------------------------------------------------
 1 | attachments.cmo: mail.cmi attachments.cmi 
 2 | attachments.cmx: mail.cmx attachments.cmi 
 3 | configfile.cmo: configfile.cmi 
 4 | configfile.cmx: configfile.cmi 
 5 | config.cmo: configfile.cmi config.cmi 
 6 | config.cmx: configfile.cmx config.cmi 
 7 | database.cmo: database.cmi 
 8 | database.cmx: database.cmi 
 9 | htmlscan.cmo: config.cmi htmlscan.cmi 
10 | htmlscan.cmx: config.cmx htmlscan.cmi 
11 | mail.cmo: config.cmi htmlscan.cmi mail.cmi 
12 | mail.cmx: config.cmx htmlscan.cmx mail.cmi 
13 | main.cmo: config.cmi configfile.cmi database.cmi mbox.cmi processing.cmi 
14 | main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx 
15 | mbox.cmo: mbox.cmi 
16 | mbox.cmx: mbox.cmi 
17 | processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \
18 |     refhosts.cmi wordsplit.cmi processing.cmi 
19 | processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \
20 |     refhosts.cmx wordsplit.cmx processing.cmi 
21 | rankmsg.cmo: config.cmi database.cmi mail.cmi refhosts.cmi wordsplit.cmi \
22 |     rankmsg.cmi 
23 | rankmsg.cmx: config.cmx database.cmx mail.cmx refhosts.cmx wordsplit.cmx \
24 |     rankmsg.cmi 
25 | refhosts.cmo: refhosts.cmi 
26 | refhosts.cmx: refhosts.cmi 
27 | virus.cmo: mail.cmi mbox.cmi zip.cmo 
28 | virus.cmx: mail.cmx mbox.cmx zip.cmx 
29 | wordsplit.cmo: wordsplit.cmi 
30 | wordsplit.cmx: wordsplit.cmi 
31 | attachments.cmi: mail.cmi 
32 | config.cmi: configfile.cmi 
33 | processing.cmi: database.cmi 
34 | rankmsg.cmi: database.cmi mail.cmi 
35 | 


--------------------------------------------------------------------------------
/database.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (** Word frequency database *)
16 | 
17 | type short = {
18 |   s_num_good: int;
19 |   s_num_spam: int;
20 |   s_freq: (string, int * int) Hashtbl.t
21 | }
22 | 
23 | type full = {
24 |   mutable f_num_good: int;
25 |   mutable f_num_spam: int;
26 |   f_high_freq: (string, int * int) Hashtbl.t;
27 |   f_low_freq: (string, int * int) Hashtbl.t
28 | }
29 | 
30 | val read_short: string -> short
31 | val read_full: string -> full
32 | val write_full: string -> full -> unit
33 | val create: int -> full
34 | val add_good: full -> string -> unit
35 | val add_spam: full -> string -> unit
36 | val dump: full -> out_channel -> unit
37 | val restore: in_channel -> full
38 | val in_short: short -> string -> bool
39 | val in_full: full -> string -> bool
40 | val current_version: int
41 | 
42 | exception Error of string
43 | 


--------------------------------------------------------------------------------
/refhosts.ml:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (* Extract hostnames or IP addresses referenced from http URLs
16 |    in message bodies. *)
17 | 
18 | let re_url =
19 |   Str.regexp_case_fold
20 |     "http://\\([^@]+@\\)?\\([a-z0-9-]+\\(\\.[a-z0-9-]+\\)+\\)"
21 | 
22 | module StringSet = Set.Make(String)
23 | 
24 | let hosts = ref StringSet.empty
25 | 
26 | let reset() = hosts := StringSet.empty
27 | 
28 | let rec add_urls txt pos =
29 |   let matched =
30 |     try ignore (Str.search_forward re_url txt pos); true
31 |     with Not_found -> false in
32 |   if matched then begin
33 |     hosts := StringSet.add (Str.matched_group 2 txt) !hosts;
34 |     add_urls txt (Str.match_end())
35 |   end
36 | 
37 | let add txt =
38 |   add_urls txt 0
39 | 
40 | let summarize () =
41 |   let lst = StringSet.elements !hosts in
42 |   hosts := StringSet.empty;
43 |   String.concat " " lst
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ### Configuration section
 2 | 
 3 | # The laguages you're interested in, besides English
 4 | LANGUAGES=-DFRENCH #-DSPANISH -DITALIAN -DGERMAN -DPORTUGUESE -DJAPANESE
 5 | 
 6 | # How to invoke the C preprocessor
 7 | CPP=gcc -E -P $(LANGUAGES) -
 8 | 
 9 | # Where to install the binary
10 | BINDIR=/usr/local/bin
11 | 
12 | # Where to install the man pages
13 | MANDIR=/usr/local/man
14 | 
15 | ### End of configuration section
16 | 
17 | OCAMLC=ocamlc -g
18 | OCAMLLEX=ocamllex
19 | OCAMLDEP=ocamldep
20 | OCAMLOPT=ocamlopt
21 | 
22 | BYTEOBJS=configfile.cmo config.cmo \
23 |   htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \
24 |   refhosts.cmo rankmsg.cmo attachments.cmo processing.cmo main.cmo
25 | BYTELIBS=unix.cma str.cma
26 | 
27 | NATOBJS=$(BYTEOBJS:.cmo=.cmx)
28 | NATLIBS=$(BYTELIBS:.cma=.cmxa)
29 | 
30 | all: spamoracle
31 | 
32 | install:
33 | 	cp spamoracle $(BINDIR)/spamoracle
34 | 	cp spamoracle.1 $(MANDIR)/man1/spamoracle.1
35 | 	cp spamoracle.conf.5 $(MANDIR)/man5/spamoracle.conf.5
36 | 
37 | spamoracle: $(NATOBJS)
38 | 	$(OCAMLOPT) -o spamoracle $(NATLIBS) $(NATOBJS)
39 | 
40 | clean::
41 | 	rm -f spamoracle
42 | 
43 | spamoracle.byte: $(BYTEOBJS)
44 | 	$(OCAMLC) -o spamoracle.byte $(BYTELIBS) $(BYTEOBJS)
45 | 
46 | clean::
47 | 	rm -f spamoracle.byte
48 | 
49 | wordsplit.mll: wordsplit.mlp
50 | 	$(CPP) < wordsplit.mlp > wordsplit.mll \
51 |         || { rm -f wordsplit.mll; exit 2; }
52 | 
53 | clean::
54 | 	rm -f wordsplit.mll
55 | 
56 | wordsplit.ml: wordsplit.mll
57 | 	$(OCAMLLEX) wordsplit.mll
58 | 
59 | clean::
60 | 	rm -f wordsplit.ml
61 | 
62 | beforedepend:: wordsplit.ml
63 | 
64 | htmlscan.ml: htmlscan.mll
65 | 	$(OCAMLLEX) htmlscan.mll
66 | 
67 | clean::
68 | 	rm -f htmlscan.ml
69 | 
70 | beforedepend:: htmlscan.ml
71 | 
72 | clean::
73 | 	rm -f *.cm[iox] *.o
74 | 
75 | .SUFFIXES: .ml .mli .cmo .cmi .cmx
76 | 
77 | .mli.cmi:
78 | 	$(OCAMLC) -c $<
79 | .ml.cmo:
80 | 	$(OCAMLC) -c $<
81 | .ml.cmx:
82 | 	$(OCAMLOPT) -c $<
83 | 
84 | depend: beforedepend
85 | 	$(OCAMLDEP) *.ml *.mli > .depend
86 | 
87 | include .depend
88 | 


--------------------------------------------------------------------------------
/mbox.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (** Reading of a mailbox file and splitting into individual messages *)
16 | 
17 | type t
18 |   (** The type of channels opened on a mailbox *)
19 | 
20 | val open_mbox_file: string -> t
21 |   (** Open the given file name as a mailbox, and return an mbox channel
22 |       ready for reading.  If the file name ends in [.gz], arrange
23 |       for on-the-fly decompression with [zcat]. *)
24 | val open_mbox_channel: in_channel -> t
25 |   (** Open the given input channel as a mailbox. *)
26 | val read_msg: t -> string
27 |   (** Read the next message from the given channel, and return it
28 |       as a string.  Raise [End_of_file] if no message remains. *)
29 | val close_mbox: t -> unit
30 |   (** Close the given mbox channel. *)
31 | val mbox_file_iter: string -> (string -> unit) -> unit
32 |   (** [mbox_file_iter filename fn] reads messages from the file named
33 |       [filename], and applies [fn] in turn to each message. *)
34 | val mbox_channel_iter: in_channel -> (string -> unit) -> unit
35 |   (** [mbox_channel_iter ic fn] reads messages from the input channel
36 |       [ic], and applies [fn] in turn to each message. *)
37 | 
38 | val read_single_msg: in_channel -> string
39 |   (** Read one message from the given channel, up to end of file *)
40 | 


--------------------------------------------------------------------------------
/config.ml:
--------------------------------------------------------------------------------
 1 | (* Configurable parameters *)
 2 | 
 3 | let database_name =
 4 |   ref (try Filename.concat (Sys.getenv "HOME") ".spamoracle.db"
 5 |        with Not_found -> ".spamoracle.db")
 6 | 
 7 | let html_add_tags = ref false
 8 | 
 9 | let html_tag_attr = ref (Str.regexp_case_fold
10 |   "a/href\\|img/src\\|img/alt\\|frame/src\\|font/face\\|font/color")
11 | 
12 | let mail_headers = ref (Str.regexp_case_fold
13 |   "from:\\|subject:")
14 | 
15 | let alternative_favor_html = ref true
16 | 
17 | let spam_header = ref "X-Spam"
18 | 
19 | let attachments_header = ref "X-Attachments"
20 | 
21 | let summarize_attachments = ref true
22 | 
23 | let num_words_retained = ref 15
24 | 
25 | let max_repetitions = ref 2
26 | 
27 | let robinson_s = ref 0.0
28 | let robinson_x = ref 0.5
29 | 
30 | let low_freq_limit = ref 0.01
31 | let high_freq_limit = ref 0.99
32 | 
33 | let use_chi_square = ref false
34 | 
35 | let good_mail_prob = ref 0.2
36 | let spam_mail_prob = ref 0.8
37 | 
38 | let min_meaningful_words = ref 5
39 | 
40 | let summarize_referenced = ref false
41 | 
42 | let referenced_header = ref "X-Referenced-Hosts"
43 | 
44 | let reassemble_words = ref false
45 | 
46 | let external_converter = ref ""
47 | 
48 | open Configfile
49 | 
50 | let options = [
51 |   "database_file", String database_name;
52 |   "html_retain_tags", Bool html_add_tags;
53 |   "html_tag_attributes", Regexp html_tag_attr;
54 |   "mail_headers", Regexp mail_headers;
55 |   "alternative_favor_html", Bool alternative_favor_html;
56 |   "spam_header", String spam_header;
57 |   "attachments_header", String attachments_header;
58 |   "summarize_attachments", Bool summarize_attachments;
59 |   "referenced_header", String referenced_header;
60 |   "summarize_referenced", Bool summarize_referenced;
61 |   "num_meaningful_words", Int num_words_retained;
62 |   "max_repetitions", Int max_repetitions;
63 |   "low_freq_limit", Float low_freq_limit;
64 |   "high_freq_limit", Float high_freq_limit;
65 |   "min_meaningful_words", Int min_meaningful_words;
66 |   "good_mail_prob", Float good_mail_prob;
67 |   "spam_mail_prob", Float spam_mail_prob;
68 |   "robinson_s", Float robinson_s;
69 |   "robinson_x", Float robinson_x;
70 |   "use_chi_square", Bool use_chi_square;
71 |   "reassemble_words", Bool reassemble_words;
72 |   "external_converter", String external_converter
73 | ]
74 | 
75 | 


--------------------------------------------------------------------------------
/mail.mli:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (** Parsing of e-mail messages, including attachments *)
16 | 
17 | type message =
18 |   { headers: (string * string) list;
19 |     body: string;
20 |     parts: message list }
21 |   (** The type of parsed e-mail messages.
22 |     - [headers] is an A-list of pairs [(header-name, header-content)].
23 |       [header-name] is lowercased and includes [:], e.g. [subject:].
24 |     - [body] is the body of the message.  Base64 and quoted-printable
25 |       encodings are already decoded.  For multipart messages, [body]
26 |       is the initial blurb before the first part.
27 |     - [parts] is empty except for multipart messages, in which case
28 |       it lists all parts, recursively represented as messages. *)
29 | 
30 | val parse_message: string -> message
31 |   (** Parse the given textual message and return its structure. *)
32 | 
33 | val header: string -> message -> string
34 |   (** [header h msg] returns the contents of header named [h]
35 |       in message [msg], or the empty string if this header is missing.
36 |       Remember that header names are lowercased and include the final [:],
37 |       e.g. [subject:]. *)
38 | 
39 | val iter_text_parts: (message -> unit) -> message -> unit
40 |   (** [iter_text_parts fn msg] applies [fn] to every (sub-)message 
41 |       contained in [msg] that is of type text. *)
42 | 
43 | val iter_message: (string -> unit) -> message -> unit
44 |   (** [iter_message fn msg] applies [fn] to the following parts of
45 |       message [msg]:
46 |       - the headers that match [!Config.mail_headers];
47 |       - the body of every sub-message of [msg] that is of type text.
48 |     *)
49 | 


--------------------------------------------------------------------------------
/config.mli:
--------------------------------------------------------------------------------
 1 | (* Configurable parameters *)
 2 | 
 3 | val database_name : string ref
 4 |   (** Name of database file *)
 5 | 
 6 | val html_add_tags : bool ref
 7 |   (** Whether to treat HTML tag names as words *)
 8 | 
 9 | val html_tag_attr : Str.regexp ref
10 |   (** Regexp matching [tag/attr] strings denoting pairs of HTML tag and
11 |       attribute names.  If a tag and attribute pair matches, the associated
12 |       value is added to the text. *)
13 | 
14 | val mail_headers : Str.regexp ref
15 |   (** Regexp matching names of e-mail headers that must be analyzed. *)
16 | 
17 | val alternative_favor_html : bool ref
18 |   (** If true, consider only the HTML part of a multipart/alternative.
19 |       Otherwise, consider all parts. *)
20 | 
21 | val spam_header : string ref
22 |   (** Name of header added with spam / not-spam info (default: "X-Spam") *)
23 | 
24 | val attachments_header : string ref
25 |   (** Name of header added with attachment summary (default: "X-Attachments") *)
26 | 
27 | val summarize_attachments : bool ref
28 |   (** Whether to generate the attachment summary *)
29 | 
30 | val num_words_retained : int ref
31 |   (** Number of meaningful words to retain for computing final prob. *)
32 | 
33 | val max_repetitions : int ref
34 |   (** Among the meaningful words, max number of time a given word
35 |       can appear. *)
36 | 
37 | val low_freq_limit : float ref
38 |   (** Lower limit for word frequencies.  Default is 0.001. *)
39 | 
40 | val high_freq_limit : float ref
41 |   (** Upper limit for word frequencies.  Default is 0.999. *)
42 | 
43 | val robinson_s : float ref
44 | val robinson_x : float ref
45 |   (** Robinson's parameters for taking word frequencies into account. *)
46 | 
47 | val use_chi_square : bool ref
48 |   (** Use Robinson's chi-square test *)
49 | 
50 | val min_meaningful_words : int ref
51 |   (** Number of meaningful words below which mails are classified as unknown *)
52 | val good_mail_prob : float ref
53 |   (** Spam probability below which mails are classified as good *)
54 | val spam_mail_prob : float ref
55 |   (** Spam probability below which mails are classified as spam *)
56 | 
57 | val summarize_referenced : bool ref
58 | 
59 | val referenced_header : string ref
60 | 
61 | val reassemble_words : bool ref
62 | 
63 | val external_converter : string ref
64 |   (** Program to be called on message parts that are not text.
65 |       The program receives the content-type as first argument
66 |       and the actual data on standard input.
67 |       It should output the corresponding text on standard output,
68 |       or exit with non-zero error code if it cannot extract text. *)
69 | 
70 | val options : (string * Configfile.value) list
71 |   (** List of configurable parameters *)
72 | 


--------------------------------------------------------------------------------
/attachments.ml:
--------------------------------------------------------------------------------
 1 | (***********************************************************************)
 2 | (*                                                                     *)
 3 | (*                 SpamOracle -- a Bayesian spam filter                *)
 4 | (*                                                                     *)
 5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 6 | (*                                                                     *)
 7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
 8 | (*  en Automatique.  This file is distributed under the terms of the   *)
 9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
10 | (*                                                                     *)
11 | (***********************************************************************)
12 | 
13 | (* $Id$ *)
14 | 
15 | (* Summarize the attachments of a message as one line that can be
16 |    put in the header of the message.  Allows procmail to filter
17 |    suspicious attachments without looking at the message body. *)
18 | 
19 | open Printf
20 | open Mail
21 | 
22 | let re_content_type =
23 |   Str.regexp "\\([/a-zA-Z0-9-]+\\)"
24 | let re_innocuous_content_types =
25 |   Str.regexp_case_fold "text/plain\\|text/html\\|text/x-vcard\\|multipart/\\|message/rfc822\\|message/delivery-status"
26 | let re_charset =
27 |   Str.regexp_case_fold "charset=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)"
28 | let re_innocuous_charsets =
29 |   Str.regexp_case_fold "us-ascii\\|iso[-_]8859[-_]1$\\|iso[-_]8859[-_]15\\|windows-1252"
30 | let re_name =
31 |   Str.regexp_case_fold "name=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)"
32 | 
33 | let match_anchored re s =
34 |   Str.string_match re s 0
35 | let match_unanchored re s =
36 |   try ignore (Str.search_forward re s 0); true with Not_found -> false
37 | 
38 | let summarize msg =
39 |   let res = Buffer.create 200 in
40 |   let rec summ m =
41 |     let h = header "content-type:" m in
42 |     if match_anchored re_content_type h then begin
43 |       let c = Str.matched_group 1 h in
44 |       if not (Str.string_match re_innocuous_content_types c 0) then
45 |         bprintf res "type=\"%s\" " c
46 |     end;
47 |     if match_unanchored re_charset h then begin
48 |       let c =
49 |         try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in
50 |       if not (Str.string_match re_innocuous_charsets c 0) then
51 |         bprintf res "cset=\"%s\" " c
52 |     end;
53 |     if match_unanchored re_name h then begin
54 |       let c =
55 |         try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in
56 |       bprintf res "name=\"%s\" " c
57 |     end;
58 |     let h = header "content-disposition:" m in
59 |     if match_unanchored re_name h then begin
60 |       let c =
61 |         try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in
62 |       bprintf res "name=\"%s\" " c
63 |     end;
64 |     List.iter summ m.parts in
65 |   List.iter summ msg.parts;
66 |   Buffer.contents res
67 | 


--------------------------------------------------------------------------------
/configfile.ml:
--------------------------------------------------------------------------------
  1 | (* Parsing configuration files *)
  2 | 
  3 | type value =
  4 |   | Bool of bool ref
  5 |   | String of string ref
  6 |   | Int of int ref
  7 |   | Float of float ref
  8 |   | Regexp of Str.regexp ref
  9 |   | OptBool of bool option ref
 10 |   | OptString of string option ref
 11 |   | OptInt of int option ref
 12 |   | OptFloat of float option ref
 13 |   | OptRegexp of Str.regexp option ref
 14 | 
 15 | exception Error of string
 16 | 
 17 | let re_bool_yes = Str.regexp_case_fold "\\(on\\|yes\\|true\\|1\\)[ \t\r]*$"
 18 | let re_bool_no  = Str.regexp_case_fold "\\(off\\|no\\|false\\|0\\)[ \t\r]*$"
 19 | 
 20 | let parse_bool data =
 21 |   if Str.string_match re_bool_yes data 0 then true
 22 |   else if Str.string_match re_bool_no data 0 then false
 23 |   else raise (Error "invalid boolean value")
 24 | 
 25 | let trim_spaces s =
 26 |   let i = ref (String.length s - 1) in
 27 |   while !i >= 0 && (let c = s.[!i] in c = ' ' || c = '\t' || c = '\r')
 28 |   do decr i done;
 29 |   String.sub s 0 (!i + 1)
 30 | 
 31 | let parse_string data =
 32 |   try
 33 |     Scanf.sscanf data "%S" (fun s -> s)
 34 |   with Scanf.Scan_failure _ ->
 35 |     trim_spaces data
 36 | 
 37 | let parse_int data =
 38 |   try
 39 |     Scanf.sscanf data "%i" (fun n -> n)
 40 |   with Scanf.Scan_failure _ | Failure _ ->
 41 |     raise (Error ("invalid integer value"))
 42 | 
 43 | let parse_float data =
 44 |   try
 45 |     Scanf.sscanf data "%f" (fun n -> n)
 46 |   with Scanf.Scan_failure _ | Failure _ ->
 47 |     raise (Error ("invalid floating-point value"))
 48 | 
 49 | let parse_regexp data =
 50 |   try
 51 |     Str.regexp_case_fold (trim_spaces data)
 52 |   with Failure msg ->
 53 |     raise (Error ("invalid regular expression: " ^ msg))
 54 | 
 55 | let parse_data valuedesc data =
 56 |   match valuedesc with
 57 |   | Bool r      -> r := parse_bool data
 58 |   | String r    -> r := parse_string data
 59 |   | Int r       -> r := parse_int data
 60 |   | Float r     -> r := parse_float data
 61 |   | Regexp r    -> r := parse_regexp data
 62 |   | OptBool r   -> r := Some(parse_bool data)
 63 |   | OptString r -> r := Some(parse_string data)
 64 |   | OptInt r    -> r := Some(parse_int data)
 65 |   | OptFloat r  -> r := Some(parse_float data)
 66 |   | OptRegexp r -> r := Some(parse_regexp data)
 67 | 
 68 | let re_line =
 69 |   Str.regexp "[ \t]*\\([A-Za-z][A-Za-z0-9_]*\\)[ \t]*=[ \t]*\\(.*\\)"
 70 | let re_skip =
 71 |   Str.regexp "#\\|[ \t\r]*$"
 72 | 
 73 | let parse_line opts s =
 74 |   if Str.string_match re_line s 0 then begin
 75 |     let key = Str.matched_group 1 s and data = Str.matched_group 2 s in
 76 |     try
 77 |       parse_data (List.assoc key opts) data
 78 |     with Not_found ->
 79 |       raise (Error ("unknown variable " ^ key))
 80 |   end
 81 |   else if not (Str.string_match re_skip s 0) then
 82 |     raise (Error "ill-formed line")
 83 |     
 84 | let parse opts filename =
 85 |   let ic = open_in filename in
 86 |   let lineno = ref 1 in
 87 |   let errors = ref [] in
 88 |   begin try
 89 |     while true do
 90 |       let s = input_line ic in
 91 |       begin try
 92 |         parse_line opts s
 93 |       with Error msg ->
 94 |         errors := (!lineno, msg) :: !errors
 95 |       end;
 96 |       incr lineno
 97 |     done
 98 |   with End_of_file ->
 99 |     close_in ic
100 |   end;
101 |   List.rev !errors
102 | 


--------------------------------------------------------------------------------
/mbox.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (* Reading of a mailbox file and splitting into individual messages *)
 16 | 
 17 | type t =
 18 |   { ic: in_channel;
 19 |     zipped: bool;
 20 |     mutable start: string;
 21 |     buf: Buffer.t }
 22 | 
 23 | let open_mbox_file filename =
 24 |   if Filename.check_suffix filename ".gz" then
 25 |     { ic = Unix.open_process_in ("gunzip -c " ^filename);
 26 |       zipped = true;
 27 |       start = "";
 28 |       buf = Buffer.create 50000 }
 29 |   else
 30 |     { ic = open_in filename;
 31 |       zipped = false;
 32 |       start = "";
 33 |       buf = Buffer.create 50000 }
 34 | 
 35 | let open_mbox_channel ic =
 36 |     { ic = ic;
 37 |       zipped = false;
 38 |       start = "";
 39 |       buf = Buffer.create 50000 }
 40 | 
 41 | let re_crlf = Str.regexp "\r+\n"
 42 | 
 43 | let normalize_eol s =
 44 |   match String.index s '\r' with
 45 |   | exception Not_found -> s
 46 |   | _ -> Str.global_replace re_crlf "\n" s
 47 | 
 48 | let read_msg t =
 49 |   Buffer.clear t.buf;
 50 |   Buffer.add_string t.buf t.start;
 51 |   let rec read () =
 52 |     let line = input_line t.ic in
 53 |     if String.length line >= 5
 54 |     && String.sub line 0 5 = "From "
 55 |     && Buffer.length t.buf > 0 then begin
 56 |       t.start <- (line ^ "\n");
 57 |       normalize_eol (Buffer.contents t.buf)
 58 |     end else begin
 59 |       Buffer.add_string t.buf line;
 60 |       Buffer.add_char t.buf '\n';
 61 |       read ()
 62 |     end in
 63 |   try
 64 |     read()
 65 |   with End_of_file ->
 66 |     if Buffer.length t.buf > 0 then begin
 67 |       t.start <- "";
 68 |       normalize_eol (Buffer.contents t.buf)
 69 |     end else
 70 |       raise End_of_file
 71 | 
 72 | let close_mbox t =
 73 |   if t.zipped
 74 |   then ignore(Unix.close_process_in t.ic)
 75 |   else close_in t.ic
 76 | 
 77 | let mbox_file_iter filename fn =
 78 |   let ic = open_mbox_file filename in
 79 |   try
 80 |     while true do fn(read_msg ic) done
 81 |   with End_of_file ->
 82 |     close_mbox ic
 83 | 
 84 | let mbox_channel_iter inchan fn =
 85 |   let ic = open_mbox_channel inchan in
 86 |   try
 87 |     while true do fn(read_msg ic) done
 88 |   with End_of_file ->
 89 |     close_mbox ic
 90 | 
 91 | let read_single_msg inchan =
 92 |   let res = Buffer.create 10000 in
 93 |   let buf = Bytes.create 1024 in
 94 |   let rec read () =
 95 |     let n = input inchan buf 0 (Bytes.length buf) in
 96 |     if n > 0 then begin
 97 |       Buffer.add_subbytes res buf 0 n;
 98 |       read ()
 99 |     end in
100 |   read ();
101 |   normalize_eol (Buffer.contents res)
102 | 


--------------------------------------------------------------------------------
/processing.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (* Processing messages *)
 16 | 
 17 | open Printf
 18 | open Mail
 19 | open Database
 20 | open Rankmsg
 21 | 
 22 | (* Mark message with rank info *)
 23 | 
 24 | let re_nl_nl = Str.regexp "\n\n"
 25 | 
 26 | let mark_message db txt =
 27 |   let m = parse_message txt in
 28 |   let r = rank_message db m in
 29 |   try
 30 |     let pos_sep = Str.search_forward re_nl_nl txt 0 in
 31 |     output_substring stdout txt 0 pos_sep;
 32 |     let verdict =
 33 |       if r.spam_prob <= !Config.good_mail_prob
 34 |       && r.num_meaningful >= !Config.min_meaningful_words then "no"
 35 |       else if r.spam_prob >= !Config.spam_mail_prob
 36 |            && r.num_meaningful >= !Config.min_meaningful_words then "yes"
 37 |       else "unknown" in
 38 |     printf "\n%s: %s; %.2f; %s" 
 39 |       !Config.spam_header verdict r.spam_prob r.explanation;
 40 |     if !Config.summarize_attachments then begin 
 41 |       let att = Attachments.summarize m in
 42 |       if att <> "" then
 43 |         printf "\n%s: %s" !Config.attachments_header att;
 44 |     end;
 45 |     if !Config.summarize_referenced then begin 
 46 |       let refh = Refhosts.summarize () in
 47 |       if refh <> "" then
 48 |         printf "\n%s: %s" !Config.referenced_header refh;
 49 |     end;
 50 |     output_substring stdout txt pos_sep (String.length txt - pos_sep)
 51 |   with Not_found ->
 52 |     eprintf "spamoracle mark: ill-formed message (no header-body separation)\n";
 53 |     print_string txt
 54 | 
 55 | (* Add messages to database *)
 56 | 
 57 | let record_words db is_spam txt =
 58 |   Wordsplit.iter
 59 |     (fun w ->
 60 |       if is_spam then add_spam db w else add_good db w)
 61 |     (in_full db) txt
 62 | 
 63 | let add_message db verbose is_spam msg =
 64 |   if verbose then begin
 65 |     printf "\r%6d / %6d" db.f_num_good db.f_num_spam;
 66 |     flush stdout
 67 |   end;
 68 |   iter_message (record_words db is_spam) (parse_message msg);
 69 |   if is_spam
 70 |   then db.f_num_spam <- db.f_num_spam + 1
 71 |   else db.f_num_good <- db.f_num_good + 1
 72 | 
 73 | (* Test analysis on a message *)
 74 | 
 75 | let test_message db low high f txt =
 76 |   let msg = parse_message txt in
 77 |   let r = rank_message db msg in
 78 |   if r.spam_prob >= low && r.spam_prob <= high then begin
 79 |     printf "--------------------------------------------------\n";
 80 |     printf "From: %s\n" (header "from:" msg);
 81 |     printf "Subject: %s\n" (header "subject:" msg);
 82 |     printf "Score: %.2f -- %d\n" r.spam_prob r.num_meaningful;
 83 |     printf "Details: %s\n" r.explanation;
 84 |     if !Config.summarize_attachments then begin
 85 |       let att = Attachments.summarize msg in
 86 |       if att <> "" then printf "Attachments: %s\n" att
 87 |     end;
 88 |     if !Config.summarize_referenced then begin 
 89 |       let refh = Refhosts.summarize () in
 90 |       if refh <> "" then printf "Referenced hosts: %s\n" refh
 91 |     end;
 92 |     printf "File: %s\n" f;
 93 |   end
 94 | 
 95 | (* Statistics *)
 96 | 
 97 | type message_class = Msg_good | Msg_unknown | Msg_spam
 98 | 
 99 | let stat_message db txt =
100 |   let msg = parse_message txt in
101 |   let r = rank_message db msg in
102 |   if r.spam_prob <= 0.2 && r.num_meaningful >= 5 then Msg_good
103 |   else if r.spam_prob >= 0.8 && r.num_meaningful >= 5 then Msg_spam
104 |   else Msg_unknown
105 | 
106 | (* Word splitting *)
107 | 
108 | let wordsplit_message db txt =
109 |   Format.open_hovbox 0;
110 |   Mail.iter_message
111 |     (fun txt ->
112 |       Wordsplit.iter
113 |         (fun word -> Format.print_string word; Format.print_space())
114 |         (in_short db) txt)
115 |     (parse_message txt);
116 |   Format.close_box();
117 |   Format.print_newline()
118 | 


--------------------------------------------------------------------------------
/rankmsg.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (* Message ranking *)
 16 | 
 17 | open Mail
 18 | open Database
 19 | 
 20 | let word_count_in w res =
 21 |   let count = ref 0 in
 22 |   for i = 0 to Array.length res - 1 do
 23 |     if w = fst res.(i) then incr count
 24 |   done;
 25 |   !count
 26 | 
 27 | let add_word w p res =
 28 |   let i = ref 0 in
 29 |   while !i < Array.length res
 30 |      && abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5)
 31 |   do
 32 |     incr i
 33 |   done;
 34 |   if !i < Array.length res then begin
 35 |     for j = Array.length res - 1 downto !i + 1 do
 36 |       res.(j) <- res.(j - 1)
 37 |     done;
 38 |     res.(!i) <- (w, p)
 39 |   end
 40 | 
 41 | let normalize (p : float) low high =
 42 |   if p > high then high else if p < low then low else p
 43 | 
 44 | let cap (p : float) =
 45 |   if p > 1.0 then 1.0 else p
 46 | 
 47 | let word_proba g b num_g num_b =
 48 |   let g = 2 * g in  (* Graham's magic factor to bias in favor of ham *)
 49 |   let pgood = cap (float g /. float num_g)
 50 |   and pbad = cap (float b /. float num_b) in
 51 |   let p = pbad /. (pgood +. pbad) in
 52 |   if !Config.robinson_s = 0.0 then
 53 |     normalize p !Config.low_freq_limit !Config.high_freq_limit
 54 |   else begin
 55 |     (* Robinson's adjustement *)
 56 |     let n = float (g + b) in
 57 |     let p =
 58 |       (!Config.robinson_s *. !Config.robinson_x +. n *. p)
 59 |       /. (!Config.robinson_s +. n) in
 60 |     (* Result normalization *)
 61 |     normalize p !Config.low_freq_limit !Config.high_freq_limit
 62 |   end
 63 | 
 64 | let process_word (db, res) w =
 65 |   try
 66 |     let (g, b) = Hashtbl.find db.s_freq w in
 67 |     if word_count_in w res < !Config.max_repetitions then begin
 68 |       let p = word_proba g b db.s_num_good db.s_num_spam in
 69 |       add_word w p res
 70 |     end
 71 |   with Not_found ->
 72 |     ()
 73 | 
 74 | let process_words ((db, res) as ctx) txt =
 75 |   Wordsplit.iter (process_word ctx) (in_short db) txt;
 76 |   if !Config.summarize_referenced then Refhosts.add txt
 77 | 
 78 | let process_msg ctx m =
 79 |   iter_message (process_words ctx) m
 80 | 
 81 | (* This is Graham's original approach *)
 82 | 
 83 | let spaminess_score_graham res =
 84 |   let p = ref 1.0 and pexp = ref 0
 85 |   and cp = ref 1.0 and cpexp = ref 0 in
 86 |   for i = 0 to Array.length res - 1 do
 87 |     let (_, x) = res.(i) in
 88 |     p := !p *. x;
 89 |     if !p <= 1e-100 then begin 
 90 |       let (m, e) = frexp !p in p := m; pexp := !pexp + e
 91 |     end;
 92 |     cp := !cp *. (1.0 -. x);
 93 |     if !cp <= 1e-100 then begin
 94 |       let (m, e) = frexp !cp in cp := m; cpexp := !cpexp + e
 95 |     end
 96 |   done;
 97 |   if !cpexp < !pexp then cp := ldexp !cp (!cpexp - !pexp)
 98 |   else if !cpexp > !pexp then p := ldexp !p (!pexp - !cpexp);
 99 |   !p /. (!p +. !cp)
100 | 
101 | (* This is Robinson's chi-square stuff *)
102 | 
103 | let chi2_inverse m n =   (* chi2 inverse of 2m with 2n degrees *)
104 |   let t = ref (exp (-. m)) in
105 |   let s = ref !t in
106 |   for i = 1 to n do
107 |     t := !t *. m /. float i;
108 |     s := !s +. !t
109 |   done;
110 |   if !s >= 1.0 then 1.0 else !s
111 | 
112 | let log2 = log 2.0
113 | 
114 | let chi2_hypothesis ps =
115 |   (* Compute -2 * ln (product ps).  Be careful with underflows. *)
116 |   let p = ref 1.0 and pexp = ref 0 in
117 |   for i = 0 to Array.length ps - 1 do
118 |     p := !p *. ps.(i);
119 |     if !p <= 1e-100 then begin
120 |       let (x, e) = frexp !p in p := x; pexp := !pexp + e
121 |     end
122 |   done;
123 |   chi2_inverse (-. (log !p +. log2 *. float !pexp)) (Array.length ps)
124 | 
125 | let spaminess_score_robinson res =
126 |   let probs = Array.map snd res in
127 |   let cprobs = Array.map (fun x -> 1.0 -. x) probs in
128 |   0.5 *. (1.0 +. chi2_hypothesis probs -. chi2_hypothesis cprobs)
129 | 
130 | type rank =
131 |   { spam_prob: float;
132 |     num_meaningful: int;
133 |     explanation: string }
134 | 
135 | let rank_message db msg =
136 |   Refhosts.reset();
137 |   let res = Array.make !Config.num_words_retained ("", 0.5) in
138 |   process_msg (db, res) msg;
139 |   let p =
140 |     if !Config.use_chi_square
141 |     then spaminess_score_robinson res
142 |     else spaminess_score_graham res in
143 |   let meaningful = ref 0 in
144 |   while !meaningful < Array.length res && fst res.(!meaningful) <> ""
145 |   do incr meaningful done;
146 |   let summary = Buffer.create 200 in
147 |   for i = 0 to !meaningful - 1 do
148 |     let (w, p) = res.(i) in
149 |     Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0))
150 |   done;
151 |   { spam_prob = p;
152 |     num_meaningful = !meaningful;
153 |     explanation = Buffer.contents summary }
154 | 
155 | 


--------------------------------------------------------------------------------
/database.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (* Word frequency database *)
 16 | 
 17 | exception Error of string
 18 | 
 19 | type short = {
 20 |   s_num_good: int;
 21 |   s_num_spam: int;
 22 |   s_freq: (string, int * int) Hashtbl.t
 23 | }
 24 | 
 25 | type full = {
 26 |   mutable f_num_good: int;
 27 |   mutable f_num_spam: int;
 28 |   f_high_freq: (string, int * int) Hashtbl.t;
 29 |   f_low_freq: (string, int * int) Hashtbl.t
 30 | }
 31 | 
 32 | let magic = "Mailscrubber"  (* + 4 digits for version number *)
 33 | 
 34 | let check_magic filename ic =
 35 |   let mlen = String.length magic in
 36 |   let buf = really_input_string ic (mlen + 4) in
 37 |   if String.sub buf 0 mlen <> magic then
 38 |     raise(Error(filename ^ ": bad magic number"));
 39 |   try
 40 |     int_of_string (String.sub buf mlen 4)
 41 |   with Failure _ ->
 42 |     raise(Error(filename ^ ": bad magic number"));
 43 | 
 44 | type db_chan = {zipped : bool ; ic : in_channel}
 45 | 
 46 | let open_db filename =
 47 |   if Filename.check_suffix filename ".gz" then
 48 |     { ic = Unix.open_process_in ("gunzip -c " ^ filename);
 49 |       zipped = true; }
 50 |   else
 51 |     { ic = open_in_bin filename ;
 52 |       zipped = false }
 53 | 
 54 | let close_db {zipped = zipped ; ic = ic } =
 55 |   if zipped 
 56 |   then ignore(Unix.close_process_in ic)
 57 |   else close_in ic
 58 | 
 59 | let current_version =
 60 |   if Sys.ocaml_version < "4.03" then 1
 61 |   else 2
 62 | 
 63 | let read_hashtbl filename ic version =
 64 |   try
 65 |     let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in
 66 |     if version = current_version then tbl
 67 |     else if version > current_version then
 68 |       raise (Error(filename ^ ": database version not supported"))
 69 |     else begin
 70 |       Printf.eprintf "%s: converting from version %d to version %d\n\
 71 |                       Run 'spamoracle upgrade' to suppress this warning.\n%!"
 72 |                      filename version current_version;
 73 |       let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in
 74 |       Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl;
 75 |       tbl'
 76 |     end
 77 |   with Failure _ ->
 78 |     raise (Error(filename ^ ": database is corrupted"))
 79 | 
 80 | let read_short filename =
 81 |   let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
 82 |   let version = check_magic filename ic in
 83 |   let ng = input_binary_int ic in
 84 |   let ns = input_binary_int ic in
 85 |   let freq = read_hashtbl filename ic version in
 86 |   close_db db_ic;
 87 |   { s_num_good = ng; s_num_spam = ns; s_freq = freq }
 88 | 
 89 | let read_full filename =
 90 |   let {ic=ic ; zipped=zipped} as db_ic = open_db filename in
 91 |   let version = check_magic filename ic in
 92 |   let ng = input_binary_int ic in
 93 |   let ns = input_binary_int ic in
 94 |   let high_freq = read_hashtbl filename ic version in
 95 |   let low_freq = read_hashtbl filename ic version in
 96 |   close_db db_ic;
 97 |   { f_num_good = ng; f_num_spam = ns; 
 98 |     f_low_freq = low_freq; f_high_freq = high_freq }
 99 | 
100 | let temp_file basename =
101 |   let pid = Unix.getpid() in
102 |   let rec tmpfile counter =
103 |     if counter > 10000 then raise (Error "cannot create temporary database");
104 |     let filename = basename ^ string_of_int (pid + counter) in
105 |     try
106 |       (filename,
107 |        open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600
108 |                     filename)
109 |     with Sys_error _ ->
110 |       tmpfile (counter + 1)
111 |   in tmpfile 0
112 | 
113 | let write_full filename db =
114 |   let basename, zip =
115 |     if Filename.check_suffix filename ".gz" then
116 |       Filename.chop_suffix filename ".gz", true
117 |     else
118 |       filename, false in
119 |   let (tempname, oc) = temp_file (basename ^ ".tmp") in
120 |   Printf.fprintf oc "%s%04d" magic current_version;
121 |   output_binary_int oc db.f_num_good;
122 |   output_binary_int oc db.f_num_spam;
123 |   Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing];
124 |   Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing];
125 |   close_out oc;
126 |   if zip then begin
127 |     let r = Sys.command ("gzip -best " ^ tempname) in
128 |     if r = 0 then
129 |       Sys.rename (tempname ^ ".gz") filename
130 |     else
131 |       Sys.rename tempname basename
132 |   end else
133 |     Sys.rename tempname filename 
134 | 
135 | 
136 | let create sz =
137 |   { f_num_good = 0;
138 |     f_num_spam = 0;
139 |     f_high_freq = Hashtbl.create sz;
140 |     f_low_freq = Hashtbl.create sz }
141 | 
142 | let add_good db w =
143 |   begin try
144 |     let (g, s as f) = Hashtbl.find db.f_high_freq w in
145 |     Hashtbl.replace db.f_high_freq w (g+1, s)
146 |   with Not_found ->
147 |   try
148 |     let (g, s as f) = Hashtbl.find db.f_low_freq w in
149 |     let g' = g + 1 in
150 |     if 2 * g' + s >= 5 then begin
151 |       Hashtbl.remove db.f_low_freq w;
152 |       Hashtbl.add db.f_high_freq w (g', s)
153 |     end else
154 |       Hashtbl.replace db.f_low_freq w (g', s)
155 |   with Not_found ->
156 |     Hashtbl.add db.f_low_freq w (1, 0)
157 |   end
158 | 
159 | let add_spam db w =
160 |   begin try
161 |     let (g, s) = Hashtbl.find db.f_high_freq w in
162 |     Hashtbl.replace db.f_high_freq w (g, s+1)
163 |   with Not_found ->
164 |   try
165 |     let (g, s) = Hashtbl.find db.f_low_freq w in
166 |     let s' = s + 1 in
167 |     if 2 * g + s' >= 5 then begin
168 |       Hashtbl.remove db.f_low_freq w;
169 |       Hashtbl.add db.f_high_freq w (g, s')
170 |     end else
171 |       Hashtbl.replace db.f_low_freq w (g, s')
172 |   with Not_found ->
173 |     Hashtbl.add db.f_low_freq w (0, 1)
174 |   end
175 | 
176 | open Printf
177 | 
178 | let dump db oc =
179 |   let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in
180 |   fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam;
181 |   Hashtbl.iter dump_entry db.f_high_freq;
182 |   Hashtbl.iter dump_entry db.f_low_freq
183 | 
184 | let split s =
185 |   try
186 |     let i = String.index s ' ' in
187 |     let j = String.index_from s (i + 1) ' ' in
188 |     (String.sub s 0 i,
189 |      int_of_string (String.sub s (i + 1) (j - i - 1)),
190 |      int_of_string (String.sub s (j + 1) (String.length s - j - 1)))
191 |   with Not_found ->
192 |     raise(Error("Database restoration: ill-formed line `"
193 |                 ^ String.escaped s ^ "'"))
194 | 
195 | let restore ic =
196 |   let db = create 997 in
197 |   begin try
198 |     let (w, ng, ns) = split (input_line ic) in
199 |     if w <> "SPAMORACLE/1"
200 |     then raise (Error("Database restoration: wrong version"));
201 |     db.f_num_good <- ng;  
202 |     db.f_num_spam <- ns
203 |   with End_of_file ->
204 |     raise (Error("Database restoration: first line missing"));
205 |   end;
206 |   begin try
207 |     while true do
208 |       let (w, g, s) = split (input_line ic) in
209 |       if 2 * g + s >= 5
210 |       then Hashtbl.add db.f_high_freq w (g, s)
211 |       else Hashtbl.add db.f_low_freq w (g, s)
212 |     done
213 |   with End_of_file ->
214 |     ()
215 |   end;
216 |   db
217 | 
218 | let in_short db w = Hashtbl.mem db.s_freq w
219 | let in_full  db w = Hashtbl.mem db.f_high_freq w
220 | 


--------------------------------------------------------------------------------
/spamoracle.conf.5:
--------------------------------------------------------------------------------
  1 | .TH SPAMORACLE.CONF 5
  2 | 
  3 | .SH NAME
  4 | spamoracle.conf \- SpamOracle configuration file format
  5 | 
  6 | .SH DESCRIPTION
  7 | The
  8 | .B spamoracle.conf
  9 | file is a configuration file governing the operation of the
 10 | .BR spamoracle (1)
 11 | e-mail classification tool.  By default, the configuration file
 12 | is searched in
 13 | .IB $HOME /.spamoracle.conf
 14 | but an alternate location can be specified using the
 15 | .B -config
 16 | flag to
 17 | .BR spamoracle (1).
 18 | 
 19 | .B Important note:
 20 | most of the configuration parameters should not be modified lightly,
 21 | as this may result in completely wrong e-mail classification.  
 22 | Familiarity with Graham's filtering algorithm, as described in the
 23 | paper referenced at the end of this page, is recommended to fully
 24 | understand the effect of the parameters.
 25 | 
 26 | .SH SYNTAX
 27 | 
 28 | The
 29 | .B spamoracle.conf
 30 | file is composed of lines of the form
 31 | .I variable
 32 | .B =
 33 | .IR value .
 34 | Lines starting with a # sign are treated as comments and ignored.
 35 | Blank lines are ignored.
 36 | 
 37 | Depending on the type of the variable (see the list of variables below), the
 38 | .I value 
 39 | part takes one of the following forms:
 40 | .TP
 41 | .I string
 42 | A sequence of characters.  Blanks (spaces, tabs) at the beginning and the
 43 | end of the string are ignored.  Alternatively, the string can be
 44 | enclosed in double quotes ("), in which case spaces are not trimmed.
 45 | Inside quoted strings, blackslashes (\\) and double quotes (") must be
 46 | escaped with a backslash, as in \\\\ or \\\"
 47 | .TP
 48 | .I boolean
 49 | Either
 50 | .BR on,
 51 | .BR yes,
 52 | .BR true,
 53 | or
 54 | .B 1
 55 | to activate the boolean option, or
 56 | .BR off,
 57 | .BR no,
 58 | .BR false,
 59 | or
 60 | .B 0
 61 | to deactivate it.
 62 | .TP
 63 | .I integer
 64 | A decimal integer 
 65 | .TP
 66 | .I float
 67 | A decimal floating-point number.
 68 | .TP
 69 | .I regexp
 70 | A regular expression in
 71 | .BR emacs (1)
 72 | syntax.  The repetition operators are
 73 | .BR * ,
 74 | .BR + ,
 75 | and
 76 | .BR ? .
 77 | Alternation is written
 78 | .B \e|
 79 | and grouping is written
 80 | .BR \e( ... \e) .
 81 | Character classes are written between brackets
 82 | .BR [ ... ]
 83 | as usual.  A single dot denotes any character except newline.
 84 | Regular expressions are case-insensitive.
 85 | 
 86 | .SH CONFIGURABLE PARAMETERS
 87 | 
 88 | .TP
 89 | .B database_file
 90 | (type
 91 | .IR string,
 92 | default value
 93 | .IB $HOME /.spamoracle.db
 94 | )
 95 | .br
 96 | The location of the file that contains the database of word frequencies
 97 | used by
 98 | .BR spamoracle (1).
 99 | .TP
100 | .B html_retain_tags
101 | (type
102 | .IR boolean,
103 | default value
104 | .BR false )
105 | .br
106 | In HTML-formatted e-mails and attachments, the names of HTML tags are
107 | normally not treated as words and are ignored for the word frequency
108 | calculations. If the
109 | .B html_retain_tags
110 | parameter is set to
111 | .BR true ,
112 | HTML tags (such as
113 | .B img
114 | or
115 | .BR bold )
116 | are treated as words and included in the computation of word frequencies.
117 | .TP
118 | .B html_tag_attributes
119 | (type
120 | .IR regexp ,
121 | default value
122 | .br
123 | .BR a/href\e|img/src\e|img/alt\e|frame/src\e|font/face\e|font/color )
124 | .br
125 | This regular expression matches pairs of HTML tags and HTML attributes
126 | written as
127 | .IB tag / attribute.
128 | When scanning HTML-formatted e-mails and attachments, attributes to
129 | HTML tags are normally ignored, unless the tag/attribute pair matches
130 | the regular expression
131 | .BR html_tag_attributes .
132 | If the tag/attribute pair matches this regexp, the value of the attribute
133 | (for instance, the URL for the
134 | .BR a / href
135 | attribute) is scanned for words.
136 | .TP
137 | .B mail_headers
138 | (type
139 | .IR regexp ,
140 | default value
141 | .BR from:\e|subject: )
142 | .br
143 | A regular expression determining which headers of an e-mail message
144 | are scanned for words.
145 | .TP
146 | .B alternative_favor_html
147 | (type
148 | .IR bool ,
149 | default value
150 | .BR true )
151 | .br
152 | Determine how multipart/alternative messages are treated.  If this
153 | parameter is set, and one part of the alternative is of type text/html,
154 | this part is scanned and all other parts are ignored.  In all other
155 | cases, all parts of the alternative are scanned.
156 | .TP
157 | .B spam_header
158 | (type
159 | .IR string ,
160 | default value
161 | .BR X-Spam )
162 | .br
163 | The name of the header that
164 | .B spamoracle mark
165 | adds to incoming e-mail messages, with the results of the spam/non-spam 
166 | classification.
167 | .TP
168 | .B attachments_header
169 | (type
170 | .IR string ,
171 | default value
172 | .BR X-Attachments )
173 | .br
174 | The name of the header that
175 | .B spamoracle mark
176 | adds to incoming e-mail messages, with the one-line summary of attachment 
177 | types, names and character sets.  The generation of this header can
178 | be turned off with the
179 | .B summarize_attachment
180 | parameter.
181 | .TP
182 | .B summarize_attachment
183 | (type
184 | .IR boolean ,
185 | default value
186 | .BR true )
187 | .br
188 | If this parameter is set,
189 | .B spamoracle mark
190 | generates a one-line summary of the attachments of the incoming messages,
191 | and inserts this summary in the message headers.
192 | Setting this parameter to
193 | .B false
194 | disables the generation of this extra header.
195 | .TP
196 | .B num_meaningful_words
197 | (type
198 | .IR integer ,
199 | default value
200 | .BR 15 )
201 | .br
202 | Maximal number of "meaningful" words that are retained for computing
203 | the spam probability.  During mail analysis,
204 | .B spamoracle
205 | extracts all words of the message, and retains those whose spam frequency
206 | (frequency of occurrence in spam messages) is closest to 1 or to 0.  
207 | At most
208 | .B num_meaningful_words
209 | such "meaningful" words are retained.
210 | .TP
211 | .B max_repetitions
212 | (type
213 | .IR integer ,
214 | default value
215 | .BR 2 )
216 | .br
217 | Maximum number of times a given word can occur in the set of
218 | "meaningful" words retained for computing the spam probability.
219 | The default value of 2 means that at most 2 occurrences of the same
220 | word will be retained.
221 | .TP
222 | .B low_freq_limit
223 | (type
224 | .IR float ,
225 | default value
226 | .BR 0.01 )
227 | .TP
228 | .B high_freq_limit
229 | (type
230 | .IR float ,
231 | default value
232 | .BR 0.99 )
233 | .br
234 | The spam frequency of a word is computed as the number of occurrences
235 | in spam divided by number of occurrences in all messages.  This ratio
236 | is then clipped to the interval [
237 | .BR low_freq_limit ,
238 | .B high_freq_limit
239 | ], so that words that are extremely rare or extremely common in spam
240 | do not bias the probability computation too much.  The default values
241 | of 0.01 and 0.99 are adequate for a corpus of a few thousand e-mails.
242 | For larger corpora (e.g. 10000 e-mails), the values 0.001 and 0.999
243 | may give better results.
244 | .TP
245 | .B min_meaningful_words
246 | (type
247 | .IR integer ,
248 | default value
249 | .BR 5 )
250 | .br
251 | Minimum number of "meaningful" words below which 
252 | .B spamoracle mark
253 | refuses to classify the e-mail and outputs "unknown" status.  This
254 | happens with very short e-mails, or e-mails that consist exclusively of
255 | links and pictures.
256 | .TP
257 | .B good_mail_prob
258 | (type
259 | .IR float ,
260 | default value
261 | .BR 0.2 )
262 | .br
263 | Spam probability below which the e-mail is classified as non-spam.
264 | .TP
265 | .B spam_mail_prob
266 | (type
267 | .IR float ,
268 | default value
269 | .BR 0.8 )
270 | .br
271 | Spam probability above which the e-mail is classified as spam.
272 | Messages whose probability falls between
273 | .B good_mail_prob
274 | and
275 | .B spam_mail_prob
276 | are classified as "unknown".
277 | 
278 | .SH AUTHOR
279 | Xavier Leroy <Xavier.Leroy@inria.fr>
280 | 
281 | .SH "SEE ALSO"
282 | 
283 | .BR spamoracle (1)
284 | 
285 | .B http://www.paulgraham.com/spam.html
286 | (Paul Graham's seminal paper)
287 | 
288 | 
289 | 


--------------------------------------------------------------------------------
/htmlscan.mll:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (** Approximate HTML scanner.  Extracts words from HTML text,
 16 |     as well as certain parameters of certain tags (e.g. URLs). *)
 17 | 
 18 | {
 19 | module StringSet = Set.Make(String)
 20 | module StringMap = Map.Make(String)
 21 | 
 22 | let re_url_encoding = Str.regexp "%\\([0-9A-Fa-f][0-9A-Fa-f]\\)"
 23 | 
 24 | let decode_url_percent s =
 25 |   let n = int_of_string ("0x" ^ Str.matched_group 1 s) in
 26 |   String.make 1 (Char.chr n)
 27 | 
 28 | let decode_url s =
 29 |   Str.global_substitute re_url_encoding decode_url_percent s
 30 | 
 31 | let entity_table =
 32 |   List.fold_left
 33 |     (fun t (s, c) -> StringMap.add s c t)
 34 |     StringMap.empty
 35 |     ["amp", '&';          "lt", '<';           "gt", '>';
 36 |      "nbsp", '\160';     
 37 |      "Agrave", '\192';    "Aacute", '\193';    "Acirc", '\194';
 38 |      "Atilde", '\195';    "Auml", '\196';      "Aring", '\197';
 39 |      "AElig", '\198';     "Ccedil", '\199';    "Egrave", '\200';
 40 |      "Eacute", '\201';    "Ecirc", '\202';     "Euml", '\203';
 41 |      "Igrave", '\204';    "Iacute", '\205';    "Icirc", '\206';
 42 |      "Iuml", '\207';      "ETH", '\208';       "Ntilde", '\209';
 43 |      "Ograve", '\210';    "Oacute", '\211';    "Ocirc", '\212';
 44 |      "Otilde", '\213';    "Ouml", '\214';      "times", '\215';
 45 |      "Oslash", '\216';    "Ugrave", '\217';    "Uacute", '\218';
 46 |      "Ucirc", '\219';     "Uuml", '\220';      "Yacute", '\221';
 47 |      "THORN", '\222';     "szlig", '\223';     "agrave", '\224';
 48 |      "aacute", '\225';    "acirc", '\226';     "atilde", '\227';
 49 |      "auml", '\228';      "aring", '\229';     "aelig", '\230';
 50 |      "ccedil", '\231';    "egrave", '\232';    "eacute", '\233';
 51 |      "ecirc", '\234';     "euml", '\235';      "igrave", '\236';
 52 |      "iacute", '\237';    "icirc", '\238';     "iuml", '\239';
 53 |      "eth", '\240';       "ntilde", '\241';    "ograve", '\242';
 54 |      "oacute", '\243';    "ocirc", '\244';     "otilde", '\245';
 55 |      "ouml", '\246';      "divide", '\247';    "oslash", '\248';
 56 |      "ugrave", '\249';    "uacute", '\250';    "ucirc", '\251';
 57 |      "uuml", '\252';      "yacute", '\253';    "thorn", '\254';
 58 |      "yuml", '\255']    
 59 | 
 60 | let word_breaking_tags =
 61 |   List.fold_right StringSet.add
 62 |     [ "p"; "br"; "ul"; "ol"; "dt"; "li"; "dd"; "table"; "tr"; "th"; "td";
 63 |       "img"; "div"; "blockquote"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
 64 |       "address" ]
 65 |     StringSet.empty
 66 | 
 67 | module Output = struct
 68 | 
 69 |   type t = { txt: Buffer.t; extra: Buffer.t }
 70 | 
 71 |   let create() = { txt = Buffer.create 2048; extra = Buffer.create 256 }
 72 | 
 73 |   let clear ob =
 74 |     Buffer.clear ob.txt;
 75 |     Buffer.clear ob.extra
 76 | 
 77 |   let contents ob =
 78 |     Buffer.add_char ob.txt '\n';
 79 |     Buffer.add_buffer ob.txt ob.extra;
 80 |     Buffer.contents ob.txt
 81 | 
 82 |   let char ob c =
 83 |     Buffer.add_char ob.txt c
 84 | 
 85 |   let string ob s =
 86 |     Buffer.add_string ob.txt s
 87 | 
 88 |   let add_extra ob s =
 89 |     Buffer.add_string ob.extra s; Buffer.add_char ob.extra '\n'
 90 | 
 91 |   let tag ob t =
 92 |     if StringSet.mem t word_breaking_tags then char ob ' ';
 93 |     if !Config.html_add_tags then add_extra ob t
 94 | 
 95 |   let tag_attr ob t n s =
 96 |     let n = String.lowercase_ascii n in
 97 |     if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then
 98 |       if n = "href" || n = "src"
 99 |       then add_extra ob (decode_url s)
100 |       else add_extra ob s
101 | end
102 | 
103 | let ob = Output.create()
104 | let tag = ref ""
105 | let attr_name = ref ""
106 | let attr_value = Buffer.create 128
107 | 
108 | }
109 | 
110 | let ws = [' ' '\n' '\r' '\t']
111 | let name = ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '0'-'9' '.' '-']*
112 | let unquotedattrib = 
113 |   [^ '\'' '\"' '>' ' ' '\n' '\r' '\t'] [^ '>' ' ' '\n' '\r' '\t']*
114 | 
115 | rule main = parse
116 |     "<!"                        (* tolerance *)
117 |       { comment lexbuf; main lexbuf }
118 |   | "<" name
119 |       { let s = Lexing.lexeme lexbuf in
120 |         tag := String.lowercase_ascii (String.sub s 1 (String.length s - 1));
121 |         tagbody lexbuf;
122 |         main lexbuf }
123 |   | "</" name
124 |       { let s = Lexing.lexeme lexbuf in
125 |         tag := String.lowercase_ascii (String.sub s 2 (String.length s - 2));
126 |         tagbody lexbuf;
127 |         main lexbuf }
128 |   | "<"                         (* tolerance *)
129 |       { Output.char ob '<'; main lexbuf }
130 |   | "&"
131 |       { Output.char ob (entity lexbuf); main lexbuf }
132 |   (* This parses UTF-8 encodings of ISO Latin 1 characters in the
133 |      0x80 - 0xFF range *)
134 |   | ['\194'-'\195'] ['\128'-'\191']
135 |       { let a = Char.code (Lexing.lexeme_char lexbuf 0)
136 |         and b = Char.code (Lexing.lexeme_char lexbuf 1) in
137 |         Output.char ob (Char.chr (((a land 0x3) lsl 6) lor (b land 0x3F)));
138 |         main lexbuf }
139 |   | _
140 |       { Output.char ob (Lexing.lexeme_char lexbuf 0); main lexbuf }
141 |   | eof
142 |       { Output.contents ob }
143 | 
144 | and comment = parse
145 |     ">"                         (* tolerance *)
146 |       { () }
147 |   | _                           (* tolerance *)
148 |       { comment lexbuf }
149 |   | eof                         (* tolerance *)
150 |       { () }
151 | 
152 | and tagbody = parse
153 |     ">"
154 |       { Output.tag ob !tag }
155 |   | name
156 |       { attr_name := Lexing.lexeme lexbuf;
157 |         tagattrib lexbuf;
158 |         tagbody lexbuf }
159 |   | _                           (* tolerance -- should be ws *)
160 |       { tagbody lexbuf }
161 |   | eof                         (* tolerance *)
162 |       { Output.tag ob !tag }
163 | 
164 | and tagattrib = parse
165 |     ws* '=' ws* 
166 |       { tagvalue lexbuf }
167 |   | ""
168 |       { Output.tag_attr ob !tag !attr_name "" }
169 | 
170 | and tagvalue = parse
171 |     "'"
172 |       { Buffer.clear attr_value; singlequoted lexbuf }
173 |   | "\""
174 |       { Buffer.clear attr_value; doublequoted lexbuf }
175 |   | unquotedattrib
176 |       { Output.tag_attr ob !tag !attr_name (Lexing.lexeme lexbuf) }
177 |   | ""                          (* tolerance *)
178 |       { Output.tag_attr ob !tag !attr_name "" }
179 | 
180 | and singlequoted = parse
181 |     "'" | eof                   (* eof is tolerance *)
182 |       { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) }
183 |   | "&"
184 |       { Buffer.add_char attr_value (entity lexbuf); singlequoted lexbuf }
185 |   | _
186 |       { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0);
187 |         singlequoted lexbuf }
188 | 
189 | and doublequoted = parse
190 |     "\"" | eof                 (* eof is tolerance *)
191 |       { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) }
192 |   | "&"
193 |       { Buffer.add_char attr_value (entity lexbuf); doublequoted lexbuf }
194 |   | _
195 |       { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0);
196 |         doublequoted lexbuf }
197 | 
198 | and entity = parse
199 |     '#' ['0'-'9']+
200 |       { let s = Lexing.lexeme lexbuf in
201 |         let n = int_of_string (String.sub s 1 (String.length s - 1)) in
202 |         entity_end lexbuf;
203 |         if n >= 0 && n <= 255 then Char.chr n else '\255' }
204 |   | name
205 |       { let s = Lexing.lexeme lexbuf in
206 |         entity_end lexbuf;
207 |         try StringMap.find s entity_table with Not_found -> '\255' }
208 |   | ""                          (* tolerance *)
209 |       { '&' }
210 | 
211 | and entity_end = parse
212 |     ";" ?
213 |       { () }
214 | 
215 | {
216 | let extract_text s =
217 |   Output.clear ob; main (Lexing.from_string s)
218 | }
219 | 


--------------------------------------------------------------------------------
/mail.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (** Parsing of e-mail messages, including attachments *)
 16 | 
 17 | type message =
 18 |   { headers: (string * string) list;
 19 |     body: string;
 20 |     parts: message list }
 21 | 
 22 | let base64_decode_char c =
 23 |   match c with
 24 |     'A' .. 'Z' -> Char.code c - 65
 25 |   | 'a' .. 'z' -> Char.code c - 97 + 26
 26 |   | '0' .. '9' -> Char.code c - 48 + 52
 27 |   | '+' -> 62
 28 |   | '/' -> 63
 29 |   | _ -> -1
 30 | 
 31 | let decode_base64 s =
 32 |   let d = Buffer.create (String.length s * 3 / 4) in
 33 |   let buf = Array.make 4 0 in
 34 |   let pos = ref 0 in
 35 |   for i = 0 to String.length s - 1 do
 36 |     let n = base64_decode_char s.[i] in
 37 |     if n >= 0 then begin
 38 |       buf.(!pos) <- n;
 39 |       incr pos;
 40 |       if !pos = 4 then begin
 41 |         Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4));
 42 |         Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2));
 43 |         Buffer.add_char d (Char.chr((buf.(2) land 3) lsl 6 + buf.(3)));
 44 |         pos := 0
 45 |       end
 46 |     end
 47 |   done;
 48 |   begin match !pos with
 49 |     2 ->
 50 |       Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4))
 51 |   | 3 ->
 52 |       Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4));
 53 |       Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2))
 54 |   | _ ->
 55 |       ()
 56 |   end;
 57 |   Buffer.contents d
 58 | 
 59 | let hexa_digit c =
 60 |   if c >= '0' && c <= '9' then Char.code c - 48
 61 |   else if c >= 'A' && c <= 'F' then Char.code c - 65 + 10
 62 |   else if c >= 'a' && c <= 'f' then Char.code c - 97 + 10
 63 |   else raise Not_found
 64 | 
 65 | let decode_qp s =
 66 |   let len = String.length s in
 67 |   let d = Buffer.create (String.length s) in
 68 |   let pos = ref 0 in
 69 |   while !pos < len do
 70 |     let c = s.[!pos] in
 71 |     if c = '=' && !pos + 1 < len && s.[!pos + 1] = '\n' then begin
 72 |       pos := !pos + 2
 73 |     end else if c = '=' && !pos + 2 < len then begin
 74 |       try
 75 |         let h1 = hexa_digit s.[!pos + 1]
 76 |         and h2 = hexa_digit s.[!pos + 2] in
 77 |         Buffer.add_char d (Char.chr(h1 lsl 4 + h2));
 78 |         pos := !pos + 3
 79 |       with Not_found ->
 80 |         Buffer.add_char d c;
 81 |         incr pos
 82 |     end else begin
 83 |       Buffer.add_char d c;
 84 |       incr pos
 85 |     end
 86 |   done;
 87 |   Buffer.contents d
 88 | 
 89 | let re_base64 = Str.regexp_case_fold "base64"
 90 | let re_qp = Str.regexp_case_fold "quoted-printable"
 91 | 
 92 | let decode encoding s =
 93 |   if Str.string_match re_base64 encoding 0 then
 94 |     decode_base64 s
 95 |   else if Str.string_match re_qp encoding 0 then
 96 |     decode_qp s
 97 |   else
 98 |     s
 99 | 
100 | let re_encoded_header =
101 |   Str.regexp "=\\?[_A-Za-z0-9-]+\\?\\([BbQq]\\)\\?\\([^?]*\\)\\?="
102 | 
103 | let decode_header s =
104 |   let decode_group s =
105 |     let enc = Str.matched_group 1 s
106 |     and txt = Str.matched_group 2 s in
107 |     match enc with
108 |       "B" | "b" -> decode_base64 txt
109 |     | "Q" | "q" -> decode_qp txt
110 |     | _ -> assert false in
111 |   Str.global_substitute re_encoded_header decode_group s
112 | 
113 | let re_continuation = Str.regexp "\n[ \t]+"
114 | let re_nl = Str.regexp "\n"
115 | let re_field = Str.regexp "\\([A-Za-z-]+[: ]\\)[ \t]*\\(.*\\)"
116 | 
117 | let parse_header s =
118 |   let rec parse_field accu = function
119 |     [] -> List.rev accu
120 |   | line :: rem ->
121 |       if Str.string_match re_field line 0 then begin
122 |         let field_name = String.lowercase_ascii (Str.matched_group 1 line)
123 |         and field_val  = Str.matched_group 2 line in
124 |         parse_field ((field_name, decode_header field_val) :: accu) rem
125 |       end else
126 |         parse_field accu rem in
127 |   parse_field [] (Str.split re_nl (Str.global_replace re_continuation " " s))
128 | 
129 | let find_header name headers =
130 |   try List.assoc name headers with Not_found -> ""
131 | 
132 | let re_nl_nl = Str.regexp "\n\n"
133 | let re_multipart =
134 |   Str.regexp_case_fold
135 |     "multipart/.*boundary *= *\\(\"\\([^\"]+\\)\"\\|\\([^ \t]+\\)\\)"
136 | 
137 | let rec parse_message s =
138 |   try
139 |     let pos_sep = Str.search_forward re_nl_nl s 0 in
140 |     let headers = parse_header (String.sub s 0 pos_sep) in
141 |     let body = String.sub s (pos_sep + 2) (String.length s - pos_sep - 2) in
142 |     let encoding = find_header "content-transfer-encoding:" headers in
143 |     let ctype = find_header "content-type:" headers in
144 |     if Str.string_match re_multipart ctype 0 then begin
145 |       let boundary =
146 |         try
147 |           Str.matched_group 2 ctype
148 |         with Not_found -> try
149 |           Str.matched_group 3 ctype
150 |         with Not_found ->
151 |           assert false in
152 |       let re_bound =
153 |         Str.regexp ("--" ^ Str.quote boundary ^ "[ \t\n]*") in
154 |       match Str.split_delim re_bound body with
155 |         [] ->
156 |           { headers = headers;
157 |             body = decode encoding body;
158 |             parts = [] }
159 |       | blurb :: parts ->
160 |           { headers = headers;
161 |             body = decode encoding blurb;
162 |             parts = List.map parse_message parts }
163 |     end else
164 |       { headers = headers;
165 |         body = decode encoding body;
166 |         parts = [] }
167 |   with Not_found ->
168 |     { headers = [];
169 |       body = s;
170 |       parts = [] }
171 | 
172 | let safe_remove fname = try Sys.remove fname with Sys_error _ -> ()
173 | 
174 | let run_body_through_external_converter cmd arg body =
175 |   let infile = Filename.temp_file "spamoracle" ".data" in
176 |   let outfile = Filename.temp_file "spamoracle" ".txt" in
177 |   let oc = open_out_bin infile in
178 |   let ic = open_in_bin outfile in
179 |   output_string oc body;
180 |   close_out oc;
181 |   let retcode =
182 |     Sys.command
183 |       (Printf.sprintf "%s %s < %s > %s" 
184 |                       cmd (Filename.quote arg) infile outfile) in
185 |   if retcode <> 0 then begin
186 |     close_in ic;
187 |     safe_remove infile; safe_remove outfile;
188 |     None
189 |   end else begin
190 |     let len = in_channel_length ic in
191 |     let res = really_input_string ic len in
192 |     close_in ic;
193 |     safe_remove infile; safe_remove outfile;
194 |     Some res
195 |   end
196 | 
197 | let header s msg =
198 |   let rec hdr = function
199 |     [] -> []
200 |   | (h,v) :: rem -> if h = s then v :: hdr rem else hdr rem in
201 |   String.concat "\n" (hdr msg.headers)
202 | 
203 | let header_matches s re msg =
204 |   let rec hmatch = function
205 |     [] -> false
206 |   | (h,v) :: rem -> (h = s && Str.string_match re v 0) || hmatch rem
207 |   in hmatch msg.headers
208 | 
209 | let re_content_text =
210 |   Str.regexp_case_fold "text/plain\\|text/enriched\\|text;\\|text$"
211 | let re_content_html =
212 |   Str.regexp_case_fold "text/html"
213 | let re_content_message_rfc822 =
214 |   Str.regexp_case_fold "message/rfc822"
215 | let re_content_alternative =
216 |   Str.regexp_case_fold "multipart/alternative"
217 | let re_content_multipart =
218 |   Str.regexp_case_fold "multipart/"
219 | let re_content_any_text =
220 |   Str.regexp_case_fold "text/"
221 | 
222 | let rec iter_text_parts fn m =
223 |   if header_matches "content-type:" re_content_text m 
224 |   || not(List.mem_assoc "content-type:" m.headers) then
225 |     fn m
226 |   else if header_matches "content-type:" re_content_html m then
227 |     fn {m with body = Htmlscan.extract_text m.body}
228 |   else if header_matches "content-type:" re_content_alternative m then begin
229 |     try
230 |       if not !Config.alternative_favor_html then raise Not_found;
231 |       iter_text_parts fn 
232 |         (List.find (header_matches "content-type:" re_content_html) m.parts)
233 |     with Not_found ->
234 |       fn m;
235 |       List.iter (iter_text_parts fn) m.parts
236 |   end else if header_matches "content-type:" re_content_multipart m then begin
237 |     fn m;
238 |     List.iter (iter_text_parts fn) m.parts
239 |   end else if header_matches "content-type:" re_content_message_rfc822 m then
240 |     iter_text_parts fn (parse_message m.body)
241 |   else if !Config.external_converter <> ""
242 |        && not (header_matches "content-type:" re_content_any_text m)
243 |        then begin
244 |     match run_body_through_external_converter
245 |               !Config.external_converter
246 |               (header "content-type:" m)
247 |               m.body with
248 |     | None -> ()
249 |     | Some txt -> fn {m with body = txt}
250 |   end else
251 |     ()
252 | 
253 | let iter_message fn msg =
254 |   List.iter
255 |     (fun (h, v) -> if Str.string_match !Config.mail_headers h 0 then fn v)
256 |     msg.headers;
257 |   iter_text_parts
258 |     (fun m -> fn m.body)
259 |     msg
260 | 


--------------------------------------------------------------------------------
/main.ml:
--------------------------------------------------------------------------------
  1 | (***********************************************************************)
  2 | (*                                                                     *)
  3 | (*                 SpamOracle -- a Bayesian spam filter                *)
  4 | (*                                                                     *)
  5 | (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
  6 | (*                                                                     *)
  7 | (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
  8 | (*  en Automatique.  This file is distributed under the terms of the   *)
  9 | (*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
 10 | (*                                                                     *)
 11 | (***********************************************************************)
 12 | 
 13 | (* $Id$ *)
 14 | 
 15 | (* Argument parsing and main program *)
 16 | 
 17 | open Printf
 18 | open Mbox
 19 | open Database
 20 | open Processing
 21 | 
 22 | exception Usage of string
 23 | 
 24 | let default_config_name =
 25 |   try Filename.concat (Sys.getenv "HOME") ".spamoracle.conf"
 26 |   with Not_found -> ".spamoracle.conf"
 27 | 
 28 | let parse_config_file file =
 29 |   try
 30 |     let errs = Configfile.parse Config.options file in
 31 |     if errs <> [] then begin
 32 |       eprintf "Error while reading configuration file %s:\n" file;
 33 |       List.iter (fun (line, msg) -> eprintf "Line %d: %s\n" line msg) errs;
 34 |       exit 2
 35 |     end
 36 |   with Sys_error msg ->
 37 |     eprintf "Cannot read configuration file %s:\n%s\n" file msg
 38 | 
 39 | let mark_command args =
 40 |   let db = Database.read_short !Config.database_name in
 41 |   if args = [] then
 42 |     mark_message db (read_single_msg stdin)
 43 |   else
 44 |     List.iter (fun f -> mbox_file_iter f (mark_message db)) args
 45 | 
 46 | let add_command args =
 47 |   let db =
 48 |     try
 49 |       Database.read_full !Config.database_name
 50 |     with Sys_error _ ->
 51 |       Database.create 997 in
 52 |   let processed = ref false
 53 |   and is_spam = ref true
 54 |   and verbose = ref false in
 55 |   let rec parse_args = function
 56 |     | "-v" :: rem ->
 57 |         verbose := true; parse_args rem
 58 |     | "-spam" :: rem ->
 59 |         is_spam := true; parse_args rem
 60 |     | "-good" :: rem ->
 61 |         is_spam := false; parse_args rem
 62 |     | f :: rem ->
 63 |         mbox_file_iter f (add_message db !verbose !is_spam);
 64 |         processed := true;
 65 |         parse_args rem
 66 |     | [] ->
 67 |         if not !processed then
 68 |           add_message db !verbose !is_spam (read_single_msg stdin);
 69 |         if !verbose then
 70 |           printf "\r%6d / %6d  good / spam messages\n"
 71 |                  db.f_num_good db.f_num_spam
 72 |   in parse_args args; Database.write_full !Config.database_name db
 73 | 
 74 | let list_command args =
 75 |   let db = Database.read_full !Config.database_name in
 76 |   let res = ref [] in
 77 |   List.iter
 78 |     (fun s ->
 79 |       let re = Str.regexp (s ^ "$") in
 80 |       let match_word w (g, s) =
 81 |         if Str.string_match re w 0 then begin
 82 |           let p =
 83 |             if 2 * g + s < 5
 84 |             then -1.0
 85 |             else Rankmsg.word_proba g s db.f_num_good db.f_num_spam in
 86 |           res := (w, p, g, s) :: !res
 87 |         end in
 88 |       Hashtbl.iter match_word db.f_high_freq;
 89 |       Hashtbl.iter match_word db.f_low_freq)
 90 |     args;
 91 |   if !res = [] then
 92 |     Printf.printf "No matching word found in database.\n"
 93 |   else begin
 94 |     Printf.printf "     Word         Proba   #good   #spam\n";
 95 |     List.iter
 96 |       (fun (w, p, g, s) ->
 97 |         if p >= 0.0 then
 98 |           Printf.printf "%-15s%8.2f%8d%8d\n" w p g s
 99 |         else
100 |           Printf.printf "%-15s    ----%8d%8d\n" w g s)
101 |       (List.sort
102 |         (fun (_, p1, _, _) (_, p2, _, _) -> compare p2 p1)
103 |         !res)
104 |   end
105 | 
106 | let test_command args =
107 |   let db = Database.read_short !Config.database_name in
108 |   let low = ref 0.0 and high = ref 1.0 in
109 |   let rec parse_args = function
110 |     | "-min" :: s :: rem ->
111 |         begin try low := float_of_string s
112 |         with Failure _ -> raise(Usage("bad argument to -min"))
113 |         end;
114 |         parse_args rem
115 |     | "-min" :: [] ->
116 |         raise(Usage("no argument to -min"))
117 |     | "-max" :: s :: rem ->
118 |         begin try high := float_of_string s
119 |         with Failure _ -> raise(Usage("bad argument to -max"))
120 |         end;
121 |         parse_args rem
122 |     | "-max" :: [] ->
123 |         raise(Usage("no argument to -max"))
124 |     | f :: rem ->
125 |         mbox_file_iter f (test_message db !low !high f);
126 |         parse_args rem
127 |     | [] -> ()
128 |   in parse_args args
129 | 
130 | let stat_command args =
131 |   let db = Database.read_short !Config.database_name in
132 |   let stat_mbox f =
133 |     let num_msgs = ref 0
134 |     and num_good = ref 0
135 |     and num_spam = ref 0
136 |     and num_unknown = ref 0 in
137 |     mbox_file_iter f
138 |       (fun s ->
139 |         incr num_msgs;
140 |         match stat_message db s with
141 |           Msg_good -> incr num_good
142 |         | Msg_spam -> incr num_spam
143 |         | Msg_unknown -> incr num_unknown);
144 |     let percentage a b =
145 |       100.0 *. float a /. float b in
146 |     if !num_msgs > 0 then
147 |       printf "%s: %d (%.2f%%) good, %d (%.2f%%) unknown, %d (%.2f%%) spam\n"
148 |              f 
149 |              !num_good (percentage !num_good !num_msgs)
150 |              !num_unknown (percentage !num_unknown !num_msgs)
151 |              !num_spam (percentage !num_spam !num_msgs)
152 |   in List.iter stat_mbox args
153 | 
154 | let words_command args =
155 |   let db = Database.read_short !Config.database_name in
156 |   if args = [] then
157 |     wordsplit_message db (read_single_msg stdin)
158 |   else
159 |     List.iter
160 |       (fun f ->
161 |         mbox_file_iter f
162 |           (fun msg ->
163 |             print_string "----------------------------------------\n";
164 |             wordsplit_message db msg))
165 |       args
166 | 
167 | let backup_command () =
168 |   Database.dump (Database.read_full !Config.database_name) stdout
169 | 
170 | let restore_command () =
171 |   Database.write_full !Config.database_name (Database.restore stdin)
172 | 
173 | let upgrade_command () =
174 |   let db = Database.read_full !Config.database_name in
175 |   Database.write_full !Config.database_name db;
176 |   printf "Converted %s to version %d.\n"
177 |          !Config.database_name
178 |          Database.current_version
179 | 
180 | let rec parse_args_1 = function
181 |     "-config" :: file :: rem ->
182 |       parse_config_file file; parse_args_2 rem
183 |   | "-config" :: [] ->
184 |       raise(Usage("Option -config requires an argument"))
185 |   | rem ->
186 |       if Sys.file_exists default_config_name
187 |       then parse_config_file default_config_name;
188 |       parse_args_2 rem
189 | 
190 | and parse_args_2 = function
191 |   | "-f" :: file :: rem ->
192 |       Config.database_name := file; parse_args_3 rem
193 |   | "-f" :: [] ->
194 |       raise(Usage("Option -f requires an argument"))
195 |   | rem ->
196 |       parse_args_3 rem
197 | 
198 | and parse_args_3 = function
199 |     "mark" :: rem ->
200 |       mark_command rem
201 |   | "add" :: rem ->
202 |       add_command rem
203 |   | "list" :: rem ->
204 |       list_command rem
205 |   | "test" :: rem ->
206 |       test_command rem
207 |   | "stat" :: rem ->
208 |       stat_command rem
209 |   | "backup" :: rem ->
210 |       backup_command ()
211 |   | "restore" :: rem ->
212 |       restore_command ()
213 |   | "words" :: rem ->
214 |       words_command rem
215 |   | "upgrade" :: rem ->
216 |       upgrade_command ()
217 |   | s :: rem ->
218 |       raise(Usage("Unknown command " ^ s))
219 |   | [] ->
220 |       raise(Usage "")
221 | 
222 | let usage_string = "\
223 | Usage:
224 |   spamoracle [-config conf] [-f db] mark {mailbox}*
225 |   Add 'X-Spam:' headers to messages with result of analysis
226 |     {mailbox}*   Mailboxes containing messages to analyze and mark
227 |                  If none given, read single msg from standard input
228 | 
229 |   spamoracle [-config conf] [-f db] add [-v] -spam {spambox}* -good {goodbox}*
230 |   Create or update database with known spam or non-spam messages
231 |     -v           Print progress bar
232 |     -spam        Indicate subsequent mailboxes contain spam
233 |     -good        Indicate subsequent mailboxes contain good msgs (not spam)
234 |     {spambox}*   Mailboxes containing spam
235 |     {goodbox}*   Mailboxes containing good messages (not spam)
236 |                  If no mailbox given, read single msg from standard input
237 | 
238 |   spamoracle [-config conf] [-f db] test [-min prob] [-max prob] {mailbox}*
239 |   Analyze messages and print summary of results for each message
240 |     -min <prob>  Don't print messages with result below <prob>   
241 |     -max <prob>  Don't print messages with result above <prob>   
242 |     {mailbox}*   Mailboxes containing messages to analyze
243 | 
244 |   spamoracle [-config conf] [-f db] stat {mailbox}*
245 |   Analyze messages and print percentages of spam/non-spam for each mailbox
246 |     {mailbox}*   Mailboxes containing messages to analyze
247 | 
248 |   spamoracle [-config conf] [-f db] list {regexp}*
249 |   Dump word statistics in database
250 |     {regexp}*    Regular expressions for words we are interested in
251 | 
252 |   spamoracle [-config conf] [-f db] backup > database.backup
253 |   Dump whole database in portable text format on standard output
254 | 
255 |   spamoracle [-config conf] [-f db] restore < database.backup
256 |   Restore database from text backup file read from standard input
257 | 
258 |   spamoracle [-config conf] [-f db] upgrade
259 |   Convert database to the latest format
260 | 
261 |   spamoracle [-config conf] [-f db] words {mailbox}*
262 |   Extract words from messages and print them
263 |     {mailbox}*   Mailboxes containing messages to scan
264 |                  If no mailbox given, read single msg from standard input
265 | 
266 |   Common options:
267 |     -config <conf> Configuration file (default $HOME/.spamoracle.conf)
268 |     -f <db>        Database to use (default $HOME/.spamoracle.db)"
269 | 
270 | let main () =
271 |   try
272 |     parse_args_1 (List.tl (Array.to_list Sys.argv))
273 |   with
274 |   | Usage msg ->
275 |       eprintf "%s\n%s\n" msg usage_string;
276 |       exit 2
277 |   | Sys_error msg ->
278 |       eprintf "System error: %s\n" msg;
279 |       exit 2
280 |   | Database.Error msg ->
281 |       eprintf "%s\n" msg;
282 |       exit 2
283 | 
284 | let _ = main()
285 | 
286 | 


--------------------------------------------------------------------------------