├── README ├── spamoracle.1 ├── wordsplit.mlp ├── .gitignore ├── wordsplit.mli ├── configfile.mli ├── htmlscan.mli ├── refhosts.mli ├── attachments.mli ├── rankmsg.mli ├── processing.mli ├── Changes ├── .depend ├── database.mli ├── refhosts.ml ├── Makefile ├── mbox.mli ├── config.ml ├── mail.mli ├── config.mli ├── attachments.ml ├── configfile.ml ├── mbox.ml ├── processing.ml ├── rankmsg.ml ├── database.ml ├── spamoracle.conf.5 ├── htmlscan.mll ├── mail.ml └── main.ml /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/README -------------------------------------------------------------------------------- /spamoracle.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/spamoracle.1 -------------------------------------------------------------------------------- /wordsplit.mlp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavierleroy/spamoracle/HEAD/wordsplit.mlp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.cm[iox] 2 | *.o 3 | spamoracle 4 | htmlscan.ml 5 | wordsplit.ml 6 | wordsplit.mll 7 | *~ 8 | -------------------------------------------------------------------------------- /wordsplit.mli: -------------------------------------------------------------------------------- 1 | (** Decompose a string into words. *) 2 | 3 | val iter: (string -> unit) -> (string -> bool) -> string -> unit 4 | (** [iter fn db txt] applies [fn] to each word in [txt]. 5 | [db] is a predicate that returns [true] for known words. 6 | It is used to recognize stretched-out words, e.g. [H.e.ll.o]. 7 | *) 8 | -------------------------------------------------------------------------------- /configfile.mli: -------------------------------------------------------------------------------- 1 | (* Parsing configuration files *) 2 | 3 | type value = 4 | | Bool of bool ref 5 | | String of string ref 6 | | Int of int ref 7 | | Float of float ref 8 | | Regexp of Str.regexp ref 9 | | OptBool of bool option ref 10 | | OptString of string option ref 11 | | OptInt of int option ref 12 | | OptFloat of float option ref 13 | | OptRegexp of Str.regexp option ref 14 | 15 | val parse: (string * value) list -> string -> (int * string) list 16 | -------------------------------------------------------------------------------- /htmlscan.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Approximate HTML scanner. Extracts words and certain parameters 16 | of certain tags (e.g. URLs) from HTML text. *) 17 | 18 | val extract_text: string -> string 19 | -------------------------------------------------------------------------------- /refhosts.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Extract hostnames or IP addresses referenced from http URLs 16 | in message bodies. *) 17 | 18 | val reset: unit -> unit 19 | val add: string -> unit 20 | val summarize: unit -> string 21 | -------------------------------------------------------------------------------- /attachments.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Summarize the attachments of a message as one line that can be 16 | put in the header of the message. Allows procmail to filter 17 | suspicious attachments without looking at the message body. *) 18 | 19 | val summarize: Mail.message -> string 20 | -------------------------------------------------------------------------------- /rankmsg.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Message ranking *) 16 | 17 | type rank = 18 | { spam_prob: float; 19 | num_meaningful: int; 20 | explanation: string } 21 | 22 | val rank_message: Database.short -> Mail.message -> rank 23 | val word_proba: int -> int -> int -> int -> float 24 | -------------------------------------------------------------------------------- /processing.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Processing messages *) 16 | 17 | val mark_message : Database.short -> string -> unit 18 | val record_words : Database.full -> bool -> string -> unit 19 | val add_message : Database.full -> bool -> bool -> string -> unit 20 | val test_message : Database.short -> float -> float -> string -> string -> unit 21 | type message_class = Msg_good | Msg_unknown | Msg_spam 22 | val stat_message : Database.short -> string -> message_class 23 | val wordsplit_message : Database.short -> string -> unit 24 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | - Issue #2: correctly handle mails with CR-LF end-of-lines. 2 | 3 | Release 1.6: 4 | - Adapt to safe strings, using mutable byte arrays where needed. 5 | 6 | Release 1.5: 7 | - Be resilient to changes in OCaml's hash table implementation. 8 | - Added command "spamoracle upgrade" to convert the database to 9 | the current hash table format. 10 | 11 | Release 1.4: 12 | - More lenient rule for ignoring HTML comments. 13 | - Recognition of S P A C E D O-U-T words. 14 | - Fixed a bug causing certain text-only e-mails to be ignored. 15 | - Improved the "spamoracle test" command. 16 | - Added "spamoracle words" command for troubleshooting the word-splitting 17 | algorithm. 18 | 19 | Release 1.3: 20 | - Better scanning of words in HTML documents. 21 | - Better handling of multipart documents; now scan both text and HTML 22 | parts of a multipart/alternative. 23 | - Support for Portuguese and Japanese (JIS encoding) 24 | - Added configuration file (~/.spamoracle.conf) to allow twiddling 25 | with filtering parameters. 26 | - Fixed database corruption problem with concurrent adds. 27 | - Handles gzipped databases. 28 | - Added manual pages. 29 | 30 | Release 1.2: 31 | - When choosing part of a multipart/alternative, ignore pure text part 32 | if much smaller than HTML part. 33 | - Added commands "backup" and "restore" to facilitate future evolution 34 | towards other database formats. 35 | 36 | Release 1.1: 37 | - Fixed "usage" message. 38 | 39 | Release 1.0: 40 | - First public release. 41 | -------------------------------------------------------------------------------- /.depend: -------------------------------------------------------------------------------- 1 | attachments.cmo: mail.cmi attachments.cmi 2 | attachments.cmx: mail.cmx attachments.cmi 3 | configfile.cmo: configfile.cmi 4 | configfile.cmx: configfile.cmi 5 | config.cmo: configfile.cmi config.cmi 6 | config.cmx: configfile.cmx config.cmi 7 | database.cmo: database.cmi 8 | database.cmx: database.cmi 9 | htmlscan.cmo: config.cmi htmlscan.cmi 10 | htmlscan.cmx: config.cmx htmlscan.cmi 11 | mail.cmo: config.cmi htmlscan.cmi mail.cmi 12 | mail.cmx: config.cmx htmlscan.cmx mail.cmi 13 | main.cmo: config.cmi configfile.cmi database.cmi mbox.cmi processing.cmi 14 | main.cmx: config.cmx configfile.cmx database.cmx mbox.cmx processing.cmx 15 | mbox.cmo: mbox.cmi 16 | mbox.cmx: mbox.cmi 17 | processing.cmo: attachments.cmi config.cmi database.cmi mail.cmi rankmsg.cmi \ 18 | refhosts.cmi wordsplit.cmi processing.cmi 19 | processing.cmx: attachments.cmx config.cmx database.cmx mail.cmx rankmsg.cmx \ 20 | refhosts.cmx wordsplit.cmx processing.cmi 21 | rankmsg.cmo: config.cmi database.cmi mail.cmi refhosts.cmi wordsplit.cmi \ 22 | rankmsg.cmi 23 | rankmsg.cmx: config.cmx database.cmx mail.cmx refhosts.cmx wordsplit.cmx \ 24 | rankmsg.cmi 25 | refhosts.cmo: refhosts.cmi 26 | refhosts.cmx: refhosts.cmi 27 | virus.cmo: mail.cmi mbox.cmi zip.cmo 28 | virus.cmx: mail.cmx mbox.cmx zip.cmx 29 | wordsplit.cmo: wordsplit.cmi 30 | wordsplit.cmx: wordsplit.cmi 31 | attachments.cmi: mail.cmi 32 | config.cmi: configfile.cmi 33 | processing.cmi: database.cmi 34 | rankmsg.cmi: database.cmi mail.cmi 35 | -------------------------------------------------------------------------------- /database.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Word frequency database *) 16 | 17 | type short = { 18 | s_num_good: int; 19 | s_num_spam: int; 20 | s_freq: (string, int * int) Hashtbl.t 21 | } 22 | 23 | type full = { 24 | mutable f_num_good: int; 25 | mutable f_num_spam: int; 26 | f_high_freq: (string, int * int) Hashtbl.t; 27 | f_low_freq: (string, int * int) Hashtbl.t 28 | } 29 | 30 | val read_short: string -> short 31 | val read_full: string -> full 32 | val write_full: string -> full -> unit 33 | val create: int -> full 34 | val add_good: full -> string -> unit 35 | val add_spam: full -> string -> unit 36 | val dump: full -> out_channel -> unit 37 | val restore: in_channel -> full 38 | val in_short: short -> string -> bool 39 | val in_full: full -> string -> bool 40 | val current_version: int 41 | 42 | exception Error of string 43 | -------------------------------------------------------------------------------- /refhosts.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Extract hostnames or IP addresses referenced from http URLs 16 | in message bodies. *) 17 | 18 | let re_url = 19 | Str.regexp_case_fold 20 | "http://\\([^@]+@\\)?\\([a-z0-9-]+\\(\\.[a-z0-9-]+\\)+\\)" 21 | 22 | module StringSet = Set.Make(String) 23 | 24 | let hosts = ref StringSet.empty 25 | 26 | let reset() = hosts := StringSet.empty 27 | 28 | let rec add_urls txt pos = 29 | let matched = 30 | try ignore (Str.search_forward re_url txt pos); true 31 | with Not_found -> false in 32 | if matched then begin 33 | hosts := StringSet.add (Str.matched_group 2 txt) !hosts; 34 | add_urls txt (Str.match_end()) 35 | end 36 | 37 | let add txt = 38 | add_urls txt 0 39 | 40 | let summarize () = 41 | let lst = StringSet.elements !hosts in 42 | hosts := StringSet.empty; 43 | String.concat " " lst 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ### Configuration section 2 | 3 | # The laguages you're interested in, besides English 4 | LANGUAGES=-DFRENCH #-DSPANISH -DITALIAN -DGERMAN -DPORTUGUESE -DJAPANESE 5 | 6 | # How to invoke the C preprocessor 7 | CPP=gcc -E -P $(LANGUAGES) - 8 | 9 | # Where to install the binary 10 | BINDIR=/usr/local/bin 11 | 12 | # Where to install the man pages 13 | MANDIR=/usr/local/man 14 | 15 | ### End of configuration section 16 | 17 | OCAMLC=ocamlc -g 18 | OCAMLLEX=ocamllex 19 | OCAMLDEP=ocamldep 20 | OCAMLOPT=ocamlopt 21 | 22 | BYTEOBJS=configfile.cmo config.cmo \ 23 | htmlscan.cmo mail.cmo database.cmo mbox.cmo wordsplit.cmo \ 24 | refhosts.cmo rankmsg.cmo attachments.cmo processing.cmo main.cmo 25 | BYTELIBS=unix.cma str.cma 26 | 27 | NATOBJS=$(BYTEOBJS:.cmo=.cmx) 28 | NATLIBS=$(BYTELIBS:.cma=.cmxa) 29 | 30 | all: spamoracle 31 | 32 | install: 33 | cp spamoracle $(BINDIR)/spamoracle 34 | cp spamoracle.1 $(MANDIR)/man1/spamoracle.1 35 | cp spamoracle.conf.5 $(MANDIR)/man5/spamoracle.conf.5 36 | 37 | spamoracle: $(NATOBJS) 38 | $(OCAMLOPT) -o spamoracle $(NATLIBS) $(NATOBJS) 39 | 40 | clean:: 41 | rm -f spamoracle 42 | 43 | spamoracle.byte: $(BYTEOBJS) 44 | $(OCAMLC) -o spamoracle.byte $(BYTELIBS) $(BYTEOBJS) 45 | 46 | clean:: 47 | rm -f spamoracle.byte 48 | 49 | wordsplit.mll: wordsplit.mlp 50 | $(CPP) < wordsplit.mlp > wordsplit.mll \ 51 | || { rm -f wordsplit.mll; exit 2; } 52 | 53 | clean:: 54 | rm -f wordsplit.mll 55 | 56 | wordsplit.ml: wordsplit.mll 57 | $(OCAMLLEX) wordsplit.mll 58 | 59 | clean:: 60 | rm -f wordsplit.ml 61 | 62 | beforedepend:: wordsplit.ml 63 | 64 | htmlscan.ml: htmlscan.mll 65 | $(OCAMLLEX) htmlscan.mll 66 | 67 | clean:: 68 | rm -f htmlscan.ml 69 | 70 | beforedepend:: htmlscan.ml 71 | 72 | clean:: 73 | rm -f *.cm[iox] *.o 74 | 75 | .SUFFIXES: .ml .mli .cmo .cmi .cmx 76 | 77 | .mli.cmi: 78 | $(OCAMLC) -c $< 79 | .ml.cmo: 80 | $(OCAMLC) -c $< 81 | .ml.cmx: 82 | $(OCAMLOPT) -c $< 83 | 84 | depend: beforedepend 85 | $(OCAMLDEP) *.ml *.mli > .depend 86 | 87 | include .depend 88 | -------------------------------------------------------------------------------- /mbox.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Reading of a mailbox file and splitting into individual messages *) 16 | 17 | type t 18 | (** The type of channels opened on a mailbox *) 19 | 20 | val open_mbox_file: string -> t 21 | (** Open the given file name as a mailbox, and return an mbox channel 22 | ready for reading. If the file name ends in [.gz], arrange 23 | for on-the-fly decompression with [zcat]. *) 24 | val open_mbox_channel: in_channel -> t 25 | (** Open the given input channel as a mailbox. *) 26 | val read_msg: t -> string 27 | (** Read the next message from the given channel, and return it 28 | as a string. Raise [End_of_file] if no message remains. *) 29 | val close_mbox: t -> unit 30 | (** Close the given mbox channel. *) 31 | val mbox_file_iter: string -> (string -> unit) -> unit 32 | (** [mbox_file_iter filename fn] reads messages from the file named 33 | [filename], and applies [fn] in turn to each message. *) 34 | val mbox_channel_iter: in_channel -> (string -> unit) -> unit 35 | (** [mbox_channel_iter ic fn] reads messages from the input channel 36 | [ic], and applies [fn] in turn to each message. *) 37 | 38 | val read_single_msg: in_channel -> string 39 | (** Read one message from the given channel, up to end of file *) 40 | -------------------------------------------------------------------------------- /config.ml: -------------------------------------------------------------------------------- 1 | (* Configurable parameters *) 2 | 3 | let database_name = 4 | ref (try Filename.concat (Sys.getenv "HOME") ".spamoracle.db" 5 | with Not_found -> ".spamoracle.db") 6 | 7 | let html_add_tags = ref false 8 | 9 | let html_tag_attr = ref (Str.regexp_case_fold 10 | "a/href\\|img/src\\|img/alt\\|frame/src\\|font/face\\|font/color") 11 | 12 | let mail_headers = ref (Str.regexp_case_fold 13 | "from:\\|subject:") 14 | 15 | let alternative_favor_html = ref true 16 | 17 | let spam_header = ref "X-Spam" 18 | 19 | let attachments_header = ref "X-Attachments" 20 | 21 | let summarize_attachments = ref true 22 | 23 | let num_words_retained = ref 15 24 | 25 | let max_repetitions = ref 2 26 | 27 | let robinson_s = ref 0.0 28 | let robinson_x = ref 0.5 29 | 30 | let low_freq_limit = ref 0.01 31 | let high_freq_limit = ref 0.99 32 | 33 | let use_chi_square = ref false 34 | 35 | let good_mail_prob = ref 0.2 36 | let spam_mail_prob = ref 0.8 37 | 38 | let min_meaningful_words = ref 5 39 | 40 | let summarize_referenced = ref false 41 | 42 | let referenced_header = ref "X-Referenced-Hosts" 43 | 44 | let reassemble_words = ref false 45 | 46 | let external_converter = ref "" 47 | 48 | open Configfile 49 | 50 | let options = [ 51 | "database_file", String database_name; 52 | "html_retain_tags", Bool html_add_tags; 53 | "html_tag_attributes", Regexp html_tag_attr; 54 | "mail_headers", Regexp mail_headers; 55 | "alternative_favor_html", Bool alternative_favor_html; 56 | "spam_header", String spam_header; 57 | "attachments_header", String attachments_header; 58 | "summarize_attachments", Bool summarize_attachments; 59 | "referenced_header", String referenced_header; 60 | "summarize_referenced", Bool summarize_referenced; 61 | "num_meaningful_words", Int num_words_retained; 62 | "max_repetitions", Int max_repetitions; 63 | "low_freq_limit", Float low_freq_limit; 64 | "high_freq_limit", Float high_freq_limit; 65 | "min_meaningful_words", Int min_meaningful_words; 66 | "good_mail_prob", Float good_mail_prob; 67 | "spam_mail_prob", Float spam_mail_prob; 68 | "robinson_s", Float robinson_s; 69 | "robinson_x", Float robinson_x; 70 | "use_chi_square", Bool use_chi_square; 71 | "reassemble_words", Bool reassemble_words; 72 | "external_converter", String external_converter 73 | ] 74 | 75 | -------------------------------------------------------------------------------- /mail.mli: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Parsing of e-mail messages, including attachments *) 16 | 17 | type message = 18 | { headers: (string * string) list; 19 | body: string; 20 | parts: message list } 21 | (** The type of parsed e-mail messages. 22 | - [headers] is an A-list of pairs [(header-name, header-content)]. 23 | [header-name] is lowercased and includes [:], e.g. [subject:]. 24 | - [body] is the body of the message. Base64 and quoted-printable 25 | encodings are already decoded. For multipart messages, [body] 26 | is the initial blurb before the first part. 27 | - [parts] is empty except for multipart messages, in which case 28 | it lists all parts, recursively represented as messages. *) 29 | 30 | val parse_message: string -> message 31 | (** Parse the given textual message and return its structure. *) 32 | 33 | val header: string -> message -> string 34 | (** [header h msg] returns the contents of header named [h] 35 | in message [msg], or the empty string if this header is missing. 36 | Remember that header names are lowercased and include the final [:], 37 | e.g. [subject:]. *) 38 | 39 | val iter_text_parts: (message -> unit) -> message -> unit 40 | (** [iter_text_parts fn msg] applies [fn] to every (sub-)message 41 | contained in [msg] that is of type text. *) 42 | 43 | val iter_message: (string -> unit) -> message -> unit 44 | (** [iter_message fn msg] applies [fn] to the following parts of 45 | message [msg]: 46 | - the headers that match [!Config.mail_headers]; 47 | - the body of every sub-message of [msg] that is of type text. 48 | *) 49 | -------------------------------------------------------------------------------- /config.mli: -------------------------------------------------------------------------------- 1 | (* Configurable parameters *) 2 | 3 | val database_name : string ref 4 | (** Name of database file *) 5 | 6 | val html_add_tags : bool ref 7 | (** Whether to treat HTML tag names as words *) 8 | 9 | val html_tag_attr : Str.regexp ref 10 | (** Regexp matching [tag/attr] strings denoting pairs of HTML tag and 11 | attribute names. If a tag and attribute pair matches, the associated 12 | value is added to the text. *) 13 | 14 | val mail_headers : Str.regexp ref 15 | (** Regexp matching names of e-mail headers that must be analyzed. *) 16 | 17 | val alternative_favor_html : bool ref 18 | (** If true, consider only the HTML part of a multipart/alternative. 19 | Otherwise, consider all parts. *) 20 | 21 | val spam_header : string ref 22 | (** Name of header added with spam / not-spam info (default: "X-Spam") *) 23 | 24 | val attachments_header : string ref 25 | (** Name of header added with attachment summary (default: "X-Attachments") *) 26 | 27 | val summarize_attachments : bool ref 28 | (** Whether to generate the attachment summary *) 29 | 30 | val num_words_retained : int ref 31 | (** Number of meaningful words to retain for computing final prob. *) 32 | 33 | val max_repetitions : int ref 34 | (** Among the meaningful words, max number of time a given word 35 | can appear. *) 36 | 37 | val low_freq_limit : float ref 38 | (** Lower limit for word frequencies. Default is 0.001. *) 39 | 40 | val high_freq_limit : float ref 41 | (** Upper limit for word frequencies. Default is 0.999. *) 42 | 43 | val robinson_s : float ref 44 | val robinson_x : float ref 45 | (** Robinson's parameters for taking word frequencies into account. *) 46 | 47 | val use_chi_square : bool ref 48 | (** Use Robinson's chi-square test *) 49 | 50 | val min_meaningful_words : int ref 51 | (** Number of meaningful words below which mails are classified as unknown *) 52 | val good_mail_prob : float ref 53 | (** Spam probability below which mails are classified as good *) 54 | val spam_mail_prob : float ref 55 | (** Spam probability below which mails are classified as spam *) 56 | 57 | val summarize_referenced : bool ref 58 | 59 | val referenced_header : string ref 60 | 61 | val reassemble_words : bool ref 62 | 63 | val external_converter : string ref 64 | (** Program to be called on message parts that are not text. 65 | The program receives the content-type as first argument 66 | and the actual data on standard input. 67 | It should output the corresponding text on standard output, 68 | or exit with non-zero error code if it cannot extract text. *) 69 | 70 | val options : (string * Configfile.value) list 71 | (** List of configurable parameters *) 72 | -------------------------------------------------------------------------------- /attachments.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Summarize the attachments of a message as one line that can be 16 | put in the header of the message. Allows procmail to filter 17 | suspicious attachments without looking at the message body. *) 18 | 19 | open Printf 20 | open Mail 21 | 22 | let re_content_type = 23 | Str.regexp "\\([/a-zA-Z0-9-]+\\)" 24 | let re_innocuous_content_types = 25 | Str.regexp_case_fold "text/plain\\|text/html\\|text/x-vcard\\|multipart/\\|message/rfc822\\|message/delivery-status" 26 | let re_charset = 27 | Str.regexp_case_fold "charset=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" 28 | let re_innocuous_charsets = 29 | Str.regexp_case_fold "us-ascii\\|iso[-_]8859[-_]1$\\|iso[-_]8859[-_]15\\|windows-1252" 30 | let re_name = 31 | Str.regexp_case_fold "name=\\(\"\\([^\"]+\\)\"\\|[^ \t;]+\\)" 32 | 33 | let match_anchored re s = 34 | Str.string_match re s 0 35 | let match_unanchored re s = 36 | try ignore (Str.search_forward re s 0); true with Not_found -> false 37 | 38 | let summarize msg = 39 | let res = Buffer.create 200 in 40 | let rec summ m = 41 | let h = header "content-type:" m in 42 | if match_anchored re_content_type h then begin 43 | let c = Str.matched_group 1 h in 44 | if not (Str.string_match re_innocuous_content_types c 0) then 45 | bprintf res "type=\"%s\" " c 46 | end; 47 | if match_unanchored re_charset h then begin 48 | let c = 49 | try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in 50 | if not (Str.string_match re_innocuous_charsets c 0) then 51 | bprintf res "cset=\"%s\" " c 52 | end; 53 | if match_unanchored re_name h then begin 54 | let c = 55 | try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in 56 | bprintf res "name=\"%s\" " c 57 | end; 58 | let h = header "content-disposition:" m in 59 | if match_unanchored re_name h then begin 60 | let c = 61 | try Str.matched_group 2 h with Not_found -> Str.matched_group 1 h in 62 | bprintf res "name=\"%s\" " c 63 | end; 64 | List.iter summ m.parts in 65 | List.iter summ msg.parts; 66 | Buffer.contents res 67 | -------------------------------------------------------------------------------- /configfile.ml: -------------------------------------------------------------------------------- 1 | (* Parsing configuration files *) 2 | 3 | type value = 4 | | Bool of bool ref 5 | | String of string ref 6 | | Int of int ref 7 | | Float of float ref 8 | | Regexp of Str.regexp ref 9 | | OptBool of bool option ref 10 | | OptString of string option ref 11 | | OptInt of int option ref 12 | | OptFloat of float option ref 13 | | OptRegexp of Str.regexp option ref 14 | 15 | exception Error of string 16 | 17 | let re_bool_yes = Str.regexp_case_fold "\\(on\\|yes\\|true\\|1\\)[ \t\r]*$" 18 | let re_bool_no = Str.regexp_case_fold "\\(off\\|no\\|false\\|0\\)[ \t\r]*$" 19 | 20 | let parse_bool data = 21 | if Str.string_match re_bool_yes data 0 then true 22 | else if Str.string_match re_bool_no data 0 then false 23 | else raise (Error "invalid boolean value") 24 | 25 | let trim_spaces s = 26 | let i = ref (String.length s - 1) in 27 | while !i >= 0 && (let c = s.[!i] in c = ' ' || c = '\t' || c = '\r') 28 | do decr i done; 29 | String.sub s 0 (!i + 1) 30 | 31 | let parse_string data = 32 | try 33 | Scanf.sscanf data "%S" (fun s -> s) 34 | with Scanf.Scan_failure _ -> 35 | trim_spaces data 36 | 37 | let parse_int data = 38 | try 39 | Scanf.sscanf data "%i" (fun n -> n) 40 | with Scanf.Scan_failure _ | Failure _ -> 41 | raise (Error ("invalid integer value")) 42 | 43 | let parse_float data = 44 | try 45 | Scanf.sscanf data "%f" (fun n -> n) 46 | with Scanf.Scan_failure _ | Failure _ -> 47 | raise (Error ("invalid floating-point value")) 48 | 49 | let parse_regexp data = 50 | try 51 | Str.regexp_case_fold (trim_spaces data) 52 | with Failure msg -> 53 | raise (Error ("invalid regular expression: " ^ msg)) 54 | 55 | let parse_data valuedesc data = 56 | match valuedesc with 57 | | Bool r -> r := parse_bool data 58 | | String r -> r := parse_string data 59 | | Int r -> r := parse_int data 60 | | Float r -> r := parse_float data 61 | | Regexp r -> r := parse_regexp data 62 | | OptBool r -> r := Some(parse_bool data) 63 | | OptString r -> r := Some(parse_string data) 64 | | OptInt r -> r := Some(parse_int data) 65 | | OptFloat r -> r := Some(parse_float data) 66 | | OptRegexp r -> r := Some(parse_regexp data) 67 | 68 | let re_line = 69 | Str.regexp "[ \t]*\\([A-Za-z][A-Za-z0-9_]*\\)[ \t]*=[ \t]*\\(.*\\)" 70 | let re_skip = 71 | Str.regexp "#\\|[ \t\r]*$" 72 | 73 | let parse_line opts s = 74 | if Str.string_match re_line s 0 then begin 75 | let key = Str.matched_group 1 s and data = Str.matched_group 2 s in 76 | try 77 | parse_data (List.assoc key opts) data 78 | with Not_found -> 79 | raise (Error ("unknown variable " ^ key)) 80 | end 81 | else if not (Str.string_match re_skip s 0) then 82 | raise (Error "ill-formed line") 83 | 84 | let parse opts filename = 85 | let ic = open_in filename in 86 | let lineno = ref 1 in 87 | let errors = ref [] in 88 | begin try 89 | while true do 90 | let s = input_line ic in 91 | begin try 92 | parse_line opts s 93 | with Error msg -> 94 | errors := (!lineno, msg) :: !errors 95 | end; 96 | incr lineno 97 | done 98 | with End_of_file -> 99 | close_in ic 100 | end; 101 | List.rev !errors 102 | -------------------------------------------------------------------------------- /mbox.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Reading of a mailbox file and splitting into individual messages *) 16 | 17 | type t = 18 | { ic: in_channel; 19 | zipped: bool; 20 | mutable start: string; 21 | buf: Buffer.t } 22 | 23 | let open_mbox_file filename = 24 | if Filename.check_suffix filename ".gz" then 25 | { ic = Unix.open_process_in ("gunzip -c " ^filename); 26 | zipped = true; 27 | start = ""; 28 | buf = Buffer.create 50000 } 29 | else 30 | { ic = open_in filename; 31 | zipped = false; 32 | start = ""; 33 | buf = Buffer.create 50000 } 34 | 35 | let open_mbox_channel ic = 36 | { ic = ic; 37 | zipped = false; 38 | start = ""; 39 | buf = Buffer.create 50000 } 40 | 41 | let re_crlf = Str.regexp "\r+\n" 42 | 43 | let normalize_eol s = 44 | match String.index s '\r' with 45 | | exception Not_found -> s 46 | | _ -> Str.global_replace re_crlf "\n" s 47 | 48 | let read_msg t = 49 | Buffer.clear t.buf; 50 | Buffer.add_string t.buf t.start; 51 | let rec read () = 52 | let line = input_line t.ic in 53 | if String.length line >= 5 54 | && String.sub line 0 5 = "From " 55 | && Buffer.length t.buf > 0 then begin 56 | t.start <- (line ^ "\n"); 57 | normalize_eol (Buffer.contents t.buf) 58 | end else begin 59 | Buffer.add_string t.buf line; 60 | Buffer.add_char t.buf '\n'; 61 | read () 62 | end in 63 | try 64 | read() 65 | with End_of_file -> 66 | if Buffer.length t.buf > 0 then begin 67 | t.start <- ""; 68 | normalize_eol (Buffer.contents t.buf) 69 | end else 70 | raise End_of_file 71 | 72 | let close_mbox t = 73 | if t.zipped 74 | then ignore(Unix.close_process_in t.ic) 75 | else close_in t.ic 76 | 77 | let mbox_file_iter filename fn = 78 | let ic = open_mbox_file filename in 79 | try 80 | while true do fn(read_msg ic) done 81 | with End_of_file -> 82 | close_mbox ic 83 | 84 | let mbox_channel_iter inchan fn = 85 | let ic = open_mbox_channel inchan in 86 | try 87 | while true do fn(read_msg ic) done 88 | with End_of_file -> 89 | close_mbox ic 90 | 91 | let read_single_msg inchan = 92 | let res = Buffer.create 10000 in 93 | let buf = Bytes.create 1024 in 94 | let rec read () = 95 | let n = input inchan buf 0 (Bytes.length buf) in 96 | if n > 0 then begin 97 | Buffer.add_subbytes res buf 0 n; 98 | read () 99 | end in 100 | read (); 101 | normalize_eol (Buffer.contents res) 102 | -------------------------------------------------------------------------------- /processing.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Processing messages *) 16 | 17 | open Printf 18 | open Mail 19 | open Database 20 | open Rankmsg 21 | 22 | (* Mark message with rank info *) 23 | 24 | let re_nl_nl = Str.regexp "\n\n" 25 | 26 | let mark_message db txt = 27 | let m = parse_message txt in 28 | let r = rank_message db m in 29 | try 30 | let pos_sep = Str.search_forward re_nl_nl txt 0 in 31 | output_substring stdout txt 0 pos_sep; 32 | let verdict = 33 | if r.spam_prob <= !Config.good_mail_prob 34 | && r.num_meaningful >= !Config.min_meaningful_words then "no" 35 | else if r.spam_prob >= !Config.spam_mail_prob 36 | && r.num_meaningful >= !Config.min_meaningful_words then "yes" 37 | else "unknown" in 38 | printf "\n%s: %s; %.2f; %s" 39 | !Config.spam_header verdict r.spam_prob r.explanation; 40 | if !Config.summarize_attachments then begin 41 | let att = Attachments.summarize m in 42 | if att <> "" then 43 | printf "\n%s: %s" !Config.attachments_header att; 44 | end; 45 | if !Config.summarize_referenced then begin 46 | let refh = Refhosts.summarize () in 47 | if refh <> "" then 48 | printf "\n%s: %s" !Config.referenced_header refh; 49 | end; 50 | output_substring stdout txt pos_sep (String.length txt - pos_sep) 51 | with Not_found -> 52 | eprintf "spamoracle mark: ill-formed message (no header-body separation)\n"; 53 | print_string txt 54 | 55 | (* Add messages to database *) 56 | 57 | let record_words db is_spam txt = 58 | Wordsplit.iter 59 | (fun w -> 60 | if is_spam then add_spam db w else add_good db w) 61 | (in_full db) txt 62 | 63 | let add_message db verbose is_spam msg = 64 | if verbose then begin 65 | printf "\r%6d / %6d" db.f_num_good db.f_num_spam; 66 | flush stdout 67 | end; 68 | iter_message (record_words db is_spam) (parse_message msg); 69 | if is_spam 70 | then db.f_num_spam <- db.f_num_spam + 1 71 | else db.f_num_good <- db.f_num_good + 1 72 | 73 | (* Test analysis on a message *) 74 | 75 | let test_message db low high f txt = 76 | let msg = parse_message txt in 77 | let r = rank_message db msg in 78 | if r.spam_prob >= low && r.spam_prob <= high then begin 79 | printf "--------------------------------------------------\n"; 80 | printf "From: %s\n" (header "from:" msg); 81 | printf "Subject: %s\n" (header "subject:" msg); 82 | printf "Score: %.2f -- %d\n" r.spam_prob r.num_meaningful; 83 | printf "Details: %s\n" r.explanation; 84 | if !Config.summarize_attachments then begin 85 | let att = Attachments.summarize msg in 86 | if att <> "" then printf "Attachments: %s\n" att 87 | end; 88 | if !Config.summarize_referenced then begin 89 | let refh = Refhosts.summarize () in 90 | if refh <> "" then printf "Referenced hosts: %s\n" refh 91 | end; 92 | printf "File: %s\n" f; 93 | end 94 | 95 | (* Statistics *) 96 | 97 | type message_class = Msg_good | Msg_unknown | Msg_spam 98 | 99 | let stat_message db txt = 100 | let msg = parse_message txt in 101 | let r = rank_message db msg in 102 | if r.spam_prob <= 0.2 && r.num_meaningful >= 5 then Msg_good 103 | else if r.spam_prob >= 0.8 && r.num_meaningful >= 5 then Msg_spam 104 | else Msg_unknown 105 | 106 | (* Word splitting *) 107 | 108 | let wordsplit_message db txt = 109 | Format.open_hovbox 0; 110 | Mail.iter_message 111 | (fun txt -> 112 | Wordsplit.iter 113 | (fun word -> Format.print_string word; Format.print_space()) 114 | (in_short db) txt) 115 | (parse_message txt); 116 | Format.close_box(); 117 | Format.print_newline() 118 | -------------------------------------------------------------------------------- /rankmsg.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Message ranking *) 16 | 17 | open Mail 18 | open Database 19 | 20 | let word_count_in w res = 21 | let count = ref 0 in 22 | for i = 0 to Array.length res - 1 do 23 | if w = fst res.(i) then incr count 24 | done; 25 | !count 26 | 27 | let add_word w p res = 28 | let i = ref 0 in 29 | while !i < Array.length res 30 | && abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5) 31 | do 32 | incr i 33 | done; 34 | if !i < Array.length res then begin 35 | for j = Array.length res - 1 downto !i + 1 do 36 | res.(j) <- res.(j - 1) 37 | done; 38 | res.(!i) <- (w, p) 39 | end 40 | 41 | let normalize (p : float) low high = 42 | if p > high then high else if p < low then low else p 43 | 44 | let cap (p : float) = 45 | if p > 1.0 then 1.0 else p 46 | 47 | let word_proba g b num_g num_b = 48 | let g = 2 * g in (* Graham's magic factor to bias in favor of ham *) 49 | let pgood = cap (float g /. float num_g) 50 | and pbad = cap (float b /. float num_b) in 51 | let p = pbad /. (pgood +. pbad) in 52 | if !Config.robinson_s = 0.0 then 53 | normalize p !Config.low_freq_limit !Config.high_freq_limit 54 | else begin 55 | (* Robinson's adjustement *) 56 | let n = float (g + b) in 57 | let p = 58 | (!Config.robinson_s *. !Config.robinson_x +. n *. p) 59 | /. (!Config.robinson_s +. n) in 60 | (* Result normalization *) 61 | normalize p !Config.low_freq_limit !Config.high_freq_limit 62 | end 63 | 64 | let process_word (db, res) w = 65 | try 66 | let (g, b) = Hashtbl.find db.s_freq w in 67 | if word_count_in w res < !Config.max_repetitions then begin 68 | let p = word_proba g b db.s_num_good db.s_num_spam in 69 | add_word w p res 70 | end 71 | with Not_found -> 72 | () 73 | 74 | let process_words ((db, res) as ctx) txt = 75 | Wordsplit.iter (process_word ctx) (in_short db) txt; 76 | if !Config.summarize_referenced then Refhosts.add txt 77 | 78 | let process_msg ctx m = 79 | iter_message (process_words ctx) m 80 | 81 | (* This is Graham's original approach *) 82 | 83 | let spaminess_score_graham res = 84 | let p = ref 1.0 and pexp = ref 0 85 | and cp = ref 1.0 and cpexp = ref 0 in 86 | for i = 0 to Array.length res - 1 do 87 | let (_, x) = res.(i) in 88 | p := !p *. x; 89 | if !p <= 1e-100 then begin 90 | let (m, e) = frexp !p in p := m; pexp := !pexp + e 91 | end; 92 | cp := !cp *. (1.0 -. x); 93 | if !cp <= 1e-100 then begin 94 | let (m, e) = frexp !cp in cp := m; cpexp := !cpexp + e 95 | end 96 | done; 97 | if !cpexp < !pexp then cp := ldexp !cp (!cpexp - !pexp) 98 | else if !cpexp > !pexp then p := ldexp !p (!pexp - !cpexp); 99 | !p /. (!p +. !cp) 100 | 101 | (* This is Robinson's chi-square stuff *) 102 | 103 | let chi2_inverse m n = (* chi2 inverse of 2m with 2n degrees *) 104 | let t = ref (exp (-. m)) in 105 | let s = ref !t in 106 | for i = 1 to n do 107 | t := !t *. m /. float i; 108 | s := !s +. !t 109 | done; 110 | if !s >= 1.0 then 1.0 else !s 111 | 112 | let log2 = log 2.0 113 | 114 | let chi2_hypothesis ps = 115 | (* Compute -2 * ln (product ps). Be careful with underflows. *) 116 | let p = ref 1.0 and pexp = ref 0 in 117 | for i = 0 to Array.length ps - 1 do 118 | p := !p *. ps.(i); 119 | if !p <= 1e-100 then begin 120 | let (x, e) = frexp !p in p := x; pexp := !pexp + e 121 | end 122 | done; 123 | chi2_inverse (-. (log !p +. log2 *. float !pexp)) (Array.length ps) 124 | 125 | let spaminess_score_robinson res = 126 | let probs = Array.map snd res in 127 | let cprobs = Array.map (fun x -> 1.0 -. x) probs in 128 | 0.5 *. (1.0 +. chi2_hypothesis probs -. chi2_hypothesis cprobs) 129 | 130 | type rank = 131 | { spam_prob: float; 132 | num_meaningful: int; 133 | explanation: string } 134 | 135 | let rank_message db msg = 136 | Refhosts.reset(); 137 | let res = Array.make !Config.num_words_retained ("", 0.5) in 138 | process_msg (db, res) msg; 139 | let p = 140 | if !Config.use_chi_square 141 | then spaminess_score_robinson res 142 | else spaminess_score_graham res in 143 | let meaningful = ref 0 in 144 | while !meaningful < Array.length res && fst res.(!meaningful) <> "" 145 | do incr meaningful done; 146 | let summary = Buffer.create 200 in 147 | for i = 0 to !meaningful - 1 do 148 | let (w, p) = res.(i) in 149 | Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0)) 150 | done; 151 | { spam_prob = p; 152 | num_meaningful = !meaningful; 153 | explanation = Buffer.contents summary } 154 | 155 | -------------------------------------------------------------------------------- /database.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Word frequency database *) 16 | 17 | exception Error of string 18 | 19 | type short = { 20 | s_num_good: int; 21 | s_num_spam: int; 22 | s_freq: (string, int * int) Hashtbl.t 23 | } 24 | 25 | type full = { 26 | mutable f_num_good: int; 27 | mutable f_num_spam: int; 28 | f_high_freq: (string, int * int) Hashtbl.t; 29 | f_low_freq: (string, int * int) Hashtbl.t 30 | } 31 | 32 | let magic = "Mailscrubber" (* + 4 digits for version number *) 33 | 34 | let check_magic filename ic = 35 | let mlen = String.length magic in 36 | let buf = really_input_string ic (mlen + 4) in 37 | if String.sub buf 0 mlen <> magic then 38 | raise(Error(filename ^ ": bad magic number")); 39 | try 40 | int_of_string (String.sub buf mlen 4) 41 | with Failure _ -> 42 | raise(Error(filename ^ ": bad magic number")); 43 | 44 | type db_chan = {zipped : bool ; ic : in_channel} 45 | 46 | let open_db filename = 47 | if Filename.check_suffix filename ".gz" then 48 | { ic = Unix.open_process_in ("gunzip -c " ^ filename); 49 | zipped = true; } 50 | else 51 | { ic = open_in_bin filename ; 52 | zipped = false } 53 | 54 | let close_db {zipped = zipped ; ic = ic } = 55 | if zipped 56 | then ignore(Unix.close_process_in ic) 57 | else close_in ic 58 | 59 | let current_version = 60 | if Sys.ocaml_version < "4.03" then 1 61 | else 2 62 | 63 | let read_hashtbl filename ic version = 64 | try 65 | let tbl : ('a, 'b) Hashtbl.t = Marshal.from_channel ic in 66 | if version = current_version then tbl 67 | else if version > current_version then 68 | raise (Error(filename ^ ": database version not supported")) 69 | else begin 70 | Printf.eprintf "%s: converting from version %d to version %d\n\ 71 | Run 'spamoracle upgrade' to suppress this warning.\n%!" 72 | filename version current_version; 73 | let tbl' = Hashtbl.create (Hashtbl.length tbl / 3) in 74 | Hashtbl.iter (fun k d -> Hashtbl.add tbl' k d) tbl; 75 | tbl' 76 | end 77 | with Failure _ -> 78 | raise (Error(filename ^ ": database is corrupted")) 79 | 80 | let read_short filename = 81 | let {ic=ic ; zipped=zipped} as db_ic = open_db filename in 82 | let version = check_magic filename ic in 83 | let ng = input_binary_int ic in 84 | let ns = input_binary_int ic in 85 | let freq = read_hashtbl filename ic version in 86 | close_db db_ic; 87 | { s_num_good = ng; s_num_spam = ns; s_freq = freq } 88 | 89 | let read_full filename = 90 | let {ic=ic ; zipped=zipped} as db_ic = open_db filename in 91 | let version = check_magic filename ic in 92 | let ng = input_binary_int ic in 93 | let ns = input_binary_int ic in 94 | let high_freq = read_hashtbl filename ic version in 95 | let low_freq = read_hashtbl filename ic version in 96 | close_db db_ic; 97 | { f_num_good = ng; f_num_spam = ns; 98 | f_low_freq = low_freq; f_high_freq = high_freq } 99 | 100 | let temp_file basename = 101 | let pid = Unix.getpid() in 102 | let rec tmpfile counter = 103 | if counter > 10000 then raise (Error "cannot create temporary database"); 104 | let filename = basename ^ string_of_int (pid + counter) in 105 | try 106 | (filename, 107 | open_out_gen [Open_wronly; Open_creat; Open_excl; Open_binary] 0o600 108 | filename) 109 | with Sys_error _ -> 110 | tmpfile (counter + 1) 111 | in tmpfile 0 112 | 113 | let write_full filename db = 114 | let basename, zip = 115 | if Filename.check_suffix filename ".gz" then 116 | Filename.chop_suffix filename ".gz", true 117 | else 118 | filename, false in 119 | let (tempname, oc) = temp_file (basename ^ ".tmp") in 120 | Printf.fprintf oc "%s%04d" magic current_version; 121 | output_binary_int oc db.f_num_good; 122 | output_binary_int oc db.f_num_spam; 123 | Marshal.to_channel oc db.f_high_freq [Marshal.No_sharing]; 124 | Marshal.to_channel oc db.f_low_freq [Marshal.No_sharing]; 125 | close_out oc; 126 | if zip then begin 127 | let r = Sys.command ("gzip -best " ^ tempname) in 128 | if r = 0 then 129 | Sys.rename (tempname ^ ".gz") filename 130 | else 131 | Sys.rename tempname basename 132 | end else 133 | Sys.rename tempname filename 134 | 135 | 136 | let create sz = 137 | { f_num_good = 0; 138 | f_num_spam = 0; 139 | f_high_freq = Hashtbl.create sz; 140 | f_low_freq = Hashtbl.create sz } 141 | 142 | let add_good db w = 143 | begin try 144 | let (g, s as f) = Hashtbl.find db.f_high_freq w in 145 | Hashtbl.replace db.f_high_freq w (g+1, s) 146 | with Not_found -> 147 | try 148 | let (g, s as f) = Hashtbl.find db.f_low_freq w in 149 | let g' = g + 1 in 150 | if 2 * g' + s >= 5 then begin 151 | Hashtbl.remove db.f_low_freq w; 152 | Hashtbl.add db.f_high_freq w (g', s) 153 | end else 154 | Hashtbl.replace db.f_low_freq w (g', s) 155 | with Not_found -> 156 | Hashtbl.add db.f_low_freq w (1, 0) 157 | end 158 | 159 | let add_spam db w = 160 | begin try 161 | let (g, s) = Hashtbl.find db.f_high_freq w in 162 | Hashtbl.replace db.f_high_freq w (g, s+1) 163 | with Not_found -> 164 | try 165 | let (g, s) = Hashtbl.find db.f_low_freq w in 166 | let s' = s + 1 in 167 | if 2 * g + s' >= 5 then begin 168 | Hashtbl.remove db.f_low_freq w; 169 | Hashtbl.add db.f_high_freq w (g, s') 170 | end else 171 | Hashtbl.replace db.f_low_freq w (g, s') 172 | with Not_found -> 173 | Hashtbl.add db.f_low_freq w (0, 1) 174 | end 175 | 176 | open Printf 177 | 178 | let dump db oc = 179 | let dump_entry w (g, s) = fprintf oc "%s %d %d\n" w g s in 180 | fprintf oc "SPAMORACLE/1 %d %d\n" db.f_num_good db.f_num_spam; 181 | Hashtbl.iter dump_entry db.f_high_freq; 182 | Hashtbl.iter dump_entry db.f_low_freq 183 | 184 | let split s = 185 | try 186 | let i = String.index s ' ' in 187 | let j = String.index_from s (i + 1) ' ' in 188 | (String.sub s 0 i, 189 | int_of_string (String.sub s (i + 1) (j - i - 1)), 190 | int_of_string (String.sub s (j + 1) (String.length s - j - 1))) 191 | with Not_found -> 192 | raise(Error("Database restoration: ill-formed line `" 193 | ^ String.escaped s ^ "'")) 194 | 195 | let restore ic = 196 | let db = create 997 in 197 | begin try 198 | let (w, ng, ns) = split (input_line ic) in 199 | if w <> "SPAMORACLE/1" 200 | then raise (Error("Database restoration: wrong version")); 201 | db.f_num_good <- ng; 202 | db.f_num_spam <- ns 203 | with End_of_file -> 204 | raise (Error("Database restoration: first line missing")); 205 | end; 206 | begin try 207 | while true do 208 | let (w, g, s) = split (input_line ic) in 209 | if 2 * g + s >= 5 210 | then Hashtbl.add db.f_high_freq w (g, s) 211 | else Hashtbl.add db.f_low_freq w (g, s) 212 | done 213 | with End_of_file -> 214 | () 215 | end; 216 | db 217 | 218 | let in_short db w = Hashtbl.mem db.s_freq w 219 | let in_full db w = Hashtbl.mem db.f_high_freq w 220 | -------------------------------------------------------------------------------- /spamoracle.conf.5: -------------------------------------------------------------------------------- 1 | .TH SPAMORACLE.CONF 5 2 | 3 | .SH NAME 4 | spamoracle.conf \- SpamOracle configuration file format 5 | 6 | .SH DESCRIPTION 7 | The 8 | .B spamoracle.conf 9 | file is a configuration file governing the operation of the 10 | .BR spamoracle (1) 11 | e-mail classification tool. By default, the configuration file 12 | is searched in 13 | .IB $HOME /.spamoracle.conf 14 | but an alternate location can be specified using the 15 | .B -config 16 | flag to 17 | .BR spamoracle (1). 18 | 19 | .B Important note: 20 | most of the configuration parameters should not be modified lightly, 21 | as this may result in completely wrong e-mail classification. 22 | Familiarity with Graham's filtering algorithm, as described in the 23 | paper referenced at the end of this page, is recommended to fully 24 | understand the effect of the parameters. 25 | 26 | .SH SYNTAX 27 | 28 | The 29 | .B spamoracle.conf 30 | file is composed of lines of the form 31 | .I variable 32 | .B = 33 | .IR value . 34 | Lines starting with a # sign are treated as comments and ignored. 35 | Blank lines are ignored. 36 | 37 | Depending on the type of the variable (see the list of variables below), the 38 | .I value 39 | part takes one of the following forms: 40 | .TP 41 | .I string 42 | A sequence of characters. Blanks (spaces, tabs) at the beginning and the 43 | end of the string are ignored. Alternatively, the string can be 44 | enclosed in double quotes ("), in which case spaces are not trimmed. 45 | Inside quoted strings, blackslashes (\\) and double quotes (") must be 46 | escaped with a backslash, as in \\\\ or \\\" 47 | .TP 48 | .I boolean 49 | Either 50 | .BR on, 51 | .BR yes, 52 | .BR true, 53 | or 54 | .B 1 55 | to activate the boolean option, or 56 | .BR off, 57 | .BR no, 58 | .BR false, 59 | or 60 | .B 0 61 | to deactivate it. 62 | .TP 63 | .I integer 64 | A decimal integer 65 | .TP 66 | .I float 67 | A decimal floating-point number. 68 | .TP 69 | .I regexp 70 | A regular expression in 71 | .BR emacs (1) 72 | syntax. The repetition operators are 73 | .BR * , 74 | .BR + , 75 | and 76 | .BR ? . 77 | Alternation is written 78 | .B \e| 79 | and grouping is written 80 | .BR \e( ... \e) . 81 | Character classes are written between brackets 82 | .BR [ ... ] 83 | as usual. A single dot denotes any character except newline. 84 | Regular expressions are case-insensitive. 85 | 86 | .SH CONFIGURABLE PARAMETERS 87 | 88 | .TP 89 | .B database_file 90 | (type 91 | .IR string, 92 | default value 93 | .IB $HOME /.spamoracle.db 94 | ) 95 | .br 96 | The location of the file that contains the database of word frequencies 97 | used by 98 | .BR spamoracle (1). 99 | .TP 100 | .B html_retain_tags 101 | (type 102 | .IR boolean, 103 | default value 104 | .BR false ) 105 | .br 106 | In HTML-formatted e-mails and attachments, the names of HTML tags are 107 | normally not treated as words and are ignored for the word frequency 108 | calculations. If the 109 | .B html_retain_tags 110 | parameter is set to 111 | .BR true , 112 | HTML tags (such as 113 | .B img 114 | or 115 | .BR bold ) 116 | are treated as words and included in the computation of word frequencies. 117 | .TP 118 | .B html_tag_attributes 119 | (type 120 | .IR regexp , 121 | default value 122 | .br 123 | .BR a/href\e|img/src\e|img/alt\e|frame/src\e|font/face\e|font/color ) 124 | .br 125 | This regular expression matches pairs of HTML tags and HTML attributes 126 | written as 127 | .IB tag / attribute. 128 | When scanning HTML-formatted e-mails and attachments, attributes to 129 | HTML tags are normally ignored, unless the tag/attribute pair matches 130 | the regular expression 131 | .BR html_tag_attributes . 132 | If the tag/attribute pair matches this regexp, the value of the attribute 133 | (for instance, the URL for the 134 | .BR a / href 135 | attribute) is scanned for words. 136 | .TP 137 | .B mail_headers 138 | (type 139 | .IR regexp , 140 | default value 141 | .BR from:\e|subject: ) 142 | .br 143 | A regular expression determining which headers of an e-mail message 144 | are scanned for words. 145 | .TP 146 | .B alternative_favor_html 147 | (type 148 | .IR bool , 149 | default value 150 | .BR true ) 151 | .br 152 | Determine how multipart/alternative messages are treated. If this 153 | parameter is set, and one part of the alternative is of type text/html, 154 | this part is scanned and all other parts are ignored. In all other 155 | cases, all parts of the alternative are scanned. 156 | .TP 157 | .B spam_header 158 | (type 159 | .IR string , 160 | default value 161 | .BR X-Spam ) 162 | .br 163 | The name of the header that 164 | .B spamoracle mark 165 | adds to incoming e-mail messages, with the results of the spam/non-spam 166 | classification. 167 | .TP 168 | .B attachments_header 169 | (type 170 | .IR string , 171 | default value 172 | .BR X-Attachments ) 173 | .br 174 | The name of the header that 175 | .B spamoracle mark 176 | adds to incoming e-mail messages, with the one-line summary of attachment 177 | types, names and character sets. The generation of this header can 178 | be turned off with the 179 | .B summarize_attachment 180 | parameter. 181 | .TP 182 | .B summarize_attachment 183 | (type 184 | .IR boolean , 185 | default value 186 | .BR true ) 187 | .br 188 | If this parameter is set, 189 | .B spamoracle mark 190 | generates a one-line summary of the attachments of the incoming messages, 191 | and inserts this summary in the message headers. 192 | Setting this parameter to 193 | .B false 194 | disables the generation of this extra header. 195 | .TP 196 | .B num_meaningful_words 197 | (type 198 | .IR integer , 199 | default value 200 | .BR 15 ) 201 | .br 202 | Maximal number of "meaningful" words that are retained for computing 203 | the spam probability. During mail analysis, 204 | .B spamoracle 205 | extracts all words of the message, and retains those whose spam frequency 206 | (frequency of occurrence in spam messages) is closest to 1 or to 0. 207 | At most 208 | .B num_meaningful_words 209 | such "meaningful" words are retained. 210 | .TP 211 | .B max_repetitions 212 | (type 213 | .IR integer , 214 | default value 215 | .BR 2 ) 216 | .br 217 | Maximum number of times a given word can occur in the set of 218 | "meaningful" words retained for computing the spam probability. 219 | The default value of 2 means that at most 2 occurrences of the same 220 | word will be retained. 221 | .TP 222 | .B low_freq_limit 223 | (type 224 | .IR float , 225 | default value 226 | .BR 0.01 ) 227 | .TP 228 | .B high_freq_limit 229 | (type 230 | .IR float , 231 | default value 232 | .BR 0.99 ) 233 | .br 234 | The spam frequency of a word is computed as the number of occurrences 235 | in spam divided by number of occurrences in all messages. This ratio 236 | is then clipped to the interval [ 237 | .BR low_freq_limit , 238 | .B high_freq_limit 239 | ], so that words that are extremely rare or extremely common in spam 240 | do not bias the probability computation too much. The default values 241 | of 0.01 and 0.99 are adequate for a corpus of a few thousand e-mails. 242 | For larger corpora (e.g. 10000 e-mails), the values 0.001 and 0.999 243 | may give better results. 244 | .TP 245 | .B min_meaningful_words 246 | (type 247 | .IR integer , 248 | default value 249 | .BR 5 ) 250 | .br 251 | Minimum number of "meaningful" words below which 252 | .B spamoracle mark 253 | refuses to classify the e-mail and outputs "unknown" status. This 254 | happens with very short e-mails, or e-mails that consist exclusively of 255 | links and pictures. 256 | .TP 257 | .B good_mail_prob 258 | (type 259 | .IR float , 260 | default value 261 | .BR 0.2 ) 262 | .br 263 | Spam probability below which the e-mail is classified as non-spam. 264 | .TP 265 | .B spam_mail_prob 266 | (type 267 | .IR float , 268 | default value 269 | .BR 0.8 ) 270 | .br 271 | Spam probability above which the e-mail is classified as spam. 272 | Messages whose probability falls between 273 | .B good_mail_prob 274 | and 275 | .B spam_mail_prob 276 | are classified as "unknown". 277 | 278 | .SH AUTHOR 279 | Xavier Leroy 280 | 281 | .SH "SEE ALSO" 282 | 283 | .BR spamoracle (1) 284 | 285 | .B http://www.paulgraham.com/spam.html 286 | (Paul Graham's seminal paper) 287 | 288 | 289 | -------------------------------------------------------------------------------- /htmlscan.mll: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Approximate HTML scanner. Extracts words from HTML text, 16 | as well as certain parameters of certain tags (e.g. URLs). *) 17 | 18 | { 19 | module StringSet = Set.Make(String) 20 | module StringMap = Map.Make(String) 21 | 22 | let re_url_encoding = Str.regexp "%\\([0-9A-Fa-f][0-9A-Fa-f]\\)" 23 | 24 | let decode_url_percent s = 25 | let n = int_of_string ("0x" ^ Str.matched_group 1 s) in 26 | String.make 1 (Char.chr n) 27 | 28 | let decode_url s = 29 | Str.global_substitute re_url_encoding decode_url_percent s 30 | 31 | let entity_table = 32 | List.fold_left 33 | (fun t (s, c) -> StringMap.add s c t) 34 | StringMap.empty 35 | ["amp", '&'; "lt", '<'; "gt", '>'; 36 | "nbsp", '\160'; 37 | "Agrave", '\192'; "Aacute", '\193'; "Acirc", '\194'; 38 | "Atilde", '\195'; "Auml", '\196'; "Aring", '\197'; 39 | "AElig", '\198'; "Ccedil", '\199'; "Egrave", '\200'; 40 | "Eacute", '\201'; "Ecirc", '\202'; "Euml", '\203'; 41 | "Igrave", '\204'; "Iacute", '\205'; "Icirc", '\206'; 42 | "Iuml", '\207'; "ETH", '\208'; "Ntilde", '\209'; 43 | "Ograve", '\210'; "Oacute", '\211'; "Ocirc", '\212'; 44 | "Otilde", '\213'; "Ouml", '\214'; "times", '\215'; 45 | "Oslash", '\216'; "Ugrave", '\217'; "Uacute", '\218'; 46 | "Ucirc", '\219'; "Uuml", '\220'; "Yacute", '\221'; 47 | "THORN", '\222'; "szlig", '\223'; "agrave", '\224'; 48 | "aacute", '\225'; "acirc", '\226'; "atilde", '\227'; 49 | "auml", '\228'; "aring", '\229'; "aelig", '\230'; 50 | "ccedil", '\231'; "egrave", '\232'; "eacute", '\233'; 51 | "ecirc", '\234'; "euml", '\235'; "igrave", '\236'; 52 | "iacute", '\237'; "icirc", '\238'; "iuml", '\239'; 53 | "eth", '\240'; "ntilde", '\241'; "ograve", '\242'; 54 | "oacute", '\243'; "ocirc", '\244'; "otilde", '\245'; 55 | "ouml", '\246'; "divide", '\247'; "oslash", '\248'; 56 | "ugrave", '\249'; "uacute", '\250'; "ucirc", '\251'; 57 | "uuml", '\252'; "yacute", '\253'; "thorn", '\254'; 58 | "yuml", '\255'] 59 | 60 | let word_breaking_tags = 61 | List.fold_right StringSet.add 62 | [ "p"; "br"; "ul"; "ol"; "dt"; "li"; "dd"; "table"; "tr"; "th"; "td"; 63 | "img"; "div"; "blockquote"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 64 | "address" ] 65 | StringSet.empty 66 | 67 | module Output = struct 68 | 69 | type t = { txt: Buffer.t; extra: Buffer.t } 70 | 71 | let create() = { txt = Buffer.create 2048; extra = Buffer.create 256 } 72 | 73 | let clear ob = 74 | Buffer.clear ob.txt; 75 | Buffer.clear ob.extra 76 | 77 | let contents ob = 78 | Buffer.add_char ob.txt '\n'; 79 | Buffer.add_buffer ob.txt ob.extra; 80 | Buffer.contents ob.txt 81 | 82 | let char ob c = 83 | Buffer.add_char ob.txt c 84 | 85 | let string ob s = 86 | Buffer.add_string ob.txt s 87 | 88 | let add_extra ob s = 89 | Buffer.add_string ob.extra s; Buffer.add_char ob.extra '\n' 90 | 91 | let tag ob t = 92 | if StringSet.mem t word_breaking_tags then char ob ' '; 93 | if !Config.html_add_tags then add_extra ob t 94 | 95 | let tag_attr ob t n s = 96 | let n = String.lowercase_ascii n in 97 | if Str.string_match !Config.html_tag_attr (t ^ "/" ^ n) 0 then 98 | if n = "href" || n = "src" 99 | then add_extra ob (decode_url s) 100 | else add_extra ob s 101 | end 102 | 103 | let ob = Output.create() 104 | let tag = ref "" 105 | let attr_name = ref "" 106 | let attr_value = Buffer.create 128 107 | 108 | } 109 | 110 | let ws = [' ' '\n' '\r' '\t'] 111 | let name = ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '0'-'9' '.' '-']* 112 | let unquotedattrib = 113 | [^ '\'' '\"' '>' ' ' '\n' '\r' '\t'] [^ '>' ' ' '\n' '\r' '\t']* 114 | 115 | rule main = parse 116 | "" (* tolerance *) 146 | { () } 147 | | _ (* tolerance *) 148 | { comment lexbuf } 149 | | eof (* tolerance *) 150 | { () } 151 | 152 | and tagbody = parse 153 | ">" 154 | { Output.tag ob !tag } 155 | | name 156 | { attr_name := Lexing.lexeme lexbuf; 157 | tagattrib lexbuf; 158 | tagbody lexbuf } 159 | | _ (* tolerance -- should be ws *) 160 | { tagbody lexbuf } 161 | | eof (* tolerance *) 162 | { Output.tag ob !tag } 163 | 164 | and tagattrib = parse 165 | ws* '=' ws* 166 | { tagvalue lexbuf } 167 | | "" 168 | { Output.tag_attr ob !tag !attr_name "" } 169 | 170 | and tagvalue = parse 171 | "'" 172 | { Buffer.clear attr_value; singlequoted lexbuf } 173 | | "\"" 174 | { Buffer.clear attr_value; doublequoted lexbuf } 175 | | unquotedattrib 176 | { Output.tag_attr ob !tag !attr_name (Lexing.lexeme lexbuf) } 177 | | "" (* tolerance *) 178 | { Output.tag_attr ob !tag !attr_name "" } 179 | 180 | and singlequoted = parse 181 | "'" | eof (* eof is tolerance *) 182 | { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } 183 | | "&" 184 | { Buffer.add_char attr_value (entity lexbuf); singlequoted lexbuf } 185 | | _ 186 | { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); 187 | singlequoted lexbuf } 188 | 189 | and doublequoted = parse 190 | "\"" | eof (* eof is tolerance *) 191 | { Output.tag_attr ob !tag !attr_name (Buffer.contents attr_value) } 192 | | "&" 193 | { Buffer.add_char attr_value (entity lexbuf); doublequoted lexbuf } 194 | | _ 195 | { Buffer.add_char attr_value (Lexing.lexeme_char lexbuf 0); 196 | doublequoted lexbuf } 197 | 198 | and entity = parse 199 | '#' ['0'-'9']+ 200 | { let s = Lexing.lexeme lexbuf in 201 | let n = int_of_string (String.sub s 1 (String.length s - 1)) in 202 | entity_end lexbuf; 203 | if n >= 0 && n <= 255 then Char.chr n else '\255' } 204 | | name 205 | { let s = Lexing.lexeme lexbuf in 206 | entity_end lexbuf; 207 | try StringMap.find s entity_table with Not_found -> '\255' } 208 | | "" (* tolerance *) 209 | { '&' } 210 | 211 | and entity_end = parse 212 | ";" ? 213 | { () } 214 | 215 | { 216 | let extract_text s = 217 | Output.clear ob; main (Lexing.from_string s) 218 | } 219 | -------------------------------------------------------------------------------- /mail.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (** Parsing of e-mail messages, including attachments *) 16 | 17 | type message = 18 | { headers: (string * string) list; 19 | body: string; 20 | parts: message list } 21 | 22 | let base64_decode_char c = 23 | match c with 24 | 'A' .. 'Z' -> Char.code c - 65 25 | | 'a' .. 'z' -> Char.code c - 97 + 26 26 | | '0' .. '9' -> Char.code c - 48 + 52 27 | | '+' -> 62 28 | | '/' -> 63 29 | | _ -> -1 30 | 31 | let decode_base64 s = 32 | let d = Buffer.create (String.length s * 3 / 4) in 33 | let buf = Array.make 4 0 in 34 | let pos = ref 0 in 35 | for i = 0 to String.length s - 1 do 36 | let n = base64_decode_char s.[i] in 37 | if n >= 0 then begin 38 | buf.(!pos) <- n; 39 | incr pos; 40 | if !pos = 4 then begin 41 | Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); 42 | Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)); 43 | Buffer.add_char d (Char.chr((buf.(2) land 3) lsl 6 + buf.(3))); 44 | pos := 0 45 | end 46 | end 47 | done; 48 | begin match !pos with 49 | 2 -> 50 | Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)) 51 | | 3 -> 52 | Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4)); 53 | Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2)) 54 | | _ -> 55 | () 56 | end; 57 | Buffer.contents d 58 | 59 | let hexa_digit c = 60 | if c >= '0' && c <= '9' then Char.code c - 48 61 | else if c >= 'A' && c <= 'F' then Char.code c - 65 + 10 62 | else if c >= 'a' && c <= 'f' then Char.code c - 97 + 10 63 | else raise Not_found 64 | 65 | let decode_qp s = 66 | let len = String.length s in 67 | let d = Buffer.create (String.length s) in 68 | let pos = ref 0 in 69 | while !pos < len do 70 | let c = s.[!pos] in 71 | if c = '=' && !pos + 1 < len && s.[!pos + 1] = '\n' then begin 72 | pos := !pos + 2 73 | end else if c = '=' && !pos + 2 < len then begin 74 | try 75 | let h1 = hexa_digit s.[!pos + 1] 76 | and h2 = hexa_digit s.[!pos + 2] in 77 | Buffer.add_char d (Char.chr(h1 lsl 4 + h2)); 78 | pos := !pos + 3 79 | with Not_found -> 80 | Buffer.add_char d c; 81 | incr pos 82 | end else begin 83 | Buffer.add_char d c; 84 | incr pos 85 | end 86 | done; 87 | Buffer.contents d 88 | 89 | let re_base64 = Str.regexp_case_fold "base64" 90 | let re_qp = Str.regexp_case_fold "quoted-printable" 91 | 92 | let decode encoding s = 93 | if Str.string_match re_base64 encoding 0 then 94 | decode_base64 s 95 | else if Str.string_match re_qp encoding 0 then 96 | decode_qp s 97 | else 98 | s 99 | 100 | let re_encoded_header = 101 | Str.regexp "=\\?[_A-Za-z0-9-]+\\?\\([BbQq]\\)\\?\\([^?]*\\)\\?=" 102 | 103 | let decode_header s = 104 | let decode_group s = 105 | let enc = Str.matched_group 1 s 106 | and txt = Str.matched_group 2 s in 107 | match enc with 108 | "B" | "b" -> decode_base64 txt 109 | | "Q" | "q" -> decode_qp txt 110 | | _ -> assert false in 111 | Str.global_substitute re_encoded_header decode_group s 112 | 113 | let re_continuation = Str.regexp "\n[ \t]+" 114 | let re_nl = Str.regexp "\n" 115 | let re_field = Str.regexp "\\([A-Za-z-]+[: ]\\)[ \t]*\\(.*\\)" 116 | 117 | let parse_header s = 118 | let rec parse_field accu = function 119 | [] -> List.rev accu 120 | | line :: rem -> 121 | if Str.string_match re_field line 0 then begin 122 | let field_name = String.lowercase_ascii (Str.matched_group 1 line) 123 | and field_val = Str.matched_group 2 line in 124 | parse_field ((field_name, decode_header field_val) :: accu) rem 125 | end else 126 | parse_field accu rem in 127 | parse_field [] (Str.split re_nl (Str.global_replace re_continuation " " s)) 128 | 129 | let find_header name headers = 130 | try List.assoc name headers with Not_found -> "" 131 | 132 | let re_nl_nl = Str.regexp "\n\n" 133 | let re_multipart = 134 | Str.regexp_case_fold 135 | "multipart/.*boundary *= *\\(\"\\([^\"]+\\)\"\\|\\([^ \t]+\\)\\)" 136 | 137 | let rec parse_message s = 138 | try 139 | let pos_sep = Str.search_forward re_nl_nl s 0 in 140 | let headers = parse_header (String.sub s 0 pos_sep) in 141 | let body = String.sub s (pos_sep + 2) (String.length s - pos_sep - 2) in 142 | let encoding = find_header "content-transfer-encoding:" headers in 143 | let ctype = find_header "content-type:" headers in 144 | if Str.string_match re_multipart ctype 0 then begin 145 | let boundary = 146 | try 147 | Str.matched_group 2 ctype 148 | with Not_found -> try 149 | Str.matched_group 3 ctype 150 | with Not_found -> 151 | assert false in 152 | let re_bound = 153 | Str.regexp ("--" ^ Str.quote boundary ^ "[ \t\n]*") in 154 | match Str.split_delim re_bound body with 155 | [] -> 156 | { headers = headers; 157 | body = decode encoding body; 158 | parts = [] } 159 | | blurb :: parts -> 160 | { headers = headers; 161 | body = decode encoding blurb; 162 | parts = List.map parse_message parts } 163 | end else 164 | { headers = headers; 165 | body = decode encoding body; 166 | parts = [] } 167 | with Not_found -> 168 | { headers = []; 169 | body = s; 170 | parts = [] } 171 | 172 | let safe_remove fname = try Sys.remove fname with Sys_error _ -> () 173 | 174 | let run_body_through_external_converter cmd arg body = 175 | let infile = Filename.temp_file "spamoracle" ".data" in 176 | let outfile = Filename.temp_file "spamoracle" ".txt" in 177 | let oc = open_out_bin infile in 178 | let ic = open_in_bin outfile in 179 | output_string oc body; 180 | close_out oc; 181 | let retcode = 182 | Sys.command 183 | (Printf.sprintf "%s %s < %s > %s" 184 | cmd (Filename.quote arg) infile outfile) in 185 | if retcode <> 0 then begin 186 | close_in ic; 187 | safe_remove infile; safe_remove outfile; 188 | None 189 | end else begin 190 | let len = in_channel_length ic in 191 | let res = really_input_string ic len in 192 | close_in ic; 193 | safe_remove infile; safe_remove outfile; 194 | Some res 195 | end 196 | 197 | let header s msg = 198 | let rec hdr = function 199 | [] -> [] 200 | | (h,v) :: rem -> if h = s then v :: hdr rem else hdr rem in 201 | String.concat "\n" (hdr msg.headers) 202 | 203 | let header_matches s re msg = 204 | let rec hmatch = function 205 | [] -> false 206 | | (h,v) :: rem -> (h = s && Str.string_match re v 0) || hmatch rem 207 | in hmatch msg.headers 208 | 209 | let re_content_text = 210 | Str.regexp_case_fold "text/plain\\|text/enriched\\|text;\\|text$" 211 | let re_content_html = 212 | Str.regexp_case_fold "text/html" 213 | let re_content_message_rfc822 = 214 | Str.regexp_case_fold "message/rfc822" 215 | let re_content_alternative = 216 | Str.regexp_case_fold "multipart/alternative" 217 | let re_content_multipart = 218 | Str.regexp_case_fold "multipart/" 219 | let re_content_any_text = 220 | Str.regexp_case_fold "text/" 221 | 222 | let rec iter_text_parts fn m = 223 | if header_matches "content-type:" re_content_text m 224 | || not(List.mem_assoc "content-type:" m.headers) then 225 | fn m 226 | else if header_matches "content-type:" re_content_html m then 227 | fn {m with body = Htmlscan.extract_text m.body} 228 | else if header_matches "content-type:" re_content_alternative m then begin 229 | try 230 | if not !Config.alternative_favor_html then raise Not_found; 231 | iter_text_parts fn 232 | (List.find (header_matches "content-type:" re_content_html) m.parts) 233 | with Not_found -> 234 | fn m; 235 | List.iter (iter_text_parts fn) m.parts 236 | end else if header_matches "content-type:" re_content_multipart m then begin 237 | fn m; 238 | List.iter (iter_text_parts fn) m.parts 239 | end else if header_matches "content-type:" re_content_message_rfc822 m then 240 | iter_text_parts fn (parse_message m.body) 241 | else if !Config.external_converter <> "" 242 | && not (header_matches "content-type:" re_content_any_text m) 243 | then begin 244 | match run_body_through_external_converter 245 | !Config.external_converter 246 | (header "content-type:" m) 247 | m.body with 248 | | None -> () 249 | | Some txt -> fn {m with body = txt} 250 | end else 251 | () 252 | 253 | let iter_message fn msg = 254 | List.iter 255 | (fun (h, v) -> if Str.string_match !Config.mail_headers h 0 then fn v) 256 | msg.headers; 257 | iter_text_parts 258 | (fun m -> fn m.body) 259 | msg 260 | -------------------------------------------------------------------------------- /main.ml: -------------------------------------------------------------------------------- 1 | (***********************************************************************) 2 | (* *) 3 | (* SpamOracle -- a Bayesian spam filter *) 4 | (* *) 5 | (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6 | (* *) 7 | (* Copyright 2002 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. This file is distributed under the terms of the *) 9 | (* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *) 10 | (* *) 11 | (***********************************************************************) 12 | 13 | (* $Id$ *) 14 | 15 | (* Argument parsing and main program *) 16 | 17 | open Printf 18 | open Mbox 19 | open Database 20 | open Processing 21 | 22 | exception Usage of string 23 | 24 | let default_config_name = 25 | try Filename.concat (Sys.getenv "HOME") ".spamoracle.conf" 26 | with Not_found -> ".spamoracle.conf" 27 | 28 | let parse_config_file file = 29 | try 30 | let errs = Configfile.parse Config.options file in 31 | if errs <> [] then begin 32 | eprintf "Error while reading configuration file %s:\n" file; 33 | List.iter (fun (line, msg) -> eprintf "Line %d: %s\n" line msg) errs; 34 | exit 2 35 | end 36 | with Sys_error msg -> 37 | eprintf "Cannot read configuration file %s:\n%s\n" file msg 38 | 39 | let mark_command args = 40 | let db = Database.read_short !Config.database_name in 41 | if args = [] then 42 | mark_message db (read_single_msg stdin) 43 | else 44 | List.iter (fun f -> mbox_file_iter f (mark_message db)) args 45 | 46 | let add_command args = 47 | let db = 48 | try 49 | Database.read_full !Config.database_name 50 | with Sys_error _ -> 51 | Database.create 997 in 52 | let processed = ref false 53 | and is_spam = ref true 54 | and verbose = ref false in 55 | let rec parse_args = function 56 | | "-v" :: rem -> 57 | verbose := true; parse_args rem 58 | | "-spam" :: rem -> 59 | is_spam := true; parse_args rem 60 | | "-good" :: rem -> 61 | is_spam := false; parse_args rem 62 | | f :: rem -> 63 | mbox_file_iter f (add_message db !verbose !is_spam); 64 | processed := true; 65 | parse_args rem 66 | | [] -> 67 | if not !processed then 68 | add_message db !verbose !is_spam (read_single_msg stdin); 69 | if !verbose then 70 | printf "\r%6d / %6d good / spam messages\n" 71 | db.f_num_good db.f_num_spam 72 | in parse_args args; Database.write_full !Config.database_name db 73 | 74 | let list_command args = 75 | let db = Database.read_full !Config.database_name in 76 | let res = ref [] in 77 | List.iter 78 | (fun s -> 79 | let re = Str.regexp (s ^ "$") in 80 | let match_word w (g, s) = 81 | if Str.string_match re w 0 then begin 82 | let p = 83 | if 2 * g + s < 5 84 | then -1.0 85 | else Rankmsg.word_proba g s db.f_num_good db.f_num_spam in 86 | res := (w, p, g, s) :: !res 87 | end in 88 | Hashtbl.iter match_word db.f_high_freq; 89 | Hashtbl.iter match_word db.f_low_freq) 90 | args; 91 | if !res = [] then 92 | Printf.printf "No matching word found in database.\n" 93 | else begin 94 | Printf.printf " Word Proba #good #spam\n"; 95 | List.iter 96 | (fun (w, p, g, s) -> 97 | if p >= 0.0 then 98 | Printf.printf "%-15s%8.2f%8d%8d\n" w p g s 99 | else 100 | Printf.printf "%-15s ----%8d%8d\n" w g s) 101 | (List.sort 102 | (fun (_, p1, _, _) (_, p2, _, _) -> compare p2 p1) 103 | !res) 104 | end 105 | 106 | let test_command args = 107 | let db = Database.read_short !Config.database_name in 108 | let low = ref 0.0 and high = ref 1.0 in 109 | let rec parse_args = function 110 | | "-min" :: s :: rem -> 111 | begin try low := float_of_string s 112 | with Failure _ -> raise(Usage("bad argument to -min")) 113 | end; 114 | parse_args rem 115 | | "-min" :: [] -> 116 | raise(Usage("no argument to -min")) 117 | | "-max" :: s :: rem -> 118 | begin try high := float_of_string s 119 | with Failure _ -> raise(Usage("bad argument to -max")) 120 | end; 121 | parse_args rem 122 | | "-max" :: [] -> 123 | raise(Usage("no argument to -max")) 124 | | f :: rem -> 125 | mbox_file_iter f (test_message db !low !high f); 126 | parse_args rem 127 | | [] -> () 128 | in parse_args args 129 | 130 | let stat_command args = 131 | let db = Database.read_short !Config.database_name in 132 | let stat_mbox f = 133 | let num_msgs = ref 0 134 | and num_good = ref 0 135 | and num_spam = ref 0 136 | and num_unknown = ref 0 in 137 | mbox_file_iter f 138 | (fun s -> 139 | incr num_msgs; 140 | match stat_message db s with 141 | Msg_good -> incr num_good 142 | | Msg_spam -> incr num_spam 143 | | Msg_unknown -> incr num_unknown); 144 | let percentage a b = 145 | 100.0 *. float a /. float b in 146 | if !num_msgs > 0 then 147 | printf "%s: %d (%.2f%%) good, %d (%.2f%%) unknown, %d (%.2f%%) spam\n" 148 | f 149 | !num_good (percentage !num_good !num_msgs) 150 | !num_unknown (percentage !num_unknown !num_msgs) 151 | !num_spam (percentage !num_spam !num_msgs) 152 | in List.iter stat_mbox args 153 | 154 | let words_command args = 155 | let db = Database.read_short !Config.database_name in 156 | if args = [] then 157 | wordsplit_message db (read_single_msg stdin) 158 | else 159 | List.iter 160 | (fun f -> 161 | mbox_file_iter f 162 | (fun msg -> 163 | print_string "----------------------------------------\n"; 164 | wordsplit_message db msg)) 165 | args 166 | 167 | let backup_command () = 168 | Database.dump (Database.read_full !Config.database_name) stdout 169 | 170 | let restore_command () = 171 | Database.write_full !Config.database_name (Database.restore stdin) 172 | 173 | let upgrade_command () = 174 | let db = Database.read_full !Config.database_name in 175 | Database.write_full !Config.database_name db; 176 | printf "Converted %s to version %d.\n" 177 | !Config.database_name 178 | Database.current_version 179 | 180 | let rec parse_args_1 = function 181 | "-config" :: file :: rem -> 182 | parse_config_file file; parse_args_2 rem 183 | | "-config" :: [] -> 184 | raise(Usage("Option -config requires an argument")) 185 | | rem -> 186 | if Sys.file_exists default_config_name 187 | then parse_config_file default_config_name; 188 | parse_args_2 rem 189 | 190 | and parse_args_2 = function 191 | | "-f" :: file :: rem -> 192 | Config.database_name := file; parse_args_3 rem 193 | | "-f" :: [] -> 194 | raise(Usage("Option -f requires an argument")) 195 | | rem -> 196 | parse_args_3 rem 197 | 198 | and parse_args_3 = function 199 | "mark" :: rem -> 200 | mark_command rem 201 | | "add" :: rem -> 202 | add_command rem 203 | | "list" :: rem -> 204 | list_command rem 205 | | "test" :: rem -> 206 | test_command rem 207 | | "stat" :: rem -> 208 | stat_command rem 209 | | "backup" :: rem -> 210 | backup_command () 211 | | "restore" :: rem -> 212 | restore_command () 213 | | "words" :: rem -> 214 | words_command rem 215 | | "upgrade" :: rem -> 216 | upgrade_command () 217 | | s :: rem -> 218 | raise(Usage("Unknown command " ^ s)) 219 | | [] -> 220 | raise(Usage "") 221 | 222 | let usage_string = "\ 223 | Usage: 224 | spamoracle [-config conf] [-f db] mark {mailbox}* 225 | Add 'X-Spam:' headers to messages with result of analysis 226 | {mailbox}* Mailboxes containing messages to analyze and mark 227 | If none given, read single msg from standard input 228 | 229 | spamoracle [-config conf] [-f db] add [-v] -spam {spambox}* -good {goodbox}* 230 | Create or update database with known spam or non-spam messages 231 | -v Print progress bar 232 | -spam Indicate subsequent mailboxes contain spam 233 | -good Indicate subsequent mailboxes contain good msgs (not spam) 234 | {spambox}* Mailboxes containing spam 235 | {goodbox}* Mailboxes containing good messages (not spam) 236 | If no mailbox given, read single msg from standard input 237 | 238 | spamoracle [-config conf] [-f db] test [-min prob] [-max prob] {mailbox}* 239 | Analyze messages and print summary of results for each message 240 | -min Don't print messages with result below 241 | -max Don't print messages with result above 242 | {mailbox}* Mailboxes containing messages to analyze 243 | 244 | spamoracle [-config conf] [-f db] stat {mailbox}* 245 | Analyze messages and print percentages of spam/non-spam for each mailbox 246 | {mailbox}* Mailboxes containing messages to analyze 247 | 248 | spamoracle [-config conf] [-f db] list {regexp}* 249 | Dump word statistics in database 250 | {regexp}* Regular expressions for words we are interested in 251 | 252 | spamoracle [-config conf] [-f db] backup > database.backup 253 | Dump whole database in portable text format on standard output 254 | 255 | spamoracle [-config conf] [-f db] restore < database.backup 256 | Restore database from text backup file read from standard input 257 | 258 | spamoracle [-config conf] [-f db] upgrade 259 | Convert database to the latest format 260 | 261 | spamoracle [-config conf] [-f db] words {mailbox}* 262 | Extract words from messages and print them 263 | {mailbox}* Mailboxes containing messages to scan 264 | If no mailbox given, read single msg from standard input 265 | 266 | Common options: 267 | -config Configuration file (default $HOME/.spamoracle.conf) 268 | -f Database to use (default $HOME/.spamoracle.db)" 269 | 270 | let main () = 271 | try 272 | parse_args_1 (List.tl (Array.to_list Sys.argv)) 273 | with 274 | | Usage msg -> 275 | eprintf "%s\n%s\n" msg usage_string; 276 | exit 2 277 | | Sys_error msg -> 278 | eprintf "System error: %s\n" msg; 279 | exit 2 280 | | Database.Error msg -> 281 | eprintf "%s\n" msg; 282 | exit 2 283 | 284 | let _ = main() 285 | 286 | --------------------------------------------------------------------------------