├── README.md
├── nbsvm.py
└── oh_my_go.sh


/README.md:
--------------------------------------------------------------------------------
 1 | **Note:** I don't provide personal support for custom changes in the code. Only
 2 | for the release.  For people just starting, I recommend
 3 | [Treehouse](http://referrals.trhou.se/grgoiremesnil) for online-learning.
 4 | 
 5 | Naive Bayes SVM (NB-SVM)
 6 | ========================
 7 | 
 8 | This code reproduces performance of the NB-SVM on the IMDB reviews from the
 9 | paper:
10 | 
11 | Sida Wang and Christopher D. Manning: Baselines and Bigrams: Simple, Good Sentiment and Topic Classification; ACL 2012.
12 | http://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf
13 | 
14 | They obtain 91.22% while this code obtains 91.55% with bigrams and 91.82% with trigrams.
15 | Little improvements (+0.33% with bigrams and +0.6% with unigrams) versus the paper.
16 | 
17 | To reproduce the results:
18 | 
19 | ```
20 | git clone git@github.com:mesnilgr/nbsvm.git
21 | cd nbsvm; chmod +x oh_my_go.sh
22 | ./oh_my_go.sh
23 | ```
24 | 
25 | End to end (downloading the data, tokenizing, training the models), this will
26 | take 68 mins. Note that most of the time is spent dowloading and tokenizing.
27 | Once the data has been downloaded and tokenized, training an NB-SVM only takes
28 | ~2 mins for uni+bigrams and <5 mins for uni+bi+trigrams.
29 | 
30 | <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">Naive Bayes SVM</span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Grégoire Mesnil</span> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.<br />Based on a work at <a xmlns:dct="http://purl.org/dc/terms/" href="https://github.com/mesnilgr/nbsvm" rel="dct:source">https://github.com/mesnilgr/nbsvm</a>.
31 | 


--------------------------------------------------------------------------------
/nbsvm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pdb
 3 | import numpy as np
 4 | import argparse
 5 | from collections import Counter
 6 | 
 7 | def tokenize(sentence, grams):
 8 |     words = sentence.split()
 9 |     tokens = []
10 |     for gram in grams:
11 |         for i in range(len(words) - gram + 1):
12 |             tokens += ["_*_".join(words[i:i+gram])]
13 |     return tokens
14 | 
15 | def build_dict(f, grams):
16 |     dic = Counter()
17 |     for sentence in open(f).xreadlines():
18 |         dic.update(tokenize(sentence, grams))
19 |     return dic
20 | 
21 | def process_files(file_pos, file_neg, dic, r, outfn, grams):
22 |     output = []
23 |     for beg_line, f in zip(["1", "-1"], [file_pos, file_neg]):
24 |         for l in open(f).xreadlines():
25 |             tokens = tokenize(l, grams)
26 |             indexes = []
27 |             for t in tokens:
28 |                 try:
29 |                     indexes += [dic[t]]
30 |                 except KeyError:
31 |                     pass
32 |             indexes = list(set(indexes))
33 |             indexes.sort()
34 |             line = [beg_line]
35 |             for i in indexes:
36 |                 line += ["%i:%f" % (i + 1, r[i])]
37 |             output += [" ".join(line)]
38 |     output = "\n".join(output)
39 |     f = open(outfn, "w")
40 |     f.writelines(output)
41 |     f.close()
42 | 
43 | def compute_ratio(poscounts, negcounts, alpha=1):
44 |     alltokens = list(set(poscounts.keys() + negcounts.keys()))
45 |     dic = dict((t, i) for i, t in enumerate(alltokens))
46 |     d = len(dic)
47 |     print "computing r..."
48 |     p, q = np.ones(d) * alpha , np.ones(d) * alpha
49 |     for t in alltokens:
50 |         p[dic[t]] += poscounts[t]
51 |         q[dic[t]] += negcounts[t]
52 |     p /= abs(p).sum()
53 |     q /= abs(q).sum()
54 |     r = np.log(p/q)
55 |     return dic, r
56 |  
57 | def main(ptrain, ntrain, ptest, ntest, out, liblinear, ngram):
58 |     ngram = [int(i) for i in ngram]
59 |     print "counting..."
60 |     poscounts = build_dict(ptrain, ngram)         
61 |     negcounts = build_dict(ntrain, ngram)         
62 |     
63 |     dic, r = compute_ratio(poscounts, negcounts)
64 |     print "processing files..."
65 |     process_files(ptrain, ntrain, dic, r, "train-nbsvm.txt", ngram)
66 |     process_files(ptest, ntest, dic, r, "test-nbsvm.txt", ngram)
67 |     
68 |     trainsvm = os.path.join(liblinear, "train") 
69 |     predictsvm = os.path.join(liblinear, "predict") 
70 |     os.system(trainsvm + " -s 0 train-nbsvm.txt model.logreg")
71 |     os.system(predictsvm + " -b 1 test-nbsvm.txt model.logreg " + out)
72 |     os.system("rm model.logreg train-nbsvm.txt test-nbsvm.txt")
73 |         
74 | if __name__ == "__main__":
75 |     """
76 |     Usage :
77 | 
78 |     python nbsvm.py --liblinear /PATH/liblinear-1.96\
79 |         --ptrain /PATH/data/full-train-pos.txt\
80 |         --ntrain /PATH/data/full-train-neg.txt\
81 |         --ptest /PATH/data/test-pos.txt\
82 |         --ntest /PATH/data/test-neg.txt\
83 |          --ngram 123 --out TEST-SCORE
84 |     """
85 | 
86 |     parser = argparse.ArgumentParser(description='Run NB-SVM on some text files.')
87 |     parser.add_argument('--liblinear', help='path of liblinear install e.g. */liblinear-1.96')
88 |     parser.add_argument('--ptrain', help='path of the text file TRAIN POSITIVE')
89 |     parser.add_argument('--ntrain', help='path of the text file TRAIN NEGATIVE')
90 |     parser.add_argument('--ptest', help='path of the text file TEST POSITIVE')
91 |     parser.add_argument('--ntest', help='path of the text file TEST NEGATIVE')
92 |     parser.add_argument('--out', help='path and fileename for score output')
93 |     parser.add_argument('--ngram', help='N-grams considered e.g. 123 is uni+bi+tri-grams')
94 |     args = vars(parser.parse_args())
95 | 
96 |     main(**args)
97 | 


--------------------------------------------------------------------------------
/oh_my_go.sh:
--------------------------------------------------------------------------------
 1 | #this function will convert text to lowercase and will disconnect punctuation and special symbols from words
 2 | function normalize_text {
 3 |   awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/"/ " /g' \
 4 |   -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \
 5 |   -e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm
 6 | }
 7 | 
 8 | cd ..
 9 | mkdir nbsvm_run; cd nbsvm_run
10 | 
11 | wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
12 | tar -xvf aclImdb_v1.tar.gz
13 | rm aclImdb_v1.tar.gz
14 | 
15 | for j in train/pos train/neg test/pos test/neg; do
16 |   for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done
17 |   normalize_text temp
18 |   mv temp-norm aclImdb/$j/norm.txt
19 |   rm temp
20 | done
21 | 
22 | mkdir data
23 | mv aclImdb/train/pos/norm.txt data/train-pos.txt
24 | mv aclImdb/train/neg/norm.txt data/train-neg.txt
25 | mv aclImdb/test/pos/norm.txt data/test-pos.txt
26 | mv aclImdb/test/neg/norm.txt data/test-neg.txt
27 | rm -r aclImdb
28 | 
29 | wget https://www.csie.ntu.edu.tw/~cjlin/liblinear/oldfiles/liblinear-1.96.zip
30 | unzip liblinear-1.96.zip
31 | rm liblinear-1.96.zip
32 | cd liblinear-1.96
33 | make
34 | cd ..
35 | 
36 | echo "BI-GRAM";
37 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 12 --out NBSVM-TEST-BIGRAM
38 | echo "TRI-GRAM";
39 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 123 --out NBSVM-TEST-TRIGRAM
40 | cd ../nbsvm
41 | 


--------------------------------------------------------------------------------