├── README.md
├── nbsvm.py
└── oh_my_go.sh
/README.md:
--------------------------------------------------------------------------------
1 | **Note:** I don't provide personal support for custom changes in the code. Only
2 | for the release. For people just starting, I recommend
3 | [Treehouse](http://referrals.trhou.se/grgoiremesnil) for online-learning.
4 |
5 | Naive Bayes SVM (NB-SVM)
6 | ========================
7 |
8 | This code reproduces performance of the NB-SVM on the IMDB reviews from the
9 | paper:
10 |
11 | Sida Wang and Christopher D. Manning: Baselines and Bigrams: Simple, Good Sentiment and Topic Classification; ACL 2012.
12 | http://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf
13 |
14 | They obtain 91.22% while this code obtains 91.55% with bigrams and 91.82% with trigrams.
15 | Little improvements (+0.33% with bigrams and +0.6% with unigrams) versus the paper.
16 |
17 | To reproduce the results:
18 |
19 | ```
20 | git clone git@github.com:mesnilgr/nbsvm.git
21 | cd nbsvm; chmod +x oh_my_go.sh
22 | ./oh_my_go.sh
23 | ```
24 |
25 | End to end (downloading the data, tokenizing, training the models), this will
26 | take 68 mins. Note that most of the time is spent dowloading and tokenizing.
27 | Once the data has been downloaded and tokenized, training an NB-SVM only takes
28 | ~2 mins for uni+bigrams and <5 mins for uni+bi+trigrams.
29 |
30 | 
Naive Bayes SVM by Grégoire Mesnil is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
Based on a work at https://github.com/mesnilgr/nbsvm.
31 |
--------------------------------------------------------------------------------
/nbsvm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pdb
3 | import numpy as np
4 | import argparse
5 | from collections import Counter
6 |
7 | def tokenize(sentence, grams):
8 | words = sentence.split()
9 | tokens = []
10 | for gram in grams:
11 | for i in range(len(words) - gram + 1):
12 | tokens += ["_*_".join(words[i:i+gram])]
13 | return tokens
14 |
15 | def build_dict(f, grams):
16 | dic = Counter()
17 | for sentence in open(f).xreadlines():
18 | dic.update(tokenize(sentence, grams))
19 | return dic
20 |
21 | def process_files(file_pos, file_neg, dic, r, outfn, grams):
22 | output = []
23 | for beg_line, f in zip(["1", "-1"], [file_pos, file_neg]):
24 | for l in open(f).xreadlines():
25 | tokens = tokenize(l, grams)
26 | indexes = []
27 | for t in tokens:
28 | try:
29 | indexes += [dic[t]]
30 | except KeyError:
31 | pass
32 | indexes = list(set(indexes))
33 | indexes.sort()
34 | line = [beg_line]
35 | for i in indexes:
36 | line += ["%i:%f" % (i + 1, r[i])]
37 | output += [" ".join(line)]
38 | output = "\n".join(output)
39 | f = open(outfn, "w")
40 | f.writelines(output)
41 | f.close()
42 |
43 | def compute_ratio(poscounts, negcounts, alpha=1):
44 | alltokens = list(set(poscounts.keys() + negcounts.keys()))
45 | dic = dict((t, i) for i, t in enumerate(alltokens))
46 | d = len(dic)
47 | print "computing r..."
48 | p, q = np.ones(d) * alpha , np.ones(d) * alpha
49 | for t in alltokens:
50 | p[dic[t]] += poscounts[t]
51 | q[dic[t]] += negcounts[t]
52 | p /= abs(p).sum()
53 | q /= abs(q).sum()
54 | r = np.log(p/q)
55 | return dic, r
56 |
57 | def main(ptrain, ntrain, ptest, ntest, out, liblinear, ngram):
58 | ngram = [int(i) for i in ngram]
59 | print "counting..."
60 | poscounts = build_dict(ptrain, ngram)
61 | negcounts = build_dict(ntrain, ngram)
62 |
63 | dic, r = compute_ratio(poscounts, negcounts)
64 | print "processing files..."
65 | process_files(ptrain, ntrain, dic, r, "train-nbsvm.txt", ngram)
66 | process_files(ptest, ntest, dic, r, "test-nbsvm.txt", ngram)
67 |
68 | trainsvm = os.path.join(liblinear, "train")
69 | predictsvm = os.path.join(liblinear, "predict")
70 | os.system(trainsvm + " -s 0 train-nbsvm.txt model.logreg")
71 | os.system(predictsvm + " -b 1 test-nbsvm.txt model.logreg " + out)
72 | os.system("rm model.logreg train-nbsvm.txt test-nbsvm.txt")
73 |
74 | if __name__ == "__main__":
75 | """
76 | Usage :
77 |
78 | python nbsvm.py --liblinear /PATH/liblinear-1.96\
79 | --ptrain /PATH/data/full-train-pos.txt\
80 | --ntrain /PATH/data/full-train-neg.txt\
81 | --ptest /PATH/data/test-pos.txt\
82 | --ntest /PATH/data/test-neg.txt\
83 | --ngram 123 --out TEST-SCORE
84 | """
85 |
86 | parser = argparse.ArgumentParser(description='Run NB-SVM on some text files.')
87 | parser.add_argument('--liblinear', help='path of liblinear install e.g. */liblinear-1.96')
88 | parser.add_argument('--ptrain', help='path of the text file TRAIN POSITIVE')
89 | parser.add_argument('--ntrain', help='path of the text file TRAIN NEGATIVE')
90 | parser.add_argument('--ptest', help='path of the text file TEST POSITIVE')
91 | parser.add_argument('--ntest', help='path of the text file TEST NEGATIVE')
92 | parser.add_argument('--out', help='path and fileename for score output')
93 | parser.add_argument('--ngram', help='N-grams considered e.g. 123 is uni+bi+tri-grams')
94 | args = vars(parser.parse_args())
95 |
96 | main(**args)
97 |
--------------------------------------------------------------------------------
/oh_my_go.sh:
--------------------------------------------------------------------------------
1 | #this function will convert text to lowercase and will disconnect punctuation and special symbols from words
2 | function normalize_text {
3 | awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/
/ /g' -e 's/"/ " /g' \
4 | -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \
5 | -e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm
6 | }
7 |
8 | cd ..
9 | mkdir nbsvm_run; cd nbsvm_run
10 |
11 | wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
12 | tar -xvf aclImdb_v1.tar.gz
13 | rm aclImdb_v1.tar.gz
14 |
15 | for j in train/pos train/neg test/pos test/neg; do
16 | for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done
17 | normalize_text temp
18 | mv temp-norm aclImdb/$j/norm.txt
19 | rm temp
20 | done
21 |
22 | mkdir data
23 | mv aclImdb/train/pos/norm.txt data/train-pos.txt
24 | mv aclImdb/train/neg/norm.txt data/train-neg.txt
25 | mv aclImdb/test/pos/norm.txt data/test-pos.txt
26 | mv aclImdb/test/neg/norm.txt data/test-neg.txt
27 | rm -r aclImdb
28 |
29 | wget https://www.csie.ntu.edu.tw/~cjlin/liblinear/oldfiles/liblinear-1.96.zip
30 | unzip liblinear-1.96.zip
31 | rm liblinear-1.96.zip
32 | cd liblinear-1.96
33 | make
34 | cd ..
35 |
36 | echo "BI-GRAM";
37 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 12 --out NBSVM-TEST-BIGRAM
38 | echo "TRI-GRAM";
39 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 123 --out NBSVM-TEST-TRIGRAM
40 | cd ../nbsvm
41 |
--------------------------------------------------------------------------------