├── README.md ├── nbsvm.py └── oh_my_go.sh /README.md: -------------------------------------------------------------------------------- 1 | **Note:** I don't provide personal support for custom changes in the code. Only 2 | for the release. For people just starting, I recommend 3 | [Treehouse](http://referrals.trhou.se/grgoiremesnil) for online-learning. 4 | 5 | Naive Bayes SVM (NB-SVM) 6 | ======================== 7 | 8 | This code reproduces performance of the NB-SVM on the IMDB reviews from the 9 | paper: 10 | 11 | Sida Wang and Christopher D. Manning: Baselines and Bigrams: Simple, Good Sentiment and Topic Classification; ACL 2012. 12 | http://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf 13 | 14 | They obtain 91.22% while this code obtains 91.55% with bigrams and 91.82% with trigrams. 15 | Little improvements (+0.33% with bigrams and +0.6% with unigrams) versus the paper. 16 | 17 | To reproduce the results: 18 | 19 | ``` 20 | git clone git@github.com:mesnilgr/nbsvm.git 21 | cd nbsvm; chmod +x oh_my_go.sh 22 | ./oh_my_go.sh 23 | ``` 24 | 25 | End to end (downloading the data, tokenizing, training the models), this will 26 | take 68 mins. Note that most of the time is spent dowloading and tokenizing. 27 | Once the data has been downloaded and tokenized, training an NB-SVM only takes 28 | ~2 mins for uni+bigrams and <5 mins for uni+bi+trigrams. 29 | 30 | Creative Commons License
Naive Bayes SVM by Grégoire Mesnil is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
Based on a work at https://github.com/mesnilgr/nbsvm. 31 | -------------------------------------------------------------------------------- /nbsvm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pdb 3 | import numpy as np 4 | import argparse 5 | from collections import Counter 6 | 7 | def tokenize(sentence, grams): 8 | words = sentence.split() 9 | tokens = [] 10 | for gram in grams: 11 | for i in range(len(words) - gram + 1): 12 | tokens += ["_*_".join(words[i:i+gram])] 13 | return tokens 14 | 15 | def build_dict(f, grams): 16 | dic = Counter() 17 | for sentence in open(f).xreadlines(): 18 | dic.update(tokenize(sentence, grams)) 19 | return dic 20 | 21 | def process_files(file_pos, file_neg, dic, r, outfn, grams): 22 | output = [] 23 | for beg_line, f in zip(["1", "-1"], [file_pos, file_neg]): 24 | for l in open(f).xreadlines(): 25 | tokens = tokenize(l, grams) 26 | indexes = [] 27 | for t in tokens: 28 | try: 29 | indexes += [dic[t]] 30 | except KeyError: 31 | pass 32 | indexes = list(set(indexes)) 33 | indexes.sort() 34 | line = [beg_line] 35 | for i in indexes: 36 | line += ["%i:%f" % (i + 1, r[i])] 37 | output += [" ".join(line)] 38 | output = "\n".join(output) 39 | f = open(outfn, "w") 40 | f.writelines(output) 41 | f.close() 42 | 43 | def compute_ratio(poscounts, negcounts, alpha=1): 44 | alltokens = list(set(poscounts.keys() + negcounts.keys())) 45 | dic = dict((t, i) for i, t in enumerate(alltokens)) 46 | d = len(dic) 47 | print "computing r..." 48 | p, q = np.ones(d) * alpha , np.ones(d) * alpha 49 | for t in alltokens: 50 | p[dic[t]] += poscounts[t] 51 | q[dic[t]] += negcounts[t] 52 | p /= abs(p).sum() 53 | q /= abs(q).sum() 54 | r = np.log(p/q) 55 | return dic, r 56 | 57 | def main(ptrain, ntrain, ptest, ntest, out, liblinear, ngram): 58 | ngram = [int(i) for i in ngram] 59 | print "counting..." 60 | poscounts = build_dict(ptrain, ngram) 61 | negcounts = build_dict(ntrain, ngram) 62 | 63 | dic, r = compute_ratio(poscounts, negcounts) 64 | print "processing files..." 65 | process_files(ptrain, ntrain, dic, r, "train-nbsvm.txt", ngram) 66 | process_files(ptest, ntest, dic, r, "test-nbsvm.txt", ngram) 67 | 68 | trainsvm = os.path.join(liblinear, "train") 69 | predictsvm = os.path.join(liblinear, "predict") 70 | os.system(trainsvm + " -s 0 train-nbsvm.txt model.logreg") 71 | os.system(predictsvm + " -b 1 test-nbsvm.txt model.logreg " + out) 72 | os.system("rm model.logreg train-nbsvm.txt test-nbsvm.txt") 73 | 74 | if __name__ == "__main__": 75 | """ 76 | Usage : 77 | 78 | python nbsvm.py --liblinear /PATH/liblinear-1.96\ 79 | --ptrain /PATH/data/full-train-pos.txt\ 80 | --ntrain /PATH/data/full-train-neg.txt\ 81 | --ptest /PATH/data/test-pos.txt\ 82 | --ntest /PATH/data/test-neg.txt\ 83 | --ngram 123 --out TEST-SCORE 84 | """ 85 | 86 | parser = argparse.ArgumentParser(description='Run NB-SVM on some text files.') 87 | parser.add_argument('--liblinear', help='path of liblinear install e.g. */liblinear-1.96') 88 | parser.add_argument('--ptrain', help='path of the text file TRAIN POSITIVE') 89 | parser.add_argument('--ntrain', help='path of the text file TRAIN NEGATIVE') 90 | parser.add_argument('--ptest', help='path of the text file TEST POSITIVE') 91 | parser.add_argument('--ntest', help='path of the text file TEST NEGATIVE') 92 | parser.add_argument('--out', help='path and fileename for score output') 93 | parser.add_argument('--ngram', help='N-grams considered e.g. 123 is uni+bi+tri-grams') 94 | args = vars(parser.parse_args()) 95 | 96 | main(**args) 97 | -------------------------------------------------------------------------------- /oh_my_go.sh: -------------------------------------------------------------------------------- 1 | #this function will convert text to lowercase and will disconnect punctuation and special symbols from words 2 | function normalize_text { 3 | awk '{print tolower($0);}' < $1 | sed -e 's/\./ \. /g' -e 's/
/ /g' -e 's/"/ " /g' \ 4 | -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' -e 's/\?/ \? /g' \ 5 | -e 's/\;/ \; /g' -e 's/\:/ \: /g' > $1-norm 6 | } 7 | 8 | cd .. 9 | mkdir nbsvm_run; cd nbsvm_run 10 | 11 | wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 12 | tar -xvf aclImdb_v1.tar.gz 13 | rm aclImdb_v1.tar.gz 14 | 15 | for j in train/pos train/neg test/pos test/neg; do 16 | for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done 17 | normalize_text temp 18 | mv temp-norm aclImdb/$j/norm.txt 19 | rm temp 20 | done 21 | 22 | mkdir data 23 | mv aclImdb/train/pos/norm.txt data/train-pos.txt 24 | mv aclImdb/train/neg/norm.txt data/train-neg.txt 25 | mv aclImdb/test/pos/norm.txt data/test-pos.txt 26 | mv aclImdb/test/neg/norm.txt data/test-neg.txt 27 | rm -r aclImdb 28 | 29 | wget https://www.csie.ntu.edu.tw/~cjlin/liblinear/oldfiles/liblinear-1.96.zip 30 | unzip liblinear-1.96.zip 31 | rm liblinear-1.96.zip 32 | cd liblinear-1.96 33 | make 34 | cd .. 35 | 36 | echo "BI-GRAM"; 37 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 12 --out NBSVM-TEST-BIGRAM 38 | echo "TRI-GRAM"; 39 | python ../nbsvm/nbsvm.py --liblinear liblinear-1.96 --ptrain data/train-pos.txt --ntrain data/train-neg.txt --ptest data/test-pos.txt --ntest data/test-neg.txt --ngram 123 --out NBSVM-TEST-TRIGRAM 40 | cd ../nbsvm 41 | --------------------------------------------------------------------------------