├── LICENSE ├── README.md ├── data ├── default.json └── model2.json ├── docs └── setup │ ├── install.txt │ ├── requirements-dev.txt │ └── requirements.txt ├── samr ├── __init__.py ├── corpus.py ├── data.py ├── evaluation.py ├── inquirer_lex_transform.py ├── predictor.py ├── settings.py └── transformations.py ├── scripts ├── cross_validate_config.py ├── download_3rdparty_data.py └── generate_kaggle_submission.py ├── setup.py └── tests ├── data ├── badheader.tsv ├── test.tsv └── train.tsv ├── test_corpus.py ├── test_inquirer_lex_transform.py ├── test_predictor.py └── test_transformations.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Rafael Carrascosa 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the Rafael Carrascosa nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL RAFAEL CARRASCOSA BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Sentiment Analysis on Movie Reviews 2 | =================================== 3 | 4 | This is an entry to [Kaggle](http://www.kaggle.com/)'s 5 | [Sentiment Analysis on Movie Reviews](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) (SAMR) 6 | competition. 7 | 8 | It's written for Python 3.3 and it's based on [`scikit-learn`](http://scikit-learn.org/) 9 | and [`nltk`](http://www.nltk.org/). 10 | 11 | 12 | Problem description 13 | ----------------- 14 | 15 | Quoting from Kaggle's [description page](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews): 16 | 17 | This competition presents a chance to benchmark your sentiment-analysis ideas 18 | on the [Rotten Tomatoes](http://www.rottentomatoes.com/) dataset. You are asked 19 | to label phrases on a scale of five values: negative, somewhat negative, 20 | neutral, somewhat positive, positive. 21 | 22 | Some examples: 23 | 24 | - **4** (positive): _"They works spectacularly well... A shiver-inducing, nerve-rattling ride."_ 25 | - **3** (somewhat positive): _"rooted in a sincere performance by the title character undergoing midlife crisis"_ 26 | - **2** (neutral): _"Its everything you would expect -- but nothing more."_ 27 | - **1** (somewhat negative): _"But it does not leave you with much."_ 28 | - **0** (negative): _"The movies progression into rambling incoherence gives new meaning to the phrase fatal script error."_ 29 | 30 | So the goal of the competition is to produce an algorithm to classify phrases 31 | into these categories. And that's what `samr` does. 32 | 33 | 34 | How to use it 35 | ------------- 36 | 37 | After installing just run: 38 | 39 | generate_kaggle_submission.py samr/data/model2.json > submission.csv 40 | 41 | And that will generate a Kaggle submission file that scores near `0.65844` on the 42 | [leaderboard](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/leaderboard) 43 | (should take 3 minutes, and as of 2014-07-22 that score is the 2nd place). 44 | 45 | The `model2.json` argument above is a configuration file for `samr` that 46 | determines how the `scikit-learn` pipeline is going to be built and other 47 | hyperparameters, here is how it looks: 48 | 49 | { 50 | "classifier":"randomforest", 51 | "classifier_args":{"n_estimators": 100, "min_samples_leaf":10, "n_jobs":-1}, 52 | "lowercase":"true", 53 | "map_to_synsets":"true", 54 | "map_to_lex":"true", 55 | "duplicates":"true" 56 | } 57 | 58 | You can try `samr` with different configuration files you make (as long as the 59 | options are implemented), yielding 60 | different scores and perhaps even better scores. 61 | 62 | ### Just tell me how it works 63 | 64 | In particular `model2.json` feeds a [random forest classifier](http://en.wikipedia.org/wiki/Random_forest) 65 | with a concatenation of 3 kinds of features: 66 | 67 | - The [decision functions](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier.decision_function) 68 | of set of vanilla SGDClassifiers trained in a one-versus-others scheme using 69 | [bag-of-words](http://en.wikipedia.org/wiki/Bag-of-words_model) as features. 70 | It's classifier inside a classifier, [yo dawg!](http://i.imgur.com/aueqLyL.png) 71 | - The decision functions of set of vanilla SGDClassifiers trained in a one-versus-others scheme using bag-of-words 72 | on the [wordnet](http://wordnetweb.princeton.edu/perl/webwn?s=bank) synsets of the words in a phrase. 73 | - The amount of "positive" and "negative" words in a phrase as dictated by 74 | the [Harvard Inquirer sentiment lexicon](http://www.wjh.harvard.edu/~inquirer/spreadsheet_guide.htm) 75 | 76 | During prediction, it also checks for duplicates between the training set and 77 | the train set (there are quite a few). 78 | 79 | And that's it! Want more details? see the code! it's only 350 lines. 80 | 81 | 82 | Installation 83 | ------------ 84 | 85 | If you know the drill, this should be enough: 86 | 87 | git clone https://github.com/rafacarrascosa/samr.git 88 | pip install -e samr -r samr/docs/setup/requirements-dev.txt 89 | download_3rdparty_data.py 90 | 91 | Then you will need to **manually download** `train.tsv` and `test.tsv` from the 92 | competition's [data folder](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data) 93 | and unzip them into the `samr/data` folder. You may be asked to join Kaggle and/or 94 | accept the competition rules before downloading the data. 95 | 96 | Even though `samr` is writen for Python 3.3 it may also work with Python 2.7 97 | (and the last time I checked it was), but this is not supported and it may 98 | break in the future. 99 | 100 | If the short instructions are not enough, read on. 101 | 102 | 103 | ### Full instructions for Ubuntu 104 | 105 | These instructions will install the development version of `samr` inside a 106 | Python 3.3 virtualenv and were thought for a blank, vanilla Ubuntu 14.04 and 107 | tested using [Docker](https://www.docker.com/) (awesome tool btw). They should 108 | work more or less unchanged with other Ubuntu versions and Debian-based OSs. 109 | 110 | Open a console and 'cd' into an empty folder of your choice. Now, execute the 111 | following commands: 112 | 113 | Install python 3.3 and compilation requirements for numpy and scipy: 114 | 115 | sudo apt-get update 116 | sudo apt-get install -y software-properties-common 117 | sudo add-apt-repository -y ppa:fkrull/deadsnakes 118 | sudo apt-get update 119 | sudo apt-get install -y python3.3 python3.3-dev python-scipy gfortran libopenblas-dev liblapack-dev git wget 120 | 121 | Create virtualenv, bootstrap pip and boostrap numpy: 122 | 123 | python3.3 -m venv venv 124 | source venv/bin/activate 125 | wget https://bootstrap.pypa.io/get-pip.py 126 | python3.3 get-pip.py 127 | echo 'PATH="$VIRTUAL_ENV/local/bin:$PATH"; export PATH' >> venv/bin/activate 128 | source venv/bin/activate 129 | pip install numpy==1.8.1 130 | 131 | Clone and install samr: 132 | 133 | git clone https://github.com/rafacarrascosa/samr.git 134 | pip install -e samr -r samr/docs/setup/requirements-dev.txt 135 | download_3rdparty_data.py 136 | 137 | Optionally run the tests: 138 | 139 | nosetests samr/tests 140 | 141 | Lastly, you will need to **manually download** `train.tsv` and `test.tsv` from the 142 | competition's [data folder](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data) 143 | and unzip them into the `samr/data` folder. You may be asked to join Kaggle and/or 144 | accept the competition rules before downloading the data. 145 | 146 | The installation is self-contained (within the folder you chose at the start) with 147 | two exceptions: 148 | 149 | - Lines starting with `sudo apt-get` made system-wide changes, to uninstall 150 | those you will to use `sudo apt-get remove`. 151 | - `nltk` downloads data to `~/nltk_data`, once you don't use `nltk` it's safe 152 | to erase that folder. 153 | 154 | 155 | Licensing 156 | --------- 157 | 158 | This project is open-source and BSD licensed, see the LICENSE file for details. 159 | 160 | This license basically allows you to do anything, but in case you're wondering: 161 | I'm ok if you use `samr` to beat my score at the competition, just share back 162 | what you've learned! 163 | 164 | 165 | Credits 166 | --------- 167 | 168 | This project was developed by Rafael Carrascosa, you can contact me at 169 | . 170 | 171 | -------------------------------------------------------------------------------- /data/default.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /data/model2.json: -------------------------------------------------------------------------------- 1 | {"classifier":"randomforest", 2 | "classifier_args":{"n_estimators": 100, "min_samples_leaf":10, "n_jobs":-1}, 3 | "lowercase":"true", 4 | "map_to_synsets":"true", 5 | "map_to_lex":"true", 6 | "duplicates":"true" 7 | } 8 | -------------------------------------------------------------------------------- /docs/setup/install.txt: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | `samr` works with Python3.3. If you know the drill, this should be enough: 5 | 6 | git clone https://github.com/rafacarrascosa/samr.git 7 | pip install -e samr -r samr/docs/setup/requirements-dev.txt 8 | download_3rdparty_data.py 9 | 10 | Then you will need to **manually download** `train.tsv` and `test.tsv` from the 11 | competition's [data folder](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data) 12 | and unzip them into the `samr/data` folder. You may be asked to join Kaggle and/or 13 | accept the competition rules before downloading the data. 14 | 15 | If the short instructions are not enough, read on. 16 | 17 | ### Full instructions for Ubuntu 18 | 19 | 20 | These instructions will install the development version of samr inside a 21 | virtualenv and were thought for a blank, vanilla Ubuntu 14.04 and tested using 22 | [Docker](https://www.docker.com/) (awesome tool btw). They should work 23 | more or less unchanged with other Ubuntu versions and Debian-based OSs. 24 | 25 | Open a console and 'cd' into an empty folder of your choice. Now, execute the 26 | following commands: 27 | 28 | Install python 3.3 and compilation requirements for numpy and scipy: 29 | 30 | sudo apt-get update 31 | sudo apt-get install -y software-properties-common 32 | sudo add-apt-repository -y ppa:fkrull/deadsnakes 33 | sudo apt-get update 34 | sudo apt-get install -y python3.3 python3.3-dev python-scipy gfortran libopenblas-dev liblapack-dev git wget 35 | 36 | Create virtualenv and bootstrap pip: 37 | 38 | python3.3 -m venv venv 39 | source venv/bin/activate 40 | wget https://bootstrap.pypa.io/get-pip.py 41 | python3.3 get-pip.py 42 | echo 'PATH="$VIRTUAL_ENV/local/bin:$PATH"; export PATH' >> venv/bin/activate 43 | source venv/bin/activate 44 | 45 | Clone and install samr: 46 | 47 | git clone https://github.com/rafacarrascosa/samr.git 48 | pip install -e samr -r samr/docs/setup/requirements-dev.txt 49 | download_3rdparty_data.py 50 | 51 | Optionally run the tests: 52 | 53 | nosetests samr/tests 54 | 55 | Lastly, you will need to **manually download** `train.tsv` and `test.tsv` from the 56 | competition's [data folder](http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data) 57 | and unzip them into the `samr/data` folder. You may be asked to join Kaggle and/or 58 | accept the competition rules before downloading the data. 59 | 60 | The installation is self-contained (within the folder you chose at the start) with 61 | two exceptions: 62 | 63 | - Lines starting with `sudo apt-get` made system-wide changes, to uninstall 64 | those you will to use `sudo apt-get remove`. 65 | - `nltk` downloads data to `~/nltk_data`, once you don't use `nltk` it's safe 66 | to erase that folder. 67 | -------------------------------------------------------------------------------- /docs/setup/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | nose 2 | -------------------------------------------------------------------------------- /docs/setup/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.8.1 2 | scipy==0.14.0 3 | scikit-learn==0.15.0 4 | nltk==3.0.1 5 | -------------------------------------------------------------------------------- /samr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafacarrascosa/samr/ebab09d1e48727bf133a31fd7912a2d1edd6b404/samr/__init__.py -------------------------------------------------------------------------------- /samr/corpus.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import random 4 | 5 | from samr.data import Datapoint 6 | from samr.settings import DATA_PATH 7 | 8 | 9 | def _iter_data_file(filename): 10 | path = os.path.join(DATA_PATH, filename) 11 | it = csv.reader(open(path, "r"), delimiter="\t") 12 | row = next(it) # Drop column names 13 | if " ".join(row[:3]) != "PhraseId SentenceId Phrase": 14 | raise ValueError("Input file has wrong column names: {}".format(path)) 15 | for row in it: 16 | if len(row) == 3: 17 | row += (None,) 18 | yield Datapoint(*row) 19 | 20 | 21 | def iter_corpus(__cached=[]): 22 | """ 23 | Returns an iterable of `Datapoint`s with the contents of train.tsv. 24 | """ 25 | if not __cached: 26 | __cached.extend(_iter_data_file("train.tsv")) 27 | return __cached 28 | 29 | 30 | def iter_test_corpus(): 31 | """ 32 | Returns an iterable of `Datapoint`s with the contents of test.tsv. 33 | """ 34 | return list(_iter_data_file("test.tsv")) 35 | 36 | 37 | def make_train_test_split(seed, proportion=0.9): 38 | """ 39 | Makes a randomized train/test split of the train.tsv corpus with 40 | `proportion` fraction of the elements going to train and the rest to test. 41 | The `seed` argument controls a shuffling of the corpus prior to splitting. 42 | The same seed should always return the same train/test split and different 43 | seeds should always provide different train/test splits. 44 | 45 | Return value is a (train, test) tuple where train and test are lists of 46 | `Datapoint` instances. 47 | """ 48 | data = list(iter_corpus()) 49 | ids = list(sorted(set(x.sentenceid for x in data))) 50 | if len(ids) < 2: 51 | raise ValueError("Corpus too small to split") 52 | N = int(len(ids) * proportion) 53 | if N == 0: 54 | N += 1 55 | rng = random.Random(seed) 56 | rng.shuffle(ids) 57 | test_ids = set(ids[N:]) 58 | train = [] 59 | test = [] 60 | for x in data: 61 | if x.sentenceid in test_ids: 62 | test.append(x) 63 | else: 64 | train.append(x) 65 | return train, test 66 | -------------------------------------------------------------------------------- /samr/data.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | Datapoint = namedtuple("Datapoint", "phraseid sentenceid phrase sentiment") 5 | -------------------------------------------------------------------------------- /samr/evaluation.py: -------------------------------------------------------------------------------- 1 | from samr.corpus import make_train_test_split 2 | 3 | 4 | def cross_validation(factory, seed, K=10, callback=None): 5 | seed = str(seed) 6 | scores = [] 7 | for k in range(K): 8 | train, test = make_train_test_split(seed + str(k)) 9 | predictor = factory() 10 | predictor.fit(train) 11 | score = predictor.score(test) 12 | if callback: 13 | callback(score) 14 | scores.append(score) 15 | return sum(scores) / len(scores) 16 | -------------------------------------------------------------------------------- /samr/inquirer_lex_transform.py: -------------------------------------------------------------------------------- 1 | # Explain and drop some links here 2 | from collections import namedtuple, defaultdict 3 | import csv 4 | import os 5 | 6 | from samr.transformations import StatelessTransform 7 | from samr.settings import DATA_PATH 8 | 9 | 10 | FIELDS = ("Entry, Source, Positiv, Negativ, Pstv, Affil, Ngtv, Hostile, Strong," 11 | " Power, Weak, Submit, Active, Passive, Pleasur, Pain, Feel, Arousal," 12 | " EMOT, Virtue, Vice, Ovrst, Undrst, Academ, Doctrin, Econ, Exch, " 13 | "ECON, Exprsv, Legal, Milit, Polit, POLIT, Relig, Role, COLL, Work, " 14 | "Ritual, SocRel, Race, Kin, MALE, Female, Nonadlt, HU, ANI, PLACE, " 15 | "Social, Region, Route, Aquatic, Land, Sky, Object, Tool, Food, " 16 | "Vehicle, BldgPt, ComnObj, NatObj, BodyPt, ComForm, COM, Say, Need, " 17 | "Goal, Try, Means, Persist, Complet, Fail, NatrPro, Begin, Vary, " 18 | "Increas, Decreas, Finish, Stay, Rise, Exert, Fetch, Travel, Fall, " 19 | "Think, Know, Causal, Ought, Perceiv, Compare, Eval, EVAL, Solve, " 20 | "Abs, ABS, Quality, Quan, NUMB, ORD, CARD, FREQ, DIST, Time, TIME, " 21 | "Space, POS, DIM, Rel, COLOR, Self, Our, You, Name, Yes, No, Negate, " 22 | "Intrj, IAV, DAV, SV, IPadj, IndAdj, PowGain, PowLoss, PowEnds, " 23 | "PowAren, PowCon, PowCoop, PowAuPt, PowPt, PowDoct, PowAuth, PowOth, " 24 | "PowTot, RcEthic, RcRelig, RcGain, RcLoss, RcEnds, RcTot, RspGain, " 25 | "RspLoss, RspOth, RspTot, AffGain, AffLoss, AffPt, AffOth, AffTot, " 26 | "WltPt, WltTran, WltOth, WltTot, WlbGain, WlbLoss, WlbPhys, WlbPsyc, " 27 | "WlbPt, WlbTot, EnlGain, EnlLoss, EnlEnds, EnlPt, EnlOth, EnlTot, " 28 | "SklAsth, SklPt, SklOth, SklTot, TrnGain, TrnLoss, TranLw, MeansLw, " 29 | "EndsLw, ArenaLw, PtLw, Nation, Anomie, NegAff, PosAff, SureLw, If, " 30 | "NotLw, TimeSpc, FormLw, Othtags, Defined") 31 | 32 | InquirerLexEntry = namedtuple("InquirerLexEntry", FIELDS) 33 | FIELDS = InquirerLexEntry._fields 34 | 35 | 36 | class InquirerLexTransform(StatelessTransform): 37 | _corpus = [] 38 | _use_fields = [FIELDS.index(x) for x in "Positiv Negativ IAV Strong".split()] 39 | 40 | def transform(self, X, y=None): 41 | """ 42 | `X` is expected to be a list of `str` instances containing the phrases. 43 | Return value is a list of `str` containing different amounts of the 44 | words "Positiv_Positiv", "Negativ_Negativ", "IAV_IAV", "Strong_Strong" 45 | based on the sentiments given to the input words by the Hardvard 46 | Inquirer lexicon. 47 | """ 48 | corpus = self._get_corpus() 49 | result = [] 50 | for phrase in X: 51 | newphrase = [] 52 | for word in phrase.split(): 53 | newphrase.extend(corpus.get(word.lower(), [])) 54 | result.append(" ".join(newphrase)) 55 | return result 56 | 57 | def _get_corpus(self): 58 | """ 59 | Private method used to cache a dictionary with the Harvard Inquirer 60 | corpus. 61 | """ 62 | if not self._corpus: 63 | corpus = defaultdict(list) 64 | it = csv.reader(open(os.path.join(DATA_PATH, "inquirerbasicttabsclean")), 65 | delimiter="\t") 66 | next(it) # Drop header row 67 | for row in it: 68 | entry = InquirerLexEntry(*row) 69 | xs = [] 70 | for i in self._use_fields: 71 | name, x = FIELDS[i], entry[i] 72 | if x: 73 | xs.append("{}_{}".format(name, x)) 74 | name = entry.Entry.lower() 75 | if "#" in name: 76 | name = name[:name.index("#")] 77 | corpus[name].extend(xs) 78 | self._corpus.append(dict(corpus)) 79 | return self._corpus[0] 80 | -------------------------------------------------------------------------------- /samr/predictor.py: -------------------------------------------------------------------------------- 1 | """ 2 | SAMR main module, PhraseSentimentPredictor is the class that does the 3 | prediction and therefore one of the main entry points to the library. 4 | """ 5 | from collections import defaultdict 6 | 7 | from sklearn.linear_model import SGDClassifier 8 | from sklearn.neighbors import KNeighborsClassifier 9 | from sklearn.svm import SVC 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.feature_extraction.text import CountVectorizer 12 | from sklearn.pipeline import make_pipeline, make_union 13 | from sklearn.metrics import accuracy_score 14 | 15 | from samr.transformations import (ExtractText, ReplaceText, MapToSynsets, 16 | Densifier, ClassifierOvOAsFeatures) 17 | from samr.inquirer_lex_transform import InquirerLexTransform 18 | 19 | 20 | _valid_classifiers = { 21 | "sgd": SGDClassifier, 22 | "knn": KNeighborsClassifier, 23 | "svc": SVC, 24 | "randomforest": RandomForestClassifier, 25 | } 26 | 27 | 28 | def target(phrases): 29 | return [datapoint.sentiment for datapoint in phrases] 30 | 31 | 32 | class PhraseSentimentPredictor: 33 | """ 34 | Main `samr` class. It implements a trainable predictor for phrase 35 | sentiments. API is a-la scikit-learn, where: 36 | - `__init__` configures the predictor 37 | - `fit` trains the predictor from data. After calling `fit` the instance 38 | methods should be free side-effect. 39 | - `predict` generates sentiment predictions. 40 | - `score` evaluates classification accuracy from a test set. 41 | 42 | Outline of the predictor pipeline is as follows: 43 | A configurable main classifier is trained with a concatenation of 3 kinds of 44 | features: 45 | - The decision functions of set of vanilla SGDClassifiers trained in a 46 | one-versus-others scheme using bag-of-words as features. 47 | - (Optionally) The decision functions of set of vanilla SGDClassifiers 48 | trained in a one-versus-others scheme using bag-of-words on the 49 | wordnet synsets of the words in a phrase. 50 | - (Optionally) The amount of "positive" and "negative" words in a phrase 51 | as dictated by the Harvard Inquirer sentiment lexicon 52 | 53 | Optionally, during prediction, it also checks for exact duplicates between 54 | the training set and the train set. """ 55 | def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, 56 | text_replacements=None, map_to_synsets=False, binary=False, 57 | min_df=0, ngram=1, stopwords=None, limit_train=None, 58 | map_to_lex=False, duplicates=False): 59 | """ 60 | Parameter description: 61 | - `classifier`: The type of classifier used as main classifier, 62 | valid values are "sgd", "knn", "svc", "randomforest". 63 | - `classifier_args`: A dict to be passed as arguments to the main 64 | classifier. 65 | - `lowercase`: wheter or not all words are lowercased at the start of 66 | the pipeline. 67 | - `text_replacements`: A list of tuples `(from, to)` specifying 68 | string replacements to be made at the start of the pipeline (after 69 | lowercasing). 70 | - `map_to_synsets`: Whether or not to use the Wordnet synsets 71 | feature set. 72 | - `binary`: Whether or not to count words in the bag-of-words 73 | representation as 0 or 1. 74 | - `min_df`: Minumim frequency a word needs to have to be included 75 | in the bag-of-word representation. 76 | - `ngram`: The maximum size of ngrams to be considered in the 77 | bag-of-words representation. 78 | - `stopwords`: A list of words to filter out of the bag-of-words 79 | representation. Can also be the string "english", in which case 80 | a default list of english stopwords will be used. 81 | - `limit_train`: The maximum amount of training samples to give to 82 | the main classifier. This can be useful for some slow main 83 | classifiers (ex: svc) that converge with less samples to an 84 | optimum. 85 | - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon 86 | features. 87 | - `duplicates`: Whether or not to check for identical phrases between 88 | train and prediction. 89 | """ 90 | self.limit_train = limit_train 91 | self.duplicates = duplicates 92 | 93 | # Build pre-processing common to every extraction 94 | pipeline = [ExtractText(lowercase)] 95 | if text_replacements: 96 | pipeline.append(ReplaceText(text_replacements)) 97 | 98 | # Build feature extraction schemes 99 | ext = [build_text_extraction(binary=binary, min_df=min_df, 100 | ngram=ngram, stopwords=stopwords)] 101 | if map_to_synsets: 102 | ext.append(build_synset_extraction(binary=binary, min_df=min_df, 103 | ngram=ngram)) 104 | if map_to_lex: 105 | ext.append(build_lex_extraction(binary=binary, min_df=min_df, 106 | ngram=ngram)) 107 | ext = make_union(*ext) 108 | pipeline.append(ext) 109 | 110 | # Build classifier and put everything togheter 111 | if classifier_args is None: 112 | classifier_args = {} 113 | classifier = _valid_classifiers[classifier](**classifier_args) 114 | self.pipeline = make_pipeline(*pipeline) 115 | self.classifier = classifier 116 | 117 | def fit(self, phrases, y=None): 118 | """ 119 | `phrases` should be a list of `Datapoint` instances. 120 | `y` should be a list of `str` instances representing the sentiments to 121 | be learnt. 122 | """ 123 | y = target(phrases) 124 | if self.duplicates: 125 | self.dupes = DuplicatesHandler() 126 | self.dupes.fit(phrases, y) 127 | Z = self.pipeline.fit_transform(phrases, y) 128 | if self.limit_train: 129 | self.classifier.fit(Z[:self.limit_train], y[:self.limit_train]) 130 | else: 131 | self.classifier.fit(Z, y) 132 | return self 133 | 134 | def predict(self, phrases): 135 | """ 136 | `phrases` should be a list of `Datapoint` instances. 137 | Return value is a list of `str` instances with the predicted sentiments. 138 | """ 139 | Z = self.pipeline.transform(phrases) 140 | labels = self.classifier.predict(Z) 141 | if self.duplicates: 142 | for i, phrase in enumerate(phrases): 143 | label = self.dupes.get(phrase) 144 | if label is not None: 145 | labels[i] = label 146 | return labels 147 | 148 | def score(self, phrases): 149 | """ 150 | `phrases` should be a list of `Datapoint` instances. 151 | Return value is a `float` with the classification accuracy of the 152 | input. 153 | """ 154 | pred = self.predict(phrases) 155 | return accuracy_score(target(phrases), pred) 156 | 157 | def error_matrix(self, phrases): 158 | predictions = self.predict(phrases) 159 | matrix = defaultdict(list) 160 | for phrase, predicted in zip(phrases, predictions): 161 | if phrase.sentiment != predicted: 162 | matrix[(phrase.sentiment, predicted)].append(phrase) 163 | return matrix 164 | 165 | 166 | def build_text_extraction(binary, min_df, ngram, stopwords): 167 | return make_pipeline(CountVectorizer(binary=binary, 168 | tokenizer=lambda x: x.split(), 169 | min_df=min_df, 170 | ngram_range=(1, ngram), 171 | stop_words=stopwords), 172 | ClassifierOvOAsFeatures()) 173 | 174 | 175 | def build_synset_extraction(binary, min_df, ngram): 176 | return make_pipeline(MapToSynsets(), 177 | CountVectorizer(binary=binary, 178 | tokenizer=lambda x: x.split(), 179 | min_df=min_df, 180 | ngram_range=(1, ngram)), 181 | ClassifierOvOAsFeatures()) 182 | 183 | 184 | def build_lex_extraction(binary, min_df, ngram): 185 | return make_pipeline(InquirerLexTransform(), 186 | CountVectorizer(binary=binary, 187 | tokenizer=lambda x: x.split(), 188 | min_df=min_df, 189 | ngram_range=(1, ngram)), 190 | Densifier()) 191 | 192 | 193 | class DuplicatesHandler: 194 | def fit(self, phrases, target): 195 | self.dupes = {} 196 | for phrase, label in zip(phrases, target): 197 | self.dupes[self._key(phrase)] = label 198 | 199 | def get(self, phrase): 200 | key = self._key(phrase) 201 | return self.dupes.get(key) 202 | 203 | def _key(self, x): 204 | return " ".join(x.phrase.lower().split()) 205 | 206 | 207 | class _Baseline: 208 | def fit(self, X, y=None): 209 | return self 210 | 211 | def predict(self, X): 212 | return ["2" for _ in X] 213 | 214 | def score(self, X): 215 | gold = target(X) 216 | pred = self.predict(X) 217 | return accuracy_score(gold, pred) 218 | -------------------------------------------------------------------------------- /samr/settings.py: -------------------------------------------------------------------------------- 1 | from os.path import join, dirname, abspath 2 | 3 | 4 | DATA_PATH = abspath(join(dirname(__file__), "..", "data")) 5 | -------------------------------------------------------------------------------- /samr/transformations.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements several scikit-learn compatible transformers, see 3 | scikit-learn documentation for the convension fit/transform convensions. 4 | """ 5 | 6 | import numpy 7 | import re 8 | 9 | from sklearn.linear_model import SGDClassifier 10 | import sklearn 11 | if int(sklearn.__version__.split('.')[1]) > 16: # fix TYPO 12 | from sklearn.multiclass import OneVsOneClassifier # fix TYPO 13 | else: 14 | from sklearn.multiclass import fit_ovo 15 | import nltk 16 | 17 | 18 | class StatelessTransform: 19 | """ 20 | Base class for all transformations that do not depend on training (ie, are 21 | stateless). 22 | """ 23 | def fit(self, X, y=None): 24 | return self 25 | 26 | 27 | class ExtractText(StatelessTransform): 28 | """ 29 | This should be the first transformation on a samr pipeline, it extracts 30 | the phrase text from the richer `Datapoint` class. 31 | """ 32 | def __init__(self, lowercase=False): 33 | self.lowercase = lowercase 34 | 35 | def transform(self, X): 36 | """ 37 | `X` is expected to be a list of `Datapoint` instances. 38 | Return value is a list of `str` instances in which words were tokenized 39 | and are separated by a single space " ". Optionally words are also 40 | lowercased depending on the argument given at __init__. 41 | """ 42 | it = (" ".join(nltk.word_tokenize(datapoint.phrase)) for datapoint in X) 43 | if self.lowercase: 44 | return [x.lower() for x in it] 45 | return list(it) 46 | 47 | 48 | class ReplaceText(StatelessTransform): 49 | def __init__(self, replacements): 50 | """ 51 | Replacements should be a list of `(from, to)` tuples of strings. 52 | """ 53 | self.rdict = dict(replacements) 54 | self.pat = re.compile("|".join(re.escape(origin) for origin, _ in replacements)) 55 | 56 | def transform(self, X): 57 | """ 58 | `X` is expected to be a list of `str` instances. 59 | Return value is also a list of `str` instances with the replacements 60 | applied. 61 | """ 62 | if not self.rdict: 63 | return X 64 | return [self.pat.sub(self._repl_fun, x) for x in X] 65 | 66 | def _repl_fun(self, match): 67 | return self.rdict[match.group()] 68 | 69 | 70 | class MapToSynsets(StatelessTransform): 71 | """ 72 | This transformation replaces words in the input with their Wordnet 73 | synsets[0]. 74 | The intuition behind it is that phrases represented by synset vectors 75 | should be "closer" to one another (not suffer the curse of dimensionality) 76 | than the sparser (often poetical) words used for the reviews. 77 | 78 | [0] For example "bank": http://wordnetweb.princeton.edu/perl/webwn?s=bank 79 | """ 80 | def transform(self, X): 81 | """ 82 | `X` is expected to be a list of `str` instances. 83 | It returns a list of `str` instances such that the i-th element 84 | containins the names of the synsets of all the words in `X[i]`, 85 | excluding noun synsets. 86 | `X[i]` is internally tokenized using `str.split`, so it should be 87 | formatted accordingly. 88 | """ 89 | return [self._text_to_synsets(x) for x in X] 90 | 91 | def _text_to_synsets(self, text): 92 | result = [] 93 | for word in text.split(): 94 | ss = nltk.wordnet.wordnet.synsets(word) 95 | result.extend(str(s) for s in ss if ".n." not in str(s)) 96 | return " ".join(result) 97 | 98 | 99 | class Densifier(StatelessTransform): 100 | """ 101 | A transformation that densifies an scipy sparse matrix into a numpy ndarray 102 | """ 103 | def transform(self, X, y=None): 104 | """ 105 | `X` is expected to be a scipy sparse matrix. 106 | It returns `X` in a (dense) numpy ndarray. 107 | """ 108 | return X.todense() 109 | 110 | 111 | class ClassifierOvOAsFeatures: 112 | """ 113 | A transformation that esentially implement a form of dimensionality 114 | reduction. 115 | This class uses a fast SGDClassifier configured like a linear SVM to produce 116 | a vector of decision functions separating target classes in a 117 | one-versus-rest fashion. 118 | It's useful to reduce the dimension bag-of-words feature-set into features 119 | that are richer in information. 120 | """ 121 | def fit(self, X, y): 122 | """ 123 | `X` is expected to be an array-like or a sparse matrix. 124 | `y` is expected to be an array-like containing the classes to learn. 125 | """ 126 | if int(sklearn.__version__.split('.')[1]) > 16: # fix TYPO 127 | self.classifiers = OneVsOneClassifier(SGDClassifier(), n_jobs=-1).fit(X, numpy.array(y)).estimators_ 128 | else: 129 | self.classifiers = fit_ovo(SGDClassifier(), X, numpy.array(y), n_jobs=-1)[0] 130 | return self 131 | 132 | def transform(self, X, y=None): 133 | """ 134 | `X` is expected to be an array-like or a sparse matrix. 135 | It returns a dense matrix of shape (n_samples, m_features) where 136 | m_features = (n_classes * (n_classes - 1)) / 2 137 | """ 138 | xs = [clf.decision_function(X).reshape(-1, 1) for clf in self.classifiers] 139 | return numpy.hstack(xs) 140 | -------------------------------------------------------------------------------- /scripts/cross_validate_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run a 10-fold cross validation evaluation of a samr model given by a json 3 | configuration file. 4 | """ 5 | import time 6 | 7 | 8 | def fix_json_dict(config): 9 | new = {} 10 | for key, value in config.items(): 11 | if isinstance(value, dict): 12 | value = fix_json_dict(value) 13 | elif isinstance(value, str): 14 | if value == "true": 15 | value = True 16 | elif value == "false": 17 | value = False 18 | else: 19 | try: 20 | value = float(value) 21 | except ValueError: 22 | pass 23 | new[key] = value 24 | return new 25 | 26 | 27 | class PrintPartialCV: 28 | def __init__(self): 29 | self.last = time.time() 30 | self.i = 0 31 | 32 | def report(self, score): 33 | new = time.time() 34 | self.i += 1 35 | print("individual {}-th fold score={}% took {} seconds".format(self.i, score * 100, new - self.last)) 36 | self.last = new 37 | 38 | 39 | if __name__ == "__main__": 40 | import argparse 41 | import json 42 | 43 | from samr.evaluation import cross_validation 44 | from samr.predictor import PhraseSentimentPredictor 45 | 46 | parser = argparse.ArgumentParser(description=__doc__) 47 | parser.add_argument("filename") 48 | config = parser.parse_args() 49 | config = json.load(open(config.filename)) 50 | 51 | factory = lambda: PhraseSentimentPredictor(**config) 52 | factory() # Run once to check config is ok 53 | 54 | report = PrintPartialCV() 55 | result = cross_validation(factory, seed="robot rock", callback=report.report) 56 | 57 | print("10-fold cross validation score {}%".format(result * 100)) 58 | -------------------------------------------------------------------------------- /scripts/download_3rdparty_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | 4 | import nltk 5 | 6 | from samr.settings import DATA_PATH 7 | 8 | # Create data folder if necessary 9 | if not os.path.isdir(DATA_PATH): 10 | print("Creating data folder at {}".format(DATA_PATH)) 11 | os.makedirs(DATA_PATH) 12 | else: 13 | print("Data folder found at {}".format(DATA_PATH)) 14 | 15 | # Download inquirer data 16 | filename = os.path.join(DATA_PATH, "inquirerbasicttabsclean") 17 | url = "http://www.wjh.harvard.edu/~inquirer/inqtabs.txt" 18 | if not os.path.isfile(filename) or os.stat(filename).st_size != 2906024: 19 | print("Downloading {} into {}".format(url, filename)) 20 | urllib.request.urlretrieve(url, filename) 21 | else: 22 | print("Harvard Inquirer lexical data found at {}".format(filename)) 23 | 24 | # Download nltk data 25 | nltk.download("wordnet") 26 | nltk.download("punkt") 27 | 28 | print("\n3rd party data downloaded correctly.\n") 29 | -------------------------------------------------------------------------------- /scripts/generate_kaggle_submission.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate a tsv submission file to kaggle's 'Sentiment Analysis on Movie Reviews' (samr) 3 | competition using the samr module with a given json configuration file. 4 | """ 5 | 6 | 7 | def fix_json_dict(config): 8 | new = {} 9 | for key, value in config.items(): 10 | if isinstance(value, dict): 11 | value = fix_json_dict(value) 12 | elif isinstance(value, str): 13 | if value == "true": 14 | value = True 15 | elif value == "false": 16 | value = False 17 | else: 18 | try: 19 | value = float(value) 20 | except ValueError: 21 | pass 22 | new[key] = value 23 | return new 24 | 25 | 26 | if __name__ == "__main__": 27 | import argparse 28 | import json 29 | import csv 30 | import sys 31 | 32 | from samr.corpus import iter_corpus, iter_test_corpus 33 | from samr.predictor import PhraseSentimentPredictor 34 | 35 | parser = argparse.ArgumentParser(description=__doc__) 36 | parser.add_argument("filename") 37 | config = parser.parse_args() 38 | config = json.load(open(config.filename)) 39 | 40 | predictor = PhraseSentimentPredictor(**config) 41 | predictor.fit(list(iter_corpus())) 42 | test = list(iter_test_corpus()) 43 | prediction = predictor.predict(test) 44 | 45 | writer = csv.writer(sys.stdout) 46 | writer.writerow(("PhraseId", "Sentiment")) 47 | for datapoint, sentiment in zip(test, prediction): 48 | writer.writerow((datapoint.phraseid, sentiment)) 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from distutils.core import setup 4 | 5 | 6 | base_path = os.path.dirname(os.path.abspath(__file__)) 7 | requirements_path = os.path.join(base_path, "docs", "setup", "requirements.txt") 8 | reqs = [line.strip() for line in open(requirements_path)] 9 | 10 | 11 | setup( 12 | name="kaggle-sentiment-movie-reviews", 13 | version="0.1", 14 | description="An entry to kaggle's 'Sentiment Analysis on Movie Reviews' competition", 15 | author="Rafael Carrascosa", 16 | packages=["samr"], 17 | install_requires=reqs, 18 | scripts=["scripts/generate_kaggle_submission.py", 19 | "scripts/cross_validate_config.py", 20 | "scripts/download_3rdparty_data.py"] 21 | ) 22 | -------------------------------------------------------------------------------- /tests/data/badheader.tsv: -------------------------------------------------------------------------------- 1 | PhraseId Phrase SentenceId Sentiment 2 | 1 1 El pasillo del vecino 0 3 | -------------------------------------------------------------------------------- /tests/data/test.tsv: -------------------------------------------------------------------------------- 1 | PhraseId SentenceId Phrase 2 | 1 8 pollo con papas a la essen . 3 | 100 7 yo mama so fat 4 | 99 9 digitalism 5 | 123 4 silenz 6 | -------------------------------------------------------------------------------- /tests/data/train.tsv: -------------------------------------------------------------------------------- 1 | PhraseId SentenceId Phrase Sentiment 2 | 1 1 El pasillo del vecino 0 3 | 2 2 The cat meows 1 4 | 7 3 A bird just passed my window 2 5 | 6 4 3 6 | 5 4 empty 4 7 | 4 3 single word 4 8 | 3 2 word play 3 9 | 8 1 chea 2 10 | 9 1 che 1 11 | -------------------------------------------------------------------------------- /tests/test_corpus.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | from samr import corpus 5 | 6 | 7 | TESTDATA_PATH = os.path.join(os.path.dirname(__file__), "data") 8 | 9 | 10 | class TestCorpus(TestCase): 11 | def setUp(self): 12 | self.__original_path = corpus.DATA_PATH 13 | corpus.DATA_PATH = TESTDATA_PATH 14 | 15 | def tearDown(self): 16 | corpus.DATA_PATH = self.__original_path 17 | 18 | def test_make_train_test_split_simple(self): 19 | train, test = corpus.make_train_test_split("blitz") 20 | self.assertIn("word play", [x.phrase for x in train + test]) 21 | self.assertEqual(len(set(x.sentenceid for x in test)), 1) 22 | self.assertEqual(len(set(x.sentenceid for x in test + train)), 4) 23 | 24 | def test_make_train_test_split_seed_works(self): 25 | a1, a2 = corpus.make_train_test_split("a") 26 | b1, b2 = corpus.make_train_test_split("b") 27 | c1, c2 = corpus.make_train_test_split("a") 28 | self.assertEqual(a1, c1) 29 | self.assertEqual(a2, c2) 30 | self.assertNotEqual(a1, b1) 31 | self.assertNotEqual(a2, b2) 32 | 33 | def test_make_train_test_split_no_shared_sentences(self): 34 | """ 35 | Test that train and test don't share sent ids. 36 | """ 37 | train, test = corpus.make_train_test_split("semis") 38 | train_ids = set(x.sentenceid for x in train) 39 | test_ids = set(x.sentenceid for x in test) 40 | self.assertEqual(train_ids & test_ids, set()) 41 | 42 | def test_iter_test_corpus_simple(self): 43 | test = list(corpus.iter_test_corpus()) 44 | self.assertEqual(len(test), 4) 45 | self.assertEqual(set("1 99 100 123".split()), set(x.phraseid for x in test)) 46 | self.assertEqual(set("4 7 8 9".split()), set(x.sentenceid for x in test)) 47 | self.assertIn("yo mama so fat", [x.phrase for x in test]) 48 | self.assertEqual(set([None]), set(x.sentiment for x in test)) 49 | 50 | def test_iter_data_file_bad_header(self): 51 | with self.assertRaises(ValueError): 52 | list(corpus._iter_data_file("badheader.tsv")) 53 | -------------------------------------------------------------------------------- /tests/test_inquirer_lex_transform.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from samr.inquirer_lex_transform import InquirerLexTransform 4 | 5 | 6 | class TestInquirerLexTransform(TestCase): 7 | def test_empty(self): 8 | m = InquirerLexTransform() 9 | Z = m.transform([]) 10 | self.assertEqual(len(Z), 0) 11 | 12 | def test_fit_returns_self(self): 13 | m = InquirerLexTransform() 14 | s = m.fit([]) 15 | self.assertEqual(s, m) 16 | 17 | def test_simple(self): 18 | X = ["This was a good summer", "The food was awful"] 19 | m = InquirerLexTransform() 20 | Z = m.transform(X) 21 | self.assertEqual(len(Z), 2) 22 | self.assertTrue(isinstance(Z[0], str) and isinstance(Z[1], str)) 23 | self.assertIn("positiv", Z[0].lower()) 24 | self.assertIn("negativ", Z[1].lower()) 25 | self.assertNotIn("good", Z[0].lower()) 26 | self.assertNotIn("awful", Z[1].lower()) 27 | -------------------------------------------------------------------------------- /tests/test_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | from samr import corpus 5 | from samr.predictor import PhraseSentimentPredictor 6 | from samr.data import Datapoint 7 | 8 | 9 | TESTDATA_PATH = os.path.join(os.path.dirname(__file__), "data") 10 | 11 | 12 | class TestPhraseSentimentPredictor(TestCase): 13 | def setUp(self): 14 | self.__original_path = corpus.DATA_PATH 15 | corpus.DATA_PATH = TESTDATA_PATH 16 | 17 | def tearDown(self): 18 | corpus.DATA_PATH = self.__original_path 19 | 20 | def test_fit_returns_self(self): 21 | train, _ = corpus.make_train_test_split("defiant order") 22 | predictor = PhraseSentimentPredictor() 23 | s = predictor.fit(train) 24 | self.assertEqual(predictor, s) 25 | 26 | def test_simple_predict(self): 27 | train, test = corpus.make_train_test_split("inhaler") 28 | predictor = PhraseSentimentPredictor() 29 | predictor.fit(train) 30 | predictions = predictor.predict(test) 31 | 32 | # Same amount of predictions than input values 33 | self.assertEqual(len(predictions), len(test)) 34 | 35 | # Predicted labels where seen during training 36 | train_labels = set(x.sentiment for x in train) 37 | predicted_labels = set(predictions) 38 | self.assertEqual(predicted_labels - train_labels, set()) 39 | 40 | def test_simple_error_matrix(self): 41 | train, test = corpus.make_train_test_split("reflektor", proportion=0.4) 42 | predictor = PhraseSentimentPredictor() 43 | predictor.fit(train) 44 | error = predictor.error_matrix(test) 45 | for real, predicted in error.keys(): 46 | self.assertNotEqual(real, predicted) 47 | 48 | score = predictor.score(test) 49 | assert score > 0, "Test is valid only if score is more than 0" 50 | N = float(len(test)) 51 | wrong = sum(len(xs) for xs in error.values()) 52 | self.assertEqual((N - wrong) / N, score) 53 | 54 | def test_simple_duplicates(self): 55 | dupe = Datapoint(phraseid="a", sentenceid="b", phrase="b a", sentiment="1") 56 | # Train has a lot of "2" sentiments 57 | train = [Datapoint(phraseid=str(i), 58 | sentenceid=str(i), 59 | phrase="a b", 60 | sentiment="2") for i in range(10)] 61 | train.append(dupe) 62 | test = [Datapoint(*dupe)] 63 | predictor = PhraseSentimentPredictor(duplicates=True) 64 | predictor.fit(train) 65 | predicted = predictor.predict(test)[0] 66 | self.assertEqual(predicted, "1") 67 | -------------------------------------------------------------------------------- /tests/test_transformations.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from samr.transformations import ReplaceText, MapToSynsets 4 | 5 | 6 | class TestReplaceText(TestCase): 7 | def test_empty(self): 8 | r = ReplaceText([]) 9 | Z = r.transform([]) 10 | self.assertEqual(len(Z), 0) 11 | X = ["Deadmau5 4x4 = 12"] 12 | r = ReplaceText([]) 13 | Z = r.transform(X) 14 | self.assertEqual(list(Z), X) 15 | 16 | def test_fit_returns_self(self): 17 | r = ReplaceText([]) 18 | s = r.fit([]) 19 | self.assertEqual(s, r) 20 | 21 | def test_simple(self): 22 | X = ["Sentence number one number two and so on .", 23 | "Old ubuntu version is 12.04, but it's still mantained"] 24 | Y = ["Sentence number one number two and so on ", 25 | "Old ubuntu version is 1204, but it is still mantained"] 26 | r = ReplaceText([ 27 | (".", ""), 28 | ("'s", " is"), 29 | ]) 30 | Z = r.transform(X) 31 | self.assertEqual(Z, Y) 32 | 33 | def test_priority_is_accounted(self): 34 | X = ["What ' is ' what should n't be and what ' will be '"] 35 | Y = ["What is what should not be and what will be "] 36 | r = ReplaceText([ 37 | ("n't", "not"), 38 | ("'", ""), 39 | ]) 40 | Z = r.transform(X) 41 | self.assertEqual(Z, Y) 42 | 43 | 44 | class TestMapToSynsets(TestCase): 45 | def test_empty(self): 46 | m = MapToSynsets() 47 | Z = m.transform([]) 48 | self.assertEqual(len(Z), 0) 49 | 50 | def test_fit_returns_self(self): 51 | m = MapToSynsets() 52 | s = m.fit([]) 53 | self.assertEqual(s, m) 54 | 55 | def test_simple(self): 56 | X = ["The light crashes"] 57 | m = MapToSynsets() 58 | Z = m.transform(X) 59 | self.assertEqual(len(Z), 1) 60 | self.assertTrue(isinstance(Z[0], str)) 61 | for word in ["light.a.01", "crash.v.01"]: 62 | self.assertIn(word, Z[0]) 63 | self.assertNotIn("crash.n.02", Z[0]) 64 | --------------------------------------------------------------------------------