├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── classifiers └── __init__.py ├── data ├── __init__.py └── dataset.csv ├── models ├── __init__.py ├── classifier_model.py ├── doc2vec_model.py └── model.py ├── requirements.txt └── text_classifier.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ### JetBrains template 108 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 109 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 110 | 111 | # User-specific stuff: 112 | .idea/**/workspace.xml 113 | .idea/**/tasks.xml 114 | .idea/dictionaries 115 | 116 | # Sensitive or high-churn files: 117 | .idea/**/dataSources/ 118 | .idea/**/dataSources.ids 119 | .idea/**/dataSources.xml 120 | .idea/**/dataSources.local.xml 121 | .idea/**/sqlDataSources.xml 122 | .idea/**/dynamic.xml 123 | .idea/**/uiDesigner.xml 124 | 125 | # Gradle: 126 | .idea/**/gradle.xml 127 | .idea/**/libraries 128 | 129 | # CMake 130 | cmake-build-debug/ 131 | 132 | # Mongo Explorer plugin: 133 | .idea/**/mongoSettings.xml 134 | 135 | ## File-based project format: 136 | *.iws 137 | 138 | ## Plugin-specific files: 139 | 140 | # IntelliJ 141 | out/ 142 | 143 | # mpeltonen/sbt-idea plugin 144 | .idea_modules/ 145 | 146 | # JIRA plugin 147 | atlassian-ide-plugin.xml 148 | 149 | # Cursive Clojure plugin 150 | .idea/replstate.xml 151 | 152 | # Crashlytics plugin (for Android Studio and IntelliJ) 153 | com_crashlytics_export_strings.xml 154 | crashlytics.properties 155 | crashlytics-build.properties 156 | fabric.properties 157 | 158 | # .idea 159 | .idea/ 160 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | install: 5 | - sudo rm -f /etc/boto.cfg 6 | - pip install -r requirements.txt 7 | script: 8 | - python3 text_classifier.py 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ibrahim Sharaf ElDen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Doc2Vec Text Classification [![Build Status](https://travis-ci.org/ibrahimsharaf/doc2vec.svg?branch=master)](https://travis-ci.org/ibrahimsharaf/doc2vec) 2 | 3 | Text classification model which uses gensim Doc2Vec for generating paragraph embeddings and scikit-learn Logistic Regression for classification. 4 | 5 | 6 | ### Dataset 7 | 8 | 25,000 IMDB movie reviews, specially selected for sentiment analysis. The sentiment of reviews is binary (1 for postive, 0 for negative). 9 | 10 | This source dataset was collected in association with the following publication: 11 | 12 | ```Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis." The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).``` 13 | 14 | ### Usage 15 | - Install the required tools 16 | 17 | ```pip install -r requirements.txt``` 18 | - Run the script 19 | 20 | ```python text_classifier.py``` 21 | 22 | ### References 23 | - Kaggle – Bag of Words Meets Bags of Popcorn (https://www.kaggle.com/c/word2vec-nlp-tutorial) 24 | - Gensim – Deep learning with paragraph2vec (https://radimrehurek.com/gensim/models/doc2vec.html) 25 | - Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents (https://arxiv.org/pdf/1405.4053v2.pdf) 26 | -------------------------------------------------------------------------------- /classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/classifiers/__init__.py -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/data/__init__.py -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/models/__init__.py -------------------------------------------------------------------------------- /models/classifier_model.py: -------------------------------------------------------------------------------- 1 | from .model import Model 2 | from .doc2vec_model import doc2VecModel 3 | 4 | import logging 5 | import os 6 | import inspect 7 | 8 | import numpy as np 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 11 | 12 | logging.basicConfig( 13 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename 15 | base_path = os.path.dirname(os.path.abspath(base_file_path)) 16 | project_dir_path = os.path.dirname(os.path.abspath(base_path)) 17 | classifiers_path = os.path.join(project_dir_path, 'classifiers') 18 | 19 | 20 | class classifierModel(Model): 21 | def __init__(self): 22 | super().__init__() 23 | 24 | def initialize_model(self): 25 | self.model = LogisticRegression() 26 | 27 | def train_model(self, d2v, training_vectors, training_labels): 28 | logging.info("Classifier training") 29 | train_vectors = doc2VecModel.get_vectors( 30 | d2v, len(training_vectors), 300, 'Train') 31 | self.model.fit(train_vectors, np.array(training_labels)) 32 | training_predictions = self.model.predict(train_vectors) 33 | logging.info( 34 | 'Training predicted classes: {}'.format(np.unique( 35 | training_predictions))) 36 | logging.info( 37 | 'Training accuracy: {}'.format( 38 | accuracy_score(training_labels, training_predictions))) 39 | logging.info( 40 | 'Training precision: {}'.format( 41 | precision_score( 42 | training_labels, training_predictions))) 43 | logging.info( 44 | 'Training recall: {}'.format( 45 | recall_score( 46 | training_labels, training_predictions))) 47 | logging.info( 48 | 'Training F1 score: {}'.format( 49 | f1_score( 50 | training_labels, training_predictions, 51 | average='weighted'))) 52 | 53 | def test_model(self, d2v, testing_vectors, testing_labels): 54 | logging.info("Classifier testing") 55 | test_vectors = doc2VecModel.get_vectors( 56 | d2v, len(testing_vectors), 300, 'Test') 57 | testing_predictions = self.model.predict(test_vectors) 58 | logging.info( 59 | 'Testing predicted classes: {}'.format( 60 | np.unique(testing_predictions))) 61 | logging.info( 62 | 'Testing accuracy: {}'.format( 63 | accuracy_score(testing_labels, testing_predictions))) 64 | logging.info( 65 | 'Testing precision: {}'.format( 66 | precision_score( 67 | testing_labels, testing_predictions))) 68 | logging.info( 69 | 'Training recall: {}'.format( 70 | recall_score( 71 | testing_labels, testing_predictions))) 72 | logging.info( 73 | 'Testing F1 score: {}'.format( 74 | f1_score( 75 | testing_labels, testing_predictions, 76 | average='weighted'))) 77 | 78 | def predict(self, d2v, testing_vectors): 79 | logging.info("Classifier Predicting") 80 | test_vectors = doc2VecModel.get_vectors( 81 | d2v, len(testing_vectors), 300, 'Test') 82 | testing_predictions = self.model.predict(test_vectors) 83 | logging.info(testing_predictions) 84 | -------------------------------------------------------------------------------- /models/doc2vec_model.py: -------------------------------------------------------------------------------- 1 | from .model import Model 2 | 3 | import logging 4 | import random 5 | import os 6 | import inspect 7 | 8 | import numpy as np 9 | from gensim.models import doc2vec 10 | 11 | 12 | logging.basicConfig( 13 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename 15 | base_path = os.path.dirname(os.path.abspath(base_file_path)) 16 | project_dir_path = os.path.dirname(os.path.abspath(base_path)) 17 | classifiers_path = os.path.join(project_dir_path, 'classifiers') 18 | 19 | 20 | class doc2VecModel(Model): 21 | 22 | def __init__(self): 23 | super().__init__() 24 | 25 | def initialize_model(self, corpus): 26 | logging.info("Building Doc2Vec vocabulary") 27 | self.corpus = corpus 28 | self.model = doc2vec.Doc2Vec(min_count=1, 29 | # Ignores all words with 30 | # total frequency lower than this 31 | window=10, 32 | # The maximum distance between the current 33 | # and predicted word within a sentence 34 | vector_size=300, # Dimensionality of the 35 | # generated feature vectors 36 | workers=5, # Number of worker threads to 37 | # train the model 38 | alpha=0.025, # The initial learning rate 39 | min_alpha=0.00025, 40 | # Learning rate will linearly drop to 41 | # min_alpha as training progresses 42 | dm=1) 43 | # dm defines the training algorithm. 44 | # If dm=1 means 'distributed memory' (PV-DM) 45 | # and dm =0 means 'distributed bag of words' (PV-DBOW) 46 | self.model.build_vocab(self.corpus) 47 | 48 | def train_model(self): 49 | logging.info("Training Doc2Vec model") 50 | # 10 epochs take around 10 minutes on my machine (i7), 51 | # if you have more time/computational power make it 20 52 | for epoch in range(10): 53 | logging.info('Training iteration #{0}'.format(epoch)) 54 | self.model.train( 55 | self.corpus, total_examples=self.model.corpus_count, 56 | epochs=self.model.epochs) 57 | # shuffle the corpus 58 | random.shuffle(self.corpus) 59 | # decrease the learning rate 60 | self.model.alpha -= 0.0002 61 | # fix the learning rate, no decay 62 | self.model.min_alpha = self.model.alpha 63 | 64 | def get_vectors(self, corpus_size, vectors_size, vectors_type): 65 | """ 66 | Get vectors from trained doc2vec model 67 | :param doc2vec_model: Trained Doc2Vec model 68 | :param corpus_size: Size of the data 69 | :param vectors_size: Size of the embedding vectors 70 | :param vectors_type: Training or Testing vectors 71 | :return: list of vectors 72 | """ 73 | vectors = np.zeros((corpus_size, vectors_size)) 74 | for i in range(0, corpus_size): 75 | prefix = vectors_type + '_' + str(i) 76 | vectors[i] = self.model.docvecs[prefix] 77 | return vectors 78 | 79 | def label_sentences(corpus, label_type): 80 | """ 81 | Gensim's Doc2Vec implementation requires each 82 | document/paragraph to have a label associated with it. 83 | We do this by using the LabeledSentence method. 84 | The format will be "TRAIN_i" or "TEST_i" where "i" is 85 | a dummy index of the review. 86 | """ 87 | labeled = [] 88 | for i, v in enumerate(corpus): 89 | label = label_type + '_' + str(i) 90 | labeled.append(doc2vec.LabeledSentence(v.split(), [label])) 91 | return labeled 92 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Model(ABC): 5 | 6 | def __init__(self): 7 | self.model = None 8 | super().__init__() 9 | 10 | @abstractmethod 11 | def initialize_model(self): 12 | pass 13 | 14 | @abstractmethod 15 | def train_model(self): 16 | pass 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.23.4 2 | gensim==3.6.0 3 | numpy==1.16.0 4 | scikit_learn==0.20.2 -------------------------------------------------------------------------------- /text_classifier.py: -------------------------------------------------------------------------------- 1 | from models.doc2vec_model import doc2VecModel 2 | from models.classifier_model import classifierModel 3 | 4 | import os 5 | import logging 6 | import inspect 7 | 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | logging.basicConfig( 13 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename 15 | project_dir_path = os.path.dirname(os.path.abspath(base_file_path)) 16 | data_path = os.path.join(project_dir_path, 'data') 17 | default_classifier = os.path.join( 18 | project_dir_path, 'classifiers', 'logreg_model.pkl') 19 | default_doc2vec = os.path.join(project_dir_path, 'classifiers', 'd2v.model') 20 | default_dataset = os.path.join(data_path, 'dataset.csv') 21 | 22 | 23 | class TextClassifier(): 24 | 25 | def __init__(self): 26 | super().__init__() 27 | self.d2v = doc2VecModel() 28 | self.classifier = classifierModel() 29 | self.dataset = None 30 | 31 | def read_data(self, filename): 32 | filename = os.path.join(data_path, filename) 33 | self.dataset = pd.read_csv(filename, header=0, delimiter="\t") 34 | 35 | def prepare_all_data(self): 36 | x_train, x_test, y_train, y_test = train_test_split( 37 | self.dataset.review, self.dataset.sentiment, random_state=0, 38 | test_size=0.1) 39 | x_train = doc2VecModel.label_sentences(x_train, 'Train') 40 | x_test = doc2VecModel.label_sentences(x_test, 'Test') 41 | all_data = x_train + x_test 42 | return x_train, x_test, y_train, y_test, all_data 43 | 44 | def prepare_test_data(self, sentence): 45 | x_test = doc2VecModel.label_sentences(sentence, 'Test') 46 | return x_test 47 | 48 | def train_classifier(self): 49 | x_train, x_test, y_train, y_test, all_data = self.prepare_all_data() 50 | self.d2v.initialize_model(all_data) 51 | self.d2v.train_model() 52 | self.classifier.initialize_model() 53 | self.classifier.train_model(self.d2v, x_train, y_train) 54 | self.classifier.test_model(self.d2v, x_test, y_test) 55 | return self.d2v, self.classifier 56 | 57 | def test_classifier(self): 58 | _, x_test, _, y_test, _ = self.prepare_all_data() 59 | if (self.d2v.model is None or self.classifier.model is None): 60 | logging.info( 61 | "Models Not Found, Train First or Use Correct Model Names") 62 | else: 63 | self.classifier.test_model(self.d2v, x_test, y_test) 64 | 65 | 66 | def run(dataset_file): 67 | tc = TextClassifier() 68 | tc.read_data(dataset_file) 69 | tc.train_classifier() 70 | tc.test_classifier() 71 | 72 | 73 | if __name__ == "__main__": 74 | run("dataset.csv") 75 | --------------------------------------------------------------------------------