├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── classifiers
    └── __init__.py
├── data
    ├── __init__.py
    └── dataset.csv
├── models
    ├── __init__.py
    ├── classifier_model.py
    ├── doc2vec_model.py
    └── model.py
├── requirements.txt
└── text_classifier.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | .static_storage/
 58 | .media/
 59 | local_settings.py
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | ### JetBrains template
108 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
109 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
110 | 
111 | # User-specific stuff:
112 | .idea/**/workspace.xml
113 | .idea/**/tasks.xml
114 | .idea/dictionaries
115 | 
116 | # Sensitive or high-churn files:
117 | .idea/**/dataSources/
118 | .idea/**/dataSources.ids
119 | .idea/**/dataSources.xml
120 | .idea/**/dataSources.local.xml
121 | .idea/**/sqlDataSources.xml
122 | .idea/**/dynamic.xml
123 | .idea/**/uiDesigner.xml
124 | 
125 | # Gradle:
126 | .idea/**/gradle.xml
127 | .idea/**/libraries
128 | 
129 | # CMake
130 | cmake-build-debug/
131 | 
132 | # Mongo Explorer plugin:
133 | .idea/**/mongoSettings.xml
134 | 
135 | ## File-based project format:
136 | *.iws
137 | 
138 | ## Plugin-specific files:
139 | 
140 | # IntelliJ
141 | out/
142 | 
143 | # mpeltonen/sbt-idea plugin
144 | .idea_modules/
145 | 
146 | # JIRA plugin
147 | atlassian-ide-plugin.xml
148 | 
149 | # Cursive Clojure plugin
150 | .idea/replstate.xml
151 | 
152 | # Crashlytics plugin (for Android Studio and IntelliJ)
153 | com_crashlytics_export_strings.xml
154 | crashlytics.properties
155 | crashlytics-build.properties
156 | fabric.properties
157 | 
158 | # .idea
159 | .idea/
160 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 |  language: python
2 |  python:
3 |     - "3.5"
4 |  install:
5 |   - sudo rm -f /etc/boto.cfg
6 |   - pip install -r requirements.txt
7 |  script:
8 |    - python3 text_classifier.py
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ibrahim Sharaf ElDen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Doc2Vec Text Classification [![Build Status](https://travis-ci.org/ibrahimsharaf/doc2vec.svg?branch=master)](https://travis-ci.org/ibrahimsharaf/doc2vec)
 2 | 
 3 | Text classification model which uses gensim Doc2Vec for generating paragraph embeddings and scikit-learn Logistic Regression for classification.
 4 | 
 5 | 
 6 | ### Dataset
 7 | 
 8 | 25,000 IMDB movie reviews, specially selected for sentiment analysis. The sentiment of reviews is binary (1 for postive, 0 for negative).
 9 | 
10 | This source dataset was collected in association with the following publication:
11 | 
12 | ```Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis." The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).```
13 | 
14 | ### Usage
15 | - Install the required tools 
16 | 
17 |     ```pip install -r requirements.txt```
18 | - Run the script 
19 |     
20 |      ```python text_classifier.py```
21 | 
22 | ### References
23 | - Kaggle – Bag of Words Meets Bags of Popcorn (https://www.kaggle.com/c/word2vec-nlp-tutorial)
24 | - Gensim – Deep learning with paragraph2vec (https://radimrehurek.com/gensim/models/doc2vec.html)
25 | - Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents (https://arxiv.org/pdf/1405.4053v2.pdf)
26 | 


--------------------------------------------------------------------------------
/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/classifiers/__init__.py


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/data/__init__.py


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibrahimsharaf/doc2vec/488c605f1358e01620e4644c7a1fedbd27c84354/models/__init__.py


--------------------------------------------------------------------------------
/models/classifier_model.py:
--------------------------------------------------------------------------------
 1 | from .model import Model
 2 | from .doc2vec_model import doc2VecModel
 3 | 
 4 | import logging
 5 | import os
 6 | import inspect
 7 | 
 8 | import numpy as np
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
11 | 
12 | logging.basicConfig(
13 |     format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
15 | base_path = os.path.dirname(os.path.abspath(base_file_path))
16 | project_dir_path = os.path.dirname(os.path.abspath(base_path))
17 | classifiers_path = os.path.join(project_dir_path, 'classifiers')
18 | 
19 | 
20 | class classifierModel(Model):
21 |     def __init__(self):
22 |         super().__init__()
23 | 
24 |     def initialize_model(self):
25 |         self.model = LogisticRegression()
26 | 
27 |     def train_model(self, d2v, training_vectors, training_labels):
28 |         logging.info("Classifier training")
29 |         train_vectors = doc2VecModel.get_vectors(
30 |             d2v, len(training_vectors), 300, 'Train')
31 |         self.model.fit(train_vectors, np.array(training_labels))
32 |         training_predictions = self.model.predict(train_vectors)
33 |         logging.info(
34 |             'Training predicted classes: {}'.format(np.unique(
35 |                 training_predictions)))       
36 |         logging.info(
37 |             'Training accuracy: {}'.format(
38 |                 accuracy_score(training_labels, training_predictions)))
39 |         logging.info(
40 |             'Training precision: {}'.format(
41 |             precision_score(
42 |             training_labels, training_predictions)))
43 |         logging.info(
44 |             'Training recall: {}'.format(
45 |             recall_score(
46 |             training_labels, training_predictions))) 
47 |         logging.info(
48 |             'Training F1 score: {}'.format(
49 |                 f1_score(
50 |                     training_labels, training_predictions,
51 |                     average='weighted')))
52 | 
53 |     def test_model(self, d2v, testing_vectors, testing_labels):
54 |         logging.info("Classifier testing")
55 |         test_vectors = doc2VecModel.get_vectors(
56 |             d2v, len(testing_vectors), 300, 'Test')
57 |         testing_predictions = self.model.predict(test_vectors)
58 |         logging.info(
59 |             'Testing predicted classes: {}'.format(
60 |                 np.unique(testing_predictions)))
61 |         logging.info(
62 |             'Testing accuracy: {}'.format(
63 |                 accuracy_score(testing_labels, testing_predictions)))
64 |         logging.info(
65 |             'Testing precision: {}'.format(
66 |             precision_score(
67 |             testing_labels, testing_predictions)))
68 |         logging.info(
69 |             'Training recall: {}'.format(
70 |             recall_score(
71 |             testing_labels, testing_predictions))) 
72 |         logging.info(
73 |             'Testing F1 score: {}'.format(
74 |                 f1_score(
75 |                     testing_labels, testing_predictions,
76 |                     average='weighted')))
77 | 
78 |     def predict(self, d2v, testing_vectors):
79 |         logging.info("Classifier Predicting")
80 |         test_vectors = doc2VecModel.get_vectors(
81 |             d2v, len(testing_vectors), 300, 'Test')
82 |         testing_predictions = self.model.predict(test_vectors)
83 |         logging.info(testing_predictions)
84 | 


--------------------------------------------------------------------------------
/models/doc2vec_model.py:
--------------------------------------------------------------------------------
 1 | from .model import Model
 2 | 
 3 | import logging
 4 | import random
 5 | import os
 6 | import inspect
 7 | 
 8 | import numpy as np
 9 | from gensim.models import doc2vec
10 | 
11 | 
12 | logging.basicConfig(
13 |     format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
15 | base_path = os.path.dirname(os.path.abspath(base_file_path))
16 | project_dir_path = os.path.dirname(os.path.abspath(base_path))
17 | classifiers_path = os.path.join(project_dir_path, 'classifiers')
18 | 
19 | 
20 | class doc2VecModel(Model):
21 | 
22 |     def __init__(self):
23 |         super().__init__()
24 | 
25 |     def initialize_model(self, corpus):
26 |         logging.info("Building Doc2Vec vocabulary")
27 |         self.corpus = corpus
28 |         self.model = doc2vec.Doc2Vec(min_count=1,
29 |                                      # Ignores all words with
30 |                                      # total frequency lower than this
31 |                                      window=10,
32 |                                      # The maximum distance between the current
33 |                                      #  and predicted word within a sentence
34 |                                      vector_size=300,  # Dimensionality of the
35 |                                      #  generated feature vectors
36 |                                      workers=5,  # Number of worker threads to
37 |                                      #  train the model
38 |                                      alpha=0.025,  # The initial learning rate
39 |                                      min_alpha=0.00025,
40 |                                      # Learning rate will linearly drop to
41 |                                      # min_alpha as training progresses
42 |                                      dm=1)
43 |         # dm defines the training algorithm.
44 |         #  If dm=1 means 'distributed memory' (PV-DM)
45 |         # and dm =0 means 'distributed bag of words' (PV-DBOW)
46 |         self.model.build_vocab(self.corpus)
47 | 
48 |     def train_model(self):
49 |         logging.info("Training Doc2Vec model")
50 |         # 10 epochs take around 10 minutes on my machine (i7),
51 |         #  if you have more time/computational power make it 20
52 |         for epoch in range(10):
53 |             logging.info('Training iteration #{0}'.format(epoch))
54 |             self.model.train(
55 |                 self.corpus, total_examples=self.model.corpus_count,
56 |                 epochs=self.model.epochs)
57 |             # shuffle the corpus
58 |             random.shuffle(self.corpus)
59 |             # decrease the learning rate
60 |             self.model.alpha -= 0.0002
61 |             # fix the learning rate, no decay
62 |             self.model.min_alpha = self.model.alpha
63 | 
64 |     def get_vectors(self, corpus_size, vectors_size, vectors_type):
65 |         """
66 |         Get vectors from trained doc2vec model
67 |         :param doc2vec_model: Trained Doc2Vec model
68 |         :param corpus_size: Size of the data
69 |         :param vectors_size: Size of the embedding vectors
70 |         :param vectors_type: Training or Testing vectors
71 |         :return: list of vectors
72 |         """
73 |         vectors = np.zeros((corpus_size, vectors_size))
74 |         for i in range(0, corpus_size):
75 |             prefix = vectors_type + '_' + str(i)
76 |             vectors[i] = self.model.docvecs[prefix]
77 |         return vectors
78 | 
79 |     def label_sentences(corpus, label_type):
80 |         """
81 |         Gensim's Doc2Vec implementation requires each
82 |          document/paragraph to have a label associated with it.
83 |         We do this by using the LabeledSentence method.
84 |         The format will be "TRAIN_i" or "TEST_i" where "i" is
85 |         a dummy index of the review.
86 |         """
87 |         labeled = []
88 |         for i, v in enumerate(corpus):
89 |             label = label_type + '_' + str(i)
90 |             labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
91 |         return labeled
92 | 


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class Model(ABC):
 5 | 
 6 |     def __init__(self):
 7 |         self.model = None
 8 |         super().__init__()
 9 | 
10 |     @abstractmethod
11 |     def initialize_model(self):
12 |         pass
13 | 
14 |     @abstractmethod
15 |     def train_model(self):
16 |         pass
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.23.4
2 | gensim==3.6.0
3 | numpy==1.16.0
4 | scikit_learn==0.20.2


--------------------------------------------------------------------------------
/text_classifier.py:
--------------------------------------------------------------------------------
 1 | from models.doc2vec_model import doc2VecModel
 2 | from models.classifier_model import classifierModel
 3 | 
 4 | import os
 5 | import logging
 6 | import inspect
 7 | 
 8 | import pandas as pd
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | 
12 | logging.basicConfig(
13 |     format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
14 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
15 | project_dir_path = os.path.dirname(os.path.abspath(base_file_path))
16 | data_path = os.path.join(project_dir_path, 'data')
17 | default_classifier = os.path.join(
18 |     project_dir_path, 'classifiers', 'logreg_model.pkl')
19 | default_doc2vec = os.path.join(project_dir_path, 'classifiers', 'd2v.model')
20 | default_dataset = os.path.join(data_path, 'dataset.csv')
21 | 
22 | 
23 | class TextClassifier():
24 | 
25 |     def __init__(self):
26 |         super().__init__()
27 |         self.d2v = doc2VecModel()
28 |         self.classifier = classifierModel()
29 |         self.dataset = None
30 | 
31 |     def read_data(self, filename):
32 |         filename = os.path.join(data_path, filename)
33 |         self.dataset = pd.read_csv(filename, header=0, delimiter="\t")
34 | 
35 |     def prepare_all_data(self):
36 |         x_train, x_test, y_train, y_test = train_test_split(
37 |             self.dataset.review, self.dataset.sentiment, random_state=0,
38 |             test_size=0.1)
39 |         x_train = doc2VecModel.label_sentences(x_train, 'Train')
40 |         x_test = doc2VecModel.label_sentences(x_test, 'Test')
41 |         all_data = x_train + x_test
42 |         return x_train, x_test, y_train, y_test, all_data
43 | 
44 |     def prepare_test_data(self, sentence):
45 |         x_test = doc2VecModel.label_sentences(sentence, 'Test')
46 |         return x_test
47 | 
48 |     def train_classifier(self):
49 |         x_train, x_test, y_train, y_test, all_data = self.prepare_all_data()
50 |         self.d2v.initialize_model(all_data)
51 |         self.d2v.train_model()
52 |         self.classifier.initialize_model()
53 |         self.classifier.train_model(self.d2v, x_train, y_train)
54 |         self.classifier.test_model(self.d2v, x_test, y_test)
55 |         return self.d2v, self.classifier
56 | 
57 |     def test_classifier(self):
58 |         _, x_test, _, y_test, _ = self.prepare_all_data()
59 |         if (self.d2v.model is None or self.classifier.model is None):
60 |             logging.info(
61 |                 "Models Not Found, Train First or Use Correct Model Names")
62 |         else:
63 |             self.classifier.test_model(self.d2v, x_test, y_test)
64 | 
65 | 
66 | def run(dataset_file):
67 |     tc = TextClassifier()
68 |     tc.read_data(dataset_file)
69 |     tc.train_classifier()
70 |     tc.test_classifier()
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     run("dataset.csv")
75 | 


--------------------------------------------------------------------------------