├── mika-baumeister-Wpnoqo2plFA-unsplash.jpg
├── .idea
    └── .gitignore
├── README.md
├── embeddings
    ├── wordsim_clothing.csv
    └── references.md
├── .gitignore
├── archive
    └── README2.md
├── text-analytics
    └── Text_Analytics.ipynb
└── ml-projects
    └── Using_Embeddings_and_NLP_For_Machine_Learning.ipynb


/mika-baumeister-Wpnoqo2plFA-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjahanshahi/intermediate-nlp/HEAD/mika-baumeister-Wpnoqo2plFA-unsplash.jpg


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intermediate Natural Language Processing
2 | 
3 | There are three classes in this series:
4 | 1. [Building Text Analytics Pipelines using NLP](/text-analytics)
5 | 2. [Extracting Insights from Text Data using NLP and Word Embeddings](/embeddings)
6 | 3. [Leveraging NLP and Word Embeddings in Machine Learning Projects](/ml-projects)
7 | 
8 | 


--------------------------------------------------------------------------------
/embeddings/wordsim_clothing.csv:
--------------------------------------------------------------------------------
 1 | gorgeous,attractive,1
 2 | perfect,ugly,0
 3 | ugly,terrible,1
 4 | cheap,flimsy,1
 5 | comfortable,good,1
 6 | comfortable,wonderful,1
 7 | comfortable,tight,0
 8 | cheap,fancy,0
 9 | dress,skirt,1
10 | suit,yellow,0
11 | cheap,polyester,1
12 | orange,swimsuit,0
13 | silky,lovely,1
14 | silky,soft,1
15 | silky,pretty,1
16 | capris,shorts,1
17 | capris,crops,1
18 | tacky,sloppy,1
19 | tacky,avoid,1
20 | shiny,purple,0
21 | tacky,orange,0
22 | pregnant,maternity,1
23 | petite,small,1
24 | petite,large,0
25 | petite,tall,0
26 | coat,trench,1
27 | coat,scarf,1
28 | coat,polyester,0
29 | coat,wool,1
30 | yoga,lounge,1
31 | yoga,sweats,1
32 | yoga,gym,1
33 | waist,yoga,0
34 | hem,yoga,0
35 | lounge,band,0
36 | waist,waistband,1
37 | waist,hip,1
38 | boots,sandals,1
39 | boots,heels,1
40 | lines,tailored,1
41 | lines,defined,1
42 | faded,dull,1
43 | boring,plain,1
44 | boring,cheap,1
45 | necklace,earrings,1
46 | necklace,coat,0
47 | necklace,boots,0
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/archive/README2.md:
--------------------------------------------------------------------------------
 1 | # Intermediate Natural Language Processing:
 2 | ## Real World Applications of Word Embeddings
 3 | 
 4 | ### Notebook Setup 
 5 | 
 6 | A hosted is available on [Kaggle Kernel](https://www.kaggle.com/jahanshahi/intermediate-nlp-word-embeddings).
 7 | 
 8 | The chief benefit of using the Kaggle Kernel is to enable everyone to quickly and efficiently use the same environment. These notebooks are hosted on GPUs in the cloud and a similar set up can be made on Google. I chose Kaggle because they also host the datasets we will be using (as well as many more!), which makes it easy to quickly import them and get models going! 
 9 | 
10 | ### Schedule
11 |  
12 | #### Segment 1: Introduction to Language Models (Length: 30 min)
13 | 
14 | We will discuss:
15 | 
16 | - Complexity of natural language requires specific techniques:
17 | - Language models are probability distributions over a sequence of words
18 | - Key uses are in machine learning and unsupervised learning (search/IR and clustering/topic modeling)
19 | - The intuition behind vector space modeling
20 | - Description of some of the similarities and differences between different word embedding algorithms (word2vec, GloVe, PPMI)
21 | 
22 | #### Q&A / Break (Length: 10 min)
23 | 
24 | #### Segment 2:  Using Pretrained Word Embeddings (Length: 30 min)
25 | 
26 | We will demonstrate (using a Notebook):
27 | 
28 | - Ease of using pretrained embeddings
29 | - Design considerations in using pretrained models including: noise, sentiments, generalization
30 | - Some specific examples using different models (occupy in Twitter / Wikipedia / Common Crawl)
31 | - Limitations using pretrained models:
32 |   + Inputs: Implications of design decisions made during preprocessing on casing / stopwords / frequently occurring phrases
33 |   + Output: Goal to learn similarity (example of word similarity tests)
34 | 
35 | #### Q&A / Break (Length: 10 min)
36 | 
37 | #### Segment 3: Training your own Word Embeddings (Length: 30 min)
38 | 
39 | We will discuss:
40 | 
41 | -	Optimizing for different outputs (semantic relations vs semantic similarity):
42 | -	Preprocessing for outputs
43 | -	Testing word embedding models (visual inspection, similarity pairs)
44 | 
45 | We will demonstrate:
46 | 
47 | -	Training a custom embedding model using spaCy to preprocess and the Gensim and scikit-learn API to train models
48 | 
49 | Note: Training an embedding can take many hours, so this notebook will focus on how to do it, and participants can continue to train or experiment in their own time.
50 | 
51 | #### Q&A / Break (Length: 10 min)
52 | 
53 | #### Segment 4: Applying Word Embeddings (Length: 40 min)
54 | 
55 | We will demonstrate:
56 | 
57 | -	Using word embeddings as inputs to understand documents
58 |   + Supervised Machine Learning including Document Classification
59 |   + Unsupervised Models including Document Clustering
60 |   
61 | We will discuss:
62 | 
63 | - Using word embeddings to extract insights from texts
64 |   + Static vs Dynamic Embeddings on a high level
65 |   + Hacking dynamic embeddings for other types of ordinal structure (grouping by reviews stars)
66 | 
67 | #### Q&A / Break (Length: 10 min)
68 | 


--------------------------------------------------------------------------------
/embeddings/references.md:
--------------------------------------------------------------------------------
 1 | # Selected References:
 2 | ## Embeddings
 3 | - Ruder, Sebastian. "On word embeddings - Part 1". http://ruder.io/word-embeddings-1/, 2016.
 4 | - Ruder, Sebastian. "On word embeddings - Part 3: The secret ingredients of word2vec". http://ruder.io/secret-word2vec/, 2016.
 5 | 
 6 | ## Parameter Tuning
 7 | - Hardt, Moritz. “Word Embedding: Explaining Their Properties.” Off the Convex Path, http://offconvex.github.io/2016/02/14/word-embeddings-2/.
 8 | - Komiya, Kanako, and Hiroyuki Shinnou. “Investigating Effective Parameters for Fine-Tuning of Word Embeddings Using Only a Small Corpus.” Proceedings of the Workshop on Deep Learning Approaches for Low-Resource NLP, Association for Computational Linguistics, 2018, pp. 60–67. ACLWeb, doi:10.18653/v1/W18-3408.
 9 | - Landauer, Thomas and Dumais, Susan. LSA: A Solution to Plato’s Problem. http://lsa.colorado.edu/papers/plato/plato.annote.html. 
10 | - Yin, Zi, and Yuanyuan Shen. “On the Dimensionality of Word Embedding.” ArXiv:1812.04224 [Cs, Stat], Dec. 2018. arXiv.org, http://arxiv.org/abs/1812.04224.
11 | 
12 | ## Testing Embeddings
13 | - Bakarov, Amir. “A Survey of Word Embeddings Evaluation Methods.” ArXiv:1801.09536 [Cs], Jan. 2018. arXiv.org, http://arxiv.org/abs/1801.09536.
14 | - Schnabel, Tobias, et al. “Evaluation Methods for Unsupervised Word Embeddings.” Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, 2015, pp. 298–307. DOI.org (Crossref), doi:10.18653/v1/D15-1036.
15 | - Wang, Bin, et al. “Evaluating Word Embedding Models: Methods and Experimental Results.” APSIPA Transactions on Signal and Information Processing, vol. 8, 2019, p. e19. arXiv.org, doi:10.1017/ATSIP.2019.12.
16 | 
17 | 
18 | ## Debiasing Embeddings
19 | - Bolukbasi, Tolga, et al. “Man Is to Computer Programmer as Woman Is to Homemaker? Debiasing Word Embeddings.” Proceedings of the 30th International Conference on Neural Information Processing Systems, Curran Associates Inc., 2016, pp. 4356–64.
20 | - Garg, Nikhil, et al. “Word Embeddings Quantify 100 Years of Gender and Ethnic Stereotypes.” Proceedings of the National Academy of Sciences, vol. 115, no. 16, Apr. 2018, pp. E3635–44. DOI.org (Crossref), doi:10.1073/pnas.1720347115.
21 | - Gonen, Hila, and Goldberg, Yoav. “Lipstick on a Pig: Debiasing Methods Cover up Systematic Gender Biases in Word Embeddings But Do Not Remove Them.” ArXiv:1903.03862 [Cs], Sept. 2019. arXiv.org, http://arxiv.org/abs/1903.03862.
22 | - Ethayarajh, Kawin, et al. “Understanding Undesirable Word Embedding Associations.” Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, 2019, pp. 1696–705. DOI.org (Crossref), doi:10.18653/v1/P19-1166.
23 | - Nissim, Malvina, et al. “Fair Is Better than Sensational: Man Is to Doctor as Woman Is to Doctor.” Computational Linguistics, vol. 46, no. 2, June 2020, pp. 487–97. DOI.org (Crossref), doi:10.1162/coli_a_00379.
24 | - Papakyriakopoulos, Orestis, et al. “Bias in Word Embeddings.” Proceedings of the 2020 Conference on Fairness, Accountability, and Transparency, ACM, 2020, pp. 446–57. DOI.org (Crossref), doi:10.1145/3351095.3372843.
25 | - Zhao, Jieyu, et al. “Gender Bias in Contextualized Word Embeddings.” Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Association for Computational Linguistics, 2019, pp. 629–34. ACLWeb, doi:10.18653/v1/N19-1064.
26 | 


--------------------------------------------------------------------------------
/text-analytics/Text_Analytics.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "colab": {
   6 |       "name": "Text Analytics.ipynb",
   7 |       "provenance": [],
   8 |       "authorship_tag": "ABX9TyMQgNb4AtMt0C9GaQKU6Y3i",
   9 |       "include_colab_link": true
  10 |     },
  11 |     "kernelspec": {
  12 |       "name": "python3",
  13 |       "display_name": "Python 3"
  14 |     }
  15 |   },
  16 |   "cells": [
  17 |     {
  18 |       "cell_type": "markdown",
  19 |       "metadata": {
  20 |         "id": "view-in-github",
  21 |         "colab_type": "text"
  22 |       },
  23 |       "source": [
  24 |         "<a href=\"https://colab.research.google.com/github/mjahanshahi/intermediate-nlp/blob/master/text-analytics/Text_Analytics.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  25 |       ]
  26 |     },
  27 |     {
  28 |       "cell_type": "markdown",
  29 |       "metadata": {
  30 |         "id": "YQV5BO4qX-uY"
  31 |       },
  32 |       "source": [
  33 |         "# Introduction to Text Analysis\n",
  34 |         "\n",
  35 |         "Welcome to this colab notebook that I will use for demonstrative purposes. "
  36 |       ]
  37 |     },
  38 |     {
  39 |       "cell_type": "markdown",
  40 |       "metadata": {
  41 |         "id": "aYrFID9Zz4k9"
  42 |       },
  43 |       "source": [
  44 |         "## Comparing NLTK vs spaCy"
  45 |       ]
  46 |     },
  47 |     {
  48 |       "cell_type": "code",
  49 |       "metadata": {
  50 |         "id": "J0Y53Pe-0AaV"
  51 |       },
  52 |       "source": [
  53 |         "import spacy\n",
  54 |         "from spacy.lang.en import English\n",
  55 |         "import nltk\n",
  56 |         "from nltk.tokenize import word_tokenize"
  57 |       ],
  58 |       "execution_count": 2,
  59 |       "outputs": []
  60 |     },
  61 |     {
  62 |       "cell_type": "code",
  63 |       "metadata": {
  64 |         "id": "QkuFGuZyZfek",
  65 |         "outputId": "02427d2e-105d-491a-b4fe-73d605ac0e3c",
  66 |         "colab": {
  67 |           "base_uri": "https://localhost:8080/",
  68 |           "height": 71
  69 |         }
  70 |       },
  71 |       "source": [
  72 |         "en = English()\n",
  73 |         "text = 'We are doing Text Analysis.'\n",
  74 |         "doc = en(text)\n",
  75 |         "print(type(doc))\n",
  76 |         "print([(x, type(x)) for x in doc])"
  77 |       ],
  78 |       "execution_count": 3,
  79 |       "outputs": [
  80 |         {
  81 |           "output_type": "stream",
  82 |           "text": [
  83 |             "<class 'spacy.tokens.doc.Doc'>\n",
  84 |             "[(We, <class 'spacy.tokens.token.Token'>), (are, <class 'spacy.tokens.token.Token'>), (doing, <class 'spacy.tokens.token.Token'>), (Text, <class 'spacy.tokens.token.Token'>), (Analysis, <class 'spacy.tokens.token.Token'>), (., <class 'spacy.tokens.token.Token'>)]\n"
  85 |           ],
  86 |           "name": "stdout"
  87 |         }
  88 |       ]
  89 |     },
  90 |     {
  91 |       "cell_type": "code",
  92 |       "metadata": {
  93 |         "id": "4vuUgFcweMVc",
  94 |         "outputId": "06652f5d-0afa-426c-90ed-0dfb13c76c48",
  95 |         "colab": {
  96 |           "base_uri": "https://localhost:8080/",
  97 |           "height": 105
  98 |         }
  99 |       },
 100 |       "source": [
 101 |         "nltk.download('punkt')\n",
 102 |         "doc = word_tokenize(text)\n",
 103 |         "print(type(doc))\n",
 104 |         "print([(x, type(x)) for x in doc])"
 105 |       ],
 106 |       "execution_count": 4,
 107 |       "outputs": [
 108 |         {
 109 |           "output_type": "stream",
 110 |           "text": [
 111 |             "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
 112 |             "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
 113 |             "<class 'list'>\n",
 114 |             "[('We', <class 'str'>), ('are', <class 'str'>), ('doing', <class 'str'>), ('Text', <class 'str'>), ('Analysis', <class 'str'>), ('.', <class 'str'>)]\n"
 115 |           ],
 116 |           "name": "stdout"
 117 |         }
 118 |       ]
 119 |     },
 120 |     {
 121 |       "cell_type": "code",
 122 |       "metadata": {
 123 |         "id": "sT5_LiR_0SS7",
 124 |         "outputId": "985ff655-e755-4c00-8d88-c2f79c5ae991",
 125 |         "colab": {
 126 |           "base_uri": "https://localhost:8080/",
 127 |           "height": 105
 128 |         }
 129 |       },
 130 |       "source": [
 131 |         "%timeit en(text)\n",
 132 |         "%timeit nltk.tokenize.casual_tokenize(text)"
 133 |       ],
 134 |       "execution_count": 5,
 135 |       "outputs": [
 136 |         {
 137 |           "output_type": "stream",
 138 |           "text": [
 139 |             "The slowest run took 26.97 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
 140 |             "100000 loops, best of 3: 8 µs per loop\n",
 141 |             "The slowest run took 7.34 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
 142 |             "100000 loops, best of 3: 16.2 µs per loop\n"
 143 |           ],
 144 |           "name": "stdout"
 145 |         }
 146 |       ]
 147 |     },
 148 |     {
 149 |       "cell_type": "markdown",
 150 |       "metadata": {
 151 |         "id": "P8lwZEHT01zF"
 152 |       },
 153 |       "source": [
 154 |         "## spaCy's Language Models"
 155 |       ]
 156 |     },
 157 |     {
 158 |       "cell_type": "code",
 159 |       "metadata": {
 160 |         "id": "Ti-ns7PW0dWj",
 161 |         "outputId": "92594493-67c7-4293-fb21-4cdb0b5d23d0",
 162 |         "colab": {
 163 |           "base_uri": "https://localhost:8080/",
 164 |           "height": 51
 165 |         }
 166 |       },
 167 |       "source": [
 168 |         "from spacy.lang.en import English\n",
 169 |         "en = English()\n",
 170 |         "print(en.tokenizer)\n",
 171 |         "print(en.pipe_names)\n"
 172 |       ],
 173 |       "execution_count": 8,
 174 |       "outputs": [
 175 |         {
 176 |           "output_type": "stream",
 177 |           "text": [
 178 |             "<spacy.tokenizer.Tokenizer object at 0x7f83754a5e58>\n",
 179 |             "[]\n"
 180 |           ],
 181 |           "name": "stdout"
 182 |         }
 183 |       ]
 184 |     },
 185 |     {
 186 |       "cell_type": "code",
 187 |       "metadata": {
 188 |         "id": "JpknyeLs1Bmo",
 189 |         "outputId": "2f47ce7d-9081-44f3-a36e-be9a7a38519d",
 190 |         "colab": {
 191 |           "base_uri": "https://localhost:8080/",
 192 |           "height": 34
 193 |         }
 194 |       },
 195 |       "source": [
 196 |         "nlp = spacy.load('en_core_web_sm')\n",
 197 |         "print(nlp.pipe_names)"
 198 |       ],
 199 |       "execution_count": 9,
 200 |       "outputs": [
 201 |         {
 202 |           "output_type": "stream",
 203 |           "text": [
 204 |             "['tagger', 'parser', 'ner']\n"
 205 |           ],
 206 |           "name": "stdout"
 207 |         }
 208 |       ]
 209 |     },
 210 |     {
 211 |       "cell_type": "code",
 212 |       "metadata": {
 213 |         "id": "OEyN0UUM1MoE",
 214 |         "outputId": "f90b30f4-4a23-4b73-d136-5f6f80e8d7a6",
 215 |         "colab": {
 216 |           "base_uri": "https://localhost:8080/",
 217 |           "height": 88
 218 |         }
 219 |       },
 220 |       "source": [
 221 |         "doc = en('Text analysis is so much fun!')\n",
 222 |         "print(doc)\n",
 223 |         "print(type(doc))\n",
 224 |         "doc_attrs = set(dir(doc))\n",
 225 |         "print(doc_attrs)"
 226 |       ],
 227 |       "execution_count": 10,
 228 |       "outputs": [
 229 |         {
 230 |           "output_type": "stream",
 231 |           "text": [
 232 |             "Text analysis is so much fun!\n",
 233 |             "<class 'spacy.tokens.doc.Doc'>\n",
 234 |             "{'lang_', '__ne__', '__sizeof__', 'to_utf8_array', '_realloc', '__init_subclass__', 'sentiment', '__format__', '__lt__', '_vector', 'ents', 'to_disk', 'vector_norm', 'is_parsed', 'get_lca_matrix', '__str__', 'to_bytes', '__unicode__', 'is_sentenced', '__init__', '__iter__', 'doc', 'noun_chunks_iterator', 'remove_extension', '__new__', '__class__', '__reduce_ex__', '_py_tokens', '__setattr__', '_', 'is_nered', 'to_json', '__bytes__', '__delattr__', 'retokenize', 'char_span', '__repr__', '__len__', 'from_array', 'text_with_ws', '__dir__', 'to_array', 'similarity', 'mem', 'count_by', 'from_disk', 'get_extension', 'has_vector', 'noun_chunks', '__getattribute__', '__pyx_vtable__', '_bulk_merge', '__setstate__', 'print_tree', 'sents', 'lang', '__doc__', '__ge__', 'has_extension', 'text', 'tensor', '_vector_norm', 'user_token_hooks', 'cats', '__subclasshook__', 'set_extension', 'user_data', 'extend_tensor', 'user_span_hooks', 'from_bytes', 'vector', 'vocab', '__getitem__', 'user_hooks', '__le__', 'merge', '__hash__', '__gt__', '__eq__', '__reduce__', 'is_tagged'}\n"
 235 |           ],
 236 |           "name": "stdout"
 237 |         }
 238 |       ]
 239 |     },
 240 |     {
 241 |       "cell_type": "markdown",
 242 |       "metadata": {
 243 |         "id": "swU1MNr_1ZL1"
 244 |       },
 245 |       "source": [
 246 |         "Tokens are units of documents"
 247 |       ]
 248 |     },
 249 |     {
 250 |       "cell_type": "code",
 251 |       "metadata": {
 252 |         "id": "4lDyworS1Wvz",
 253 |         "outputId": "14006b2a-3f6f-4d31-b564-7acd9c65d24b",
 254 |         "colab": {
 255 |           "base_uri": "https://localhost:8080/",
 256 |           "height": 88
 257 |         }
 258 |       },
 259 |       "source": [
 260 |         "print(doc[0])\n",
 261 |         "print(type(doc[0]))\n",
 262 |         "print(dir(doc[0]))"
 263 |       ],
 264 |       "execution_count": 11,
 265 |       "outputs": [
 266 |         {
 267 |           "output_type": "stream",
 268 |           "text": [
 269 |             "Text\n",
 270 |             "<class 'spacy.tokens.token.Token'>\n",
 271 |             "['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_extension', 'has_vector', 'head', 'i', 'idx', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex_id', 'like_email', 'like_num', 'like_url', 'lower', 'lower_', 'morph', 'n_lefts', 'n_rights', 'nbor', 'norm', 'norm_', 'orth', 'orth_', 'pos', 'pos_', 'prefix', 'prefix_', 'prob', 'rank', 'remove_extension', 'right_edge', 'rights', 'sent', 'sent_start', 'sentiment', 'set_extension', 'shape', 'shape_', 'similarity', 'string', 'subtree', 'suffix', 'suffix_', 'tag', 'tag_', 'tensor', 'text', 'text_with_ws', 'vector', 'vector_norm', 'vocab', 'whitespace_']\n"
 272 |           ],
 273 |           "name": "stdout"
 274 |         }
 275 |       ]
 276 |     },
 277 |     {
 278 |       "cell_type": "code",
 279 |       "metadata": {
 280 |         "id": "q6m4nX6p1rwv",
 281 |         "outputId": "80732600-a3ea-4f50-8aae-971bdf16f21d",
 282 |         "colab": {
 283 |           "base_uri": "https://localhost:8080/",
 284 |           "height": 51
 285 |         }
 286 |       },
 287 |       "source": [
 288 |         "print(doc[0])\n",
 289 |         "print(doc[0].lower_)"
 290 |       ],
 291 |       "execution_count": 15,
 292 |       "outputs": [
 293 |         {
 294 |           "output_type": "stream",
 295 |           "text": [
 296 |             "Text\n",
 297 |             "text\n"
 298 |           ],
 299 |           "name": "stdout"
 300 |         }
 301 |       ]
 302 |     },
 303 |     {
 304 |       "cell_type": "markdown",
 305 |       "metadata": {
 306 |         "id": "QIZFGzoO2Y6-"
 307 |       },
 308 |       "source": [
 309 |         "### Text Preprocessing\n",
 310 |         "\n",
 311 |         "#### Normalizing case"
 312 |       ]
 313 |     },
 314 |     {
 315 |       "cell_type": "code",
 316 |       "metadata": {
 317 |         "id": "HZCn1uA72dmV",
 318 |         "outputId": "56fd66ce-46c7-4592-b6f7-9b0c68533f10",
 319 |         "colab": {
 320 |           "base_uri": "https://localhost:8080/",
 321 |           "height": 34
 322 |         }
 323 |       },
 324 |       "source": [
 325 |         "[x.lower_ for x in en(text)]"
 326 |       ],
 327 |       "execution_count": 16,
 328 |       "outputs": [
 329 |         {
 330 |           "output_type": "execute_result",
 331 |           "data": {
 332 |             "text/plain": [
 333 |               "['we', 'are', 'doing', 'text', 'analysis', '.']"
 334 |             ]
 335 |           },
 336 |           "metadata": {
 337 |             "tags": []
 338 |           },
 339 |           "execution_count": 16
 340 |         }
 341 |       ]
 342 |     },
 343 |     {
 344 |       "cell_type": "markdown",
 345 |       "metadata": {
 346 |         "id": "loZEO5Zy3Q9W"
 347 |       },
 348 |       "source": [
 349 |         "#### Stripping punctuation"
 350 |       ]
 351 |     },
 352 |     {
 353 |       "cell_type": "code",
 354 |       "metadata": {
 355 |         "id": "I6mAOXms3RWh",
 356 |         "outputId": "65eead87-7bbb-444b-bcfc-184b6c1a07be",
 357 |         "colab": {
 358 |           "base_uri": "https://localhost:8080/",
 359 |           "height": 34
 360 |         }
 361 |       },
 362 |       "source": [
 363 |         "[x.text for x in en(text) if x.is_alpha]"
 364 |       ],
 365 |       "execution_count": 17,
 366 |       "outputs": [
 367 |         {
 368 |           "output_type": "execute_result",
 369 |           "data": {
 370 |             "text/plain": [
 371 |               "['We', 'are', 'doing', 'text', 'analysis']"
 372 |             ]
 373 |           },
 374 |           "metadata": {
 375 |             "tags": []
 376 |           },
 377 |           "execution_count": 17
 378 |         }
 379 |       ]
 380 |     },
 381 |     {
 382 |       "cell_type": "code",
 383 |       "metadata": {
 384 |         "id": "5Gt8IUVK3RrT",
 385 |         "outputId": "15dc4c77-bbf1-4221-9f26-6668a7570da2",
 386 |         "colab": {
 387 |           "base_uri": "https://localhost:8080/",
 388 |           "height": 34
 389 |         }
 390 |       },
 391 |       "source": [
 392 |         "text = \"We're doing text analysis and it's fun!\"\n",
 393 |         "[x.text for x in en(text) if x.is_alpha]"
 394 |       ],
 395 |       "execution_count": 19,
 396 |       "outputs": [
 397 |         {
 398 |           "output_type": "stream",
 399 |           "text": [
 400 |             "Removing non-alpha ['We', 'doing', 'text', 'analysis', 'and', 'it', 'fun']\n"
 401 |           ],
 402 |           "name": "stdout"
 403 |         }
 404 |       ]
 405 |     },
 406 |     {
 407 |       "cell_type": "markdown",
 408 |       "metadata": {
 409 |         "id": "bm65jFIE4o8G"
 410 |       },
 411 |       "source": [
 412 |         "#### Lemmatizing"
 413 |       ]
 414 |     },
 415 |     {
 416 |       "cell_type": "code",
 417 |       "metadata": {
 418 |         "id": "YmWC7hNg4oTD",
 419 |         "outputId": "e230b58d-03c4-459c-bd15-a4070f505e55",
 420 |         "colab": {
 421 |           "base_uri": "https://localhost:8080/",
 422 |           "height": 34
 423 |         }
 424 |       },
 425 |       "source": [
 426 |         "[x.lemma_ for x in nlp(text)]"
 427 |       ],
 428 |       "execution_count": 23,
 429 |       "outputs": [
 430 |         {
 431 |           "output_type": "execute_result",
 432 |           "data": {
 433 |             "text/plain": [
 434 |               "['-PRON-', 'be', 'do', 'text', 'analysis', 'and', '-PRON-', 'be', 'fun', '!']"
 435 |             ]
 436 |           },
 437 |           "metadata": {
 438 |             "tags": []
 439 |           },
 440 |           "execution_count": 23
 441 |         }
 442 |       ]
 443 |     },
 444 |     {
 445 |       "cell_type": "markdown",
 446 |       "metadata": {
 447 |         "id": "6QkZze7E5CtA"
 448 |       },
 449 |       "source": [
 450 |         "#### Stop Words"
 451 |       ]
 452 |     },
 453 |     {
 454 |       "cell_type": "code",
 455 |       "metadata": {
 456 |         "id": "UQOs2wRi5C4w",
 457 |         "outputId": "c5d6709e-36f3-41e7-88b1-96db7c117f8b",
 458 |         "colab": {
 459 |           "base_uri": "https://localhost:8080/",
 460 |           "height": 34
 461 |         }
 462 |       },
 463 |       "source": [
 464 |         "[x.text for x in en(text) if not x.is_stop]"
 465 |       ],
 466 |       "execution_count": 24,
 467 |       "outputs": [
 468 |         {
 469 |           "output_type": "execute_result",
 470 |           "data": {
 471 |             "text/plain": [
 472 |               "['text', 'analysis', 'fun', '!']"
 473 |             ]
 474 |           },
 475 |           "metadata": {
 476 |             "tags": []
 477 |           },
 478 |           "execution_count": 24
 479 |         }
 480 |       ]
 481 |     },
 482 |     {
 483 |       "cell_type": "markdown",
 484 |       "metadata": {
 485 |         "id": "bWiFwTZ35U-f"
 486 |       },
 487 |       "source": [
 488 |         "#### Named Entities\n",
 489 |         "\n",
 490 |         "First URLs"
 491 |       ]
 492 |     },
 493 |     {
 494 |       "cell_type": "code",
 495 |       "metadata": {
 496 |         "id": "Vq7oKqcy5UfL",
 497 |         "outputId": "30ba53d7-f6f8-4e36-a11c-1c5a68ae8063",
 498 |         "colab": {
 499 |           "base_uri": "https://localhost:8080/",
 500 |           "height": 51
 501 |         }
 502 |       },
 503 |       "source": [
 504 |         "text = \"Check out the course on Github: https://github.com/mjahanshahi/intermediate-nlp\"\n",
 505 |         "print([x for x in en(text) if not x.like_url])\n",
 506 |         "print(['-URL-' if x.like_url else x for x in en(text)])"
 507 |       ],
 508 |       "execution_count": 25,
 509 |       "outputs": [
 510 |         {
 511 |           "output_type": "stream",
 512 |           "text": [
 513 |             "[Check, out, the, course, on, Github, :]\n",
 514 |             "[Check, out, the, course, on, Github, :, '-URL-']\n"
 515 |           ],
 516 |           "name": "stdout"
 517 |         }
 518 |       ]
 519 |     },
 520 |     {
 521 |       "cell_type": "code",
 522 |       "metadata": {
 523 |         "id": "NQCUAnL25ld-",
 524 |         "outputId": "862fadb2-41ee-4774-8d94-27bf5ee776b0",
 525 |         "colab": {
 526 |           "base_uri": "https://localhost:8080/",
 527 |           "height": 51
 528 |         }
 529 |       },
 530 |       "source": [
 531 |         "parsed = nlp(text)\n",
 532 |         "# look at the individual tokens\n",
 533 |         "tokens = [t for t in parsed]\n",
 534 |         "print(tokens)\n",
 535 |         "# look at the identified named-entities and their types\n",
 536 |         "for e in parsed.ents:\n",
 537 |         "    print(e, type(e), e.label_, spacy.explain(e.label_))"
 538 |       ],
 539 |       "execution_count": 26,
 540 |       "outputs": [
 541 |         {
 542 |           "output_type": "stream",
 543 |           "text": [
 544 |             "[Check, out, the, course, on, Github, :, https://github.com/mjahanshahi/intermediate-nlp]\n",
 545 |             "Github <class 'spacy.tokens.span.Span'> ORG Companies, agencies, institutions, etc.\n"
 546 |           ],
 547 |           "name": "stdout"
 548 |         }
 549 |       ]
 550 |     },
 551 |     {
 552 |       "cell_type": "markdown",
 553 |       "metadata": {
 554 |         "id": "ypPdecxU3Rgx"
 555 |       },
 556 |       "source": [
 557 |         "### Putting it all together"
 558 |       ]
 559 |     },
 560 |     {
 561 |       "cell_type": "code",
 562 |       "metadata": {
 563 |         "id": "ZnFbRR-O3-8r"
 564 |       },
 565 |       "source": [
 566 |         "text_data = [\"I'm taking a course on Safari.\",\n",
 567 |         "            \"I'm learning about Text Analysis.\",\n",
 568 |         "            \"We are studying preprocessing text and then analysing it\",\n",
 569 |         "            \"Check out the course on Github: https://github.com/mjahanshahi/intermediate-nlp\"]"
 570 |       ],
 571 |       "execution_count": 20,
 572 |       "outputs": []
 573 |     },
 574 |     {
 575 |       "cell_type": "code",
 576 |       "metadata": {
 577 |         "id": "FHulb9Nn4YQs"
 578 |       },
 579 |       "source": [
 580 |         "def tokenize_full(docs, model=nlp, \n",
 581 |         "                  entities=False, \n",
 582 |         "                  stop_words=False, \n",
 583 |         "                  lowercase=True, \n",
 584 |         "                  alpha_only=True, \n",
 585 |         "                  lemma=True):\n",
 586 |         "    \"\"\"Full tokenizer with flags for processing steps\n",
 587 |         "    entities: If False, replaces with entity type\n",
 588 |         "    stop_words: If False, removes stop words\n",
 589 |         "    lowercase: If True, lowercases all tokens\n",
 590 |         "    alpha_only: If True, removes all non-alpha characters\n",
 591 |         "    lemma: If True, lemmatizes words\n",
 592 |         "    \"\"\"\n",
 593 |         "    tokenized_docs = []\n",
 594 |         "    for d in docs:\n",
 595 |         "        parsed = model(d)\n",
 596 |         "        # token collector\n",
 597 |         "        tokens = []\n",
 598 |         "        # index pointer\n",
 599 |         "        i = 0\n",
 600 |         "        # entity collector\n",
 601 |         "        ent = ''\n",
 602 |         "        for t in parsed:\n",
 603 |         "            # only need this if we're replacing entities\n",
 604 |         "            if not entities:\n",
 605 |         "                # replace URLs\n",
 606 |         "                if t.like_url:\n",
 607 |         "                    tokens.append('URL')\n",
 608 |         "                    continue\n",
 609 |         "                # if there's entities collected and current token is non-entity\n",
 610 |         "                if (t.ent_iob_=='O')&(ent!=''):\n",
 611 |         "                    tokens.append(ent)\n",
 612 |         "                    ent = ''\n",
 613 |         "                    continue\n",
 614 |         "                elif t.ent_iob_!='O':\n",
 615 |         "                    ent = t.ent_type_\n",
 616 |         "                    continue\n",
 617 |         "            # only include stop words if stop words==True\n",
 618 |         "            if (t.is_stop)&(not stop_words):\n",
 619 |         "                continue\n",
 620 |         "            # only include non-alpha is alpha_only==False\n",
 621 |         "            if (not t.is_alpha)&(alpha_only):\n",
 622 |         "                continue\n",
 623 |         "            if lemma:\n",
 624 |         "                t = t.lemma_\n",
 625 |         "            else:\n",
 626 |         "                t = t.text\n",
 627 |         "            if lowercase:\n",
 628 |         "                t.lower()\n",
 629 |         "            tokens.append(t)\n",
 630 |         "        tokenized_docs.append(tokens)\n",
 631 |         "    return(tokenized_docs)"
 632 |       ],
 633 |       "execution_count": 21,
 634 |       "outputs": []
 635 |     },
 636 |     {
 637 |       "cell_type": "code",
 638 |       "metadata": {
 639 |         "id": "d0xbabiX5vXL",
 640 |         "outputId": "d0b1b248-45b3-4606-cc6f-2cd06838b8a7",
 641 |         "colab": {
 642 |           "base_uri": "https://localhost:8080/",
 643 |           "height": 340
 644 |         }
 645 |       },
 646 |       "source": [
 647 |         "tokenize_full(text_data, stop_words=True, alpha_only=False, entities=True)"
 648 |       ],
 649 |       "execution_count": 27,
 650 |       "outputs": [
 651 |         {
 652 |           "output_type": "execute_result",
 653 |           "data": {
 654 |             "text/plain": [
 655 |               "[['-PRON-', 'be', 'take', 'a', 'course', 'on', 'Safari', '.'],\n",
 656 |               " ['-PRON-', 'be', 'learn', 'about', 'Text', 'Analysis', '.'],\n",
 657 |               " ['-PRON-',\n",
 658 |               "  'be',\n",
 659 |               "  'study',\n",
 660 |               "  'preprocesse',\n",
 661 |               "  'text',\n",
 662 |               "  'and',\n",
 663 |               "  'then',\n",
 664 |               "  'analyse',\n",
 665 |               "  '-PRON-'],\n",
 666 |               " ['check',\n",
 667 |               "  'out',\n",
 668 |               "  'the',\n",
 669 |               "  'course',\n",
 670 |               "  'on',\n",
 671 |               "  'Github',\n",
 672 |               "  ':',\n",
 673 |               "  'https://github.com/mjahanshahi/intermediate-nlp']]"
 674 |             ]
 675 |           },
 676 |           "metadata": {
 677 |             "tags": []
 678 |           },
 679 |           "execution_count": 27
 680 |         }
 681 |       ]
 682 |     },
 683 |     {
 684 |       "cell_type": "code",
 685 |       "metadata": {
 686 |         "id": "xer1e8C-6DBI",
 687 |         "outputId": "ce424f85-d580-4c9c-db9b-9a60ea62a80a",
 688 |         "colab": {
 689 |           "base_uri": "https://localhost:8080/",
 690 |           "height": 193
 691 |         }
 692 |       },
 693 |       "source": [
 694 |         "from sklearn.feature_extraction.text import CountVectorizer\n",
 695 |         "import pandas as pd\n",
 696 |         "cv = CountVectorizer()\n",
 697 |         "v = cv.fit_transform(text_data).toarray()\n",
 698 |         "pd.DataFrame(v, columns=cv.get_feature_names())"
 699 |       ],
 700 |       "execution_count": 31,
 701 |       "outputs": [
 702 |         {
 703 |           "output_type": "execute_result",
 704 |           "data": {
 705 |             "text/html": [
 706 |               "<div>\n",
 707 |               "<style scoped>\n",
 708 |               "    .dataframe tbody tr th:only-of-type {\n",
 709 |               "        vertical-align: middle;\n",
 710 |               "    }\n",
 711 |               "\n",
 712 |               "    .dataframe tbody tr th {\n",
 713 |               "        vertical-align: top;\n",
 714 |               "    }\n",
 715 |               "\n",
 716 |               "    .dataframe thead th {\n",
 717 |               "        text-align: right;\n",
 718 |               "    }\n",
 719 |               "</style>\n",
 720 |               "<table border=\"1\" class=\"dataframe\">\n",
 721 |               "  <thead>\n",
 722 |               "    <tr style=\"text-align: right;\">\n",
 723 |               "      <th></th>\n",
 724 |               "      <th>about</th>\n",
 725 |               "      <th>analysing</th>\n",
 726 |               "      <th>analysis</th>\n",
 727 |               "      <th>and</th>\n",
 728 |               "      <th>are</th>\n",
 729 |               "      <th>check</th>\n",
 730 |               "      <th>com</th>\n",
 731 |               "      <th>course</th>\n",
 732 |               "      <th>github</th>\n",
 733 |               "      <th>https</th>\n",
 734 |               "      <th>intermediate</th>\n",
 735 |               "      <th>it</th>\n",
 736 |               "      <th>learning</th>\n",
 737 |               "      <th>mjahanshahi</th>\n",
 738 |               "      <th>nlp</th>\n",
 739 |               "      <th>on</th>\n",
 740 |               "      <th>out</th>\n",
 741 |               "      <th>preprocessing</th>\n",
 742 |               "      <th>safari</th>\n",
 743 |               "      <th>studying</th>\n",
 744 |               "      <th>taking</th>\n",
 745 |               "      <th>text</th>\n",
 746 |               "      <th>the</th>\n",
 747 |               "      <th>then</th>\n",
 748 |               "      <th>we</th>\n",
 749 |               "    </tr>\n",
 750 |               "  </thead>\n",
 751 |               "  <tbody>\n",
 752 |               "    <tr>\n",
 753 |               "      <th>0</th>\n",
 754 |               "      <td>0</td>\n",
 755 |               "      <td>0</td>\n",
 756 |               "      <td>0</td>\n",
 757 |               "      <td>0</td>\n",
 758 |               "      <td>0</td>\n",
 759 |               "      <td>0</td>\n",
 760 |               "      <td>0</td>\n",
 761 |               "      <td>1</td>\n",
 762 |               "      <td>0</td>\n",
 763 |               "      <td>0</td>\n",
 764 |               "      <td>0</td>\n",
 765 |               "      <td>0</td>\n",
 766 |               "      <td>0</td>\n",
 767 |               "      <td>0</td>\n",
 768 |               "      <td>0</td>\n",
 769 |               "      <td>1</td>\n",
 770 |               "      <td>0</td>\n",
 771 |               "      <td>0</td>\n",
 772 |               "      <td>1</td>\n",
 773 |               "      <td>0</td>\n",
 774 |               "      <td>1</td>\n",
 775 |               "      <td>0</td>\n",
 776 |               "      <td>0</td>\n",
 777 |               "      <td>0</td>\n",
 778 |               "      <td>0</td>\n",
 779 |               "    </tr>\n",
 780 |               "    <tr>\n",
 781 |               "      <th>1</th>\n",
 782 |               "      <td>1</td>\n",
 783 |               "      <td>0</td>\n",
 784 |               "      <td>1</td>\n",
 785 |               "      <td>0</td>\n",
 786 |               "      <td>0</td>\n",
 787 |               "      <td>0</td>\n",
 788 |               "      <td>0</td>\n",
 789 |               "      <td>0</td>\n",
 790 |               "      <td>0</td>\n",
 791 |               "      <td>0</td>\n",
 792 |               "      <td>0</td>\n",
 793 |               "      <td>0</td>\n",
 794 |               "      <td>1</td>\n",
 795 |               "      <td>0</td>\n",
 796 |               "      <td>0</td>\n",
 797 |               "      <td>0</td>\n",
 798 |               "      <td>0</td>\n",
 799 |               "      <td>0</td>\n",
 800 |               "      <td>0</td>\n",
 801 |               "      <td>0</td>\n",
 802 |               "      <td>0</td>\n",
 803 |               "      <td>1</td>\n",
 804 |               "      <td>0</td>\n",
 805 |               "      <td>0</td>\n",
 806 |               "      <td>0</td>\n",
 807 |               "    </tr>\n",
 808 |               "    <tr>\n",
 809 |               "      <th>2</th>\n",
 810 |               "      <td>0</td>\n",
 811 |               "      <td>1</td>\n",
 812 |               "      <td>0</td>\n",
 813 |               "      <td>1</td>\n",
 814 |               "      <td>1</td>\n",
 815 |               "      <td>0</td>\n",
 816 |               "      <td>0</td>\n",
 817 |               "      <td>0</td>\n",
 818 |               "      <td>0</td>\n",
 819 |               "      <td>0</td>\n",
 820 |               "      <td>0</td>\n",
 821 |               "      <td>1</td>\n",
 822 |               "      <td>0</td>\n",
 823 |               "      <td>0</td>\n",
 824 |               "      <td>0</td>\n",
 825 |               "      <td>0</td>\n",
 826 |               "      <td>0</td>\n",
 827 |               "      <td>1</td>\n",
 828 |               "      <td>0</td>\n",
 829 |               "      <td>1</td>\n",
 830 |               "      <td>0</td>\n",
 831 |               "      <td>1</td>\n",
 832 |               "      <td>0</td>\n",
 833 |               "      <td>1</td>\n",
 834 |               "      <td>1</td>\n",
 835 |               "    </tr>\n",
 836 |               "    <tr>\n",
 837 |               "      <th>3</th>\n",
 838 |               "      <td>0</td>\n",
 839 |               "      <td>0</td>\n",
 840 |               "      <td>0</td>\n",
 841 |               "      <td>0</td>\n",
 842 |               "      <td>0</td>\n",
 843 |               "      <td>1</td>\n",
 844 |               "      <td>1</td>\n",
 845 |               "      <td>1</td>\n",
 846 |               "      <td>2</td>\n",
 847 |               "      <td>1</td>\n",
 848 |               "      <td>1</td>\n",
 849 |               "      <td>0</td>\n",
 850 |               "      <td>0</td>\n",
 851 |               "      <td>1</td>\n",
 852 |               "      <td>1</td>\n",
 853 |               "      <td>1</td>\n",
 854 |               "      <td>1</td>\n",
 855 |               "      <td>0</td>\n",
 856 |               "      <td>0</td>\n",
 857 |               "      <td>0</td>\n",
 858 |               "      <td>0</td>\n",
 859 |               "      <td>0</td>\n",
 860 |               "      <td>1</td>\n",
 861 |               "      <td>0</td>\n",
 862 |               "      <td>0</td>\n",
 863 |               "    </tr>\n",
 864 |               "  </tbody>\n",
 865 |               "</table>\n",
 866 |               "</div>"
 867 |             ],
 868 |             "text/plain": [
 869 |               "   about  analysing  analysis  and  are  ...  taking  text  the  then  we\n",
 870 |               "0      0          0         0    0    0  ...       1     0    0     0   0\n",
 871 |               "1      1          0         1    0    0  ...       0     1    0     0   0\n",
 872 |               "2      0          1         0    1    1  ...       0     1    0     1   1\n",
 873 |               "3      0          0         0    0    0  ...       0     0    1     0   0\n",
 874 |               "\n",
 875 |               "[4 rows x 25 columns]"
 876 |             ]
 877 |           },
 878 |           "metadata": {
 879 |             "tags": []
 880 |           },
 881 |           "execution_count": 31
 882 |         }
 883 |       ]
 884 |     },
 885 |     {
 886 |       "cell_type": "code",
 887 |       "metadata": {
 888 |         "id": "3x9uut7r6fKz",
 889 |         "outputId": "3a3fe346-48cb-4e3d-e15e-972bb2c3dd82",
 890 |         "colab": {
 891 |           "base_uri": "https://localhost:8080/",
 892 |           "height": 173
 893 |         }
 894 |       },
 895 |       "source": [
 896 |         "cv = CountVectorizer(vocabulary=['text', 'analysis', 'preprocessing', 'safari'])\n",
 897 |         "v = cv.fit_transform(text_data).toarray()\n",
 898 |         "pd.DataFrame(v, columns=cv.get_feature_names())"
 899 |       ],
 900 |       "execution_count": 33,
 901 |       "outputs": [
 902 |         {
 903 |           "output_type": "execute_result",
 904 |           "data": {
 905 |             "text/html": [
 906 |               "<div>\n",
 907 |               "<style scoped>\n",
 908 |               "    .dataframe tbody tr th:only-of-type {\n",
 909 |               "        vertical-align: middle;\n",
 910 |               "    }\n",
 911 |               "\n",
 912 |               "    .dataframe tbody tr th {\n",
 913 |               "        vertical-align: top;\n",
 914 |               "    }\n",
 915 |               "\n",
 916 |               "    .dataframe thead th {\n",
 917 |               "        text-align: right;\n",
 918 |               "    }\n",
 919 |               "</style>\n",
 920 |               "<table border=\"1\" class=\"dataframe\">\n",
 921 |               "  <thead>\n",
 922 |               "    <tr style=\"text-align: right;\">\n",
 923 |               "      <th></th>\n",
 924 |               "      <th>text</th>\n",
 925 |               "      <th>analysis</th>\n",
 926 |               "      <th>preprocessing</th>\n",
 927 |               "      <th>safari</th>\n",
 928 |               "    </tr>\n",
 929 |               "  </thead>\n",
 930 |               "  <tbody>\n",
 931 |               "    <tr>\n",
 932 |               "      <th>0</th>\n",
 933 |               "      <td>0</td>\n",
 934 |               "      <td>0</td>\n",
 935 |               "      <td>0</td>\n",
 936 |               "      <td>1</td>\n",
 937 |               "    </tr>\n",
 938 |               "    <tr>\n",
 939 |               "      <th>1</th>\n",
 940 |               "      <td>1</td>\n",
 941 |               "      <td>1</td>\n",
 942 |               "      <td>0</td>\n",
 943 |               "      <td>0</td>\n",
 944 |               "    </tr>\n",
 945 |               "    <tr>\n",
 946 |               "      <th>2</th>\n",
 947 |               "      <td>1</td>\n",
 948 |               "      <td>0</td>\n",
 949 |               "      <td>1</td>\n",
 950 |               "      <td>0</td>\n",
 951 |               "    </tr>\n",
 952 |               "    <tr>\n",
 953 |               "      <th>3</th>\n",
 954 |               "      <td>0</td>\n",
 955 |               "      <td>0</td>\n",
 956 |               "      <td>0</td>\n",
 957 |               "      <td>0</td>\n",
 958 |               "    </tr>\n",
 959 |               "  </tbody>\n",
 960 |               "</table>\n",
 961 |               "</div>"
 962 |             ],
 963 |             "text/plain": [
 964 |               "   text  analysis  preprocessing  safari\n",
 965 |               "0     0         0              0       1\n",
 966 |               "1     1         1              0       0\n",
 967 |               "2     1         0              1       0\n",
 968 |               "3     0         0              0       0"
 969 |             ]
 970 |           },
 971 |           "metadata": {
 972 |             "tags": []
 973 |           },
 974 |           "execution_count": 33
 975 |         }
 976 |       ]
 977 |     },
 978 |     {
 979 |       "cell_type": "code",
 980 |       "metadata": {
 981 |         "id": "6tWo1FTq47qZ"
 982 |       },
 983 |       "source": [
 984 |         "\n"
 985 |       ],
 986 |       "execution_count": null,
 987 |       "outputs": []
 988 |     },
 989 |     {
 990 |       "cell_type": "markdown",
 991 |       "metadata": {
 992 |         "id": "aM05Ku_TZgdy"
 993 |       },
 994 |       "source": [
 995 |         "## Using a Dictionary to Analyse Review Sentiment\n",
 996 |         "\n",
 997 |         "A traditional technique to analyse sentiments of texts is to use dictionaries of positive and negative connotations and count the incidences of words that are represented in thiese dictionaries, considering their polarity and valence. \n",
 998 |         "\n",
 999 |         "In this section, we will use the Afinn package which has 2.5k words coded by polarity and valence. "
1000 |       ]
1001 |     },
1002 |     {
1003 |       "cell_type": "code",
1004 |       "metadata": {
1005 |         "id": "R1FButi2-SYZ",
1006 |         "outputId": "bd607287-69a2-4e7e-aea1-fa4c3e018d57",
1007 |         "colab": {
1008 |           "base_uri": "https://localhost:8080/",
1009 |           "height": 207
1010 |         }
1011 |       },
1012 |       "source": [
1013 |         "!pip install afinn"
1014 |       ],
1015 |       "execution_count": 2,
1016 |       "outputs": [
1017 |         {
1018 |           "output_type": "stream",
1019 |           "text": [
1020 |             "Collecting afinn\n",
1021 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)\n",
1022 |             "\r\u001b[K     |██████▎                         | 10kB 17.1MB/s eta 0:00:01\r\u001b[K     |████████████▌                   | 20kB 1.7MB/s eta 0:00:01\r\u001b[K     |██████████████████▊             | 30kB 2.2MB/s eta 0:00:01\r\u001b[K     |█████████████████████████       | 40kB 2.5MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▏| 51kB 2.0MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 61kB 1.8MB/s \n",
1023 |             "\u001b[?25hBuilding wheels for collected packages: afinn\n",
1024 |             "  Building wheel for afinn (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1025 |             "  Created wheel for afinn: filename=afinn-0.1-cp36-none-any.whl size=53453 sha256=693b118b7381dc265be177ae816b0ac4923bd634c0e3d1a309d10533dcafecd3\n",
1026 |             "  Stored in directory: /root/.cache/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2\n",
1027 |             "Successfully built afinn\n",
1028 |             "Installing collected packages: afinn\n",
1029 |             "Successfully installed afinn-0.1\n"
1030 |           ],
1031 |           "name": "stdout"
1032 |         }
1033 |       ]
1034 |     },
1035 |     {
1036 |       "cell_type": "code",
1037 |       "metadata": {
1038 |         "id": "iRMuZzAd-J_k"
1039 |       },
1040 |       "source": [
1041 |         "from afinn import Afinn\n",
1042 |         "afinn = Afinn(language='en')"
1043 |       ],
1044 |       "execution_count": 13,
1045 |       "outputs": []
1046 |     },
1047 |     {
1048 |       "cell_type": "code",
1049 |       "metadata": {
1050 |         "id": "M_ceBooO-hk_",
1051 |         "outputId": "d49bba4d-4ca3-4d7b-fa1a-03eb1c5431c6",
1052 |         "colab": {
1053 |           "base_uri": "https://localhost:8080/",
1054 |           "height": 34
1055 |         }
1056 |       },
1057 |       "source": [
1058 |         "afinn.score('Great')"
1059 |       ],
1060 |       "execution_count": 8,
1061 |       "outputs": [
1062 |         {
1063 |           "output_type": "execute_result",
1064 |           "data": {
1065 |             "text/plain": [
1066 |               "3.0"
1067 |             ]
1068 |           },
1069 |           "metadata": {
1070 |             "tags": []
1071 |           },
1072 |           "execution_count": 8
1073 |         }
1074 |       ]
1075 |     },
1076 |     {
1077 |       "cell_type": "code",
1078 |       "metadata": {
1079 |         "id": "-xQ1yVkc-iyf",
1080 |         "outputId": "37601631-06ff-4f63-fb74-69e5d95aec91",
1081 |         "colab": {
1082 |           "base_uri": "https://localhost:8080/",
1083 |           "height": 34
1084 |         }
1085 |       },
1086 |       "source": [
1087 |         "afinn.score('Good')"
1088 |       ],
1089 |       "execution_count": 7,
1090 |       "outputs": [
1091 |         {
1092 |           "output_type": "execute_result",
1093 |           "data": {
1094 |             "text/plain": [
1095 |               "3.0"
1096 |             ]
1097 |           },
1098 |           "metadata": {
1099 |             "tags": []
1100 |           },
1101 |           "execution_count": 7
1102 |         }
1103 |       ]
1104 |     },
1105 |     {
1106 |       "cell_type": "code",
1107 |       "metadata": {
1108 |         "id": "uCzCtFsp-rQj",
1109 |         "outputId": "24acb631-6f8a-40bc-90d4-3d214e81cb19",
1110 |         "colab": {
1111 |           "base_uri": "https://localhost:8080/",
1112 |           "height": 34
1113 |         }
1114 |       },
1115 |       "source": [
1116 |         "afinn.score('Terrible')"
1117 |       ],
1118 |       "execution_count": 9,
1119 |       "outputs": [
1120 |         {
1121 |           "output_type": "execute_result",
1122 |           "data": {
1123 |             "text/plain": [
1124 |               "-3.0"
1125 |             ]
1126 |           },
1127 |           "metadata": {
1128 |             "tags": []
1129 |           },
1130 |           "execution_count": 9
1131 |         }
1132 |       ]
1133 |     },
1134 |     {
1135 |       "cell_type": "code",
1136 |       "metadata": {
1137 |         "id": "e8hSr8lL-u0V",
1138 |         "outputId": "91c3a047-7635-49a5-fa64-7cad8ac2543c",
1139 |         "colab": {
1140 |           "base_uri": "https://localhost:8080/",
1141 |           "height": 34
1142 |         }
1143 |       },
1144 |       "source": [
1145 |         "afinn.score('I feel great! :)')"
1146 |       ],
1147 |       "execution_count": 16,
1148 |       "outputs": [
1149 |         {
1150 |           "output_type": "execute_result",
1151 |           "data": {
1152 |             "text/plain": [
1153 |               "5.0"
1154 |             ]
1155 |           },
1156 |           "metadata": {
1157 |             "tags": []
1158 |           },
1159 |           "execution_count": 16
1160 |         }
1161 |       ]
1162 |     },
1163 |     {
1164 |       "cell_type": "markdown",
1165 |       "metadata": {
1166 |         "id": "MXMzguH5_toC"
1167 |       },
1168 |       "source": [
1169 |         "Let's apply this to an actual dataset! This is the Women's Clothing E-Commerce Reviews from [this Kaggle Challenge](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews). Bi-directional LSTMs have [reached an F1 score of 0.93.](https://github.com/AFAgarap/ecommerce-reviews-analysis)"
1170 |       ]
1171 |     },
1172 |     {
1173 |       "cell_type": "code",
1174 |       "metadata": {
1175 |         "id": "JWnWrRWY_rFi"
1176 |       },
1177 |       "source": [
1178 |         "import pandas as pd"
1179 |       ],
1180 |       "execution_count": 18,
1181 |       "outputs": []
1182 |     },
1183 |     {
1184 |       "cell_type": "code",
1185 |       "metadata": {
1186 |         "id": "kL1MO06S-0bi"
1187 |       },
1188 |       "source": [
1189 |         "df = pd.read_csv('/Womens Clothing E-Commerce Reviews.csv', index_col=0)"
1190 |       ],
1191 |       "execution_count": 30,
1192 |       "outputs": []
1193 |     },
1194 |     {
1195 |       "cell_type": "code",
1196 |       "metadata": {
1197 |         "id": "owFapvdiBqmu",
1198 |         "outputId": "81908281-712e-4ec5-b593-df4bf49e7524",
1199 |         "colab": {
1200 |           "base_uri": "https://localhost:8080/",
1201 |           "height": 615
1202 |         }
1203 |       },
1204 |       "source": [
1205 |         "df.head()"
1206 |       ],
1207 |       "execution_count": 31,
1208 |       "outputs": [
1209 |         {
1210 |           "output_type": "execute_result",
1211 |           "data": {
1212 |             "text/html": [
1213 |               "<div>\n",
1214 |               "<style scoped>\n",
1215 |               "    .dataframe tbody tr th:only-of-type {\n",
1216 |               "        vertical-align: middle;\n",
1217 |               "    }\n",
1218 |               "\n",
1219 |               "    .dataframe tbody tr th {\n",
1220 |               "        vertical-align: top;\n",
1221 |               "    }\n",
1222 |               "\n",
1223 |               "    .dataframe thead th {\n",
1224 |               "        text-align: right;\n",
1225 |               "    }\n",
1226 |               "</style>\n",
1227 |               "<table border=\"1\" class=\"dataframe\">\n",
1228 |               "  <thead>\n",
1229 |               "    <tr style=\"text-align: right;\">\n",
1230 |               "      <th></th>\n",
1231 |               "      <th>Clothing ID</th>\n",
1232 |               "      <th>Age</th>\n",
1233 |               "      <th>Title</th>\n",
1234 |               "      <th>Review Text</th>\n",
1235 |               "      <th>Rating</th>\n",
1236 |               "      <th>Recommended IND</th>\n",
1237 |               "      <th>Positive Feedback Count</th>\n",
1238 |               "      <th>Division Name</th>\n",
1239 |               "      <th>Department Name</th>\n",
1240 |               "      <th>Class Name</th>\n",
1241 |               "    </tr>\n",
1242 |               "  </thead>\n",
1243 |               "  <tbody>\n",
1244 |               "    <tr>\n",
1245 |               "      <th>0</th>\n",
1246 |               "      <td>767</td>\n",
1247 |               "      <td>33</td>\n",
1248 |               "      <td>NaN</td>\n",
1249 |               "      <td>Absolutely wonderful - silky and sexy and comf...</td>\n",
1250 |               "      <td>4</td>\n",
1251 |               "      <td>1</td>\n",
1252 |               "      <td>0</td>\n",
1253 |               "      <td>Initmates</td>\n",
1254 |               "      <td>Intimate</td>\n",
1255 |               "      <td>Intimates</td>\n",
1256 |               "    </tr>\n",
1257 |               "    <tr>\n",
1258 |               "      <th>1</th>\n",
1259 |               "      <td>1080</td>\n",
1260 |               "      <td>34</td>\n",
1261 |               "      <td>NaN</td>\n",
1262 |               "      <td>Love this dress!  it's sooo pretty.  i happene...</td>\n",
1263 |               "      <td>5</td>\n",
1264 |               "      <td>1</td>\n",
1265 |               "      <td>4</td>\n",
1266 |               "      <td>General</td>\n",
1267 |               "      <td>Dresses</td>\n",
1268 |               "      <td>Dresses</td>\n",
1269 |               "    </tr>\n",
1270 |               "    <tr>\n",
1271 |               "      <th>2</th>\n",
1272 |               "      <td>1077</td>\n",
1273 |               "      <td>60</td>\n",
1274 |               "      <td>Some major design flaws</td>\n",
1275 |               "      <td>I had such high hopes for this dress and reall...</td>\n",
1276 |               "      <td>3</td>\n",
1277 |               "      <td>0</td>\n",
1278 |               "      <td>0</td>\n",
1279 |               "      <td>General</td>\n",
1280 |               "      <td>Dresses</td>\n",
1281 |               "      <td>Dresses</td>\n",
1282 |               "    </tr>\n",
1283 |               "    <tr>\n",
1284 |               "      <th>3</th>\n",
1285 |               "      <td>1049</td>\n",
1286 |               "      <td>50</td>\n",
1287 |               "      <td>My favorite buy!</td>\n",
1288 |               "      <td>I love, love, love this jumpsuit. it's fun, fl...</td>\n",
1289 |               "      <td>5</td>\n",
1290 |               "      <td>1</td>\n",
1291 |               "      <td>0</td>\n",
1292 |               "      <td>General Petite</td>\n",
1293 |               "      <td>Bottoms</td>\n",
1294 |               "      <td>Pants</td>\n",
1295 |               "    </tr>\n",
1296 |               "    <tr>\n",
1297 |               "      <th>4</th>\n",
1298 |               "      <td>847</td>\n",
1299 |               "      <td>47</td>\n",
1300 |               "      <td>Flattering shirt</td>\n",
1301 |               "      <td>This shirt is very flattering to all due to th...</td>\n",
1302 |               "      <td>5</td>\n",
1303 |               "      <td>1</td>\n",
1304 |               "      <td>6</td>\n",
1305 |               "      <td>General</td>\n",
1306 |               "      <td>Tops</td>\n",
1307 |               "      <td>Blouses</td>\n",
1308 |               "    </tr>\n",
1309 |               "  </tbody>\n",
1310 |               "</table>\n",
1311 |               "</div>"
1312 |             ],
1313 |             "text/plain": [
1314 |               "   Clothing ID  Age  ... Department Name Class Name\n",
1315 |               "0          767   33  ...        Intimate  Intimates\n",
1316 |               "1         1080   34  ...         Dresses    Dresses\n",
1317 |               "2         1077   60  ...         Dresses    Dresses\n",
1318 |               "3         1049   50  ...         Bottoms      Pants\n",
1319 |               "4          847   47  ...            Tops    Blouses\n",
1320 |               "\n",
1321 |               "[5 rows x 10 columns]"
1322 |             ]
1323 |           },
1324 |           "metadata": {
1325 |             "tags": []
1326 |           },
1327 |           "execution_count": 31
1328 |         }
1329 |       ]
1330 |     },
1331 |     {
1332 |       "cell_type": "markdown",
1333 |       "metadata": {
1334 |         "id": "-OE42CRqTGGo"
1335 |       },
1336 |       "source": [
1337 |         "Can see that some titles are null. Its also possible that some reviews do not contain any text. "
1338 |       ]
1339 |     },
1340 |     {
1341 |       "cell_type": "code",
1342 |       "metadata": {
1343 |         "id": "XclWIqTbTBfV",
1344 |         "outputId": "97c50bbe-073e-4f2f-a0d6-14e87276a30e",
1345 |         "colab": {
1346 |           "base_uri": "https://localhost:8080/",
1347 |           "height": 34
1348 |         }
1349 |       },
1350 |       "source": [
1351 |         "df[(df[\"Review Text\"].isnull()) & (df[\"Title\"].isnull())].shape[0]"
1352 |       ],
1353 |       "execution_count": 151,
1354 |       "outputs": [
1355 |         {
1356 |           "output_type": "execute_result",
1357 |           "data": {
1358 |             "text/plain": [
1359 |               "844"
1360 |             ]
1361 |           },
1362 |           "metadata": {
1363 |             "tags": []
1364 |           },
1365 |           "execution_count": 151
1366 |         }
1367 |       ]
1368 |     },
1369 |     {
1370 |       "cell_type": "markdown",
1371 |       "metadata": {
1372 |         "id": "TSgJsei_TT5R"
1373 |       },
1374 |       "source": [
1375 |         "We should remove these from the dataframe since this analysis aims to infer sentiment from text. "
1376 |       ]
1377 |     },
1378 |     {
1379 |       "cell_type": "code",
1380 |       "metadata": {
1381 |         "id": "hR3HtKyLTS38"
1382 |       },
1383 |       "source": [
1384 |         "df.drop(df[(df[\"Review Text\"].isnull()) & (df[\"Title\"].isnull())].index, inplace=True)"
1385 |       ],
1386 |       "execution_count": 159,
1387 |       "outputs": []
1388 |     },
1389 |     {
1390 |       "cell_type": "markdown",
1391 |       "metadata": {
1392 |         "id": "LJwAt-_uD2sk"
1393 |       },
1394 |       "source": [
1395 |         "There are two columns that may convey sentiment:\n",
1396 |         "- `Review Text`\n",
1397 |         "- `Title`\n",
1398 |         "\n",
1399 |         "To calculate the Afinn sentiment score for all of the responses in the dataframe, we can apply the scorer to the `Review Text` column and create a new column `text_score`. We do the same to generate a `title_score` column. \n",
1400 |         "\n"
1401 |       ]
1402 |     },
1403 |     {
1404 |       "cell_type": "code",
1405 |       "metadata": {
1406 |         "id": "4KByiMWNEObu"
1407 |       },
1408 |       "source": [
1409 |         "\n",
1410 |         "#df['text_score'] = df[df[\"Review Text\"].notnull()].loc[\"Review Text\"].apply(afinn.score)\n",
1411 |         "for index, row in df.iterrows():\n",
1412 |         "  if pd.notna(row['Review Text']):\n",
1413 |         "    df.at[index, \"text_score\"] = afinn.score(row['Review Text'])\n"
1414 |       ],
1415 |       "execution_count": 82,
1416 |       "outputs": []
1417 |     },
1418 |     {
1419 |       "cell_type": "code",
1420 |       "metadata": {
1421 |         "id": "koSaS8YEKEYW"
1422 |       },
1423 |       "source": [
1424 |         "for index, row in df.iterrows():\n",
1425 |         "  if pd.notna(row['Title']):\n",
1426 |         "    df.at[index, \"title_score\"] = afinn.score(row['Title'])"
1427 |       ],
1428 |       "execution_count": 87,
1429 |       "outputs": []
1430 |     },
1431 |     {
1432 |       "cell_type": "code",
1433 |       "metadata": {
1434 |         "id": "TJlQuxhMKHnx"
1435 |       },
1436 |       "source": [
1437 |         "df[\"total_score\"] = 2 * df[\"title_score\"] + df[\"text_score\"]"
1438 |       ],
1439 |       "execution_count": 88,
1440 |       "outputs": []
1441 |     },
1442 |     {
1443 |       "cell_type": "code",
1444 |       "metadata": {
1445 |         "id": "THGeTLn6Mm02",
1446 |         "outputId": "efc0eae7-39e8-4428-80bb-f7f0265723de",
1447 |         "colab": {
1448 |           "base_uri": "https://localhost:8080/",
1449 |           "height": 170
1450 |         }
1451 |       },
1452 |       "source": [
1453 |         "df['total_score'].describe()"
1454 |       ],
1455 |       "execution_count": 107,
1456 |       "outputs": [
1457 |         {
1458 |           "output_type": "execute_result",
1459 |           "data": {
1460 |             "text/plain": [
1461 |               "count    23486.000000\n",
1462 |               "mean        11.325172\n",
1463 |               "std          7.615414\n",
1464 |               "min        -20.000000\n",
1465 |               "25%          6.000000\n",
1466 |               "50%         11.000000\n",
1467 |               "75%         16.000000\n",
1468 |               "max         52.000000\n",
1469 |               "Name: total_score, dtype: float64"
1470 |             ]
1471 |           },
1472 |           "metadata": {
1473 |             "tags": []
1474 |           },
1475 |           "execution_count": 107
1476 |         }
1477 |       ]
1478 |     },
1479 |     {
1480 |       "cell_type": "code",
1481 |       "metadata": {
1482 |         "id": "IqmjIXItJPnl",
1483 |         "outputId": "b0da71fc-7e2f-4c4f-d2bb-683538a875ed",
1484 |         "colab": {
1485 |           "base_uri": "https://localhost:8080/",
1486 |           "height": 252
1487 |         }
1488 |       },
1489 |       "source": [
1490 |         "df.groupby(\"Rating\").median()"
1491 |       ],
1492 |       "execution_count": 100,
1493 |       "outputs": [
1494 |         {
1495 |           "output_type": "execute_result",
1496 |           "data": {
1497 |             "text/html": [
1498 |               "<div>\n",
1499 |               "<style scoped>\n",
1500 |               "    .dataframe tbody tr th:only-of-type {\n",
1501 |               "        vertical-align: middle;\n",
1502 |               "    }\n",
1503 |               "\n",
1504 |               "    .dataframe tbody tr th {\n",
1505 |               "        vertical-align: top;\n",
1506 |               "    }\n",
1507 |               "\n",
1508 |               "    .dataframe thead th {\n",
1509 |               "        text-align: right;\n",
1510 |               "    }\n",
1511 |               "</style>\n",
1512 |               "<table border=\"1\" class=\"dataframe\">\n",
1513 |               "  <thead>\n",
1514 |               "    <tr style=\"text-align: right;\">\n",
1515 |               "      <th></th>\n",
1516 |               "      <th>Clothing ID</th>\n",
1517 |               "      <th>Age</th>\n",
1518 |               "      <th>Recommended IND</th>\n",
1519 |               "      <th>Positive Feedback Count</th>\n",
1520 |               "      <th>text_score</th>\n",
1521 |               "      <th>title_score</th>\n",
1522 |               "      <th>total_score</th>\n",
1523 |               "    </tr>\n",
1524 |               "    <tr>\n",
1525 |               "      <th>Rating</th>\n",
1526 |               "      <th></th>\n",
1527 |               "      <th></th>\n",
1528 |               "      <th></th>\n",
1529 |               "      <th></th>\n",
1530 |               "      <th></th>\n",
1531 |               "      <th></th>\n",
1532 |               "      <th></th>\n",
1533 |               "    </tr>\n",
1534 |               "  </thead>\n",
1535 |               "  <tbody>\n",
1536 |               "    <tr>\n",
1537 |               "      <th>1</th>\n",
1538 |               "      <td>936</td>\n",
1539 |               "      <td>42</td>\n",
1540 |               "      <td>0</td>\n",
1541 |               "      <td>1</td>\n",
1542 |               "      <td>3</td>\n",
1543 |               "      <td>0</td>\n",
1544 |               "      <td>3</td>\n",
1545 |               "    </tr>\n",
1546 |               "    <tr>\n",
1547 |               "      <th>2</th>\n",
1548 |               "      <td>936</td>\n",
1549 |               "      <td>41</td>\n",
1550 |               "      <td>0</td>\n",
1551 |               "      <td>1</td>\n",
1552 |               "      <td>4</td>\n",
1553 |               "      <td>0</td>\n",
1554 |               "      <td>5</td>\n",
1555 |               "    </tr>\n",
1556 |               "    <tr>\n",
1557 |               "      <th>3</th>\n",
1558 |               "      <td>936</td>\n",
1559 |               "      <td>40</td>\n",
1560 |               "      <td>0</td>\n",
1561 |               "      <td>1</td>\n",
1562 |               "      <td>6</td>\n",
1563 |               "      <td>0</td>\n",
1564 |               "      <td>7</td>\n",
1565 |               "    </tr>\n",
1566 |               "    <tr>\n",
1567 |               "      <th>4</th>\n",
1568 |               "      <td>928</td>\n",
1569 |               "      <td>41</td>\n",
1570 |               "      <td>1</td>\n",
1571 |               "      <td>1</td>\n",
1572 |               "      <td>7</td>\n",
1573 |               "      <td>2</td>\n",
1574 |               "      <td>11</td>\n",
1575 |               "    </tr>\n",
1576 |               "    <tr>\n",
1577 |               "      <th>5</th>\n",
1578 |               "      <td>936</td>\n",
1579 |               "      <td>41</td>\n",
1580 |               "      <td>1</td>\n",
1581 |               "      <td>1</td>\n",
1582 |               "      <td>9</td>\n",
1583 |               "      <td>2</td>\n",
1584 |               "      <td>13</td>\n",
1585 |               "    </tr>\n",
1586 |               "  </tbody>\n",
1587 |               "</table>\n",
1588 |               "</div>"
1589 |             ],
1590 |             "text/plain": [
1591 |               "        Clothing ID  Age  Recommended IND  ...  text_score  title_score  total_score\n",
1592 |               "Rating                                     ...                                      \n",
1593 |               "1               936   42                0  ...           3            0            3\n",
1594 |               "2               936   41                0  ...           4            0            5\n",
1595 |               "3               936   40                0  ...           6            0            7\n",
1596 |               "4               928   41                1  ...           7            2           11\n",
1597 |               "5               936   41                1  ...           9            2           13\n",
1598 |               "\n",
1599 |               "[5 rows x 7 columns]"
1600 |             ]
1601 |           },
1602 |           "metadata": {
1603 |             "tags": []
1604 |           },
1605 |           "execution_count": 100
1606 |         }
1607 |       ]
1608 |     },
1609 |     {
1610 |       "cell_type": "code",
1611 |       "metadata": {
1612 |         "id": "xy-mQTkSKiCJ",
1613 |         "outputId": "5656bb02-e4a9-4d3b-c49c-aa2855873844",
1614 |         "colab": {
1615 |           "base_uri": "https://localhost:8080/",
1616 |           "height": 1000
1617 |         }
1618 |       },
1619 |       "source": [
1620 |         "df[(df[\"total_score\"]<10) & (df[\"Rating\"]==5)]"
1621 |       ],
1622 |       "execution_count": 160,
1623 |       "outputs": [
1624 |         {
1625 |           "output_type": "execute_result",
1626 |           "data": {
1627 |             "text/html": [
1628 |               "<div>\n",
1629 |               "<style scoped>\n",
1630 |               "    .dataframe tbody tr th:only-of-type {\n",
1631 |               "        vertical-align: middle;\n",
1632 |               "    }\n",
1633 |               "\n",
1634 |               "    .dataframe tbody tr th {\n",
1635 |               "        vertical-align: top;\n",
1636 |               "    }\n",
1637 |               "\n",
1638 |               "    .dataframe thead th {\n",
1639 |               "        text-align: right;\n",
1640 |               "    }\n",
1641 |               "</style>\n",
1642 |               "<table border=\"1\" class=\"dataframe\">\n",
1643 |               "  <thead>\n",
1644 |               "    <tr style=\"text-align: right;\">\n",
1645 |               "      <th></th>\n",
1646 |               "      <th>Clothing ID</th>\n",
1647 |               "      <th>Age</th>\n",
1648 |               "      <th>Title</th>\n",
1649 |               "      <th>Review Text</th>\n",
1650 |               "      <th>Rating</th>\n",
1651 |               "      <th>Recommended IND</th>\n",
1652 |               "      <th>Positive Feedback Count</th>\n",
1653 |               "      <th>Division Name</th>\n",
1654 |               "      <th>Department Name</th>\n",
1655 |               "      <th>Class Name</th>\n",
1656 |               "      <th>text_score</th>\n",
1657 |               "      <th>title_score</th>\n",
1658 |               "      <th>total_score</th>\n",
1659 |               "      <th>word_count</th>\n",
1660 |               "      <th>normalized_score</th>\n",
1661 |               "    </tr>\n",
1662 |               "  </thead>\n",
1663 |               "  <tbody>\n",
1664 |               "    <tr>\n",
1665 |               "      <th>4</th>\n",
1666 |               "      <td>847</td>\n",
1667 |               "      <td>47</td>\n",
1668 |               "      <td>Flattering shirt</td>\n",
1669 |               "      <td>This shirt is very flattering to all due to th...</td>\n",
1670 |               "      <td>5</td>\n",
1671 |               "      <td>1</td>\n",
1672 |               "      <td>6</td>\n",
1673 |               "      <td>General</td>\n",
1674 |               "      <td>Tops</td>\n",
1675 |               "      <td>Blouses</td>\n",
1676 |               "      <td>6</td>\n",
1677 |               "      <td>0</td>\n",
1678 |               "      <td>6</td>\n",
1679 |               "      <td>36</td>\n",
1680 |               "      <td>0.166667</td>\n",
1681 |               "    </tr>\n",
1682 |               "    <tr>\n",
1683 |               "      <th>6</th>\n",
1684 |               "      <td>858</td>\n",
1685 |               "      <td>39</td>\n",
1686 |               "      <td>Cagrcoal shimmer fun</td>\n",
1687 |               "      <td>I aded this in my basket at hte last mintue to...</td>\n",
1688 |               "      <td>5</td>\n",
1689 |               "      <td>1</td>\n",
1690 |               "      <td>1</td>\n",
1691 |               "      <td>General Petite</td>\n",
1692 |               "      <td>Tops</td>\n",
1693 |               "      <td>Knits</td>\n",
1694 |               "      <td>-1</td>\n",
1695 |               "      <td>4</td>\n",
1696 |               "      <td>7</td>\n",
1697 |               "      <td>101</td>\n",
1698 |               "      <td>11.990099</td>\n",
1699 |               "    </tr>\n",
1700 |               "    <tr>\n",
1701 |               "      <th>8</th>\n",
1702 |               "      <td>1077</td>\n",
1703 |               "      <td>24</td>\n",
1704 |               "      <td>Flattering</td>\n",
1705 |               "      <td>I love this dress. i usually get an xs but it ...</td>\n",
1706 |               "      <td>5</td>\n",
1707 |               "      <td>1</td>\n",
1708 |               "      <td>0</td>\n",
1709 |               "      <td>General</td>\n",
1710 |               "      <td>Dresses</td>\n",
1711 |               "      <td>Dresses</td>\n",
1712 |               "      <td>3</td>\n",
1713 |               "      <td>0</td>\n",
1714 |               "      <td>3</td>\n",
1715 |               "      <td>34</td>\n",
1716 |               "      <td>0.088235</td>\n",
1717 |               "    </tr>\n",
1718 |               "    <tr>\n",
1719 |               "      <th>11</th>\n",
1720 |               "      <td>1095</td>\n",
1721 |               "      <td>39</td>\n",
1722 |               "      <td>NaN</td>\n",
1723 |               "      <td>This dress is perfection! so pretty and flatte...</td>\n",
1724 |               "      <td>5</td>\n",
1725 |               "      <td>1</td>\n",
1726 |               "      <td>2</td>\n",
1727 |               "      <td>General Petite</td>\n",
1728 |               "      <td>Dresses</td>\n",
1729 |               "      <td>Dresses</td>\n",
1730 |               "      <td>4</td>\n",
1731 |               "      <td>0</td>\n",
1732 |               "      <td>4</td>\n",
1733 |               "      <td>8</td>\n",
1734 |               "      <td>0.500000</td>\n",
1735 |               "    </tr>\n",
1736 |               "    <tr>\n",
1737 |               "      <th>13</th>\n",
1738 |               "      <td>767</td>\n",
1739 |               "      <td>44</td>\n",
1740 |               "      <td>Runs big</td>\n",
1741 |               "      <td>Bought the black xs to go under the larkspur m...</td>\n",
1742 |               "      <td>5</td>\n",
1743 |               "      <td>1</td>\n",
1744 |               "      <td>0</td>\n",
1745 |               "      <td>Initmates</td>\n",
1746 |               "      <td>Intimate</td>\n",
1747 |               "      <td>Intimates</td>\n",
1748 |               "      <td>1</td>\n",
1749 |               "      <td>1</td>\n",
1750 |               "      <td>3</td>\n",
1751 |               "      <td>69</td>\n",
1752 |               "      <td>3.014493</td>\n",
1753 |               "    </tr>\n",
1754 |               "    <tr>\n",
1755 |               "      <th>...</th>\n",
1756 |               "      <td>...</td>\n",
1757 |               "      <td>...</td>\n",
1758 |               "      <td>...</td>\n",
1759 |               "      <td>...</td>\n",
1760 |               "      <td>...</td>\n",
1761 |               "      <td>...</td>\n",
1762 |               "      <td>...</td>\n",
1763 |               "      <td>...</td>\n",
1764 |               "      <td>...</td>\n",
1765 |               "      <td>...</td>\n",
1766 |               "      <td>...</td>\n",
1767 |               "      <td>...</td>\n",
1768 |               "      <td>...</td>\n",
1769 |               "      <td>...</td>\n",
1770 |               "      <td>...</td>\n",
1771 |               "    </tr>\n",
1772 |               "    <tr>\n",
1773 |               "      <th>23417</th>\n",
1774 |               "      <td>850</td>\n",
1775 |               "      <td>39</td>\n",
1776 |               "      <td>Get it quick!</td>\n",
1777 |               "      <td>Can i tell you this top is amazing?! get it qu...</td>\n",
1778 |               "      <td>5</td>\n",
1779 |               "      <td>1</td>\n",
1780 |               "      <td>0</td>\n",
1781 |               "      <td>General</td>\n",
1782 |               "      <td>Tops</td>\n",
1783 |               "      <td>Blouses</td>\n",
1784 |               "      <td>6</td>\n",
1785 |               "      <td>0</td>\n",
1786 |               "      <td>6</td>\n",
1787 |               "      <td>15</td>\n",
1788 |               "      <td>0.400000</td>\n",
1789 |               "    </tr>\n",
1790 |               "    <tr>\n",
1791 |               "      <th>23438</th>\n",
1792 |               "      <td>181</td>\n",
1793 |               "      <td>68</td>\n",
1794 |               "      <td>Just right</td>\n",
1795 |               "      <td>I feel like snagging a pair of these was the e...</td>\n",
1796 |               "      <td>5</td>\n",
1797 |               "      <td>1</td>\n",
1798 |               "      <td>0</td>\n",
1799 |               "      <td>Initmates</td>\n",
1800 |               "      <td>Intimate</td>\n",
1801 |               "      <td>Legwear</td>\n",
1802 |               "      <td>4</td>\n",
1803 |               "      <td>0</td>\n",
1804 |               "      <td>4</td>\n",
1805 |               "      <td>62</td>\n",
1806 |               "      <td>0.064516</td>\n",
1807 |               "    </tr>\n",
1808 |               "    <tr>\n",
1809 |               "      <th>23441</th>\n",
1810 |               "      <td>1104</td>\n",
1811 |               "      <td>63</td>\n",
1812 |               "      <td>Sweet surprise</td>\n",
1813 |               "      <td>Don't know why but i didn't have high expectat...</td>\n",
1814 |               "      <td>5</td>\n",
1815 |               "      <td>1</td>\n",
1816 |               "      <td>25</td>\n",
1817 |               "      <td>General Petite</td>\n",
1818 |               "      <td>Dresses</td>\n",
1819 |               "      <td>Dresses</td>\n",
1820 |               "      <td>2</td>\n",
1821 |               "      <td>2</td>\n",
1822 |               "      <td>6</td>\n",
1823 |               "      <td>88</td>\n",
1824 |               "      <td>6.022727</td>\n",
1825 |               "    </tr>\n",
1826 |               "    <tr>\n",
1827 |               "      <th>23442</th>\n",
1828 |               "      <td>1104</td>\n",
1829 |               "      <td>39</td>\n",
1830 |               "      <td>Flattering dress</td>\n",
1831 |               "      <td>Love this dress, very flattering fit and the f...</td>\n",
1832 |               "      <td>5</td>\n",
1833 |               "      <td>1</td>\n",
1834 |               "      <td>0</td>\n",
1835 |               "      <td>General Petite</td>\n",
1836 |               "      <td>Dresses</td>\n",
1837 |               "      <td>Dresses</td>\n",
1838 |               "      <td>6</td>\n",
1839 |               "      <td>0</td>\n",
1840 |               "      <td>6</td>\n",
1841 |               "      <td>41</td>\n",
1842 |               "      <td>0.146341</td>\n",
1843 |               "    </tr>\n",
1844 |               "    <tr>\n",
1845 |               "      <th>23458</th>\n",
1846 |               "      <td>862</td>\n",
1847 |               "      <td>63</td>\n",
1848 |               "      <td>NaN</td>\n",
1849 |               "      <td>This is my new favorite sweater. it is lightwe...</td>\n",
1850 |               "      <td>5</td>\n",
1851 |               "      <td>1</td>\n",
1852 |               "      <td>0</td>\n",
1853 |               "      <td>General Petite</td>\n",
1854 |               "      <td>Tops</td>\n",
1855 |               "      <td>Knits</td>\n",
1856 |               "      <td>2</td>\n",
1857 |               "      <td>0</td>\n",
1858 |               "      <td>2</td>\n",
1859 |               "      <td>18</td>\n",
1860 |               "      <td>0.111111</td>\n",
1861 |               "    </tr>\n",
1862 |               "  </tbody>\n",
1863 |               "</table>\n",
1864 |               "<p>3537 rows × 15 columns</p>\n",
1865 |               "</div>"
1866 |             ],
1867 |             "text/plain": [
1868 |               "       Clothing ID  Age  ... word_count normalized_score\n",
1869 |               "4              847   47  ...         36         0.166667\n",
1870 |               "6              858   39  ...        101        11.990099\n",
1871 |               "8             1077   24  ...         34         0.088235\n",
1872 |               "11            1095   39  ...          8         0.500000\n",
1873 |               "13             767   44  ...         69         3.014493\n",
1874 |               "...            ...  ...  ...        ...              ...\n",
1875 |               "23417          850   39  ...         15         0.400000\n",
1876 |               "23438          181   68  ...         62         0.064516\n",
1877 |               "23441         1104   63  ...         88         6.022727\n",
1878 |               "23442         1104   39  ...         41         0.146341\n",
1879 |               "23458          862   63  ...         18         0.111111\n",
1880 |               "\n",
1881 |               "[3537 rows x 15 columns]"
1882 |             ]
1883 |           },
1884 |           "metadata": {
1885 |             "tags": []
1886 |           },
1887 |           "execution_count": 160
1888 |         }
1889 |       ]
1890 |     },
1891 |     {
1892 |       "cell_type": "markdown",
1893 |       "metadata": {
1894 |         "id": "10ZNusm3M1bK"
1895 |       },
1896 |       "source": [
1897 |         "One of the drawbacks to using the raw Afinn score is the that longer texts may yield higher values simply because they contain more words. To adjust for that, we can divide the score by the number of words in the text."
1898 |       ]
1899 |     },
1900 |     {
1901 |       "cell_type": "code",
1902 |       "metadata": {
1903 |         "id": "SBG3pn3gL8tx"
1904 |       },
1905 |       "source": [
1906 |         "df['word_count'] = 0\n",
1907 |         "for index, row in df.iterrows():\n",
1908 |         "  if pd.notna(row['Review Text']):\n",
1909 |         "    df.at[index, \"word_count\"] = len(row['Review Text'].split())\n",
1910 |         "df[\"normalized_score\"] = (df[\"text_score\"] / df[\"word_count\"]) + (2* df[\"title_score\"])"
1911 |       ],
1912 |       "execution_count": 161,
1913 |       "outputs": []
1914 |     },
1915 |     {
1916 |       "cell_type": "code",
1917 |       "metadata": {
1918 |         "id": "djCBJnEINpaq",
1919 |         "outputId": "1bc6581e-15d9-44a7-b5a5-7719e344029f",
1920 |         "colab": {
1921 |           "base_uri": "https://localhost:8080/",
1922 |           "height": 170
1923 |         }
1924 |       },
1925 |       "source": [
1926 |         "df[\"normalized_score\"].describe()"
1927 |       ],
1928 |       "execution_count": 162,
1929 |       "outputs": [
1930 |         {
1931 |           "output_type": "execute_result",
1932 |           "data": {
1933 |             "text/plain": [
1934 |               "count    22641.000000\n",
1935 |               "mean         3.552963\n",
1936 |               "std          3.909686\n",
1937 |               "min        -11.865979\n",
1938 |               "25%          0.146341\n",
1939 |               "50%          4.070175\n",
1940 |               "75%          6.183099\n",
1941 |               "max         24.164706\n",
1942 |               "Name: normalized_score, dtype: float64"
1943 |             ]
1944 |           },
1945 |           "metadata": {
1946 |             "tags": []
1947 |           },
1948 |           "execution_count": 162
1949 |         }
1950 |       ]
1951 |     },
1952 |     {
1953 |       "cell_type": "code",
1954 |       "metadata": {
1955 |         "id": "wQd3067IVOWu"
1956 |       },
1957 |       "source": [
1958 |         "def generate_confusion_matrix(df, score_column, threshold):\n",
1959 |         "  total = df[df[\"Rating\"]!=3].shape[0]\n",
1960 |         "  tp = df[(df[score_column]>=threshold) & (df[\"Rating\"]>3)].shape[0]\n",
1961 |         "  fp = df[(df[score_column]>=threshold) & (df[\"Rating\"]<3)].shape[0]\n",
1962 |         "  tn = df[(df[score_column]<threshold) & (df[\"Rating\"]<3)].shape[0]\n",
1963 |         "  fn = df[(df[score_column]<threshold) & (df[\"Rating\"]>3)].shape[0]\n",
1964 |         "  return tp / (tp + 0.5*(fp + fn))"
1965 |       ],
1966 |       "execution_count": 213,
1967 |       "outputs": []
1968 |     },
1969 |     {
1970 |       "cell_type": "code",
1971 |       "metadata": {
1972 |         "id": "g2-9A8ksSsHp",
1973 |         "outputId": "e05859dd-27f8-418b-e55f-715fbd38c60f",
1974 |         "colab": {
1975 |           "base_uri": "https://localhost:8080/",
1976 |           "height": 34
1977 |         }
1978 |       },
1979 |       "source": [
1980 |         "generate_confusion_matrix(df, \"normalized_score\", -1)"
1981 |       ],
1982 |       "execution_count": 214,
1983 |       "outputs": [
1984 |         {
1985 |           "output_type": "execute_result",
1986 |           "data": {
1987 |             "text/plain": [
1988 |               "0.9426901223776224"
1989 |             ]
1990 |           },
1991 |           "metadata": {
1992 |             "tags": []
1993 |           },
1994 |           "execution_count": 214
1995 |         }
1996 |       ]
1997 |     },
1998 |     {
1999 |       "cell_type": "code",
2000 |       "metadata": {
2001 |         "id": "0krSzOa3UO7w",
2002 |         "outputId": "818a330a-150b-4f15-a549-34fbf19cc1fc",
2003 |         "colab": {
2004 |           "base_uri": "https://localhost:8080/",
2005 |           "height": 34
2006 |         }
2007 |       },
2008 |       "source": [
2009 |         "generate_confusion_matrix(df, \"total_score\", 2)"
2010 |       ],
2011 |       "execution_count": 215,
2012 |       "outputs": [
2013 |         {
2014 |           "output_type": "execute_result",
2015 |           "data": {
2016 |             "text/plain": [
2017 |               "0.9424460431654677"
2018 |             ]
2019 |           },
2020 |           "metadata": {
2021 |             "tags": []
2022 |           },
2023 |           "execution_count": 215
2024 |         }
2025 |       ]
2026 |     },
2027 |     {
2028 |       "cell_type": "markdown",
2029 |       "metadata": {
2030 |         "id": "KHZ1ocfvZNt4"
2031 |       },
2032 |       "source": [
2033 |         "Feature engineering with an out of the box dictionary gives us some pretty good results!"
2034 |       ]
2035 |     },
2036 |     {
2037 |       "cell_type": "markdown",
2038 |       "metadata": {
2039 |         "id": "bsUsr140adsJ"
2040 |       },
2041 |       "source": [
2042 |         "## Creating your own classifier\n",
2043 |         "\n",
2044 |         "Its possible that you may want to create a new set of words that relate to your specific use-case. "
2045 |       ]
2046 |     },
2047 |     {
2048 |       "cell_type": "code",
2049 |       "metadata": {
2050 |         "id": "pSvlE_LiadTu"
2051 |       },
2052 |       "source": [
2053 |         "def get_score(text, custom_set):\n",
2054 |         "  # First we tokenize \n",
2055 |         "  text = text.lower()\n",
2056 |         "  punctuation = '\"!#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n",
2057 |         "  tokenized_text = \"\".join([ch for ch in text if ch not in punctuation]).split()\n",
2058 |         "  tokenized_set = set(tokenized_text)\n",
2059 |         "  \n",
2060 |         "  return len(tokenized_set.intersection(custom_set)) * 2\n"
2061 |       ],
2062 |       "execution_count": 223,
2063 |       "outputs": []
2064 |     },
2065 |     {
2066 |       "cell_type": "code",
2067 |       "metadata": {
2068 |         "id": "iM7Srh8TcjQj"
2069 |       },
2070 |       "source": [
2071 |         "custom_set = set([\"flattering\", \"quick\", \"well\", \"right\", \"comfortable\", \"slimming\", \"confident\"])"
2072 |       ],
2073 |       "execution_count": 242,
2074 |       "outputs": []
2075 |     },
2076 |     {
2077 |       "cell_type": "code",
2078 |       "metadata": {
2079 |         "id": "90UYTAm5dVbR",
2080 |         "outputId": "3630c9b1-93a7-40b9-8643-d94bb7815f8f",
2081 |         "colab": {
2082 |           "base_uri": "https://localhost:8080/",
2083 |           "height": 71
2084 |         }
2085 |       },
2086 |       "source": [
2087 |         "df.at[4, \"Review Text\"]"
2088 |       ],
2089 |       "execution_count": 235,
2090 |       "outputs": [
2091 |         {
2092 |           "output_type": "execute_result",
2093 |           "data": {
2094 |             "application/vnd.google.colaboratory.intrinsic+json": {
2095 |               "type": "string"
2096 |             },
2097 |             "text/plain": [
2098 |               "'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'"
2099 |             ]
2100 |           },
2101 |           "metadata": {
2102 |             "tags": []
2103 |           },
2104 |           "execution_count": 235
2105 |         }
2106 |       ]
2107 |     },
2108 |     {
2109 |       "cell_type": "code",
2110 |       "metadata": {
2111 |         "id": "tcou-woRcpjd",
2112 |         "outputId": "2ee15852-9aca-44f5-ca1b-66369f484d4e",
2113 |         "colab": {
2114 |           "base_uri": "https://localhost:8080/",
2115 |           "height": 34
2116 |         }
2117 |       },
2118 |       "source": [
2119 |         "get_score(df.at[4, \"Review Text\"], custom_set)"
2120 |       ],
2121 |       "execution_count": 243,
2122 |       "outputs": [
2123 |         {
2124 |           "output_type": "execute_result",
2125 |           "data": {
2126 |             "text/plain": [
2127 |               "4"
2128 |             ]
2129 |           },
2130 |           "metadata": {
2131 |             "tags": []
2132 |           },
2133 |           "execution_count": 243
2134 |         }
2135 |       ]
2136 |     },
2137 |     {
2138 |       "cell_type": "code",
2139 |       "metadata": {
2140 |         "id": "fYcjhV85cyYz",
2141 |         "outputId": "b18ee760-bed3-4907-bba5-e2e4423b6d4e",
2142 |         "colab": {
2143 |           "base_uri": "https://localhost:8080/",
2144 |           "height": 71
2145 |         }
2146 |       },
2147 |       "source": [
2148 |         "df.at[23442, \"Review Text\"]"
2149 |       ],
2150 |       "execution_count": 237,
2151 |       "outputs": [
2152 |         {
2153 |           "output_type": "execute_result",
2154 |           "data": {
2155 |             "application/vnd.google.colaboratory.intrinsic+json": {
2156 |               "type": "string"
2157 |             },
2158 |             "text/plain": [
2159 |               "'Love this dress, very flattering fit and the fabric does not feel heavy but is sturdy - i wore it for first dinner out with my husband after losing most of my baby weight and felt great and confident in it.'"
2160 |             ]
2161 |           },
2162 |           "metadata": {
2163 |             "tags": []
2164 |           },
2165 |           "execution_count": 237
2166 |         }
2167 |       ]
2168 |     },
2169 |     {
2170 |       "cell_type": "code",
2171 |       "metadata": {
2172 |         "id": "1D30axVaczGh",
2173 |         "outputId": "f2b2504f-2caa-4de7-9e3d-d99201d186cb",
2174 |         "colab": {
2175 |           "base_uri": "https://localhost:8080/",
2176 |           "height": 34
2177 |         }
2178 |       },
2179 |       "source": [
2180 |         "get_score(df.at[23442, \"Review Text\"], custom_set)"
2181 |       ],
2182 |       "execution_count": 244,
2183 |       "outputs": [
2184 |         {
2185 |           "output_type": "execute_result",
2186 |           "data": {
2187 |             "text/plain": [
2188 |               "4"
2189 |             ]
2190 |           },
2191 |           "metadata": {
2192 |             "tags": []
2193 |           },
2194 |           "execution_count": 244
2195 |         }
2196 |       ]
2197 |     },
2198 |     {
2199 |       "cell_type": "code",
2200 |       "metadata": {
2201 |         "id": "sOoO5DWodL6a",
2202 |         "outputId": "e5961457-3c22-4528-b7a0-4fb4d8521b93",
2203 |         "colab": {
2204 |           "base_uri": "https://localhost:8080/",
2205 |           "height": 34
2206 |         }
2207 |       },
2208 |       "source": [
2209 |         "get_score(df.at[23438, \"Review Text\"], custom_set)"
2210 |       ],
2211 |       "execution_count": 245,
2212 |       "outputs": [
2213 |         {
2214 |           "output_type": "execute_result",
2215 |           "data": {
2216 |             "text/plain": [
2217 |               "4"
2218 |             ]
2219 |           },
2220 |           "metadata": {
2221 |             "tags": []
2222 |           },
2223 |           "execution_count": 245
2224 |         }
2225 |       ]
2226 |     },
2227 |     {
2228 |       "cell_type": "code",
2229 |       "metadata": {
2230 |         "id": "TQmTY7DLdnuT",
2231 |         "outputId": "f536eea3-a4e5-4087-c75b-c1175bb36953",
2232 |         "colab": {
2233 |           "base_uri": "https://localhost:8080/",
2234 |           "height": 88
2235 |         }
2236 |       },
2237 |       "source": [
2238 |         "df.at[23438, \"Review Text\"]"
2239 |       ],
2240 |       "execution_count": 241,
2241 |       "outputs": [
2242 |         {
2243 |           "output_type": "execute_result",
2244 |           "data": {
2245 |             "application/vnd.google.colaboratory.intrinsic+json": {
2246 |               "type": "string"
2247 |             },
2248 |             "text/plain": [
2249 |               "\"I feel like snagging a pair of these was the equivalent to standing in line for black friday, as they always seem to be out of stock. now i know why. these are soft, comfortable, and slimming. they're somewhere between the hold of control top pantyhose and spans--they don't fall and sag throughout the day and are nicely slimming without being pain-inducing.\""
2250 |             ]
2251 |           },
2252 |           "metadata": {
2253 |             "tags": []
2254 |           },
2255 |           "execution_count": 241
2256 |         }
2257 |       ]
2258 |     }
2259 |   ]
2260 | }


--------------------------------------------------------------------------------
/ml-projects/Using_Embeddings_and_NLP_For_Machine_Learning.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "colab": {
   6 |       "name": "Using Embeddings and NLP For Machine Learning.ipynb",
   7 |       "provenance": [],
   8 |       "collapsed_sections": [],
   9 |       "authorship_tag": "ABX9TyMdnG2F37FVgXskY8ZIm00B",
  10 |       "include_colab_link": true
  11 |     },
  12 |     "kernelspec": {
  13 |       "name": "python3",
  14 |       "display_name": "Python 3"
  15 |     }
  16 |   },
  17 |   "cells": [
  18 |     {
  19 |       "cell_type": "markdown",
  20 |       "metadata": {
  21 |         "id": "view-in-github",
  22 |         "colab_type": "text"
  23 |       },
  24 |       "source": [
  25 |         "<a href=\"https://colab.research.google.com/github/mjahanshahi/intermediate-nlp/blob/master/Using_Embeddings_and_NLP_For_Machine_Learning.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  26 |       ]
  27 |     },
  28 |     {
  29 |       "cell_type": "markdown",
  30 |       "metadata": {
  31 |         "id": "xE90bjnUlEYa"
  32 |       },
  33 |       "source": [
  34 |         "# Data processing"
  35 |       ]
  36 |     },
  37 |     {
  38 |       "cell_type": "code",
  39 |       "metadata": {
  40 |         "id": "EKtOpxlZq1Xe"
  41 |       },
  42 |       "source": [
  43 |         "import spacy\n",
  44 |         "import numpy as np\n",
  45 |         "import pandas as pd\n",
  46 |         "from collections import Counter\n",
  47 |         "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
  48 |         "from sklearn.decomposition import NMF, LatentDirichletAllocation\n",
  49 |         "from sklearn.metrics.pairwise import cosine_similarity\n",
  50 |         "from sklearn.preprocessing import normalize\n",
  51 |         "from sklearn.metrics import accuracy_score\n",
  52 |         "from sklearn.svm import LinearSVC\n",
  53 |         "import pickle\n",
  54 |         "#!python -m spacy download en_core_web_md en\n",
  55 |         "import en_core_web_md\n",
  56 |         "nlp = en_core_web_md.load()"
  57 |       ],
  58 |       "execution_count": 21,
  59 |       "outputs": []
  60 |     },
  61 |     {
  62 |       "cell_type": "code",
  63 |       "metadata": {
  64 |         "id": "AwyD_8OWrOYM"
  65 |       },
  66 |       "source": [
  67 |         "def basic_tokenizer(doc, model=nlp):\n",
  68 |         " \n",
  69 |         "    parsed_doc = model(doc)\n",
  70 |         "\n",
  71 |         "    # Tokens are those that are comprised of alphabetic characters and not urls and not stop words  \n",
  72 |         "    return [t.lemma_ for t in parsed_doc if (t.is_alpha)&(not t.like_url)&(not t.is_stop)]"
  73 |       ],
  74 |       "execution_count": 23,
  75 |       "outputs": []
  76 |     },
  77 |     {
  78 |       "cell_type": "code",
  79 |       "metadata": {
  80 |         "colab": {
  81 |           "base_uri": "https://localhost:8080/"
  82 |         },
  83 |         "id": "zaRN4jxStIuY",
  84 |         "outputId": "e901ae07-0bee-494b-a113-2a043b22551b"
  85 |       },
  86 |       "source": [
  87 |         "# Here we use scikit learn's count vectorizer with our tokenizer\n",
  88 |         "cv = CountVectorizer(tokenizer=basic_tokenizer)\n",
  89 |         "\n",
  90 |         "# Our mini corpus\n",
  91 |         "text_data = [\"A friend gave me these when I was recently diagnosed with breast cancer. I bought another pair because they are the best pjs I’ve ever owned. As soon as I came out of surgery, I asked the nurses to help me change into them. This is me wearing them on my first walk down the hospital hall. Now I only take them off to wash them and then put them immediately back on. They’ve survived 100 washes and still look new. They’re the best gift I’ve received during my breast cancer treatment\",\n",
  92 |         "            \"While as others claimed these do run a little on the longer side, at 5'7'' and a size small, I found the length luxurious rather than sloppy. Other than the length, the fit is pretty true to size. They wash and wear well, don't get stretched out and have an extra button at the neck for when you need a little extra warmth. The elastic waist band is thick and just generous enough not to be tight, but just in case you're tiny waisted, there is a drawstring as well. These have taken over as my new favorite!\",\n",
  93 |         "            \"I bought these pjs in navy and white stripes a few years ago and have been in love with them forever, so soft and dreamy just like the name. I decided to finally splurge on a second pair recently and it seems like the quality has gotten cheaper. The seams are EXTREMELY itchy. There’s a weird plastic piece that runs along all the seams and is constantly scratching my skin....not the best feeling in bed. Please get rid of the weird plastic seam!!\",\n",
  94 |         "            \"These are great soft pajamas, except the size small pants I received have a 33' inseam, which is obviously crazy long. I'm not sure if the pair I received is flawed, given the description says they have a 27' inseam\"]\n",
  95 |         "\n",
  96 |         "v = cv.fit_transform(text_data).toarray()\n",
  97 |         "print(v)\n"
  98 |       ],
  99 |       "execution_count": 26,
 100 |       "outputs": [
 101 |         {
 102 |           "output_type": "stream",
 103 |           "text": [
 104 |             "[[0 1 0 0 2 0 1 2 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 2\n",
 105 |             "  0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0\n",
 106 |             "  0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 2 1 0 0 0]\n",
 107 |             " [0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 2 0 1 0 0 1 1 0 0 0 1 0 0 0 0\n",
 108 |             "  0 0 0 0 0 0 0 2 0 2 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0\n",
 109 |             "  0 2 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0]\n",
 110 |             " [1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1\n",
 111 |             "  0 0 0 0 0 0 1 0 2 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 2 0 1 0 1 1 1 0 1 3\n",
 112 |             "  1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1]\n",
 113 |             " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0\n",
 114 |             "  1 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 2 0 0 0 1 0 0\n",
 115 |             "  0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n"
 116 |           ],
 117 |           "name": "stdout"
 118 |         }
 119 |       ]
 120 |     },
 121 |     {
 122 |       "cell_type": "code",
 123 |       "metadata": {
 124 |         "colab": {
 125 |           "base_uri": "https://localhost:8080/"
 126 |         },
 127 |         "id": "gbv3QspOw0fZ",
 128 |         "outputId": "354f867a-7d91-4027-fc41-1607d4cd866c"
 129 |       },
 130 |       "source": [
 131 |         "# Make these arrays human readable with feature names \n",
 132 |         "dict(zip(cv.get_feature_names(), v.sum(axis=0)))"
 133 |       ],
 134 |       "execution_count": 27,
 135 |       "outputs": [
 136 |         {
 137 |           "output_type": "execute_result",
 138 |           "data": {
 139 |             "text/plain": [
 140 |               "{'ago': 1,\n",
 141 |               " 'ask': 1,\n",
 142 |               " 'band': 1,\n",
 143 |               " 'bed': 1,\n",
 144 |               " 'breast': 2,\n",
 145 |               " 'button': 1,\n",
 146 |               " 'buy': 2,\n",
 147 |               " 'cancer': 2,\n",
 148 |               " 'case': 1,\n",
 149 |               " 'change': 1,\n",
 150 |               " 'cheap': 1,\n",
 151 |               " 'claim': 1,\n",
 152 |               " 'come': 1,\n",
 153 |               " 'constantly': 1,\n",
 154 |               " 'crazy': 1,\n",
 155 |               " 'decide': 1,\n",
 156 |               " 'description': 1,\n",
 157 |               " 'diagnose': 1,\n",
 158 |               " 'drawstring': 1,\n",
 159 |               " 'dreamy': 1,\n",
 160 |               " 'elastic': 1,\n",
 161 |               " 'extra': 2,\n",
 162 |               " 'extremely': 1,\n",
 163 |               " 'favorite': 1,\n",
 164 |               " 'feeling': 1,\n",
 165 |               " 'finally': 1,\n",
 166 |               " 'find': 1,\n",
 167 |               " 'fit': 1,\n",
 168 |               " 'flawed': 1,\n",
 169 |               " 'forever': 1,\n",
 170 |               " 'friend': 1,\n",
 171 |               " 'generous': 1,\n",
 172 |               " 'get': 1,\n",
 173 |               " 'gift': 1,\n",
 174 |               " 'give': 2,\n",
 175 |               " 'good': 3,\n",
 176 |               " 'great': 1,\n",
 177 |               " 'hall': 1,\n",
 178 |               " 'help': 1,\n",
 179 |               " 'hospital': 1,\n",
 180 |               " 'immediately': 1,\n",
 181 |               " 'inseam': 2,\n",
 182 |               " 'itchy': 1,\n",
 183 |               " 'length': 2,\n",
 184 |               " 'like': 2,\n",
 185 |               " 'little': 2,\n",
 186 |               " 'long': 2,\n",
 187 |               " 'look': 1,\n",
 188 |               " 'love': 1,\n",
 189 |               " 'luxurious': 1,\n",
 190 |               " 'navy': 1,\n",
 191 |               " 'neck': 1,\n",
 192 |               " 'need': 1,\n",
 193 |               " 'new': 2,\n",
 194 |               " 'nurse': 1,\n",
 195 |               " 'obviously': 1,\n",
 196 |               " 'own': 1,\n",
 197 |               " 'pair': 3,\n",
 198 |               " 'pajama': 1,\n",
 199 |               " 'pant': 1,\n",
 200 |               " 'piece': 1,\n",
 201 |               " 'pjs': 2,\n",
 202 |               " 'plastic': 2,\n",
 203 |               " 'pretty': 1,\n",
 204 |               " 'quality': 1,\n",
 205 |               " 'receive': 3,\n",
 206 |               " 'recently': 2,\n",
 207 |               " 'rid': 1,\n",
 208 |               " 'run': 2,\n",
 209 |               " 'say': 1,\n",
 210 |               " 'scratch': 1,\n",
 211 |               " 'seam': 3,\n",
 212 |               " 'second': 1,\n",
 213 |               " 'size': 3,\n",
 214 |               " 'skin': 1,\n",
 215 |               " 'sloppy': 1,\n",
 216 |               " 'small': 2,\n",
 217 |               " 'soft': 2,\n",
 218 |               " 'soon': 1,\n",
 219 |               " 'splurge': 1,\n",
 220 |               " 'stretch': 1,\n",
 221 |               " 'stripes': 1,\n",
 222 |               " 'sure': 1,\n",
 223 |               " 'surgery': 1,\n",
 224 |               " 'survive': 1,\n",
 225 |               " 'take': 1,\n",
 226 |               " 'thick': 1,\n",
 227 |               " 'tight': 1,\n",
 228 |               " 'tiny': 1,\n",
 229 |               " 'treatment': 1,\n",
 230 |               " 'true': 1,\n",
 231 |               " 'waist': 1,\n",
 232 |               " 'waisted': 1,\n",
 233 |               " 'walk': 1,\n",
 234 |               " 'warmth': 1,\n",
 235 |               " 'wash': 3,\n",
 236 |               " 'wear': 2,\n",
 237 |               " 'weird': 2,\n",
 238 |               " 'white': 1,\n",
 239 |               " 'year': 1}"
 240 |             ]
 241 |           },
 242 |           "metadata": {
 243 |             "tags": []
 244 |           },
 245 |           "execution_count": 27
 246 |         }
 247 |       ]
 248 |     },
 249 |     {
 250 |       "cell_type": "markdown",
 251 |       "metadata": {
 252 |         "id": "pR6CnjHgT36E"
 253 |       },
 254 |       "source": [
 255 |         "## Dataframe Cleanup"
 256 |       ]
 257 |     },
 258 |     {
 259 |       "cell_type": "code",
 260 |       "metadata": {
 261 |         "colab": {
 262 |           "base_uri": "https://localhost:8080/"
 263 |         },
 264 |         "id": "yjVaQwTIxlN6",
 265 |         "outputId": "e8d3b9d2-1f42-46c6-ce6f-3fa09a31e588"
 266 |       },
 267 |       "source": [
 268 |         "# Now let's apply to the review dataset\n",
 269 |         "DATASET_LINK = \"https://raw.githubusercontent.com/AFAgarap/ecommerce-reviews-analysis/master/Womens%20Clothing%20E-Commerce%20Reviews.csv\"\n",
 270 |         "df = pd.read_csv(DATASET_LINK, usecols=[\"Clothing ID\", \"Title\", \"Review Text\", \"Rating\"])\n",
 271 |         "df.shape"
 272 |       ],
 273 |       "execution_count": 92,
 274 |       "outputs": [
 275 |         {
 276 |           "output_type": "execute_result",
 277 |           "data": {
 278 |             "text/plain": [
 279 |               "(23486, 4)"
 280 |             ]
 281 |           },
 282 |           "metadata": {
 283 |             "tags": []
 284 |           },
 285 |           "execution_count": 92
 286 |         }
 287 |       ]
 288 |     },
 289 |     {
 290 |       "cell_type": "code",
 291 |       "metadata": {
 292 |         "colab": {
 293 |           "base_uri": "https://localhost:8080/",
 294 |           "height": 514
 295 |         },
 296 |         "id": "0dfP2miB84lz",
 297 |         "outputId": "ef1ab0c2-22bb-4a05-9362-e121c1aee43a"
 298 |       },
 299 |       "source": [
 300 |         "df.sample(15)"
 301 |       ],
 302 |       "execution_count": 91,
 303 |       "outputs": [
 304 |         {
 305 |           "output_type": "execute_result",
 306 |           "data": {
 307 |             "text/html": [
 308 |               "<div>\n",
 309 |               "<style scoped>\n",
 310 |               "    .dataframe tbody tr th:only-of-type {\n",
 311 |               "        vertical-align: middle;\n",
 312 |               "    }\n",
 313 |               "\n",
 314 |               "    .dataframe tbody tr th {\n",
 315 |               "        vertical-align: top;\n",
 316 |               "    }\n",
 317 |               "\n",
 318 |               "    .dataframe thead th {\n",
 319 |               "        text-align: right;\n",
 320 |               "    }\n",
 321 |               "</style>\n",
 322 |               "<table border=\"1\" class=\"dataframe\">\n",
 323 |               "  <thead>\n",
 324 |               "    <tr style=\"text-align: right;\">\n",
 325 |               "      <th></th>\n",
 326 |               "      <th>Clothing ID</th>\n",
 327 |               "      <th>Title</th>\n",
 328 |               "      <th>Review Text</th>\n",
 329 |               "      <th>Rating</th>\n",
 330 |               "      <th>Review Tokens</th>\n",
 331 |               "    </tr>\n",
 332 |               "  </thead>\n",
 333 |               "  <tbody>\n",
 334 |               "    <tr>\n",
 335 |               "      <th>4163</th>\n",
 336 |               "      <td>1078</td>\n",
 337 |               "      <td>Great dress!</td>\n",
 338 |               "      <td>This dress is a great casual outfit. tha fabri...</td>\n",
 339 |               "      <td>4</td>\n",
 340 |               "      <td>None</td>\n",
 341 |               "    </tr>\n",
 342 |               "    <tr>\n",
 343 |               "      <th>2743</th>\n",
 344 |               "      <td>984</td>\n",
 345 |               "      <td>Fun jacket</td>\n",
 346 |               "      <td>Love the fabric and the casual way this jacket...</td>\n",
 347 |               "      <td>5</td>\n",
 348 |               "      <td>None</td>\n",
 349 |               "    </tr>\n",
 350 |               "    <tr>\n",
 351 |               "      <th>8799</th>\n",
 352 |               "      <td>907</td>\n",
 353 |               "      <td>Cute but color different than pictured</td>\n",
 354 |               "      <td>I purchased this sweater in the grey color and...</td>\n",
 355 |               "      <td>4</td>\n",
 356 |               "      <td>None</td>\n",
 357 |               "    </tr>\n",
 358 |               "    <tr>\n",
 359 |               "      <th>17369</th>\n",
 360 |               "      <td>1048</td>\n",
 361 |               "      <td>Flattering and simply perfect</td>\n",
 362 |               "      <td>I tried these on in the store and instantly fe...</td>\n",
 363 |               "      <td>5</td>\n",
 364 |               "      <td>None</td>\n",
 365 |               "    </tr>\n",
 366 |               "    <tr>\n",
 367 |               "      <th>12195</th>\n",
 368 |               "      <td>875</td>\n",
 369 |               "      <td>Perfect fall piece</td>\n",
 370 |               "      <td>I saw this and had to have it!  it is so beaut...</td>\n",
 371 |               "      <td>4</td>\n",
 372 |               "      <td>None</td>\n",
 373 |               "    </tr>\n",
 374 |               "    <tr>\n",
 375 |               "      <th>15466</th>\n",
 376 |               "      <td>1068</td>\n",
 377 |               "      <td>Pants oversized</td>\n",
 378 |               "      <td>I wanted to love these pants since i hate supe...</td>\n",
 379 |               "      <td>3</td>\n",
 380 |               "      <td>None</td>\n",
 381 |               "    </tr>\n",
 382 |               "    <tr>\n",
 383 |               "      <th>11243</th>\n",
 384 |               "      <td>1022</td>\n",
 385 |               "      <td>Pleasantly surprised</td>\n",
 386 |               "      <td>I ordered these jsut coz they were on sale... ...</td>\n",
 387 |               "      <td>5</td>\n",
 388 |               "      <td>None</td>\n",
 389 |               "    </tr>\n",
 390 |               "    <tr>\n",
 391 |               "      <th>11927</th>\n",
 392 |               "      <td>868</td>\n",
 393 |               "      <td>Cozy elegance!</td>\n",
 394 |               "      <td>First off, this top is super cozy! i adore an ...</td>\n",
 395 |               "      <td>5</td>\n",
 396 |               "      <td>None</td>\n",
 397 |               "    </tr>\n",
 398 |               "    <tr>\n",
 399 |               "      <th>10362</th>\n",
 400 |               "      <td>1067</td>\n",
 401 |               "      <td>Beautiful</td>\n",
 402 |               "      <td>I love this jumpsuit. i've worn it twice and r...</td>\n",
 403 |               "      <td>5</td>\n",
 404 |               "      <td>None</td>\n",
 405 |               "    </tr>\n",
 406 |               "    <tr>\n",
 407 |               "      <th>7782</th>\n",
 408 |               "      <td>936</td>\n",
 409 |               "      <td>Love it!</td>\n",
 410 |               "      <td>This sweater is super cozy and comfy and my ne...</td>\n",
 411 |               "      <td>5</td>\n",
 412 |               "      <td>None</td>\n",
 413 |               "    </tr>\n",
 414 |               "    <tr>\n",
 415 |               "      <th>11412</th>\n",
 416 |               "      <td>1079</td>\n",
 417 |               "      <td>Unique and lovely</td>\n",
 418 |               "      <td>This dress is unique and lovely! it is a one o...</td>\n",
 419 |               "      <td>5</td>\n",
 420 |               "      <td>None</td>\n",
 421 |               "    </tr>\n",
 422 |               "    <tr>\n",
 423 |               "      <th>9007</th>\n",
 424 |               "      <td>1059</td>\n",
 425 |               "      <td>Super comfortable, stylish jumpsuit</td>\n",
 426 |               "      <td>This is my first ever jumpsuit purchase, and i...</td>\n",
 427 |               "      <td>5</td>\n",
 428 |               "      <td>None</td>\n",
 429 |               "    </tr>\n",
 430 |               "    <tr>\n",
 431 |               "      <th>16041</th>\n",
 432 |               "      <td>868</td>\n",
 433 |               "      <td>My new favorite top</td>\n",
 434 |               "      <td>This top is wonderfully comfortable and well m...</td>\n",
 435 |               "      <td>5</td>\n",
 436 |               "      <td>None</td>\n",
 437 |               "    </tr>\n",
 438 |               "    <tr>\n",
 439 |               "      <th>4122</th>\n",
 440 |               "      <td>1081</td>\n",
 441 |               "      <td>Beautiful dress!</td>\n",
 442 |               "      <td>This dress is so pretty, comfortable and easy ...</td>\n",
 443 |               "      <td>5</td>\n",
 444 |               "      <td>None</td>\n",
 445 |               "    </tr>\n",
 446 |               "    <tr>\n",
 447 |               "      <th>7243</th>\n",
 448 |               "      <td>1020</td>\n",
 449 |               "      <td>Versatile skirt</td>\n",
 450 |               "      <td>Well made. color is great, goes from summer ri...</td>\n",
 451 |               "      <td>5</td>\n",
 452 |               "      <td>None</td>\n",
 453 |               "    </tr>\n",
 454 |               "  </tbody>\n",
 455 |               "</table>\n",
 456 |               "</div>"
 457 |             ],
 458 |             "text/plain": [
 459 |               "       Clothing ID  ... Review Tokens\n",
 460 |               "4163          1078  ...          None\n",
 461 |               "2743           984  ...          None\n",
 462 |               "8799           907  ...          None\n",
 463 |               "17369         1048  ...          None\n",
 464 |               "12195          875  ...          None\n",
 465 |               "15466         1068  ...          None\n",
 466 |               "11243         1022  ...          None\n",
 467 |               "11927          868  ...          None\n",
 468 |               "10362         1067  ...          None\n",
 469 |               "7782           936  ...          None\n",
 470 |               "11412         1079  ...          None\n",
 471 |               "9007          1059  ...          None\n",
 472 |               "16041          868  ...          None\n",
 473 |               "4122          1081  ...          None\n",
 474 |               "7243          1020  ...          None\n",
 475 |               "\n",
 476 |               "[15 rows x 5 columns]"
 477 |             ]
 478 |           },
 479 |           "metadata": {
 480 |             "tags": []
 481 |           },
 482 |           "execution_count": 91
 483 |         }
 484 |       ]
 485 |     },
 486 |     {
 487 |       "cell_type": "code",
 488 |       "metadata": {
 489 |         "id": "wRSetiA_92rz"
 490 |       },
 491 |       "source": [
 492 |         "# Lets remove any review without a text review or a rating\n",
 493 |         "df.dropna(how = \"any\", subset=['Review Text', 'Rating'], inplace=True)\n",
 494 |         "df.reset_index(drop=True, inplace=True)\n",
 495 |         "df.shape\n",
 496 |         "# Remove this sampling to ensure broader reach\n",
 497 |         "df = df.sample(2000)"
 498 |       ],
 499 |       "execution_count": 93,
 500 |       "outputs": []
 501 |     },
 502 |     {
 503 |       "cell_type": "code",
 504 |       "metadata": {
 505 |         "colab": {
 506 |           "base_uri": "https://localhost:8080/",
 507 |           "height": 514
 508 |         },
 509 |         "id": "YQb_oGMrAx8O",
 510 |         "outputId": "18b9b81b-9522-427b-d864-5e8ba750b8f7"
 511 |       },
 512 |       "source": [
 513 |         "df.sample(15)"
 514 |       ],
 515 |       "execution_count": 94,
 516 |       "outputs": [
 517 |         {
 518 |           "output_type": "execute_result",
 519 |           "data": {
 520 |             "text/html": [
 521 |               "<div>\n",
 522 |               "<style scoped>\n",
 523 |               "    .dataframe tbody tr th:only-of-type {\n",
 524 |               "        vertical-align: middle;\n",
 525 |               "    }\n",
 526 |               "\n",
 527 |               "    .dataframe tbody tr th {\n",
 528 |               "        vertical-align: top;\n",
 529 |               "    }\n",
 530 |               "\n",
 531 |               "    .dataframe thead th {\n",
 532 |               "        text-align: right;\n",
 533 |               "    }\n",
 534 |               "</style>\n",
 535 |               "<table border=\"1\" class=\"dataframe\">\n",
 536 |               "  <thead>\n",
 537 |               "    <tr style=\"text-align: right;\">\n",
 538 |               "      <th></th>\n",
 539 |               "      <th>Clothing ID</th>\n",
 540 |               "      <th>Title</th>\n",
 541 |               "      <th>Review Text</th>\n",
 542 |               "      <th>Rating</th>\n",
 543 |               "    </tr>\n",
 544 |               "  </thead>\n",
 545 |               "  <tbody>\n",
 546 |               "    <tr>\n",
 547 |               "      <th>12917</th>\n",
 548 |               "      <td>1072</td>\n",
 549 |               "      <td>NaN</td>\n",
 550 |               "      <td>This dress is one of my recent faves from reta...</td>\n",
 551 |               "      <td>5</td>\n",
 552 |               "    </tr>\n",
 553 |               "    <tr>\n",
 554 |               "      <th>9982</th>\n",
 555 |               "      <td>1056</td>\n",
 556 |               "      <td>Very nice and versatile pants</td>\n",
 557 |               "      <td>I'm typically a 27. this 27 seems snug but the...</td>\n",
 558 |               "      <td>4</td>\n",
 559 |               "    </tr>\n",
 560 |               "    <tr>\n",
 561 |               "      <th>15160</th>\n",
 562 |               "      <td>865</td>\n",
 563 |               "      <td>Stylish &amp; comfortable</td>\n",
 564 |               "      <td>I ordered the green top in store after trying ...</td>\n",
 565 |               "      <td>4</td>\n",
 566 |               "    </tr>\n",
 567 |               "    <tr>\n",
 568 |               "      <th>651</th>\n",
 569 |               "      <td>1087</td>\n",
 570 |               "      <td>Simple but different.</td>\n",
 571 |               "      <td>I bought this dress in the cream color. it was...</td>\n",
 572 |               "      <td>4</td>\n",
 573 |               "    </tr>\n",
 574 |               "    <tr>\n",
 575 |               "      <th>18641</th>\n",
 576 |               "      <td>867</td>\n",
 577 |               "      <td>Cute but no</td>\n",
 578 |               "      <td>This shirt ran small and was more sheer than i...</td>\n",
 579 |               "      <td>2</td>\n",
 580 |               "    </tr>\n",
 581 |               "    <tr>\n",
 582 |               "      <th>36</th>\n",
 583 |               "      <td>1002</td>\n",
 584 |               "      <td>NaN</td>\n",
 585 |               "      <td>This is a comfortable skirt that can span seas...</td>\n",
 586 |               "      <td>4</td>\n",
 587 |               "    </tr>\n",
 588 |               "    <tr>\n",
 589 |               "      <th>21470</th>\n",
 590 |               "      <td>451</td>\n",
 591 |               "      <td>Just don't wash it</td>\n",
 592 |               "      <td>I loved this dress...until i washed it. the la...</td>\n",
 593 |               "      <td>3</td>\n",
 594 |               "    </tr>\n",
 595 |               "    <tr>\n",
 596 |               "      <th>4669</th>\n",
 597 |               "      <td>940</td>\n",
 598 |               "      <td>Like wearing a hug</td>\n",
 599 |               "      <td>Great sweater, beautiful detail, warm and cozy...</td>\n",
 600 |               "      <td>5</td>\n",
 601 |               "    </tr>\n",
 602 |               "    <tr>\n",
 603 |               "      <th>10890</th>\n",
 604 |               "      <td>829</td>\n",
 605 |               "      <td>Soft and swinging</td>\n",
 606 |               "      <td>The color and fabric are really soft and lovel...</td>\n",
 607 |               "      <td>4</td>\n",
 608 |               "    </tr>\n",
 609 |               "    <tr>\n",
 610 |               "      <th>12548</th>\n",
 611 |               "      <td>1081</td>\n",
 612 |               "      <td>Perfect fit, forgiving belly</td>\n",
 613 |               "      <td>This dress is so soft, and fits like a dream. ...</td>\n",
 614 |               "      <td>5</td>\n",
 615 |               "    </tr>\n",
 616 |               "    <tr>\n",
 617 |               "      <th>2963</th>\n",
 618 |               "      <td>895</td>\n",
 619 |               "      <td>NaN</td>\n",
 620 |               "      <td>I passed this over on first sight because it l...</td>\n",
 621 |               "      <td>4</td>\n",
 622 |               "    </tr>\n",
 623 |               "    <tr>\n",
 624 |               "      <th>19909</th>\n",
 625 |               "      <td>949</td>\n",
 626 |               "      <td>So cute and stylish!!</td>\n",
 627 |               "      <td>Was in my local retailer today and just had to...</td>\n",
 628 |               "      <td>5</td>\n",
 629 |               "    </tr>\n",
 630 |               "    <tr>\n",
 631 |               "      <th>10367</th>\n",
 632 |               "      <td>1037</td>\n",
 633 |               "      <td>Great look</td>\n",
 634 |               "      <td>These pants fit great. the velvet material is ...</td>\n",
 635 |               "      <td>5</td>\n",
 636 |               "    </tr>\n",
 637 |               "    <tr>\n",
 638 |               "      <th>17897</th>\n",
 639 |               "      <td>809</td>\n",
 640 |               "      <td>White tee with a \"detail\"on back</td>\n",
 641 |               "      <td>I ordered a small. i'm usually a petite small....</td>\n",
 642 |               "      <td>5</td>\n",
 643 |               "    </tr>\n",
 644 |               "    <tr>\n",
 645 |               "      <th>9848</th>\n",
 646 |               "      <td>927</td>\n",
 647 |               "      <td>Really lovely sweater coat</td>\n",
 648 |               "      <td>I love the patterns and the colors! easy to dr...</td>\n",
 649 |               "      <td>5</td>\n",
 650 |               "    </tr>\n",
 651 |               "  </tbody>\n",
 652 |               "</table>\n",
 653 |               "</div>"
 654 |             ],
 655 |             "text/plain": [
 656 |               "       Clothing ID  ... Rating\n",
 657 |               "12917         1072  ...      5\n",
 658 |               "9982          1056  ...      4\n",
 659 |               "15160          865  ...      4\n",
 660 |               "651           1087  ...      4\n",
 661 |               "18641          867  ...      2\n",
 662 |               "36            1002  ...      4\n",
 663 |               "21470          451  ...      3\n",
 664 |               "4669           940  ...      5\n",
 665 |               "10890          829  ...      4\n",
 666 |               "12548         1081  ...      5\n",
 667 |               "2963           895  ...      4\n",
 668 |               "19909          949  ...      5\n",
 669 |               "10367         1037  ...      5\n",
 670 |               "17897          809  ...      5\n",
 671 |               "9848           927  ...      5\n",
 672 |               "\n",
 673 |               "[15 rows x 4 columns]"
 674 |             ]
 675 |           },
 676 |           "metadata": {
 677 |             "tags": []
 678 |           },
 679 |           "execution_count": 94
 680 |         }
 681 |       ]
 682 |     },
 683 |     {
 684 |       "cell_type": "markdown",
 685 |       "metadata": {
 686 |         "id": "vVSK-zDVT7Uo"
 687 |       },
 688 |       "source": [
 689 |         "## From Tokens to Vectors"
 690 |       ]
 691 |     },
 692 |     {
 693 |       "cell_type": "code",
 694 |       "metadata": {
 695 |         "id": "rTZVEGzmDRY2"
 696 |       },
 697 |       "source": [
 698 |         "count = CountVectorizer(tokenizer=basic_tokenizer)\n",
 699 |         "count_vecs = count.fit_transform(df['Review Text'])\n",
 700 |         "count_df = pd.DataFrame(count_vecs.toarray(), columns=count.get_feature_names())"
 701 |       ],
 702 |       "execution_count": 101,
 703 |       "outputs": []
 704 |     },
 705 |     {
 706 |       "cell_type": "code",
 707 |       "metadata": {
 708 |         "colab": {
 709 |           "base_uri": "https://localhost:8080/",
 710 |           "height": 253
 711 |         },
 712 |         "id": "YPxlRO5IHg14",
 713 |         "outputId": "0fef3f5a-0edf-4224-afcb-b7889f9ef974"
 714 |       },
 715 |       "source": [
 716 |         "count_df.head()"
 717 |       ],
 718 |       "execution_count": 104,
 719 |       "outputs": [
 720 |         {
 721 |           "output_type": "execute_result",
 722 |           "data": {
 723 |             "text/html": [
 724 |               "<div>\n",
 725 |               "<style scoped>\n",
 726 |               "    .dataframe tbody tr th:only-of-type {\n",
 727 |               "        vertical-align: middle;\n",
 728 |               "    }\n",
 729 |               "\n",
 730 |               "    .dataframe tbody tr th {\n",
 731 |               "        vertical-align: top;\n",
 732 |               "    }\n",
 733 |               "\n",
 734 |               "    .dataframe thead th {\n",
 735 |               "        text-align: right;\n",
 736 |               "    }\n",
 737 |               "</style>\n",
 738 |               "<table border=\"1\" class=\"dataframe\">\n",
 739 |               "  <thead>\n",
 740 |               "    <tr style=\"text-align: right;\">\n",
 741 |               "      <th></th>\n",
 742 |               "      <th>ab</th>\n",
 743 |               "      <th>abby</th>\n",
 744 |               "      <th>abck</th>\n",
 745 |               "      <th>abdomen</th>\n",
 746 |               "      <th>able</th>\n",
 747 |               "      <th>absolute</th>\n",
 748 |               "      <th>absolutely</th>\n",
 749 |               "      <th>abstract</th>\n",
 750 |               "      <th>abt</th>\n",
 751 |               "      <th>abundance</th>\n",
 752 |               "      <th>ac</th>\n",
 753 |               "      <th>accent</th>\n",
 754 |               "      <th>accented</th>\n",
 755 |               "      <th>accentuate</th>\n",
 756 |               "      <th>accentuatea</th>\n",
 757 |               "      <th>accentuated</th>\n",
 758 |               "      <th>accept</th>\n",
 759 |               "      <th>acceptable</th>\n",
 760 |               "      <th>acceptably</th>\n",
 761 |               "      <th>access</th>\n",
 762 |               "      <th>accessorize</th>\n",
 763 |               "      <th>accessorizing</th>\n",
 764 |               "      <th>accessory</th>\n",
 765 |               "      <th>accidentally</th>\n",
 766 |               "      <th>acco</th>\n",
 767 |               "      <th>accommodate</th>\n",
 768 |               "      <th>accomodate</th>\n",
 769 |               "      <th>accompany</th>\n",
 770 |               "      <th>accomplish</th>\n",
 771 |               "      <th>accord</th>\n",
 772 |               "      <th>accumulate</th>\n",
 773 |               "      <th>accuracy</th>\n",
 774 |               "      <th>accurate</th>\n",
 775 |               "      <th>accurately</th>\n",
 776 |               "      <th>achieve</th>\n",
 777 |               "      <th>acknowledge</th>\n",
 778 |               "      <th>acrylic</th>\n",
 779 |               "      <th>act</th>\n",
 780 |               "      <th>actual</th>\n",
 781 |               "      <th>actuallly</th>\n",
 782 |               "      <th>...</th>\n",
 783 |               "      <th>wrinkle</th>\n",
 784 |               "      <th>wrinkled</th>\n",
 785 |               "      <th>wrinkling</th>\n",
 786 |               "      <th>wrinkly</th>\n",
 787 |               "      <th>wrist</th>\n",
 788 |               "      <th>write</th>\n",
 789 |               "      <th>wrong</th>\n",
 790 |               "      <th>x</th>\n",
 791 |               "      <th>xl</th>\n",
 792 |               "      <th>xs</th>\n",
 793 |               "      <th>xsmall</th>\n",
 794 |               "      <th>xsp</th>\n",
 795 |               "      <th>xspetite</th>\n",
 796 |               "      <th>xxs</th>\n",
 797 |               "      <th>xxsp</th>\n",
 798 |               "      <th>y</th>\n",
 799 |               "      <th>yank</th>\n",
 800 |               "      <th>yarn</th>\n",
 801 |               "      <th>yay</th>\n",
 802 |               "      <th>year</th>\n",
 803 |               "      <th>yellow</th>\n",
 804 |               "      <th>yellowed</th>\n",
 805 |               "      <th>yellowy</th>\n",
 806 |               "      <th>yes</th>\n",
 807 |               "      <th>yesterday</th>\n",
 808 |               "      <th>yikes</th>\n",
 809 |               "      <th>yo</th>\n",
 810 |               "      <th>yoga</th>\n",
 811 |               "      <th>yogi</th>\n",
 812 |               "      <th>yolk</th>\n",
 813 |               "      <th>young</th>\n",
 814 |               "      <th>yr</th>\n",
 815 |               "      <th>yuck</th>\n",
 816 |               "      <th>yummy</th>\n",
 817 |               "      <th>zero</th>\n",
 818 |               "      <th>zip</th>\n",
 819 |               "      <th>ziploc</th>\n",
 820 |               "      <th>zipped</th>\n",
 821 |               "      <th>zipper</th>\n",
 822 |               "      <th>zoom</th>\n",
 823 |               "    </tr>\n",
 824 |               "  </thead>\n",
 825 |               "  <tbody>\n",
 826 |               "    <tr>\n",
 827 |               "      <th>0</th>\n",
 828 |               "      <td>0</td>\n",
 829 |               "      <td>0</td>\n",
 830 |               "      <td>0</td>\n",
 831 |               "      <td>0</td>\n",
 832 |               "      <td>0</td>\n",
 833 |               "      <td>0</td>\n",
 834 |               "      <td>0</td>\n",
 835 |               "      <td>0</td>\n",
 836 |               "      <td>0</td>\n",
 837 |               "      <td>0</td>\n",
 838 |               "      <td>0</td>\n",
 839 |               "      <td>0</td>\n",
 840 |               "      <td>0</td>\n",
 841 |               "      <td>0</td>\n",
 842 |               "      <td>0</td>\n",
 843 |               "      <td>0</td>\n",
 844 |               "      <td>0</td>\n",
 845 |               "      <td>0</td>\n",
 846 |               "      <td>0</td>\n",
 847 |               "      <td>0</td>\n",
 848 |               "      <td>0</td>\n",
 849 |               "      <td>0</td>\n",
 850 |               "      <td>0</td>\n",
 851 |               "      <td>0</td>\n",
 852 |               "      <td>0</td>\n",
 853 |               "      <td>0</td>\n",
 854 |               "      <td>0</td>\n",
 855 |               "      <td>0</td>\n",
 856 |               "      <td>0</td>\n",
 857 |               "      <td>0</td>\n",
 858 |               "      <td>0</td>\n",
 859 |               "      <td>0</td>\n",
 860 |               "      <td>0</td>\n",
 861 |               "      <td>0</td>\n",
 862 |               "      <td>0</td>\n",
 863 |               "      <td>0</td>\n",
 864 |               "      <td>0</td>\n",
 865 |               "      <td>0</td>\n",
 866 |               "      <td>0</td>\n",
 867 |               "      <td>0</td>\n",
 868 |               "      <td>...</td>\n",
 869 |               "      <td>0</td>\n",
 870 |               "      <td>0</td>\n",
 871 |               "      <td>0</td>\n",
 872 |               "      <td>0</td>\n",
 873 |               "      <td>0</td>\n",
 874 |               "      <td>0</td>\n",
 875 |               "      <td>0</td>\n",
 876 |               "      <td>0</td>\n",
 877 |               "      <td>0</td>\n",
 878 |               "      <td>0</td>\n",
 879 |               "      <td>0</td>\n",
 880 |               "      <td>0</td>\n",
 881 |               "      <td>0</td>\n",
 882 |               "      <td>0</td>\n",
 883 |               "      <td>0</td>\n",
 884 |               "      <td>0</td>\n",
 885 |               "      <td>0</td>\n",
 886 |               "      <td>0</td>\n",
 887 |               "      <td>0</td>\n",
 888 |               "      <td>0</td>\n",
 889 |               "      <td>0</td>\n",
 890 |               "      <td>0</td>\n",
 891 |               "      <td>0</td>\n",
 892 |               "      <td>0</td>\n",
 893 |               "      <td>0</td>\n",
 894 |               "      <td>0</td>\n",
 895 |               "      <td>0</td>\n",
 896 |               "      <td>0</td>\n",
 897 |               "      <td>0</td>\n",
 898 |               "      <td>0</td>\n",
 899 |               "      <td>0</td>\n",
 900 |               "      <td>0</td>\n",
 901 |               "      <td>0</td>\n",
 902 |               "      <td>0</td>\n",
 903 |               "      <td>0</td>\n",
 904 |               "      <td>0</td>\n",
 905 |               "      <td>0</td>\n",
 906 |               "      <td>0</td>\n",
 907 |               "      <td>0</td>\n",
 908 |               "      <td>0</td>\n",
 909 |               "    </tr>\n",
 910 |               "    <tr>\n",
 911 |               "      <th>1</th>\n",
 912 |               "      <td>0</td>\n",
 913 |               "      <td>0</td>\n",
 914 |               "      <td>0</td>\n",
 915 |               "      <td>0</td>\n",
 916 |               "      <td>0</td>\n",
 917 |               "      <td>0</td>\n",
 918 |               "      <td>0</td>\n",
 919 |               "      <td>0</td>\n",
 920 |               "      <td>0</td>\n",
 921 |               "      <td>0</td>\n",
 922 |               "      <td>0</td>\n",
 923 |               "      <td>0</td>\n",
 924 |               "      <td>0</td>\n",
 925 |               "      <td>0</td>\n",
 926 |               "      <td>0</td>\n",
 927 |               "      <td>0</td>\n",
 928 |               "      <td>0</td>\n",
 929 |               "      <td>0</td>\n",
 930 |               "      <td>0</td>\n",
 931 |               "      <td>0</td>\n",
 932 |               "      <td>0</td>\n",
 933 |               "      <td>0</td>\n",
 934 |               "      <td>0</td>\n",
 935 |               "      <td>0</td>\n",
 936 |               "      <td>0</td>\n",
 937 |               "      <td>0</td>\n",
 938 |               "      <td>0</td>\n",
 939 |               "      <td>0</td>\n",
 940 |               "      <td>0</td>\n",
 941 |               "      <td>0</td>\n",
 942 |               "      <td>0</td>\n",
 943 |               "      <td>0</td>\n",
 944 |               "      <td>0</td>\n",
 945 |               "      <td>0</td>\n",
 946 |               "      <td>0</td>\n",
 947 |               "      <td>0</td>\n",
 948 |               "      <td>0</td>\n",
 949 |               "      <td>0</td>\n",
 950 |               "      <td>0</td>\n",
 951 |               "      <td>0</td>\n",
 952 |               "      <td>...</td>\n",
 953 |               "      <td>0</td>\n",
 954 |               "      <td>0</td>\n",
 955 |               "      <td>0</td>\n",
 956 |               "      <td>0</td>\n",
 957 |               "      <td>0</td>\n",
 958 |               "      <td>0</td>\n",
 959 |               "      <td>0</td>\n",
 960 |               "      <td>0</td>\n",
 961 |               "      <td>0</td>\n",
 962 |               "      <td>1</td>\n",
 963 |               "      <td>0</td>\n",
 964 |               "      <td>0</td>\n",
 965 |               "      <td>0</td>\n",
 966 |               "      <td>0</td>\n",
 967 |               "      <td>0</td>\n",
 968 |               "      <td>0</td>\n",
 969 |               "      <td>0</td>\n",
 970 |               "      <td>0</td>\n",
 971 |               "      <td>0</td>\n",
 972 |               "      <td>0</td>\n",
 973 |               "      <td>0</td>\n",
 974 |               "      <td>0</td>\n",
 975 |               "      <td>0</td>\n",
 976 |               "      <td>0</td>\n",
 977 |               "      <td>0</td>\n",
 978 |               "      <td>0</td>\n",
 979 |               "      <td>0</td>\n",
 980 |               "      <td>0</td>\n",
 981 |               "      <td>0</td>\n",
 982 |               "      <td>0</td>\n",
 983 |               "      <td>0</td>\n",
 984 |               "      <td>0</td>\n",
 985 |               "      <td>0</td>\n",
 986 |               "      <td>0</td>\n",
 987 |               "      <td>0</td>\n",
 988 |               "      <td>0</td>\n",
 989 |               "      <td>0</td>\n",
 990 |               "      <td>0</td>\n",
 991 |               "      <td>0</td>\n",
 992 |               "      <td>0</td>\n",
 993 |               "    </tr>\n",
 994 |               "    <tr>\n",
 995 |               "      <th>2</th>\n",
 996 |               "      <td>0</td>\n",
 997 |               "      <td>0</td>\n",
 998 |               "      <td>0</td>\n",
 999 |               "      <td>0</td>\n",
1000 |               "      <td>0</td>\n",
1001 |               "      <td>0</td>\n",
1002 |               "      <td>0</td>\n",
1003 |               "      <td>0</td>\n",
1004 |               "      <td>0</td>\n",
1005 |               "      <td>0</td>\n",
1006 |               "      <td>0</td>\n",
1007 |               "      <td>0</td>\n",
1008 |               "      <td>0</td>\n",
1009 |               "      <td>0</td>\n",
1010 |               "      <td>0</td>\n",
1011 |               "      <td>0</td>\n",
1012 |               "      <td>0</td>\n",
1013 |               "      <td>0</td>\n",
1014 |               "      <td>0</td>\n",
1015 |               "      <td>0</td>\n",
1016 |               "      <td>0</td>\n",
1017 |               "      <td>0</td>\n",
1018 |               "      <td>0</td>\n",
1019 |               "      <td>0</td>\n",
1020 |               "      <td>0</td>\n",
1021 |               "      <td>0</td>\n",
1022 |               "      <td>0</td>\n",
1023 |               "      <td>0</td>\n",
1024 |               "      <td>0</td>\n",
1025 |               "      <td>0</td>\n",
1026 |               "      <td>0</td>\n",
1027 |               "      <td>0</td>\n",
1028 |               "      <td>0</td>\n",
1029 |               "      <td>0</td>\n",
1030 |               "      <td>0</td>\n",
1031 |               "      <td>0</td>\n",
1032 |               "      <td>0</td>\n",
1033 |               "      <td>0</td>\n",
1034 |               "      <td>0</td>\n",
1035 |               "      <td>0</td>\n",
1036 |               "      <td>...</td>\n",
1037 |               "      <td>0</td>\n",
1038 |               "      <td>0</td>\n",
1039 |               "      <td>0</td>\n",
1040 |               "      <td>0</td>\n",
1041 |               "      <td>0</td>\n",
1042 |               "      <td>0</td>\n",
1043 |               "      <td>0</td>\n",
1044 |               "      <td>0</td>\n",
1045 |               "      <td>0</td>\n",
1046 |               "      <td>0</td>\n",
1047 |               "      <td>0</td>\n",
1048 |               "      <td>0</td>\n",
1049 |               "      <td>0</td>\n",
1050 |               "      <td>0</td>\n",
1051 |               "      <td>0</td>\n",
1052 |               "      <td>0</td>\n",
1053 |               "      <td>0</td>\n",
1054 |               "      <td>0</td>\n",
1055 |               "      <td>0</td>\n",
1056 |               "      <td>1</td>\n",
1057 |               "      <td>0</td>\n",
1058 |               "      <td>0</td>\n",
1059 |               "      <td>0</td>\n",
1060 |               "      <td>0</td>\n",
1061 |               "      <td>0</td>\n",
1062 |               "      <td>0</td>\n",
1063 |               "      <td>0</td>\n",
1064 |               "      <td>0</td>\n",
1065 |               "      <td>0</td>\n",
1066 |               "      <td>0</td>\n",
1067 |               "      <td>0</td>\n",
1068 |               "      <td>0</td>\n",
1069 |               "      <td>0</td>\n",
1070 |               "      <td>0</td>\n",
1071 |               "      <td>0</td>\n",
1072 |               "      <td>0</td>\n",
1073 |               "      <td>0</td>\n",
1074 |               "      <td>0</td>\n",
1075 |               "      <td>0</td>\n",
1076 |               "      <td>0</td>\n",
1077 |               "    </tr>\n",
1078 |               "    <tr>\n",
1079 |               "      <th>3</th>\n",
1080 |               "      <td>0</td>\n",
1081 |               "      <td>0</td>\n",
1082 |               "      <td>0</td>\n",
1083 |               "      <td>0</td>\n",
1084 |               "      <td>0</td>\n",
1085 |               "      <td>0</td>\n",
1086 |               "      <td>0</td>\n",
1087 |               "      <td>0</td>\n",
1088 |               "      <td>0</td>\n",
1089 |               "      <td>0</td>\n",
1090 |               "      <td>0</td>\n",
1091 |               "      <td>0</td>\n",
1092 |               "      <td>0</td>\n",
1093 |               "      <td>0</td>\n",
1094 |               "      <td>0</td>\n",
1095 |               "      <td>0</td>\n",
1096 |               "      <td>0</td>\n",
1097 |               "      <td>0</td>\n",
1098 |               "      <td>0</td>\n",
1099 |               "      <td>0</td>\n",
1100 |               "      <td>0</td>\n",
1101 |               "      <td>0</td>\n",
1102 |               "      <td>0</td>\n",
1103 |               "      <td>0</td>\n",
1104 |               "      <td>0</td>\n",
1105 |               "      <td>0</td>\n",
1106 |               "      <td>0</td>\n",
1107 |               "      <td>0</td>\n",
1108 |               "      <td>0</td>\n",
1109 |               "      <td>0</td>\n",
1110 |               "      <td>0</td>\n",
1111 |               "      <td>0</td>\n",
1112 |               "      <td>0</td>\n",
1113 |               "      <td>0</td>\n",
1114 |               "      <td>0</td>\n",
1115 |               "      <td>0</td>\n",
1116 |               "      <td>0</td>\n",
1117 |               "      <td>0</td>\n",
1118 |               "      <td>0</td>\n",
1119 |               "      <td>0</td>\n",
1120 |               "      <td>...</td>\n",
1121 |               "      <td>0</td>\n",
1122 |               "      <td>0</td>\n",
1123 |               "      <td>0</td>\n",
1124 |               "      <td>0</td>\n",
1125 |               "      <td>0</td>\n",
1126 |               "      <td>0</td>\n",
1127 |               "      <td>0</td>\n",
1128 |               "      <td>0</td>\n",
1129 |               "      <td>0</td>\n",
1130 |               "      <td>0</td>\n",
1131 |               "      <td>0</td>\n",
1132 |               "      <td>0</td>\n",
1133 |               "      <td>0</td>\n",
1134 |               "      <td>0</td>\n",
1135 |               "      <td>0</td>\n",
1136 |               "      <td>0</td>\n",
1137 |               "      <td>0</td>\n",
1138 |               "      <td>0</td>\n",
1139 |               "      <td>0</td>\n",
1140 |               "      <td>0</td>\n",
1141 |               "      <td>0</td>\n",
1142 |               "      <td>0</td>\n",
1143 |               "      <td>0</td>\n",
1144 |               "      <td>0</td>\n",
1145 |               "      <td>0</td>\n",
1146 |               "      <td>0</td>\n",
1147 |               "      <td>0</td>\n",
1148 |               "      <td>0</td>\n",
1149 |               "      <td>0</td>\n",
1150 |               "      <td>0</td>\n",
1151 |               "      <td>0</td>\n",
1152 |               "      <td>0</td>\n",
1153 |               "      <td>0</td>\n",
1154 |               "      <td>0</td>\n",
1155 |               "      <td>0</td>\n",
1156 |               "      <td>0</td>\n",
1157 |               "      <td>0</td>\n",
1158 |               "      <td>0</td>\n",
1159 |               "      <td>0</td>\n",
1160 |               "      <td>0</td>\n",
1161 |               "    </tr>\n",
1162 |               "    <tr>\n",
1163 |               "      <th>4</th>\n",
1164 |               "      <td>0</td>\n",
1165 |               "      <td>0</td>\n",
1166 |               "      <td>0</td>\n",
1167 |               "      <td>0</td>\n",
1168 |               "      <td>0</td>\n",
1169 |               "      <td>0</td>\n",
1170 |               "      <td>0</td>\n",
1171 |               "      <td>0</td>\n",
1172 |               "      <td>0</td>\n",
1173 |               "      <td>0</td>\n",
1174 |               "      <td>0</td>\n",
1175 |               "      <td>0</td>\n",
1176 |               "      <td>0</td>\n",
1177 |               "      <td>0</td>\n",
1178 |               "      <td>0</td>\n",
1179 |               "      <td>0</td>\n",
1180 |               "      <td>0</td>\n",
1181 |               "      <td>0</td>\n",
1182 |               "      <td>0</td>\n",
1183 |               "      <td>0</td>\n",
1184 |               "      <td>0</td>\n",
1185 |               "      <td>0</td>\n",
1186 |               "      <td>0</td>\n",
1187 |               "      <td>0</td>\n",
1188 |               "      <td>0</td>\n",
1189 |               "      <td>0</td>\n",
1190 |               "      <td>0</td>\n",
1191 |               "      <td>0</td>\n",
1192 |               "      <td>0</td>\n",
1193 |               "      <td>0</td>\n",
1194 |               "      <td>0</td>\n",
1195 |               "      <td>0</td>\n",
1196 |               "      <td>0</td>\n",
1197 |               "      <td>0</td>\n",
1198 |               "      <td>0</td>\n",
1199 |               "      <td>0</td>\n",
1200 |               "      <td>0</td>\n",
1201 |               "      <td>0</td>\n",
1202 |               "      <td>0</td>\n",
1203 |               "      <td>0</td>\n",
1204 |               "      <td>...</td>\n",
1205 |               "      <td>0</td>\n",
1206 |               "      <td>0</td>\n",
1207 |               "      <td>0</td>\n",
1208 |               "      <td>0</td>\n",
1209 |               "      <td>0</td>\n",
1210 |               "      <td>0</td>\n",
1211 |               "      <td>0</td>\n",
1212 |               "      <td>0</td>\n",
1213 |               "      <td>0</td>\n",
1214 |               "      <td>0</td>\n",
1215 |               "      <td>0</td>\n",
1216 |               "      <td>0</td>\n",
1217 |               "      <td>0</td>\n",
1218 |               "      <td>0</td>\n",
1219 |               "      <td>0</td>\n",
1220 |               "      <td>0</td>\n",
1221 |               "      <td>0</td>\n",
1222 |               "      <td>0</td>\n",
1223 |               "      <td>0</td>\n",
1224 |               "      <td>0</td>\n",
1225 |               "      <td>0</td>\n",
1226 |               "      <td>0</td>\n",
1227 |               "      <td>0</td>\n",
1228 |               "      <td>0</td>\n",
1229 |               "      <td>0</td>\n",
1230 |               "      <td>0</td>\n",
1231 |               "      <td>0</td>\n",
1232 |               "      <td>0</td>\n",
1233 |               "      <td>0</td>\n",
1234 |               "      <td>0</td>\n",
1235 |               "      <td>0</td>\n",
1236 |               "      <td>0</td>\n",
1237 |               "      <td>0</td>\n",
1238 |               "      <td>0</td>\n",
1239 |               "      <td>0</td>\n",
1240 |               "      <td>0</td>\n",
1241 |               "      <td>0</td>\n",
1242 |               "      <td>0</td>\n",
1243 |               "      <td>0</td>\n",
1244 |               "      <td>0</td>\n",
1245 |               "    </tr>\n",
1246 |               "  </tbody>\n",
1247 |               "</table>\n",
1248 |               "<p>5 rows × 3613 columns</p>\n",
1249 |               "</div>"
1250 |             ],
1251 |             "text/plain": [
1252 |               "   ab  abby  abck  abdomen  able  ...  zip  ziploc  zipped  zipper  zoom\n",
1253 |               "0   0     0     0        0     0  ...    0       0       0       0     0\n",
1254 |               "1   0     0     0        0     0  ...    0       0       0       0     0\n",
1255 |               "2   0     0     0        0     0  ...    0       0       0       0     0\n",
1256 |               "3   0     0     0        0     0  ...    0       0       0       0     0\n",
1257 |               "4   0     0     0        0     0  ...    0       0       0       0     0\n",
1258 |               "\n",
1259 |               "[5 rows x 3613 columns]"
1260 |             ]
1261 |           },
1262 |           "metadata": {
1263 |             "tags": []
1264 |           },
1265 |           "execution_count": 104
1266 |         }
1267 |       ]
1268 |     },
1269 |     {
1270 |       "cell_type": "code",
1271 |       "metadata": {
1272 |         "id": "6llw13j8HP2i"
1273 |       },
1274 |       "source": [
1275 |         "tfidf = TfidfVectorizer(tokenizer=basic_tokenizer)\n",
1276 |         "tfidf_vecs = tfidf.fit_transform(df['Review Text'])\n",
1277 |         "tfidf_df = pd.DataFrame(tfidf_vecs.toarray(), columns=tfidf.get_feature_names())"
1278 |       ],
1279 |       "execution_count": 102,
1280 |       "outputs": []
1281 |     },
1282 |     {
1283 |       "cell_type": "code",
1284 |       "metadata": {
1285 |         "colab": {
1286 |           "base_uri": "https://localhost:8080/",
1287 |           "height": 253
1288 |         },
1289 |         "id": "s5AUjPITHZj1",
1290 |         "outputId": "5fbaeade-3d52-4612-aa47-c24ddcdb9b3c"
1291 |       },
1292 |       "source": [
1293 |         "tfidf_df.head()"
1294 |       ],
1295 |       "execution_count": 103,
1296 |       "outputs": [
1297 |         {
1298 |           "output_type": "execute_result",
1299 |           "data": {
1300 |             "text/html": [
1301 |               "<div>\n",
1302 |               "<style scoped>\n",
1303 |               "    .dataframe tbody tr th:only-of-type {\n",
1304 |               "        vertical-align: middle;\n",
1305 |               "    }\n",
1306 |               "\n",
1307 |               "    .dataframe tbody tr th {\n",
1308 |               "        vertical-align: top;\n",
1309 |               "    }\n",
1310 |               "\n",
1311 |               "    .dataframe thead th {\n",
1312 |               "        text-align: right;\n",
1313 |               "    }\n",
1314 |               "</style>\n",
1315 |               "<table border=\"1\" class=\"dataframe\">\n",
1316 |               "  <thead>\n",
1317 |               "    <tr style=\"text-align: right;\">\n",
1318 |               "      <th></th>\n",
1319 |               "      <th>ab</th>\n",
1320 |               "      <th>abby</th>\n",
1321 |               "      <th>abck</th>\n",
1322 |               "      <th>abdomen</th>\n",
1323 |               "      <th>able</th>\n",
1324 |               "      <th>absolute</th>\n",
1325 |               "      <th>absolutely</th>\n",
1326 |               "      <th>abstract</th>\n",
1327 |               "      <th>abt</th>\n",
1328 |               "      <th>abundance</th>\n",
1329 |               "      <th>ac</th>\n",
1330 |               "      <th>accent</th>\n",
1331 |               "      <th>accented</th>\n",
1332 |               "      <th>accentuate</th>\n",
1333 |               "      <th>accentuatea</th>\n",
1334 |               "      <th>accentuated</th>\n",
1335 |               "      <th>accept</th>\n",
1336 |               "      <th>acceptable</th>\n",
1337 |               "      <th>acceptably</th>\n",
1338 |               "      <th>access</th>\n",
1339 |               "      <th>accessorize</th>\n",
1340 |               "      <th>accessorizing</th>\n",
1341 |               "      <th>accessory</th>\n",
1342 |               "      <th>accidentally</th>\n",
1343 |               "      <th>acco</th>\n",
1344 |               "      <th>accommodate</th>\n",
1345 |               "      <th>accomodate</th>\n",
1346 |               "      <th>accompany</th>\n",
1347 |               "      <th>accomplish</th>\n",
1348 |               "      <th>accord</th>\n",
1349 |               "      <th>accumulate</th>\n",
1350 |               "      <th>accuracy</th>\n",
1351 |               "      <th>accurate</th>\n",
1352 |               "      <th>accurately</th>\n",
1353 |               "      <th>achieve</th>\n",
1354 |               "      <th>acknowledge</th>\n",
1355 |               "      <th>acrylic</th>\n",
1356 |               "      <th>act</th>\n",
1357 |               "      <th>actual</th>\n",
1358 |               "      <th>actuallly</th>\n",
1359 |               "      <th>...</th>\n",
1360 |               "      <th>wrinkle</th>\n",
1361 |               "      <th>wrinkled</th>\n",
1362 |               "      <th>wrinkling</th>\n",
1363 |               "      <th>wrinkly</th>\n",
1364 |               "      <th>wrist</th>\n",
1365 |               "      <th>write</th>\n",
1366 |               "      <th>wrong</th>\n",
1367 |               "      <th>x</th>\n",
1368 |               "      <th>xl</th>\n",
1369 |               "      <th>xs</th>\n",
1370 |               "      <th>xsmall</th>\n",
1371 |               "      <th>xsp</th>\n",
1372 |               "      <th>xspetite</th>\n",
1373 |               "      <th>xxs</th>\n",
1374 |               "      <th>xxsp</th>\n",
1375 |               "      <th>y</th>\n",
1376 |               "      <th>yank</th>\n",
1377 |               "      <th>yarn</th>\n",
1378 |               "      <th>yay</th>\n",
1379 |               "      <th>year</th>\n",
1380 |               "      <th>yellow</th>\n",
1381 |               "      <th>yellowed</th>\n",
1382 |               "      <th>yellowy</th>\n",
1383 |               "      <th>yes</th>\n",
1384 |               "      <th>yesterday</th>\n",
1385 |               "      <th>yikes</th>\n",
1386 |               "      <th>yo</th>\n",
1387 |               "      <th>yoga</th>\n",
1388 |               "      <th>yogi</th>\n",
1389 |               "      <th>yolk</th>\n",
1390 |               "      <th>young</th>\n",
1391 |               "      <th>yr</th>\n",
1392 |               "      <th>yuck</th>\n",
1393 |               "      <th>yummy</th>\n",
1394 |               "      <th>zero</th>\n",
1395 |               "      <th>zip</th>\n",
1396 |               "      <th>ziploc</th>\n",
1397 |               "      <th>zipped</th>\n",
1398 |               "      <th>zipper</th>\n",
1399 |               "      <th>zoom</th>\n",
1400 |               "    </tr>\n",
1401 |               "  </thead>\n",
1402 |               "  <tbody>\n",
1403 |               "    <tr>\n",
1404 |               "      <th>0</th>\n",
1405 |               "      <td>0.0</td>\n",
1406 |               "      <td>0.0</td>\n",
1407 |               "      <td>0.0</td>\n",
1408 |               "      <td>0.0</td>\n",
1409 |               "      <td>0.0</td>\n",
1410 |               "      <td>0.0</td>\n",
1411 |               "      <td>0.0</td>\n",
1412 |               "      <td>0.0</td>\n",
1413 |               "      <td>0.0</td>\n",
1414 |               "      <td>0.0</td>\n",
1415 |               "      <td>0.0</td>\n",
1416 |               "      <td>0.0</td>\n",
1417 |               "      <td>0.0</td>\n",
1418 |               "      <td>0.0</td>\n",
1419 |               "      <td>0.0</td>\n",
1420 |               "      <td>0.0</td>\n",
1421 |               "      <td>0.0</td>\n",
1422 |               "      <td>0.0</td>\n",
1423 |               "      <td>0.0</td>\n",
1424 |               "      <td>0.0</td>\n",
1425 |               "      <td>0.0</td>\n",
1426 |               "      <td>0.0</td>\n",
1427 |               "      <td>0.0</td>\n",
1428 |               "      <td>0.0</td>\n",
1429 |               "      <td>0.0</td>\n",
1430 |               "      <td>0.0</td>\n",
1431 |               "      <td>0.0</td>\n",
1432 |               "      <td>0.0</td>\n",
1433 |               "      <td>0.0</td>\n",
1434 |               "      <td>0.0</td>\n",
1435 |               "      <td>0.0</td>\n",
1436 |               "      <td>0.0</td>\n",
1437 |               "      <td>0.0</td>\n",
1438 |               "      <td>0.0</td>\n",
1439 |               "      <td>0.0</td>\n",
1440 |               "      <td>0.0</td>\n",
1441 |               "      <td>0.0</td>\n",
1442 |               "      <td>0.0</td>\n",
1443 |               "      <td>0.0</td>\n",
1444 |               "      <td>0.0</td>\n",
1445 |               "      <td>...</td>\n",
1446 |               "      <td>0.0</td>\n",
1447 |               "      <td>0.0</td>\n",
1448 |               "      <td>0.0</td>\n",
1449 |               "      <td>0.0</td>\n",
1450 |               "      <td>0.0</td>\n",
1451 |               "      <td>0.0</td>\n",
1452 |               "      <td>0.0</td>\n",
1453 |               "      <td>0.0</td>\n",
1454 |               "      <td>0.0</td>\n",
1455 |               "      <td>0.000000</td>\n",
1456 |               "      <td>0.0</td>\n",
1457 |               "      <td>0.0</td>\n",
1458 |               "      <td>0.0</td>\n",
1459 |               "      <td>0.0</td>\n",
1460 |               "      <td>0.0</td>\n",
1461 |               "      <td>0.0</td>\n",
1462 |               "      <td>0.0</td>\n",
1463 |               "      <td>0.0</td>\n",
1464 |               "      <td>0.0</td>\n",
1465 |               "      <td>0.000000</td>\n",
1466 |               "      <td>0.0</td>\n",
1467 |               "      <td>0.0</td>\n",
1468 |               "      <td>0.0</td>\n",
1469 |               "      <td>0.0</td>\n",
1470 |               "      <td>0.0</td>\n",
1471 |               "      <td>0.0</td>\n",
1472 |               "      <td>0.0</td>\n",
1473 |               "      <td>0.0</td>\n",
1474 |               "      <td>0.0</td>\n",
1475 |               "      <td>0.0</td>\n",
1476 |               "      <td>0.0</td>\n",
1477 |               "      <td>0.0</td>\n",
1478 |               "      <td>0.0</td>\n",
1479 |               "      <td>0.0</td>\n",
1480 |               "      <td>0.0</td>\n",
1481 |               "      <td>0.0</td>\n",
1482 |               "      <td>0.0</td>\n",
1483 |               "      <td>0.0</td>\n",
1484 |               "      <td>0.0</td>\n",
1485 |               "      <td>0.0</td>\n",
1486 |               "    </tr>\n",
1487 |               "    <tr>\n",
1488 |               "      <th>1</th>\n",
1489 |               "      <td>0.0</td>\n",
1490 |               "      <td>0.0</td>\n",
1491 |               "      <td>0.0</td>\n",
1492 |               "      <td>0.0</td>\n",
1493 |               "      <td>0.0</td>\n",
1494 |               "      <td>0.0</td>\n",
1495 |               "      <td>0.0</td>\n",
1496 |               "      <td>0.0</td>\n",
1497 |               "      <td>0.0</td>\n",
1498 |               "      <td>0.0</td>\n",
1499 |               "      <td>0.0</td>\n",
1500 |               "      <td>0.0</td>\n",
1501 |               "      <td>0.0</td>\n",
1502 |               "      <td>0.0</td>\n",
1503 |               "      <td>0.0</td>\n",
1504 |               "      <td>0.0</td>\n",
1505 |               "      <td>0.0</td>\n",
1506 |               "      <td>0.0</td>\n",
1507 |               "      <td>0.0</td>\n",
1508 |               "      <td>0.0</td>\n",
1509 |               "      <td>0.0</td>\n",
1510 |               "      <td>0.0</td>\n",
1511 |               "      <td>0.0</td>\n",
1512 |               "      <td>0.0</td>\n",
1513 |               "      <td>0.0</td>\n",
1514 |               "      <td>0.0</td>\n",
1515 |               "      <td>0.0</td>\n",
1516 |               "      <td>0.0</td>\n",
1517 |               "      <td>0.0</td>\n",
1518 |               "      <td>0.0</td>\n",
1519 |               "      <td>0.0</td>\n",
1520 |               "      <td>0.0</td>\n",
1521 |               "      <td>0.0</td>\n",
1522 |               "      <td>0.0</td>\n",
1523 |               "      <td>0.0</td>\n",
1524 |               "      <td>0.0</td>\n",
1525 |               "      <td>0.0</td>\n",
1526 |               "      <td>0.0</td>\n",
1527 |               "      <td>0.0</td>\n",
1528 |               "      <td>0.0</td>\n",
1529 |               "      <td>...</td>\n",
1530 |               "      <td>0.0</td>\n",
1531 |               "      <td>0.0</td>\n",
1532 |               "      <td>0.0</td>\n",
1533 |               "      <td>0.0</td>\n",
1534 |               "      <td>0.0</td>\n",
1535 |               "      <td>0.0</td>\n",
1536 |               "      <td>0.0</td>\n",
1537 |               "      <td>0.0</td>\n",
1538 |               "      <td>0.0</td>\n",
1539 |               "      <td>0.113478</td>\n",
1540 |               "      <td>0.0</td>\n",
1541 |               "      <td>0.0</td>\n",
1542 |               "      <td>0.0</td>\n",
1543 |               "      <td>0.0</td>\n",
1544 |               "      <td>0.0</td>\n",
1545 |               "      <td>0.0</td>\n",
1546 |               "      <td>0.0</td>\n",
1547 |               "      <td>0.0</td>\n",
1548 |               "      <td>0.0</td>\n",
1549 |               "      <td>0.000000</td>\n",
1550 |               "      <td>0.0</td>\n",
1551 |               "      <td>0.0</td>\n",
1552 |               "      <td>0.0</td>\n",
1553 |               "      <td>0.0</td>\n",
1554 |               "      <td>0.0</td>\n",
1555 |               "      <td>0.0</td>\n",
1556 |               "      <td>0.0</td>\n",
1557 |               "      <td>0.0</td>\n",
1558 |               "      <td>0.0</td>\n",
1559 |               "      <td>0.0</td>\n",
1560 |               "      <td>0.0</td>\n",
1561 |               "      <td>0.0</td>\n",
1562 |               "      <td>0.0</td>\n",
1563 |               "      <td>0.0</td>\n",
1564 |               "      <td>0.0</td>\n",
1565 |               "      <td>0.0</td>\n",
1566 |               "      <td>0.0</td>\n",
1567 |               "      <td>0.0</td>\n",
1568 |               "      <td>0.0</td>\n",
1569 |               "      <td>0.0</td>\n",
1570 |               "    </tr>\n",
1571 |               "    <tr>\n",
1572 |               "      <th>2</th>\n",
1573 |               "      <td>0.0</td>\n",
1574 |               "      <td>0.0</td>\n",
1575 |               "      <td>0.0</td>\n",
1576 |               "      <td>0.0</td>\n",
1577 |               "      <td>0.0</td>\n",
1578 |               "      <td>0.0</td>\n",
1579 |               "      <td>0.0</td>\n",
1580 |               "      <td>0.0</td>\n",
1581 |               "      <td>0.0</td>\n",
1582 |               "      <td>0.0</td>\n",
1583 |               "      <td>0.0</td>\n",
1584 |               "      <td>0.0</td>\n",
1585 |               "      <td>0.0</td>\n",
1586 |               "      <td>0.0</td>\n",
1587 |               "      <td>0.0</td>\n",
1588 |               "      <td>0.0</td>\n",
1589 |               "      <td>0.0</td>\n",
1590 |               "      <td>0.0</td>\n",
1591 |               "      <td>0.0</td>\n",
1592 |               "      <td>0.0</td>\n",
1593 |               "      <td>0.0</td>\n",
1594 |               "      <td>0.0</td>\n",
1595 |               "      <td>0.0</td>\n",
1596 |               "      <td>0.0</td>\n",
1597 |               "      <td>0.0</td>\n",
1598 |               "      <td>0.0</td>\n",
1599 |               "      <td>0.0</td>\n",
1600 |               "      <td>0.0</td>\n",
1601 |               "      <td>0.0</td>\n",
1602 |               "      <td>0.0</td>\n",
1603 |               "      <td>0.0</td>\n",
1604 |               "      <td>0.0</td>\n",
1605 |               "      <td>0.0</td>\n",
1606 |               "      <td>0.0</td>\n",
1607 |               "      <td>0.0</td>\n",
1608 |               "      <td>0.0</td>\n",
1609 |               "      <td>0.0</td>\n",
1610 |               "      <td>0.0</td>\n",
1611 |               "      <td>0.0</td>\n",
1612 |               "      <td>0.0</td>\n",
1613 |               "      <td>...</td>\n",
1614 |               "      <td>0.0</td>\n",
1615 |               "      <td>0.0</td>\n",
1616 |               "      <td>0.0</td>\n",
1617 |               "      <td>0.0</td>\n",
1618 |               "      <td>0.0</td>\n",
1619 |               "      <td>0.0</td>\n",
1620 |               "      <td>0.0</td>\n",
1621 |               "      <td>0.0</td>\n",
1622 |               "      <td>0.0</td>\n",
1623 |               "      <td>0.000000</td>\n",
1624 |               "      <td>0.0</td>\n",
1625 |               "      <td>0.0</td>\n",
1626 |               "      <td>0.0</td>\n",
1627 |               "      <td>0.0</td>\n",
1628 |               "      <td>0.0</td>\n",
1629 |               "      <td>0.0</td>\n",
1630 |               "      <td>0.0</td>\n",
1631 |               "      <td>0.0</td>\n",
1632 |               "      <td>0.0</td>\n",
1633 |               "      <td>0.221989</td>\n",
1634 |               "      <td>0.0</td>\n",
1635 |               "      <td>0.0</td>\n",
1636 |               "      <td>0.0</td>\n",
1637 |               "      <td>0.0</td>\n",
1638 |               "      <td>0.0</td>\n",
1639 |               "      <td>0.0</td>\n",
1640 |               "      <td>0.0</td>\n",
1641 |               "      <td>0.0</td>\n",
1642 |               "      <td>0.0</td>\n",
1643 |               "      <td>0.0</td>\n",
1644 |               "      <td>0.0</td>\n",
1645 |               "      <td>0.0</td>\n",
1646 |               "      <td>0.0</td>\n",
1647 |               "      <td>0.0</td>\n",
1648 |               "      <td>0.0</td>\n",
1649 |               "      <td>0.0</td>\n",
1650 |               "      <td>0.0</td>\n",
1651 |               "      <td>0.0</td>\n",
1652 |               "      <td>0.0</td>\n",
1653 |               "      <td>0.0</td>\n",
1654 |               "    </tr>\n",
1655 |               "    <tr>\n",
1656 |               "      <th>3</th>\n",
1657 |               "      <td>0.0</td>\n",
1658 |               "      <td>0.0</td>\n",
1659 |               "      <td>0.0</td>\n",
1660 |               "      <td>0.0</td>\n",
1661 |               "      <td>0.0</td>\n",
1662 |               "      <td>0.0</td>\n",
1663 |               "      <td>0.0</td>\n",
1664 |               "      <td>0.0</td>\n",
1665 |               "      <td>0.0</td>\n",
1666 |               "      <td>0.0</td>\n",
1667 |               "      <td>0.0</td>\n",
1668 |               "      <td>0.0</td>\n",
1669 |               "      <td>0.0</td>\n",
1670 |               "      <td>0.0</td>\n",
1671 |               "      <td>0.0</td>\n",
1672 |               "      <td>0.0</td>\n",
1673 |               "      <td>0.0</td>\n",
1674 |               "      <td>0.0</td>\n",
1675 |               "      <td>0.0</td>\n",
1676 |               "      <td>0.0</td>\n",
1677 |               "      <td>0.0</td>\n",
1678 |               "      <td>0.0</td>\n",
1679 |               "      <td>0.0</td>\n",
1680 |               "      <td>0.0</td>\n",
1681 |               "      <td>0.0</td>\n",
1682 |               "      <td>0.0</td>\n",
1683 |               "      <td>0.0</td>\n",
1684 |               "      <td>0.0</td>\n",
1685 |               "      <td>0.0</td>\n",
1686 |               "      <td>0.0</td>\n",
1687 |               "      <td>0.0</td>\n",
1688 |               "      <td>0.0</td>\n",
1689 |               "      <td>0.0</td>\n",
1690 |               "      <td>0.0</td>\n",
1691 |               "      <td>0.0</td>\n",
1692 |               "      <td>0.0</td>\n",
1693 |               "      <td>0.0</td>\n",
1694 |               "      <td>0.0</td>\n",
1695 |               "      <td>0.0</td>\n",
1696 |               "      <td>0.0</td>\n",
1697 |               "      <td>...</td>\n",
1698 |               "      <td>0.0</td>\n",
1699 |               "      <td>0.0</td>\n",
1700 |               "      <td>0.0</td>\n",
1701 |               "      <td>0.0</td>\n",
1702 |               "      <td>0.0</td>\n",
1703 |               "      <td>0.0</td>\n",
1704 |               "      <td>0.0</td>\n",
1705 |               "      <td>0.0</td>\n",
1706 |               "      <td>0.0</td>\n",
1707 |               "      <td>0.000000</td>\n",
1708 |               "      <td>0.0</td>\n",
1709 |               "      <td>0.0</td>\n",
1710 |               "      <td>0.0</td>\n",
1711 |               "      <td>0.0</td>\n",
1712 |               "      <td>0.0</td>\n",
1713 |               "      <td>0.0</td>\n",
1714 |               "      <td>0.0</td>\n",
1715 |               "      <td>0.0</td>\n",
1716 |               "      <td>0.0</td>\n",
1717 |               "      <td>0.000000</td>\n",
1718 |               "      <td>0.0</td>\n",
1719 |               "      <td>0.0</td>\n",
1720 |               "      <td>0.0</td>\n",
1721 |               "      <td>0.0</td>\n",
1722 |               "      <td>0.0</td>\n",
1723 |               "      <td>0.0</td>\n",
1724 |               "      <td>0.0</td>\n",
1725 |               "      <td>0.0</td>\n",
1726 |               "      <td>0.0</td>\n",
1727 |               "      <td>0.0</td>\n",
1728 |               "      <td>0.0</td>\n",
1729 |               "      <td>0.0</td>\n",
1730 |               "      <td>0.0</td>\n",
1731 |               "      <td>0.0</td>\n",
1732 |               "      <td>0.0</td>\n",
1733 |               "      <td>0.0</td>\n",
1734 |               "      <td>0.0</td>\n",
1735 |               "      <td>0.0</td>\n",
1736 |               "      <td>0.0</td>\n",
1737 |               "      <td>0.0</td>\n",
1738 |               "    </tr>\n",
1739 |               "    <tr>\n",
1740 |               "      <th>4</th>\n",
1741 |               "      <td>0.0</td>\n",
1742 |               "      <td>0.0</td>\n",
1743 |               "      <td>0.0</td>\n",
1744 |               "      <td>0.0</td>\n",
1745 |               "      <td>0.0</td>\n",
1746 |               "      <td>0.0</td>\n",
1747 |               "      <td>0.0</td>\n",
1748 |               "      <td>0.0</td>\n",
1749 |               "      <td>0.0</td>\n",
1750 |               "      <td>0.0</td>\n",
1751 |               "      <td>0.0</td>\n",
1752 |               "      <td>0.0</td>\n",
1753 |               "      <td>0.0</td>\n",
1754 |               "      <td>0.0</td>\n",
1755 |               "      <td>0.0</td>\n",
1756 |               "      <td>0.0</td>\n",
1757 |               "      <td>0.0</td>\n",
1758 |               "      <td>0.0</td>\n",
1759 |               "      <td>0.0</td>\n",
1760 |               "      <td>0.0</td>\n",
1761 |               "      <td>0.0</td>\n",
1762 |               "      <td>0.0</td>\n",
1763 |               "      <td>0.0</td>\n",
1764 |               "      <td>0.0</td>\n",
1765 |               "      <td>0.0</td>\n",
1766 |               "      <td>0.0</td>\n",
1767 |               "      <td>0.0</td>\n",
1768 |               "      <td>0.0</td>\n",
1769 |               "      <td>0.0</td>\n",
1770 |               "      <td>0.0</td>\n",
1771 |               "      <td>0.0</td>\n",
1772 |               "      <td>0.0</td>\n",
1773 |               "      <td>0.0</td>\n",
1774 |               "      <td>0.0</td>\n",
1775 |               "      <td>0.0</td>\n",
1776 |               "      <td>0.0</td>\n",
1777 |               "      <td>0.0</td>\n",
1778 |               "      <td>0.0</td>\n",
1779 |               "      <td>0.0</td>\n",
1780 |               "      <td>0.0</td>\n",
1781 |               "      <td>...</td>\n",
1782 |               "      <td>0.0</td>\n",
1783 |               "      <td>0.0</td>\n",
1784 |               "      <td>0.0</td>\n",
1785 |               "      <td>0.0</td>\n",
1786 |               "      <td>0.0</td>\n",
1787 |               "      <td>0.0</td>\n",
1788 |               "      <td>0.0</td>\n",
1789 |               "      <td>0.0</td>\n",
1790 |               "      <td>0.0</td>\n",
1791 |               "      <td>0.000000</td>\n",
1792 |               "      <td>0.0</td>\n",
1793 |               "      <td>0.0</td>\n",
1794 |               "      <td>0.0</td>\n",
1795 |               "      <td>0.0</td>\n",
1796 |               "      <td>0.0</td>\n",
1797 |               "      <td>0.0</td>\n",
1798 |               "      <td>0.0</td>\n",
1799 |               "      <td>0.0</td>\n",
1800 |               "      <td>0.0</td>\n",
1801 |               "      <td>0.000000</td>\n",
1802 |               "      <td>0.0</td>\n",
1803 |               "      <td>0.0</td>\n",
1804 |               "      <td>0.0</td>\n",
1805 |               "      <td>0.0</td>\n",
1806 |               "      <td>0.0</td>\n",
1807 |               "      <td>0.0</td>\n",
1808 |               "      <td>0.0</td>\n",
1809 |               "      <td>0.0</td>\n",
1810 |               "      <td>0.0</td>\n",
1811 |               "      <td>0.0</td>\n",
1812 |               "      <td>0.0</td>\n",
1813 |               "      <td>0.0</td>\n",
1814 |               "      <td>0.0</td>\n",
1815 |               "      <td>0.0</td>\n",
1816 |               "      <td>0.0</td>\n",
1817 |               "      <td>0.0</td>\n",
1818 |               "      <td>0.0</td>\n",
1819 |               "      <td>0.0</td>\n",
1820 |               "      <td>0.0</td>\n",
1821 |               "      <td>0.0</td>\n",
1822 |               "    </tr>\n",
1823 |               "  </tbody>\n",
1824 |               "</table>\n",
1825 |               "<p>5 rows × 3613 columns</p>\n",
1826 |               "</div>"
1827 |             ],
1828 |             "text/plain": [
1829 |               "    ab  abby  abck  abdomen  able  ...  zip  ziploc  zipped  zipper  zoom\n",
1830 |               "0  0.0   0.0   0.0      0.0   0.0  ...  0.0     0.0     0.0     0.0   0.0\n",
1831 |               "1  0.0   0.0   0.0      0.0   0.0  ...  0.0     0.0     0.0     0.0   0.0\n",
1832 |               "2  0.0   0.0   0.0      0.0   0.0  ...  0.0     0.0     0.0     0.0   0.0\n",
1833 |               "3  0.0   0.0   0.0      0.0   0.0  ...  0.0     0.0     0.0     0.0   0.0\n",
1834 |               "4  0.0   0.0   0.0      0.0   0.0  ...  0.0     0.0     0.0     0.0   0.0\n",
1835 |               "\n",
1836 |               "[5 rows x 3613 columns]"
1837 |             ]
1838 |           },
1839 |           "metadata": {
1840 |             "tags": []
1841 |           },
1842 |           "execution_count": 103
1843 |         }
1844 |       ]
1845 |     },
1846 |     {
1847 |       "cell_type": "code",
1848 |       "metadata": {
1849 |         "colab": {
1850 |           "base_uri": "https://localhost:8080/"
1851 |         },
1852 |         "id": "CnGpoj23NCJo",
1853 |         "outputId": "56b6b559-922d-4f3f-bd01-e2ebad69a67e"
1854 |       },
1855 |       "source": [
1856 |         "def top_tfidf_words(tfidf_df):\n",
1857 |         "    return(tfidf_df[tfidf_df>0.3].mean(axis=0))\n",
1858 |         "top_tfidf_words(tfidf_df)"
1859 |       ],
1860 |       "execution_count": 136,
1861 |       "outputs": [
1862 |         {
1863 |           "output_type": "execute_result",
1864 |           "data": {
1865 |             "text/plain": [
1866 |               "ab         0.327036\n",
1867 |               "abby       0.322623\n",
1868 |               "abck            NaN\n",
1869 |               "abdomen         NaN\n",
1870 |               "able       0.301735\n",
1871 |               "             ...   \n",
1872 |               "zip        0.376426\n",
1873 |               "ziploc          NaN\n",
1874 |               "zipped     0.319633\n",
1875 |               "zipper     0.376748\n",
1876 |               "zoom            NaN\n",
1877 |               "Length: 3613, dtype: float64"
1878 |             ]
1879 |           },
1880 |           "metadata": {
1881 |             "tags": []
1882 |           },
1883 |           "execution_count": 136
1884 |         }
1885 |       ]
1886 |     },
1887 |     {
1888 |       "cell_type": "code",
1889 |       "metadata": {
1890 |         "colab": {
1891 |           "base_uri": "https://localhost:8080/"
1892 |         },
1893 |         "id": "Lqc_16oENTEE",
1894 |         "outputId": "317fc208-d3a0-4c24-bdb4-01bbbb609eeb"
1895 |       },
1896 |       "source": [
1897 |         "# Get similarities\n",
1898 |         "count_sims = cosine_similarity(count_vecs)\n",
1899 |         "tfidf_sims = cosine_similarity(tfidf_vecs)\n",
1900 |         "\n",
1901 |         "count_sims"
1902 |       ],
1903 |       "execution_count": 135,
1904 |       "outputs": [
1905 |         {
1906 |           "output_type": "execute_result",
1907 |           "data": {
1908 |             "text/plain": [
1909 |               "array([[1.        , 0.04351941, 0.07106691, ..., 0.04065578, 0.        ,\n",
1910 |               "        0.07312724],\n",
1911 |               "       [0.04351941, 1.        , 0.20412415, ..., 0.21408721, 0.16556654,\n",
1912 |               "        0.14002801],\n",
1913 |               "       [0.07106691, 0.20412415, 1.        , ..., 0.2224746 , 0.10814761,\n",
1914 |               "        0.17149859],\n",
1915 |               "       ...,\n",
1916 |               "       [0.04065578, 0.21408721, 0.2224746 , ..., 1.        , 0.12373764,\n",
1917 |               "        0.29433147],\n",
1918 |               "       [0.        , 0.16556654, 0.10814761, ..., 0.12373764, 1.        ,\n",
1919 |               "        0.05564149],\n",
1920 |               "       [0.07312724, 0.14002801, 0.17149859, ..., 0.29433147, 0.05564149,\n",
1921 |               "        1.        ]])"
1922 |             ]
1923 |           },
1924 |           "metadata": {
1925 |             "tags": []
1926 |           },
1927 |           "execution_count": 135
1928 |         }
1929 |       ]
1930 |     },
1931 |     {
1932 |       "cell_type": "markdown",
1933 |       "metadata": {
1934 |         "id": "PZ9BckdlOhDH"
1935 |       },
1936 |       "source": [
1937 |         "### Exercise Time!\n",
1938 |         "\n",
1939 |         "How do we use these arrays of similarities to identify documents that are similar to the first review?"
1940 |       ]
1941 |     },
1942 |     {
1943 |       "cell_type": "markdown",
1944 |       "metadata": {
1945 |         "id": "snP8PfzmUC1n"
1946 |       },
1947 |       "source": [
1948 |         "## From Review to Document Vectors"
1949 |       ]
1950 |     },
1951 |     {
1952 |       "cell_type": "code",
1953 |       "metadata": {
1954 |         "id": "hDSUBz-dHp5H"
1955 |       },
1956 |       "source": [
1957 |         "# Use spaCy's vectors\n",
1958 |         "toy_df = df.head(10)\n",
1959 |         "\n",
1960 |         "for index, row in toy_df.iterrows():\n",
1961 |         "  rating = row[\"Rating\"]\n",
1962 |         "  doc = nlp(row[\"Review Text\"])\n",
1963 |         "\n",
1964 |         "  # A 1D numpy array representing the document’s semantics.\n",
1965 |         "  doc_vector = doc.vector\n",
1966 |         "  # The L2 norm of the vector representation.\n",
1967 |         "  doc_vector_norm = doc.vector_norm\n"
1968 |       ],
1969 |       "execution_count": 137,
1970 |       "outputs": []
1971 |     },
1972 |     {
1973 |       "cell_type": "markdown",
1974 |       "metadata": {
1975 |         "id": "ZaW9u0iDTIsW"
1976 |       },
1977 |       "source": [
1978 |         "### Exercise Time!\n",
1979 |         "Generate a 2D-array / dataframe with the 300d vectors + Rating (so 301 columns, with 10 rows for the toy dataframe)"
1980 |       ]
1981 |     },
1982 |     {
1983 |       "cell_type": "markdown",
1984 |       "metadata": {
1985 |         "id": "Dcd7rO83UTYF"
1986 |       },
1987 |       "source": [
1988 |         "## Future Work\n",
1989 |         "Add vectors"
1990 |       ]
1991 |     },
1992 |     {
1993 |       "cell_type": "code",
1994 |       "metadata": {
1995 |         "id": "9uMF14ISTHw_"
1996 |       },
1997 |       "source": [
1998 |         ""
1999 |       ],
2000 |       "execution_count": null,
2001 |       "outputs": []
2002 |     },
2003 |     {
2004 |       "cell_type": "markdown",
2005 |       "metadata": {
2006 |         "id": "F8PalDAhj7PD"
2007 |       },
2008 |       "source": [
2009 |         "## Decide which machine learning algorithm to use\n"
2010 |       ]
2011 |     },
2012 |     {
2013 |       "cell_type": "markdown",
2014 |       "metadata": {
2015 |         "id": "ucGki7Qdk0Ns"
2016 |       },
2017 |       "source": [
2018 |         "![](https://scikit-learn.org/stable/_static/ml_map.png)\n",
2019 |         "[Reference](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)\n"
2020 |       ]
2021 |     },
2022 |     {
2023 |       "cell_type": "code",
2024 |       "metadata": {
2025 |         "id": "N-1uGSBYUpaE"
2026 |       },
2027 |       "source": [
2028 |         "class MeanEmbeddingVectorizer(object):\n",
2029 |         "    def __init__(self, word2vec):\n",
2030 |         "        self.word2vec = word2vec\n",
2031 |         "        # if a text is empty we should return a vector of zeros\n",
2032 |         "        # with the same dimensionality as all the other vectors\n",
2033 |         "        self.dim = len(word2vec.itervalues().next())\n",
2034 |         "\n",
2035 |         "    def fit(self, X, y):\n",
2036 |         "        return self\n",
2037 |         "\n",
2038 |         "    def transform(self, X):\n",
2039 |         "        return np.array([\n",
2040 |         "            np.mean([self.word2vec[w] for w in words if w in self.word2vec]\n",
2041 |         "                    or [np.zeros(self.dim)], axis=0)\n",
2042 |         "            for words in X\n",
2043 |         "        ])\n",
2044 |         "\n",
2045 |         "class TfidfEmbeddingVectorizer(object):\n",
2046 |         "    def __init__(self, word2vec):\n",
2047 |         "        self.word2vec = word2vec\n",
2048 |         "        self.word2weight = None\n",
2049 |         "        self.dim = len(word2vec.itervalues().next())\n",
2050 |         "\n",
2051 |         "    def fit(self, X, y):\n",
2052 |         "        tfidf = TfidfVectorizer(analyzer=lambda x: x)\n",
2053 |         "        tfidf.fit(X)\n",
2054 |         "        # if a word was never seen - it must be at least as infrequent\n",
2055 |         "        # as any of the known words - so the default idf is the max of \n",
2056 |         "        # known idf's\n",
2057 |         "        max_idf = max(tfidf.idf_)\n",
2058 |         "        self.word2weight = defaultdict(\n",
2059 |         "            lambda: max_idf,\n",
2060 |         "            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])\n",
2061 |         "\n",
2062 |         "        return self\n",
2063 |         "\n",
2064 |         "    def transform(self, X):\n",
2065 |         "        return np.array([\n",
2066 |         "                np.mean([self.word2vec[w] * self.word2weight[w]\n",
2067 |         "                         for w in words if w in self.word2vec] or\n",
2068 |         "                        [np.zeros(self.dim)], axis=0)\n",
2069 |         "                for words in X\n",
2070 |         "            ])"
2071 |       ],
2072 |       "execution_count": null,
2073 |       "outputs": []
2074 |     },
2075 |     {
2076 |       "cell_type": "code",
2077 |       "metadata": {
2078 |         "id": "rCVRRZAcVAru"
2079 |       },
2080 |       "source": [
2081 |         "from sklearn.pipeline import Pipeline\n",
2082 |         "from sklearn.ensemble import ExtraTreesClassifier\n",
2083 |         "\n",
2084 |         "etree_w2v = Pipeline([\n",
2085 |         "    (\"word2vec vectorizer\", MeanEmbeddingVectorizer(w2v)),\n",
2086 |         "    (\"extra trees\", ExtraTreesClassifier(n_estimators=200))])\n",
2087 |         "etree_w2v_tfidf = Pipeline([\n",
2088 |         "    (\"word2vec vectorizer\", TfidfEmbeddingVectorizer(w2v)),\n",
2089 |         "    (\"extra trees\", ExtraTreesClassifier(n_estimators=200))])"
2090 |       ],
2091 |       "execution_count": null,
2092 |       "outputs": []
2093 |     },
2094 |     {
2095 |       "cell_type": "markdown",
2096 |       "metadata": {
2097 |         "id": "N2Jxpgu0Up-W"
2098 |       },
2099 |       "source": [
2100 |         "## Doc2Vec\n",
2101 |         "This is the gensim implementation of doc2vec. "
2102 |       ]
2103 |     },
2104 |     {
2105 |       "cell_type": "code",
2106 |       "metadata": {
2107 |         "id": "_kf-4RIMUsmS"
2108 |       },
2109 |       "source": [
2110 |         "# Init the Doc2Vec model\n",
2111 |         "doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=4, epochs=20)\n",
2112 |         "\n",
2113 |         "# Build the Volabulary\n",
2114 |         "doc2vec_.build_vocab(train_data)\n",
2115 |         "\n",
2116 |         "# Train the Doc2Vec model\n",
2117 |         "doc2vec_.train(train_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)"
2118 |       ],
2119 |       "execution_count": null,
2120 |       "outputs": []
2121 |     }
2122 |   ]
2123 | }


--------------------------------------------------------------------------------