├── mika-baumeister-Wpnoqo2plFA-unsplash.jpg ├── .idea └── .gitignore ├── README.md ├── embeddings ├── wordsim_clothing.csv └── references.md ├── .gitignore ├── archive └── README2.md ├── text-analytics └── Text_Analytics.ipynb └── ml-projects └── Using_Embeddings_and_NLP_For_Machine_Learning.ipynb /mika-baumeister-Wpnoqo2plFA-unsplash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mjahanshahi/intermediate-nlp/HEAD/mika-baumeister-Wpnoqo2plFA-unsplash.jpg -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intermediate Natural Language Processing 2 | 3 | There are three classes in this series: 4 | 1. [Building Text Analytics Pipelines using NLP](/text-analytics) 5 | 2. [Extracting Insights from Text Data using NLP and Word Embeddings](/embeddings) 6 | 3. [Leveraging NLP and Word Embeddings in Machine Learning Projects](/ml-projects) 7 | 8 | -------------------------------------------------------------------------------- /embeddings/wordsim_clothing.csv: -------------------------------------------------------------------------------- 1 | gorgeous,attractive,1 2 | perfect,ugly,0 3 | ugly,terrible,1 4 | cheap,flimsy,1 5 | comfortable,good,1 6 | comfortable,wonderful,1 7 | comfortable,tight,0 8 | cheap,fancy,0 9 | dress,skirt,1 10 | suit,yellow,0 11 | cheap,polyester,1 12 | orange,swimsuit,0 13 | silky,lovely,1 14 | silky,soft,1 15 | silky,pretty,1 16 | capris,shorts,1 17 | capris,crops,1 18 | tacky,sloppy,1 19 | tacky,avoid,1 20 | shiny,purple,0 21 | tacky,orange,0 22 | pregnant,maternity,1 23 | petite,small,1 24 | petite,large,0 25 | petite,tall,0 26 | coat,trench,1 27 | coat,scarf,1 28 | coat,polyester,0 29 | coat,wool,1 30 | yoga,lounge,1 31 | yoga,sweats,1 32 | yoga,gym,1 33 | waist,yoga,0 34 | hem,yoga,0 35 | lounge,band,0 36 | waist,waistband,1 37 | waist,hip,1 38 | boots,sandals,1 39 | boots,heels,1 40 | lines,tailored,1 41 | lines,defined,1 42 | faded,dull,1 43 | boring,plain,1 44 | boring,cheap,1 45 | necklace,earrings,1 46 | necklace,coat,0 47 | necklace,boots,0 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /archive/README2.md: -------------------------------------------------------------------------------- 1 | # Intermediate Natural Language Processing: 2 | ## Real World Applications of Word Embeddings 3 | 4 | ### Notebook Setup 5 | 6 | A hosted is available on [Kaggle Kernel](https://www.kaggle.com/jahanshahi/intermediate-nlp-word-embeddings). 7 | 8 | The chief benefit of using the Kaggle Kernel is to enable everyone to quickly and efficiently use the same environment. These notebooks are hosted on GPUs in the cloud and a similar set up can be made on Google. I chose Kaggle because they also host the datasets we will be using (as well as many more!), which makes it easy to quickly import them and get models going! 9 | 10 | ### Schedule 11 | 12 | #### Segment 1: Introduction to Language Models (Length: 30 min) 13 | 14 | We will discuss: 15 | 16 | - Complexity of natural language requires specific techniques: 17 | - Language models are probability distributions over a sequence of words 18 | - Key uses are in machine learning and unsupervised learning (search/IR and clustering/topic modeling) 19 | - The intuition behind vector space modeling 20 | - Description of some of the similarities and differences between different word embedding algorithms (word2vec, GloVe, PPMI) 21 | 22 | #### Q&A / Break (Length: 10 min) 23 | 24 | #### Segment 2: Using Pretrained Word Embeddings (Length: 30 min) 25 | 26 | We will demonstrate (using a Notebook): 27 | 28 | - Ease of using pretrained embeddings 29 | - Design considerations in using pretrained models including: noise, sentiments, generalization 30 | - Some specific examples using different models (occupy in Twitter / Wikipedia / Common Crawl) 31 | - Limitations using pretrained models: 32 | + Inputs: Implications of design decisions made during preprocessing on casing / stopwords / frequently occurring phrases 33 | + Output: Goal to learn similarity (example of word similarity tests) 34 | 35 | #### Q&A / Break (Length: 10 min) 36 | 37 | #### Segment 3: Training your own Word Embeddings (Length: 30 min) 38 | 39 | We will discuss: 40 | 41 | - Optimizing for different outputs (semantic relations vs semantic similarity): 42 | - Preprocessing for outputs 43 | - Testing word embedding models (visual inspection, similarity pairs) 44 | 45 | We will demonstrate: 46 | 47 | - Training a custom embedding model using spaCy to preprocess and the Gensim and scikit-learn API to train models 48 | 49 | Note: Training an embedding can take many hours, so this notebook will focus on how to do it, and participants can continue to train or experiment in their own time. 50 | 51 | #### Q&A / Break (Length: 10 min) 52 | 53 | #### Segment 4: Applying Word Embeddings (Length: 40 min) 54 | 55 | We will demonstrate: 56 | 57 | - Using word embeddings as inputs to understand documents 58 | + Supervised Machine Learning including Document Classification 59 | + Unsupervised Models including Document Clustering 60 | 61 | We will discuss: 62 | 63 | - Using word embeddings to extract insights from texts 64 | + Static vs Dynamic Embeddings on a high level 65 | + Hacking dynamic embeddings for other types of ordinal structure (grouping by reviews stars) 66 | 67 | #### Q&A / Break (Length: 10 min) 68 | -------------------------------------------------------------------------------- /embeddings/references.md: -------------------------------------------------------------------------------- 1 | # Selected References: 2 | ## Embeddings 3 | - Ruder, Sebastian. "On word embeddings - Part 1". http://ruder.io/word-embeddings-1/, 2016. 4 | - Ruder, Sebastian. "On word embeddings - Part 3: The secret ingredients of word2vec". http://ruder.io/secret-word2vec/, 2016. 5 | 6 | ## Parameter Tuning 7 | - Hardt, Moritz. “Word Embedding: Explaining Their Properties.” Off the Convex Path, http://offconvex.github.io/2016/02/14/word-embeddings-2/. 8 | - Komiya, Kanako, and Hiroyuki Shinnou. “Investigating Effective Parameters for Fine-Tuning of Word Embeddings Using Only a Small Corpus.” Proceedings of the Workshop on Deep Learning Approaches for Low-Resource NLP, Association for Computational Linguistics, 2018, pp. 60–67. ACLWeb, doi:10.18653/v1/W18-3408. 9 | - Landauer, Thomas and Dumais, Susan. LSA: A Solution to Plato’s Problem. http://lsa.colorado.edu/papers/plato/plato.annote.html. 10 | - Yin, Zi, and Yuanyuan Shen. “On the Dimensionality of Word Embedding.” ArXiv:1812.04224 [Cs, Stat], Dec. 2018. arXiv.org, http://arxiv.org/abs/1812.04224. 11 | 12 | ## Testing Embeddings 13 | - Bakarov, Amir. “A Survey of Word Embeddings Evaluation Methods.” ArXiv:1801.09536 [Cs], Jan. 2018. arXiv.org, http://arxiv.org/abs/1801.09536. 14 | - Schnabel, Tobias, et al. “Evaluation Methods for Unsupervised Word Embeddings.” Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, 2015, pp. 298–307. DOI.org (Crossref), doi:10.18653/v1/D15-1036. 15 | - Wang, Bin, et al. “Evaluating Word Embedding Models: Methods and Experimental Results.” APSIPA Transactions on Signal and Information Processing, vol. 8, 2019, p. e19. arXiv.org, doi:10.1017/ATSIP.2019.12. 16 | 17 | 18 | ## Debiasing Embeddings 19 | - Bolukbasi, Tolga, et al. “Man Is to Computer Programmer as Woman Is to Homemaker? Debiasing Word Embeddings.” Proceedings of the 30th International Conference on Neural Information Processing Systems, Curran Associates Inc., 2016, pp. 4356–64. 20 | - Garg, Nikhil, et al. “Word Embeddings Quantify 100 Years of Gender and Ethnic Stereotypes.” Proceedings of the National Academy of Sciences, vol. 115, no. 16, Apr. 2018, pp. E3635–44. DOI.org (Crossref), doi:10.1073/pnas.1720347115. 21 | - Gonen, Hila, and Goldberg, Yoav. “Lipstick on a Pig: Debiasing Methods Cover up Systematic Gender Biases in Word Embeddings But Do Not Remove Them.” ArXiv:1903.03862 [Cs], Sept. 2019. arXiv.org, http://arxiv.org/abs/1903.03862. 22 | - Ethayarajh, Kawin, et al. “Understanding Undesirable Word Embedding Associations.” Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics, 2019, pp. 1696–705. DOI.org (Crossref), doi:10.18653/v1/P19-1166. 23 | - Nissim, Malvina, et al. “Fair Is Better than Sensational: Man Is to Doctor as Woman Is to Doctor.” Computational Linguistics, vol. 46, no. 2, June 2020, pp. 487–97. DOI.org (Crossref), doi:10.1162/coli_a_00379. 24 | - Papakyriakopoulos, Orestis, et al. “Bias in Word Embeddings.” Proceedings of the 2020 Conference on Fairness, Accountability, and Transparency, ACM, 2020, pp. 446–57. DOI.org (Crossref), doi:10.1145/3351095.3372843. 25 | - Zhao, Jieyu, et al. “Gender Bias in Contextualized Word Embeddings.” Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Association for Computational Linguistics, 2019, pp. 629–34. ACLWeb, doi:10.18653/v1/N19-1064. 26 | -------------------------------------------------------------------------------- /text-analytics/Text_Analytics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Text Analytics.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMQgNb4AtMt0C9GaQKU6Y3i", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "YQV5BO4qX-uY" 31 | }, 32 | "source": [ 33 | "# Introduction to Text Analysis\n", 34 | "\n", 35 | "Welcome to this colab notebook that I will use for demonstrative purposes. " 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "aYrFID9Zz4k9" 42 | }, 43 | "source": [ 44 | "## Comparing NLTK vs spaCy" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "J0Y53Pe-0AaV" 51 | }, 52 | "source": [ 53 | "import spacy\n", 54 | "from spacy.lang.en import English\n", 55 | "import nltk\n", 56 | "from nltk.tokenize import word_tokenize" 57 | ], 58 | "execution_count": 2, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "QkuFGuZyZfek", 65 | "outputId": "02427d2e-105d-491a-b4fe-73d605ac0e3c", 66 | "colab": { 67 | "base_uri": "https://localhost:8080/", 68 | "height": 71 69 | } 70 | }, 71 | "source": [ 72 | "en = English()\n", 73 | "text = 'We are doing Text Analysis.'\n", 74 | "doc = en(text)\n", 75 | "print(type(doc))\n", 76 | "print([(x, type(x)) for x in doc])" 77 | ], 78 | "execution_count": 3, 79 | "outputs": [ 80 | { 81 | "output_type": "stream", 82 | "text": [ 83 | "\n", 84 | "[(We, ), (are, ), (doing, ), (Text, ), (Analysis, ), (., )]\n" 85 | ], 86 | "name": "stdout" 87 | } 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "metadata": { 93 | "id": "4vuUgFcweMVc", 94 | "outputId": "06652f5d-0afa-426c-90ed-0dfb13c76c48", 95 | "colab": { 96 | "base_uri": "https://localhost:8080/", 97 | "height": 105 98 | } 99 | }, 100 | "source": [ 101 | "nltk.download('punkt')\n", 102 | "doc = word_tokenize(text)\n", 103 | "print(type(doc))\n", 104 | "print([(x, type(x)) for x in doc])" 105 | ], 106 | "execution_count": 4, 107 | "outputs": [ 108 | { 109 | "output_type": "stream", 110 | "text": [ 111 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 112 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 113 | "\n", 114 | "[('We', ), ('are', ), ('doing', ), ('Text', ), ('Analysis', ), ('.', )]\n" 115 | ], 116 | "name": "stdout" 117 | } 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "metadata": { 123 | "id": "sT5_LiR_0SS7", 124 | "outputId": "985ff655-e755-4c00-8d88-c2f79c5ae991", 125 | "colab": { 126 | "base_uri": "https://localhost:8080/", 127 | "height": 105 128 | } 129 | }, 130 | "source": [ 131 | "%timeit en(text)\n", 132 | "%timeit nltk.tokenize.casual_tokenize(text)" 133 | ], 134 | "execution_count": 5, 135 | "outputs": [ 136 | { 137 | "output_type": "stream", 138 | "text": [ 139 | "The slowest run took 26.97 times longer than the fastest. This could mean that an intermediate result is being cached.\n", 140 | "100000 loops, best of 3: 8 µs per loop\n", 141 | "The slowest run took 7.34 times longer than the fastest. This could mean that an intermediate result is being cached.\n", 142 | "100000 loops, best of 3: 16.2 µs per loop\n" 143 | ], 144 | "name": "stdout" 145 | } 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "P8lwZEHT01zF" 152 | }, 153 | "source": [ 154 | "## spaCy's Language Models" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "Ti-ns7PW0dWj", 161 | "outputId": "92594493-67c7-4293-fb21-4cdb0b5d23d0", 162 | "colab": { 163 | "base_uri": "https://localhost:8080/", 164 | "height": 51 165 | } 166 | }, 167 | "source": [ 168 | "from spacy.lang.en import English\n", 169 | "en = English()\n", 170 | "print(en.tokenizer)\n", 171 | "print(en.pipe_names)\n" 172 | ], 173 | "execution_count": 8, 174 | "outputs": [ 175 | { 176 | "output_type": "stream", 177 | "text": [ 178 | "\n", 179 | "[]\n" 180 | ], 181 | "name": "stdout" 182 | } 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "JpknyeLs1Bmo", 189 | "outputId": "2f47ce7d-9081-44f3-a36e-be9a7a38519d", 190 | "colab": { 191 | "base_uri": "https://localhost:8080/", 192 | "height": 34 193 | } 194 | }, 195 | "source": [ 196 | "nlp = spacy.load('en_core_web_sm')\n", 197 | "print(nlp.pipe_names)" 198 | ], 199 | "execution_count": 9, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "text": [ 204 | "['tagger', 'parser', 'ner']\n" 205 | ], 206 | "name": "stdout" 207 | } 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "metadata": { 213 | "id": "OEyN0UUM1MoE", 214 | "outputId": "f90b30f4-4a23-4b73-d136-5f6f80e8d7a6", 215 | "colab": { 216 | "base_uri": "https://localhost:8080/", 217 | "height": 88 218 | } 219 | }, 220 | "source": [ 221 | "doc = en('Text analysis is so much fun!')\n", 222 | "print(doc)\n", 223 | "print(type(doc))\n", 224 | "doc_attrs = set(dir(doc))\n", 225 | "print(doc_attrs)" 226 | ], 227 | "execution_count": 10, 228 | "outputs": [ 229 | { 230 | "output_type": "stream", 231 | "text": [ 232 | "Text analysis is so much fun!\n", 233 | "\n", 234 | "{'lang_', '__ne__', '__sizeof__', 'to_utf8_array', '_realloc', '__init_subclass__', 'sentiment', '__format__', '__lt__', '_vector', 'ents', 'to_disk', 'vector_norm', 'is_parsed', 'get_lca_matrix', '__str__', 'to_bytes', '__unicode__', 'is_sentenced', '__init__', '__iter__', 'doc', 'noun_chunks_iterator', 'remove_extension', '__new__', '__class__', '__reduce_ex__', '_py_tokens', '__setattr__', '_', 'is_nered', 'to_json', '__bytes__', '__delattr__', 'retokenize', 'char_span', '__repr__', '__len__', 'from_array', 'text_with_ws', '__dir__', 'to_array', 'similarity', 'mem', 'count_by', 'from_disk', 'get_extension', 'has_vector', 'noun_chunks', '__getattribute__', '__pyx_vtable__', '_bulk_merge', '__setstate__', 'print_tree', 'sents', 'lang', '__doc__', '__ge__', 'has_extension', 'text', 'tensor', '_vector_norm', 'user_token_hooks', 'cats', '__subclasshook__', 'set_extension', 'user_data', 'extend_tensor', 'user_span_hooks', 'from_bytes', 'vector', 'vocab', '__getitem__', 'user_hooks', '__le__', 'merge', '__hash__', '__gt__', '__eq__', '__reduce__', 'is_tagged'}\n" 235 | ], 236 | "name": "stdout" 237 | } 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "id": "swU1MNr_1ZL1" 244 | }, 245 | "source": [ 246 | "Tokens are units of documents" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "id": "4lDyworS1Wvz", 253 | "outputId": "14006b2a-3f6f-4d31-b564-7acd9c65d24b", 254 | "colab": { 255 | "base_uri": "https://localhost:8080/", 256 | "height": 88 257 | } 258 | }, 259 | "source": [ 260 | "print(doc[0])\n", 261 | "print(type(doc[0]))\n", 262 | "print(dir(doc[0]))" 263 | ], 264 | "execution_count": 11, 265 | "outputs": [ 266 | { 267 | "output_type": "stream", 268 | "text": [ 269 | "Text\n", 270 | "\n", 271 | "['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_extension', 'has_vector', 'head', 'i', 'idx', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex_id', 'like_email', 'like_num', 'like_url', 'lower', 'lower_', 'morph', 'n_lefts', 'n_rights', 'nbor', 'norm', 'norm_', 'orth', 'orth_', 'pos', 'pos_', 'prefix', 'prefix_', 'prob', 'rank', 'remove_extension', 'right_edge', 'rights', 'sent', 'sent_start', 'sentiment', 'set_extension', 'shape', 'shape_', 'similarity', 'string', 'subtree', 'suffix', 'suffix_', 'tag', 'tag_', 'tensor', 'text', 'text_with_ws', 'vector', 'vector_norm', 'vocab', 'whitespace_']\n" 272 | ], 273 | "name": "stdout" 274 | } 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "q6m4nX6p1rwv", 281 | "outputId": "80732600-a3ea-4f50-8aae-971bdf16f21d", 282 | "colab": { 283 | "base_uri": "https://localhost:8080/", 284 | "height": 51 285 | } 286 | }, 287 | "source": [ 288 | "print(doc[0])\n", 289 | "print(doc[0].lower_)" 290 | ], 291 | "execution_count": 15, 292 | "outputs": [ 293 | { 294 | "output_type": "stream", 295 | "text": [ 296 | "Text\n", 297 | "text\n" 298 | ], 299 | "name": "stdout" 300 | } 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "QIZFGzoO2Y6-" 307 | }, 308 | "source": [ 309 | "### Text Preprocessing\n", 310 | "\n", 311 | "#### Normalizing case" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "metadata": { 317 | "id": "HZCn1uA72dmV", 318 | "outputId": "56fd66ce-46c7-4592-b6f7-9b0c68533f10", 319 | "colab": { 320 | "base_uri": "https://localhost:8080/", 321 | "height": 34 322 | } 323 | }, 324 | "source": [ 325 | "[x.lower_ for x in en(text)]" 326 | ], 327 | "execution_count": 16, 328 | "outputs": [ 329 | { 330 | "output_type": "execute_result", 331 | "data": { 332 | "text/plain": [ 333 | "['we', 'are', 'doing', 'text', 'analysis', '.']" 334 | ] 335 | }, 336 | "metadata": { 337 | "tags": [] 338 | }, 339 | "execution_count": 16 340 | } 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": { 346 | "id": "loZEO5Zy3Q9W" 347 | }, 348 | "source": [ 349 | "#### Stripping punctuation" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "metadata": { 355 | "id": "I6mAOXms3RWh", 356 | "outputId": "65eead87-7bbb-444b-bcfc-184b6c1a07be", 357 | "colab": { 358 | "base_uri": "https://localhost:8080/", 359 | "height": 34 360 | } 361 | }, 362 | "source": [ 363 | "[x.text for x in en(text) if x.is_alpha]" 364 | ], 365 | "execution_count": 17, 366 | "outputs": [ 367 | { 368 | "output_type": "execute_result", 369 | "data": { 370 | "text/plain": [ 371 | "['We', 'are', 'doing', 'text', 'analysis']" 372 | ] 373 | }, 374 | "metadata": { 375 | "tags": [] 376 | }, 377 | "execution_count": 17 378 | } 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "5Gt8IUVK3RrT", 385 | "outputId": "15dc4c77-bbf1-4221-9f26-6668a7570da2", 386 | "colab": { 387 | "base_uri": "https://localhost:8080/", 388 | "height": 34 389 | } 390 | }, 391 | "source": [ 392 | "text = \"We're doing text analysis and it's fun!\"\n", 393 | "[x.text for x in en(text) if x.is_alpha]" 394 | ], 395 | "execution_count": 19, 396 | "outputs": [ 397 | { 398 | "output_type": "stream", 399 | "text": [ 400 | "Removing non-alpha ['We', 'doing', 'text', 'analysis', 'and', 'it', 'fun']\n" 401 | ], 402 | "name": "stdout" 403 | } 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "id": "bm65jFIE4o8G" 410 | }, 411 | "source": [ 412 | "#### Lemmatizing" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "metadata": { 418 | "id": "YmWC7hNg4oTD", 419 | "outputId": "e230b58d-03c4-459c-bd15-a4070f505e55", 420 | "colab": { 421 | "base_uri": "https://localhost:8080/", 422 | "height": 34 423 | } 424 | }, 425 | "source": [ 426 | "[x.lemma_ for x in nlp(text)]" 427 | ], 428 | "execution_count": 23, 429 | "outputs": [ 430 | { 431 | "output_type": "execute_result", 432 | "data": { 433 | "text/plain": [ 434 | "['-PRON-', 'be', 'do', 'text', 'analysis', 'and', '-PRON-', 'be', 'fun', '!']" 435 | ] 436 | }, 437 | "metadata": { 438 | "tags": [] 439 | }, 440 | "execution_count": 23 441 | } 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": { 447 | "id": "6QkZze7E5CtA" 448 | }, 449 | "source": [ 450 | "#### Stop Words" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "metadata": { 456 | "id": "UQOs2wRi5C4w", 457 | "outputId": "c5d6709e-36f3-41e7-88b1-96db7c117f8b", 458 | "colab": { 459 | "base_uri": "https://localhost:8080/", 460 | "height": 34 461 | } 462 | }, 463 | "source": [ 464 | "[x.text for x in en(text) if not x.is_stop]" 465 | ], 466 | "execution_count": 24, 467 | "outputs": [ 468 | { 469 | "output_type": "execute_result", 470 | "data": { 471 | "text/plain": [ 472 | "['text', 'analysis', 'fun', '!']" 473 | ] 474 | }, 475 | "metadata": { 476 | "tags": [] 477 | }, 478 | "execution_count": 24 479 | } 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": { 485 | "id": "bWiFwTZ35U-f" 486 | }, 487 | "source": [ 488 | "#### Named Entities\n", 489 | "\n", 490 | "First URLs" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "metadata": { 496 | "id": "Vq7oKqcy5UfL", 497 | "outputId": "30ba53d7-f6f8-4e36-a11c-1c5a68ae8063", 498 | "colab": { 499 | "base_uri": "https://localhost:8080/", 500 | "height": 51 501 | } 502 | }, 503 | "source": [ 504 | "text = \"Check out the course on Github: https://github.com/mjahanshahi/intermediate-nlp\"\n", 505 | "print([x for x in en(text) if not x.like_url])\n", 506 | "print(['-URL-' if x.like_url else x for x in en(text)])" 507 | ], 508 | "execution_count": 25, 509 | "outputs": [ 510 | { 511 | "output_type": "stream", 512 | "text": [ 513 | "[Check, out, the, course, on, Github, :]\n", 514 | "[Check, out, the, course, on, Github, :, '-URL-']\n" 515 | ], 516 | "name": "stdout" 517 | } 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "metadata": { 523 | "id": "NQCUAnL25ld-", 524 | "outputId": "862fadb2-41ee-4774-8d94-27bf5ee776b0", 525 | "colab": { 526 | "base_uri": "https://localhost:8080/", 527 | "height": 51 528 | } 529 | }, 530 | "source": [ 531 | "parsed = nlp(text)\n", 532 | "# look at the individual tokens\n", 533 | "tokens = [t for t in parsed]\n", 534 | "print(tokens)\n", 535 | "# look at the identified named-entities and their types\n", 536 | "for e in parsed.ents:\n", 537 | " print(e, type(e), e.label_, spacy.explain(e.label_))" 538 | ], 539 | "execution_count": 26, 540 | "outputs": [ 541 | { 542 | "output_type": "stream", 543 | "text": [ 544 | "[Check, out, the, course, on, Github, :, https://github.com/mjahanshahi/intermediate-nlp]\n", 545 | "Github ORG Companies, agencies, institutions, etc.\n" 546 | ], 547 | "name": "stdout" 548 | } 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": { 554 | "id": "ypPdecxU3Rgx" 555 | }, 556 | "source": [ 557 | "### Putting it all together" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "metadata": { 563 | "id": "ZnFbRR-O3-8r" 564 | }, 565 | "source": [ 566 | "text_data = [\"I'm taking a course on Safari.\",\n", 567 | " \"I'm learning about Text Analysis.\",\n", 568 | " \"We are studying preprocessing text and then analysing it\",\n", 569 | " \"Check out the course on Github: https://github.com/mjahanshahi/intermediate-nlp\"]" 570 | ], 571 | "execution_count": 20, 572 | "outputs": [] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "metadata": { 577 | "id": "FHulb9Nn4YQs" 578 | }, 579 | "source": [ 580 | "def tokenize_full(docs, model=nlp, \n", 581 | " entities=False, \n", 582 | " stop_words=False, \n", 583 | " lowercase=True, \n", 584 | " alpha_only=True, \n", 585 | " lemma=True):\n", 586 | " \"\"\"Full tokenizer with flags for processing steps\n", 587 | " entities: If False, replaces with entity type\n", 588 | " stop_words: If False, removes stop words\n", 589 | " lowercase: If True, lowercases all tokens\n", 590 | " alpha_only: If True, removes all non-alpha characters\n", 591 | " lemma: If True, lemmatizes words\n", 592 | " \"\"\"\n", 593 | " tokenized_docs = []\n", 594 | " for d in docs:\n", 595 | " parsed = model(d)\n", 596 | " # token collector\n", 597 | " tokens = []\n", 598 | " # index pointer\n", 599 | " i = 0\n", 600 | " # entity collector\n", 601 | " ent = ''\n", 602 | " for t in parsed:\n", 603 | " # only need this if we're replacing entities\n", 604 | " if not entities:\n", 605 | " # replace URLs\n", 606 | " if t.like_url:\n", 607 | " tokens.append('URL')\n", 608 | " continue\n", 609 | " # if there's entities collected and current token is non-entity\n", 610 | " if (t.ent_iob_=='O')&(ent!=''):\n", 611 | " tokens.append(ent)\n", 612 | " ent = ''\n", 613 | " continue\n", 614 | " elif t.ent_iob_!='O':\n", 615 | " ent = t.ent_type_\n", 616 | " continue\n", 617 | " # only include stop words if stop words==True\n", 618 | " if (t.is_stop)&(not stop_words):\n", 619 | " continue\n", 620 | " # only include non-alpha is alpha_only==False\n", 621 | " if (not t.is_alpha)&(alpha_only):\n", 622 | " continue\n", 623 | " if lemma:\n", 624 | " t = t.lemma_\n", 625 | " else:\n", 626 | " t = t.text\n", 627 | " if lowercase:\n", 628 | " t.lower()\n", 629 | " tokens.append(t)\n", 630 | " tokenized_docs.append(tokens)\n", 631 | " return(tokenized_docs)" 632 | ], 633 | "execution_count": 21, 634 | "outputs": [] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "metadata": { 639 | "id": "d0xbabiX5vXL", 640 | "outputId": "d0b1b248-45b3-4606-cc6f-2cd06838b8a7", 641 | "colab": { 642 | "base_uri": "https://localhost:8080/", 643 | "height": 340 644 | } 645 | }, 646 | "source": [ 647 | "tokenize_full(text_data, stop_words=True, alpha_only=False, entities=True)" 648 | ], 649 | "execution_count": 27, 650 | "outputs": [ 651 | { 652 | "output_type": "execute_result", 653 | "data": { 654 | "text/plain": [ 655 | "[['-PRON-', 'be', 'take', 'a', 'course', 'on', 'Safari', '.'],\n", 656 | " ['-PRON-', 'be', 'learn', 'about', 'Text', 'Analysis', '.'],\n", 657 | " ['-PRON-',\n", 658 | " 'be',\n", 659 | " 'study',\n", 660 | " 'preprocesse',\n", 661 | " 'text',\n", 662 | " 'and',\n", 663 | " 'then',\n", 664 | " 'analyse',\n", 665 | " '-PRON-'],\n", 666 | " ['check',\n", 667 | " 'out',\n", 668 | " 'the',\n", 669 | " 'course',\n", 670 | " 'on',\n", 671 | " 'Github',\n", 672 | " ':',\n", 673 | " 'https://github.com/mjahanshahi/intermediate-nlp']]" 674 | ] 675 | }, 676 | "metadata": { 677 | "tags": [] 678 | }, 679 | "execution_count": 27 680 | } 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "metadata": { 686 | "id": "xer1e8C-6DBI", 687 | "outputId": "ce424f85-d580-4c9c-db9b-9a60ea62a80a", 688 | "colab": { 689 | "base_uri": "https://localhost:8080/", 690 | "height": 193 691 | } 692 | }, 693 | "source": [ 694 | "from sklearn.feature_extraction.text import CountVectorizer\n", 695 | "import pandas as pd\n", 696 | "cv = CountVectorizer()\n", 697 | "v = cv.fit_transform(text_data).toarray()\n", 698 | "pd.DataFrame(v, columns=cv.get_feature_names())" 699 | ], 700 | "execution_count": 31, 701 | "outputs": [ 702 | { 703 | "output_type": "execute_result", 704 | "data": { 705 | "text/html": [ 706 | "
\n", 707 | "\n", 720 | "\n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | "
aboutanalysinganalysisandarecheckcomcoursegithubhttpsintermediateitlearningmjahanshahinlponoutpreprocessingsafaristudyingtakingtextthethenwe
00000000100000001001010000
11010000000001000000001000
20101100000010000010101011
30000011121100111100000100
\n", 866 | "
" 867 | ], 868 | "text/plain": [ 869 | " about analysing analysis and are ... taking text the then we\n", 870 | "0 0 0 0 0 0 ... 1 0 0 0 0\n", 871 | "1 1 0 1 0 0 ... 0 1 0 0 0\n", 872 | "2 0 1 0 1 1 ... 0 1 0 1 1\n", 873 | "3 0 0 0 0 0 ... 0 0 1 0 0\n", 874 | "\n", 875 | "[4 rows x 25 columns]" 876 | ] 877 | }, 878 | "metadata": { 879 | "tags": [] 880 | }, 881 | "execution_count": 31 882 | } 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "metadata": { 888 | "id": "3x9uut7r6fKz", 889 | "outputId": "3a3fe346-48cb-4e3d-e15e-972bb2c3dd82", 890 | "colab": { 891 | "base_uri": "https://localhost:8080/", 892 | "height": 173 893 | } 894 | }, 895 | "source": [ 896 | "cv = CountVectorizer(vocabulary=['text', 'analysis', 'preprocessing', 'safari'])\n", 897 | "v = cv.fit_transform(text_data).toarray()\n", 898 | "pd.DataFrame(v, columns=cv.get_feature_names())" 899 | ], 900 | "execution_count": 33, 901 | "outputs": [ 902 | { 903 | "output_type": "execute_result", 904 | "data": { 905 | "text/html": [ 906 | "
\n", 907 | "\n", 920 | "\n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | "
textanalysispreprocessingsafari
00001
11100
21010
30000
\n", 961 | "
" 962 | ], 963 | "text/plain": [ 964 | " text analysis preprocessing safari\n", 965 | "0 0 0 0 1\n", 966 | "1 1 1 0 0\n", 967 | "2 1 0 1 0\n", 968 | "3 0 0 0 0" 969 | ] 970 | }, 971 | "metadata": { 972 | "tags": [] 973 | }, 974 | "execution_count": 33 975 | } 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "metadata": { 981 | "id": "6tWo1FTq47qZ" 982 | }, 983 | "source": [ 984 | "\n" 985 | ], 986 | "execution_count": null, 987 | "outputs": [] 988 | }, 989 | { 990 | "cell_type": "markdown", 991 | "metadata": { 992 | "id": "aM05Ku_TZgdy" 993 | }, 994 | "source": [ 995 | "## Using a Dictionary to Analyse Review Sentiment\n", 996 | "\n", 997 | "A traditional technique to analyse sentiments of texts is to use dictionaries of positive and negative connotations and count the incidences of words that are represented in thiese dictionaries, considering their polarity and valence. \n", 998 | "\n", 999 | "In this section, we will use the Afinn package which has 2.5k words coded by polarity and valence. " 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "metadata": { 1005 | "id": "R1FButi2-SYZ", 1006 | "outputId": "bd607287-69a2-4e7e-aea1-fa4c3e018d57", 1007 | "colab": { 1008 | "base_uri": "https://localhost:8080/", 1009 | "height": 207 1010 | } 1011 | }, 1012 | "source": [ 1013 | "!pip install afinn" 1014 | ], 1015 | "execution_count": 2, 1016 | "outputs": [ 1017 | { 1018 | "output_type": "stream", 1019 | "text": [ 1020 | "Collecting afinn\n", 1021 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)\n", 1022 | "\r\u001b[K |██████▎ | 10kB 17.1MB/s eta 0:00:01\r\u001b[K |████████████▌ | 20kB 1.7MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 51kB 2.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 61kB 1.8MB/s \n", 1023 | "\u001b[?25hBuilding wheels for collected packages: afinn\n", 1024 | " Building wheel for afinn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1025 | " Created wheel for afinn: filename=afinn-0.1-cp36-none-any.whl size=53453 sha256=693b118b7381dc265be177ae816b0ac4923bd634c0e3d1a309d10533dcafecd3\n", 1026 | " Stored in directory: /root/.cache/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2\n", 1027 | "Successfully built afinn\n", 1028 | "Installing collected packages: afinn\n", 1029 | "Successfully installed afinn-0.1\n" 1030 | ], 1031 | "name": "stdout" 1032 | } 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "metadata": { 1038 | "id": "iRMuZzAd-J_k" 1039 | }, 1040 | "source": [ 1041 | "from afinn import Afinn\n", 1042 | "afinn = Afinn(language='en')" 1043 | ], 1044 | "execution_count": 13, 1045 | "outputs": [] 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "metadata": { 1050 | "id": "M_ceBooO-hk_", 1051 | "outputId": "d49bba4d-4ca3-4d7b-fa1a-03eb1c5431c6", 1052 | "colab": { 1053 | "base_uri": "https://localhost:8080/", 1054 | "height": 34 1055 | } 1056 | }, 1057 | "source": [ 1058 | "afinn.score('Great')" 1059 | ], 1060 | "execution_count": 8, 1061 | "outputs": [ 1062 | { 1063 | "output_type": "execute_result", 1064 | "data": { 1065 | "text/plain": [ 1066 | "3.0" 1067 | ] 1068 | }, 1069 | "metadata": { 1070 | "tags": [] 1071 | }, 1072 | "execution_count": 8 1073 | } 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "metadata": { 1079 | "id": "-xQ1yVkc-iyf", 1080 | "outputId": "37601631-06ff-4f63-fb74-69e5d95aec91", 1081 | "colab": { 1082 | "base_uri": "https://localhost:8080/", 1083 | "height": 34 1084 | } 1085 | }, 1086 | "source": [ 1087 | "afinn.score('Good')" 1088 | ], 1089 | "execution_count": 7, 1090 | "outputs": [ 1091 | { 1092 | "output_type": "execute_result", 1093 | "data": { 1094 | "text/plain": [ 1095 | "3.0" 1096 | ] 1097 | }, 1098 | "metadata": { 1099 | "tags": [] 1100 | }, 1101 | "execution_count": 7 1102 | } 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "code", 1107 | "metadata": { 1108 | "id": "uCzCtFsp-rQj", 1109 | "outputId": "24acb631-6f8a-40bc-90d4-3d214e81cb19", 1110 | "colab": { 1111 | "base_uri": "https://localhost:8080/", 1112 | "height": 34 1113 | } 1114 | }, 1115 | "source": [ 1116 | "afinn.score('Terrible')" 1117 | ], 1118 | "execution_count": 9, 1119 | "outputs": [ 1120 | { 1121 | "output_type": "execute_result", 1122 | "data": { 1123 | "text/plain": [ 1124 | "-3.0" 1125 | ] 1126 | }, 1127 | "metadata": { 1128 | "tags": [] 1129 | }, 1130 | "execution_count": 9 1131 | } 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "metadata": { 1137 | "id": "e8hSr8lL-u0V", 1138 | "outputId": "91c3a047-7635-49a5-fa64-7cad8ac2543c", 1139 | "colab": { 1140 | "base_uri": "https://localhost:8080/", 1141 | "height": 34 1142 | } 1143 | }, 1144 | "source": [ 1145 | "afinn.score('I feel great! :)')" 1146 | ], 1147 | "execution_count": 16, 1148 | "outputs": [ 1149 | { 1150 | "output_type": "execute_result", 1151 | "data": { 1152 | "text/plain": [ 1153 | "5.0" 1154 | ] 1155 | }, 1156 | "metadata": { 1157 | "tags": [] 1158 | }, 1159 | "execution_count": 16 1160 | } 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "markdown", 1165 | "metadata": { 1166 | "id": "MXMzguH5_toC" 1167 | }, 1168 | "source": [ 1169 | "Let's apply this to an actual dataset! This is the Women's Clothing E-Commerce Reviews from [this Kaggle Challenge](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews). Bi-directional LSTMs have [reached an F1 score of 0.93.](https://github.com/AFAgarap/ecommerce-reviews-analysis)" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "metadata": { 1175 | "id": "JWnWrRWY_rFi" 1176 | }, 1177 | "source": [ 1178 | "import pandas as pd" 1179 | ], 1180 | "execution_count": 18, 1181 | "outputs": [] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "metadata": { 1186 | "id": "kL1MO06S-0bi" 1187 | }, 1188 | "source": [ 1189 | "df = pd.read_csv('/Womens Clothing E-Commerce Reviews.csv', index_col=0)" 1190 | ], 1191 | "execution_count": 30, 1192 | "outputs": [] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "metadata": { 1197 | "id": "owFapvdiBqmu", 1198 | "outputId": "81908281-712e-4ec5-b593-df4bf49e7524", 1199 | "colab": { 1200 | "base_uri": "https://localhost:8080/", 1201 | "height": 615 1202 | } 1203 | }, 1204 | "source": [ 1205 | "df.head()" 1206 | ], 1207 | "execution_count": 31, 1208 | "outputs": [ 1209 | { 1210 | "output_type": "execute_result", 1211 | "data": { 1212 | "text/html": [ 1213 | "
\n", 1214 | "\n", 1227 | "\n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | "
Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Name
076733NaNAbsolutely wonderful - silky and sexy and comf...410InitmatesIntimateIntimates
1108034NaNLove this dress! it's sooo pretty. i happene...514GeneralDressesDresses
2107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses
3104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPants
484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses
\n", 1311 | "
" 1312 | ], 1313 | "text/plain": [ 1314 | " Clothing ID Age ... Department Name Class Name\n", 1315 | "0 767 33 ... Intimate Intimates\n", 1316 | "1 1080 34 ... Dresses Dresses\n", 1317 | "2 1077 60 ... Dresses Dresses\n", 1318 | "3 1049 50 ... Bottoms Pants\n", 1319 | "4 847 47 ... Tops Blouses\n", 1320 | "\n", 1321 | "[5 rows x 10 columns]" 1322 | ] 1323 | }, 1324 | "metadata": { 1325 | "tags": [] 1326 | }, 1327 | "execution_count": 31 1328 | } 1329 | ] 1330 | }, 1331 | { 1332 | "cell_type": "markdown", 1333 | "metadata": { 1334 | "id": "-OE42CRqTGGo" 1335 | }, 1336 | "source": [ 1337 | "Can see that some titles are null. Its also possible that some reviews do not contain any text. " 1338 | ] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "metadata": { 1343 | "id": "XclWIqTbTBfV", 1344 | "outputId": "97c50bbe-073e-4f2f-a0d6-14e87276a30e", 1345 | "colab": { 1346 | "base_uri": "https://localhost:8080/", 1347 | "height": 34 1348 | } 1349 | }, 1350 | "source": [ 1351 | "df[(df[\"Review Text\"].isnull()) & (df[\"Title\"].isnull())].shape[0]" 1352 | ], 1353 | "execution_count": 151, 1354 | "outputs": [ 1355 | { 1356 | "output_type": "execute_result", 1357 | "data": { 1358 | "text/plain": [ 1359 | "844" 1360 | ] 1361 | }, 1362 | "metadata": { 1363 | "tags": [] 1364 | }, 1365 | "execution_count": 151 1366 | } 1367 | ] 1368 | }, 1369 | { 1370 | "cell_type": "markdown", 1371 | "metadata": { 1372 | "id": "TSgJsei_TT5R" 1373 | }, 1374 | "source": [ 1375 | "We should remove these from the dataframe since this analysis aims to infer sentiment from text. " 1376 | ] 1377 | }, 1378 | { 1379 | "cell_type": "code", 1380 | "metadata": { 1381 | "id": "hR3HtKyLTS38" 1382 | }, 1383 | "source": [ 1384 | "df.drop(df[(df[\"Review Text\"].isnull()) & (df[\"Title\"].isnull())].index, inplace=True)" 1385 | ], 1386 | "execution_count": 159, 1387 | "outputs": [] 1388 | }, 1389 | { 1390 | "cell_type": "markdown", 1391 | "metadata": { 1392 | "id": "LJwAt-_uD2sk" 1393 | }, 1394 | "source": [ 1395 | "There are two columns that may convey sentiment:\n", 1396 | "- `Review Text`\n", 1397 | "- `Title`\n", 1398 | "\n", 1399 | "To calculate the Afinn sentiment score for all of the responses in the dataframe, we can apply the scorer to the `Review Text` column and create a new column `text_score`. We do the same to generate a `title_score` column. \n", 1400 | "\n" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "metadata": { 1406 | "id": "4KByiMWNEObu" 1407 | }, 1408 | "source": [ 1409 | "\n", 1410 | "#df['text_score'] = df[df[\"Review Text\"].notnull()].loc[\"Review Text\"].apply(afinn.score)\n", 1411 | "for index, row in df.iterrows():\n", 1412 | " if pd.notna(row['Review Text']):\n", 1413 | " df.at[index, \"text_score\"] = afinn.score(row['Review Text'])\n" 1414 | ], 1415 | "execution_count": 82, 1416 | "outputs": [] 1417 | }, 1418 | { 1419 | "cell_type": "code", 1420 | "metadata": { 1421 | "id": "koSaS8YEKEYW" 1422 | }, 1423 | "source": [ 1424 | "for index, row in df.iterrows():\n", 1425 | " if pd.notna(row['Title']):\n", 1426 | " df.at[index, \"title_score\"] = afinn.score(row['Title'])" 1427 | ], 1428 | "execution_count": 87, 1429 | "outputs": [] 1430 | }, 1431 | { 1432 | "cell_type": "code", 1433 | "metadata": { 1434 | "id": "TJlQuxhMKHnx" 1435 | }, 1436 | "source": [ 1437 | "df[\"total_score\"] = 2 * df[\"title_score\"] + df[\"text_score\"]" 1438 | ], 1439 | "execution_count": 88, 1440 | "outputs": [] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "metadata": { 1445 | "id": "THGeTLn6Mm02", 1446 | "outputId": "efc0eae7-39e8-4428-80bb-f7f0265723de", 1447 | "colab": { 1448 | "base_uri": "https://localhost:8080/", 1449 | "height": 170 1450 | } 1451 | }, 1452 | "source": [ 1453 | "df['total_score'].describe()" 1454 | ], 1455 | "execution_count": 107, 1456 | "outputs": [ 1457 | { 1458 | "output_type": "execute_result", 1459 | "data": { 1460 | "text/plain": [ 1461 | "count 23486.000000\n", 1462 | "mean 11.325172\n", 1463 | "std 7.615414\n", 1464 | "min -20.000000\n", 1465 | "25% 6.000000\n", 1466 | "50% 11.000000\n", 1467 | "75% 16.000000\n", 1468 | "max 52.000000\n", 1469 | "Name: total_score, dtype: float64" 1470 | ] 1471 | }, 1472 | "metadata": { 1473 | "tags": [] 1474 | }, 1475 | "execution_count": 107 1476 | } 1477 | ] 1478 | }, 1479 | { 1480 | "cell_type": "code", 1481 | "metadata": { 1482 | "id": "IqmjIXItJPnl", 1483 | "outputId": "b0da71fc-7e2f-4c4f-d2bb-683538a875ed", 1484 | "colab": { 1485 | "base_uri": "https://localhost:8080/", 1486 | "height": 252 1487 | } 1488 | }, 1489 | "source": [ 1490 | "df.groupby(\"Rating\").median()" 1491 | ], 1492 | "execution_count": 100, 1493 | "outputs": [ 1494 | { 1495 | "output_type": "execute_result", 1496 | "data": { 1497 | "text/html": [ 1498 | "
\n", 1499 | "\n", 1512 | "\n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | "
Clothing IDAgeRecommended INDPositive Feedback Counttext_scoretitle_scoretotal_score
Rating
19364201303
29364101405
39364001607
492841117211
593641119213
\n", 1588 | "
" 1589 | ], 1590 | "text/plain": [ 1591 | " Clothing ID Age Recommended IND ... text_score title_score total_score\n", 1592 | "Rating ... \n", 1593 | "1 936 42 0 ... 3 0 3\n", 1594 | "2 936 41 0 ... 4 0 5\n", 1595 | "3 936 40 0 ... 6 0 7\n", 1596 | "4 928 41 1 ... 7 2 11\n", 1597 | "5 936 41 1 ... 9 2 13\n", 1598 | "\n", 1599 | "[5 rows x 7 columns]" 1600 | ] 1601 | }, 1602 | "metadata": { 1603 | "tags": [] 1604 | }, 1605 | "execution_count": 100 1606 | } 1607 | ] 1608 | }, 1609 | { 1610 | "cell_type": "code", 1611 | "metadata": { 1612 | "id": "xy-mQTkSKiCJ", 1613 | "outputId": "5656bb02-e4a9-4d3b-c49c-aa2855873844", 1614 | "colab": { 1615 | "base_uri": "https://localhost:8080/", 1616 | "height": 1000 1617 | } 1618 | }, 1619 | "source": [ 1620 | "df[(df[\"total_score\"]<10) & (df[\"Rating\"]==5)]" 1621 | ], 1622 | "execution_count": 160, 1623 | "outputs": [ 1624 | { 1625 | "output_type": "execute_result", 1626 | "data": { 1627 | "text/html": [ 1628 | "
\n", 1629 | "\n", 1642 | "\n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | "
Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Nametext_scoretitle_scoretotal_scoreword_countnormalized_score
484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses606360.166667
685839Cagrcoal shimmer funI aded this in my basket at hte last mintue to...511General PetiteTopsKnits-14710111.990099
8107724FlatteringI love this dress. i usually get an xs but it ...510GeneralDressesDresses303340.088235
11109539NaNThis dress is perfection! so pretty and flatte...512General PetiteDressesDresses40480.500000
1376744Runs bigBought the black xs to go under the larkspur m...510InitmatesIntimateIntimates113693.014493
................................................
2341785039Get it quick!Can i tell you this top is amazing?! get it qu...510GeneralTopsBlouses606150.400000
2343818168Just rightI feel like snagging a pair of these was the e...510InitmatesIntimateLegwear404620.064516
23441110463Sweet surpriseDon't know why but i didn't have high expectat...5125General PetiteDressesDresses226886.022727
23442110439Flattering dressLove this dress, very flattering fit and the f...510General PetiteDressesDresses606410.146341
2345886263NaNThis is my new favorite sweater. it is lightwe...510General PetiteTopsKnits202180.111111
\n", 1864 | "

3537 rows × 15 columns

\n", 1865 | "
" 1866 | ], 1867 | "text/plain": [ 1868 | " Clothing ID Age ... word_count normalized_score\n", 1869 | "4 847 47 ... 36 0.166667\n", 1870 | "6 858 39 ... 101 11.990099\n", 1871 | "8 1077 24 ... 34 0.088235\n", 1872 | "11 1095 39 ... 8 0.500000\n", 1873 | "13 767 44 ... 69 3.014493\n", 1874 | "... ... ... ... ... ...\n", 1875 | "23417 850 39 ... 15 0.400000\n", 1876 | "23438 181 68 ... 62 0.064516\n", 1877 | "23441 1104 63 ... 88 6.022727\n", 1878 | "23442 1104 39 ... 41 0.146341\n", 1879 | "23458 862 63 ... 18 0.111111\n", 1880 | "\n", 1881 | "[3537 rows x 15 columns]" 1882 | ] 1883 | }, 1884 | "metadata": { 1885 | "tags": [] 1886 | }, 1887 | "execution_count": 160 1888 | } 1889 | ] 1890 | }, 1891 | { 1892 | "cell_type": "markdown", 1893 | "metadata": { 1894 | "id": "10ZNusm3M1bK" 1895 | }, 1896 | "source": [ 1897 | "One of the drawbacks to using the raw Afinn score is the that longer texts may yield higher values simply because they contain more words. To adjust for that, we can divide the score by the number of words in the text." 1898 | ] 1899 | }, 1900 | { 1901 | "cell_type": "code", 1902 | "metadata": { 1903 | "id": "SBG3pn3gL8tx" 1904 | }, 1905 | "source": [ 1906 | "df['word_count'] = 0\n", 1907 | "for index, row in df.iterrows():\n", 1908 | " if pd.notna(row['Review Text']):\n", 1909 | " df.at[index, \"word_count\"] = len(row['Review Text'].split())\n", 1910 | "df[\"normalized_score\"] = (df[\"text_score\"] / df[\"word_count\"]) + (2* df[\"title_score\"])" 1911 | ], 1912 | "execution_count": 161, 1913 | "outputs": [] 1914 | }, 1915 | { 1916 | "cell_type": "code", 1917 | "metadata": { 1918 | "id": "djCBJnEINpaq", 1919 | "outputId": "1bc6581e-15d9-44a7-b5a5-7719e344029f", 1920 | "colab": { 1921 | "base_uri": "https://localhost:8080/", 1922 | "height": 170 1923 | } 1924 | }, 1925 | "source": [ 1926 | "df[\"normalized_score\"].describe()" 1927 | ], 1928 | "execution_count": 162, 1929 | "outputs": [ 1930 | { 1931 | "output_type": "execute_result", 1932 | "data": { 1933 | "text/plain": [ 1934 | "count 22641.000000\n", 1935 | "mean 3.552963\n", 1936 | "std 3.909686\n", 1937 | "min -11.865979\n", 1938 | "25% 0.146341\n", 1939 | "50% 4.070175\n", 1940 | "75% 6.183099\n", 1941 | "max 24.164706\n", 1942 | "Name: normalized_score, dtype: float64" 1943 | ] 1944 | }, 1945 | "metadata": { 1946 | "tags": [] 1947 | }, 1948 | "execution_count": 162 1949 | } 1950 | ] 1951 | }, 1952 | { 1953 | "cell_type": "code", 1954 | "metadata": { 1955 | "id": "wQd3067IVOWu" 1956 | }, 1957 | "source": [ 1958 | "def generate_confusion_matrix(df, score_column, threshold):\n", 1959 | " total = df[df[\"Rating\"]!=3].shape[0]\n", 1960 | " tp = df[(df[score_column]>=threshold) & (df[\"Rating\"]>3)].shape[0]\n", 1961 | " fp = df[(df[score_column]>=threshold) & (df[\"Rating\"]<3)].shape[0]\n", 1962 | " tn = df[(df[score_column]3)].shape[0]\n", 1964 | " return tp / (tp + 0.5*(fp + fn))" 1965 | ], 1966 | "execution_count": 213, 1967 | "outputs": [] 1968 | }, 1969 | { 1970 | "cell_type": "code", 1971 | "metadata": { 1972 | "id": "g2-9A8ksSsHp", 1973 | "outputId": "e05859dd-27f8-418b-e55f-715fbd38c60f", 1974 | "colab": { 1975 | "base_uri": "https://localhost:8080/", 1976 | "height": 34 1977 | } 1978 | }, 1979 | "source": [ 1980 | "generate_confusion_matrix(df, \"normalized_score\", -1)" 1981 | ], 1982 | "execution_count": 214, 1983 | "outputs": [ 1984 | { 1985 | "output_type": "execute_result", 1986 | "data": { 1987 | "text/plain": [ 1988 | "0.9426901223776224" 1989 | ] 1990 | }, 1991 | "metadata": { 1992 | "tags": [] 1993 | }, 1994 | "execution_count": 214 1995 | } 1996 | ] 1997 | }, 1998 | { 1999 | "cell_type": "code", 2000 | "metadata": { 2001 | "id": "0krSzOa3UO7w", 2002 | "outputId": "818a330a-150b-4f15-a549-34fbf19cc1fc", 2003 | "colab": { 2004 | "base_uri": "https://localhost:8080/", 2005 | "height": 34 2006 | } 2007 | }, 2008 | "source": [ 2009 | "generate_confusion_matrix(df, \"total_score\", 2)" 2010 | ], 2011 | "execution_count": 215, 2012 | "outputs": [ 2013 | { 2014 | "output_type": "execute_result", 2015 | "data": { 2016 | "text/plain": [ 2017 | "0.9424460431654677" 2018 | ] 2019 | }, 2020 | "metadata": { 2021 | "tags": [] 2022 | }, 2023 | "execution_count": 215 2024 | } 2025 | ] 2026 | }, 2027 | { 2028 | "cell_type": "markdown", 2029 | "metadata": { 2030 | "id": "KHZ1ocfvZNt4" 2031 | }, 2032 | "source": [ 2033 | "Feature engineering with an out of the box dictionary gives us some pretty good results!" 2034 | ] 2035 | }, 2036 | { 2037 | "cell_type": "markdown", 2038 | "metadata": { 2039 | "id": "bsUsr140adsJ" 2040 | }, 2041 | "source": [ 2042 | "## Creating your own classifier\n", 2043 | "\n", 2044 | "Its possible that you may want to create a new set of words that relate to your specific use-case. " 2045 | ] 2046 | }, 2047 | { 2048 | "cell_type": "code", 2049 | "metadata": { 2050 | "id": "pSvlE_LiadTu" 2051 | }, 2052 | "source": [ 2053 | "def get_score(text, custom_set):\n", 2054 | " # First we tokenize \n", 2055 | " text = text.lower()\n", 2056 | " punctuation = '\"!#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n", 2057 | " tokenized_text = \"\".join([ch for ch in text if ch not in punctuation]).split()\n", 2058 | " tokenized_set = set(tokenized_text)\n", 2059 | " \n", 2060 | " return len(tokenized_set.intersection(custom_set)) * 2\n" 2061 | ], 2062 | "execution_count": 223, 2063 | "outputs": [] 2064 | }, 2065 | { 2066 | "cell_type": "code", 2067 | "metadata": { 2068 | "id": "iM7Srh8TcjQj" 2069 | }, 2070 | "source": [ 2071 | "custom_set = set([\"flattering\", \"quick\", \"well\", \"right\", \"comfortable\", \"slimming\", \"confident\"])" 2072 | ], 2073 | "execution_count": 242, 2074 | "outputs": [] 2075 | }, 2076 | { 2077 | "cell_type": "code", 2078 | "metadata": { 2079 | "id": "90UYTAm5dVbR", 2080 | "outputId": "3630c9b1-93a7-40b9-8643-d94bb7815f8f", 2081 | "colab": { 2082 | "base_uri": "https://localhost:8080/", 2083 | "height": 71 2084 | } 2085 | }, 2086 | "source": [ 2087 | "df.at[4, \"Review Text\"]" 2088 | ], 2089 | "execution_count": 235, 2090 | "outputs": [ 2091 | { 2092 | "output_type": "execute_result", 2093 | "data": { 2094 | "application/vnd.google.colaboratory.intrinsic+json": { 2095 | "type": "string" 2096 | }, 2097 | "text/plain": [ 2098 | "'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'" 2099 | ] 2100 | }, 2101 | "metadata": { 2102 | "tags": [] 2103 | }, 2104 | "execution_count": 235 2105 | } 2106 | ] 2107 | }, 2108 | { 2109 | "cell_type": "code", 2110 | "metadata": { 2111 | "id": "tcou-woRcpjd", 2112 | "outputId": "2ee15852-9aca-44f5-ca1b-66369f484d4e", 2113 | "colab": { 2114 | "base_uri": "https://localhost:8080/", 2115 | "height": 34 2116 | } 2117 | }, 2118 | "source": [ 2119 | "get_score(df.at[4, \"Review Text\"], custom_set)" 2120 | ], 2121 | "execution_count": 243, 2122 | "outputs": [ 2123 | { 2124 | "output_type": "execute_result", 2125 | "data": { 2126 | "text/plain": [ 2127 | "4" 2128 | ] 2129 | }, 2130 | "metadata": { 2131 | "tags": [] 2132 | }, 2133 | "execution_count": 243 2134 | } 2135 | ] 2136 | }, 2137 | { 2138 | "cell_type": "code", 2139 | "metadata": { 2140 | "id": "fYcjhV85cyYz", 2141 | "outputId": "b18ee760-bed3-4907-bba5-e2e4423b6d4e", 2142 | "colab": { 2143 | "base_uri": "https://localhost:8080/", 2144 | "height": 71 2145 | } 2146 | }, 2147 | "source": [ 2148 | "df.at[23442, \"Review Text\"]" 2149 | ], 2150 | "execution_count": 237, 2151 | "outputs": [ 2152 | { 2153 | "output_type": "execute_result", 2154 | "data": { 2155 | "application/vnd.google.colaboratory.intrinsic+json": { 2156 | "type": "string" 2157 | }, 2158 | "text/plain": [ 2159 | "'Love this dress, very flattering fit and the fabric does not feel heavy but is sturdy - i wore it for first dinner out with my husband after losing most of my baby weight and felt great and confident in it.'" 2160 | ] 2161 | }, 2162 | "metadata": { 2163 | "tags": [] 2164 | }, 2165 | "execution_count": 237 2166 | } 2167 | ] 2168 | }, 2169 | { 2170 | "cell_type": "code", 2171 | "metadata": { 2172 | "id": "1D30axVaczGh", 2173 | "outputId": "f2b2504f-2caa-4de7-9e3d-d99201d186cb", 2174 | "colab": { 2175 | "base_uri": "https://localhost:8080/", 2176 | "height": 34 2177 | } 2178 | }, 2179 | "source": [ 2180 | "get_score(df.at[23442, \"Review Text\"], custom_set)" 2181 | ], 2182 | "execution_count": 244, 2183 | "outputs": [ 2184 | { 2185 | "output_type": "execute_result", 2186 | "data": { 2187 | "text/plain": [ 2188 | "4" 2189 | ] 2190 | }, 2191 | "metadata": { 2192 | "tags": [] 2193 | }, 2194 | "execution_count": 244 2195 | } 2196 | ] 2197 | }, 2198 | { 2199 | "cell_type": "code", 2200 | "metadata": { 2201 | "id": "sOoO5DWodL6a", 2202 | "outputId": "e5961457-3c22-4528-b7a0-4fb4d8521b93", 2203 | "colab": { 2204 | "base_uri": "https://localhost:8080/", 2205 | "height": 34 2206 | } 2207 | }, 2208 | "source": [ 2209 | "get_score(df.at[23438, \"Review Text\"], custom_set)" 2210 | ], 2211 | "execution_count": 245, 2212 | "outputs": [ 2213 | { 2214 | "output_type": "execute_result", 2215 | "data": { 2216 | "text/plain": [ 2217 | "4" 2218 | ] 2219 | }, 2220 | "metadata": { 2221 | "tags": [] 2222 | }, 2223 | "execution_count": 245 2224 | } 2225 | ] 2226 | }, 2227 | { 2228 | "cell_type": "code", 2229 | "metadata": { 2230 | "id": "TQmTY7DLdnuT", 2231 | "outputId": "f536eea3-a4e5-4087-c75b-c1175bb36953", 2232 | "colab": { 2233 | "base_uri": "https://localhost:8080/", 2234 | "height": 88 2235 | } 2236 | }, 2237 | "source": [ 2238 | "df.at[23438, \"Review Text\"]" 2239 | ], 2240 | "execution_count": 241, 2241 | "outputs": [ 2242 | { 2243 | "output_type": "execute_result", 2244 | "data": { 2245 | "application/vnd.google.colaboratory.intrinsic+json": { 2246 | "type": "string" 2247 | }, 2248 | "text/plain": [ 2249 | "\"I feel like snagging a pair of these was the equivalent to standing in line for black friday, as they always seem to be out of stock. now i know why. these are soft, comfortable, and slimming. they're somewhere between the hold of control top pantyhose and spans--they don't fall and sag throughout the day and are nicely slimming without being pain-inducing.\"" 2250 | ] 2251 | }, 2252 | "metadata": { 2253 | "tags": [] 2254 | }, 2255 | "execution_count": 241 2256 | } 2257 | ] 2258 | } 2259 | ] 2260 | } -------------------------------------------------------------------------------- /ml-projects/Using_Embeddings_and_NLP_For_Machine_Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Using Embeddings and NLP For Machine Learning.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyMdnG2F37FVgXskY8ZIm00B", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "xE90bjnUlEYa" 32 | }, 33 | "source": [ 34 | "# Data processing" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "EKtOpxlZq1Xe" 41 | }, 42 | "source": [ 43 | "import spacy\n", 44 | "import numpy as np\n", 45 | "import pandas as pd\n", 46 | "from collections import Counter\n", 47 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 48 | "from sklearn.decomposition import NMF, LatentDirichletAllocation\n", 49 | "from sklearn.metrics.pairwise import cosine_similarity\n", 50 | "from sklearn.preprocessing import normalize\n", 51 | "from sklearn.metrics import accuracy_score\n", 52 | "from sklearn.svm import LinearSVC\n", 53 | "import pickle\n", 54 | "#!python -m spacy download en_core_web_md en\n", 55 | "import en_core_web_md\n", 56 | "nlp = en_core_web_md.load()" 57 | ], 58 | "execution_count": 21, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "AwyD_8OWrOYM" 65 | }, 66 | "source": [ 67 | "def basic_tokenizer(doc, model=nlp):\n", 68 | " \n", 69 | " parsed_doc = model(doc)\n", 70 | "\n", 71 | " # Tokens are those that are comprised of alphabetic characters and not urls and not stop words \n", 72 | " return [t.lemma_ for t in parsed_doc if (t.is_alpha)&(not t.like_url)&(not t.is_stop)]" 73 | ], 74 | "execution_count": 23, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "colab": { 81 | "base_uri": "https://localhost:8080/" 82 | }, 83 | "id": "zaRN4jxStIuY", 84 | "outputId": "e901ae07-0bee-494b-a113-2a043b22551b" 85 | }, 86 | "source": [ 87 | "# Here we use scikit learn's count vectorizer with our tokenizer\n", 88 | "cv = CountVectorizer(tokenizer=basic_tokenizer)\n", 89 | "\n", 90 | "# Our mini corpus\n", 91 | "text_data = [\"A friend gave me these when I was recently diagnosed with breast cancer. I bought another pair because they are the best pjs I’ve ever owned. As soon as I came out of surgery, I asked the nurses to help me change into them. This is me wearing them on my first walk down the hospital hall. Now I only take them off to wash them and then put them immediately back on. They’ve survived 100 washes and still look new. They’re the best gift I’ve received during my breast cancer treatment\",\n", 92 | " \"While as others claimed these do run a little on the longer side, at 5'7'' and a size small, I found the length luxurious rather than sloppy. Other than the length, the fit is pretty true to size. They wash and wear well, don't get stretched out and have an extra button at the neck for when you need a little extra warmth. The elastic waist band is thick and just generous enough not to be tight, but just in case you're tiny waisted, there is a drawstring as well. These have taken over as my new favorite!\",\n", 93 | " \"I bought these pjs in navy and white stripes a few years ago and have been in love with them forever, so soft and dreamy just like the name. I decided to finally splurge on a second pair recently and it seems like the quality has gotten cheaper. The seams are EXTREMELY itchy. There’s a weird plastic piece that runs along all the seams and is constantly scratching my skin....not the best feeling in bed. Please get rid of the weird plastic seam!!\",\n", 94 | " \"These are great soft pajamas, except the size small pants I received have a 33' inseam, which is obviously crazy long. I'm not sure if the pair I received is flawed, given the description says they have a 27' inseam\"]\n", 95 | "\n", 96 | "v = cv.fit_transform(text_data).toarray()\n", 97 | "print(v)\n" 98 | ], 99 | "execution_count": 26, 100 | "outputs": [ 101 | { 102 | "output_type": "stream", 103 | "text": [ 104 | "[[0 1 0 0 2 0 1 2 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 2\n", 105 | " 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0\n", 106 | " 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 2 1 0 0 0]\n", 107 | " [0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 2 0 1 0 0 1 1 0 0 0 1 0 0 0 0\n", 108 | " 0 0 0 0 0 0 0 2 0 2 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0\n", 109 | " 0 2 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0]\n", 110 | " [1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1\n", 111 | " 0 0 0 0 0 0 1 0 2 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 2 0 1 0 1 1 1 0 1 3\n", 112 | " 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1]\n", 113 | " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0\n", 114 | " 1 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 2 0 0 0 1 0 0\n", 115 | " 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n" 116 | ], 117 | "name": "stdout" 118 | } 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "colab": { 125 | "base_uri": "https://localhost:8080/" 126 | }, 127 | "id": "gbv3QspOw0fZ", 128 | "outputId": "354f867a-7d91-4027-fc41-1607d4cd866c" 129 | }, 130 | "source": [ 131 | "# Make these arrays human readable with feature names \n", 132 | "dict(zip(cv.get_feature_names(), v.sum(axis=0)))" 133 | ], 134 | "execution_count": 27, 135 | "outputs": [ 136 | { 137 | "output_type": "execute_result", 138 | "data": { 139 | "text/plain": [ 140 | "{'ago': 1,\n", 141 | " 'ask': 1,\n", 142 | " 'band': 1,\n", 143 | " 'bed': 1,\n", 144 | " 'breast': 2,\n", 145 | " 'button': 1,\n", 146 | " 'buy': 2,\n", 147 | " 'cancer': 2,\n", 148 | " 'case': 1,\n", 149 | " 'change': 1,\n", 150 | " 'cheap': 1,\n", 151 | " 'claim': 1,\n", 152 | " 'come': 1,\n", 153 | " 'constantly': 1,\n", 154 | " 'crazy': 1,\n", 155 | " 'decide': 1,\n", 156 | " 'description': 1,\n", 157 | " 'diagnose': 1,\n", 158 | " 'drawstring': 1,\n", 159 | " 'dreamy': 1,\n", 160 | " 'elastic': 1,\n", 161 | " 'extra': 2,\n", 162 | " 'extremely': 1,\n", 163 | " 'favorite': 1,\n", 164 | " 'feeling': 1,\n", 165 | " 'finally': 1,\n", 166 | " 'find': 1,\n", 167 | " 'fit': 1,\n", 168 | " 'flawed': 1,\n", 169 | " 'forever': 1,\n", 170 | " 'friend': 1,\n", 171 | " 'generous': 1,\n", 172 | " 'get': 1,\n", 173 | " 'gift': 1,\n", 174 | " 'give': 2,\n", 175 | " 'good': 3,\n", 176 | " 'great': 1,\n", 177 | " 'hall': 1,\n", 178 | " 'help': 1,\n", 179 | " 'hospital': 1,\n", 180 | " 'immediately': 1,\n", 181 | " 'inseam': 2,\n", 182 | " 'itchy': 1,\n", 183 | " 'length': 2,\n", 184 | " 'like': 2,\n", 185 | " 'little': 2,\n", 186 | " 'long': 2,\n", 187 | " 'look': 1,\n", 188 | " 'love': 1,\n", 189 | " 'luxurious': 1,\n", 190 | " 'navy': 1,\n", 191 | " 'neck': 1,\n", 192 | " 'need': 1,\n", 193 | " 'new': 2,\n", 194 | " 'nurse': 1,\n", 195 | " 'obviously': 1,\n", 196 | " 'own': 1,\n", 197 | " 'pair': 3,\n", 198 | " 'pajama': 1,\n", 199 | " 'pant': 1,\n", 200 | " 'piece': 1,\n", 201 | " 'pjs': 2,\n", 202 | " 'plastic': 2,\n", 203 | " 'pretty': 1,\n", 204 | " 'quality': 1,\n", 205 | " 'receive': 3,\n", 206 | " 'recently': 2,\n", 207 | " 'rid': 1,\n", 208 | " 'run': 2,\n", 209 | " 'say': 1,\n", 210 | " 'scratch': 1,\n", 211 | " 'seam': 3,\n", 212 | " 'second': 1,\n", 213 | " 'size': 3,\n", 214 | " 'skin': 1,\n", 215 | " 'sloppy': 1,\n", 216 | " 'small': 2,\n", 217 | " 'soft': 2,\n", 218 | " 'soon': 1,\n", 219 | " 'splurge': 1,\n", 220 | " 'stretch': 1,\n", 221 | " 'stripes': 1,\n", 222 | " 'sure': 1,\n", 223 | " 'surgery': 1,\n", 224 | " 'survive': 1,\n", 225 | " 'take': 1,\n", 226 | " 'thick': 1,\n", 227 | " 'tight': 1,\n", 228 | " 'tiny': 1,\n", 229 | " 'treatment': 1,\n", 230 | " 'true': 1,\n", 231 | " 'waist': 1,\n", 232 | " 'waisted': 1,\n", 233 | " 'walk': 1,\n", 234 | " 'warmth': 1,\n", 235 | " 'wash': 3,\n", 236 | " 'wear': 2,\n", 237 | " 'weird': 2,\n", 238 | " 'white': 1,\n", 239 | " 'year': 1}" 240 | ] 241 | }, 242 | "metadata": { 243 | "tags": [] 244 | }, 245 | "execution_count": 27 246 | } 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "id": "pR6CnjHgT36E" 253 | }, 254 | "source": [ 255 | "## Dataframe Cleanup" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "metadata": { 261 | "colab": { 262 | "base_uri": "https://localhost:8080/" 263 | }, 264 | "id": "yjVaQwTIxlN6", 265 | "outputId": "e8d3b9d2-1f42-46c6-ce6f-3fa09a31e588" 266 | }, 267 | "source": [ 268 | "# Now let's apply to the review dataset\n", 269 | "DATASET_LINK = \"https://raw.githubusercontent.com/AFAgarap/ecommerce-reviews-analysis/master/Womens%20Clothing%20E-Commerce%20Reviews.csv\"\n", 270 | "df = pd.read_csv(DATASET_LINK, usecols=[\"Clothing ID\", \"Title\", \"Review Text\", \"Rating\"])\n", 271 | "df.shape" 272 | ], 273 | "execution_count": 92, 274 | "outputs": [ 275 | { 276 | "output_type": "execute_result", 277 | "data": { 278 | "text/plain": [ 279 | "(23486, 4)" 280 | ] 281 | }, 282 | "metadata": { 283 | "tags": [] 284 | }, 285 | "execution_count": 92 286 | } 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "metadata": { 292 | "colab": { 293 | "base_uri": "https://localhost:8080/", 294 | "height": 514 295 | }, 296 | "id": "0dfP2miB84lz", 297 | "outputId": "ef1ab0c2-22bb-4a05-9362-e121c1aee43a" 298 | }, 299 | "source": [ 300 | "df.sample(15)" 301 | ], 302 | "execution_count": 91, 303 | "outputs": [ 304 | { 305 | "output_type": "execute_result", 306 | "data": { 307 | "text/html": [ 308 | "
\n", 309 | "\n", 322 | "\n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | "
Clothing IDTitleReview TextRatingReview Tokens
41631078Great dress!This dress is a great casual outfit. tha fabri...4None
2743984Fun jacketLove the fabric and the casual way this jacket...5None
8799907Cute but color different than picturedI purchased this sweater in the grey color and...4None
173691048Flattering and simply perfectI tried these on in the store and instantly fe...5None
12195875Perfect fall pieceI saw this and had to have it! it is so beaut...4None
154661068Pants oversizedI wanted to love these pants since i hate supe...3None
112431022Pleasantly surprisedI ordered these jsut coz they were on sale... ...5None
11927868Cozy elegance!First off, this top is super cozy! i adore an ...5None
103621067BeautifulI love this jumpsuit. i've worn it twice and r...5None
7782936Love it!This sweater is super cozy and comfy and my ne...5None
114121079Unique and lovelyThis dress is unique and lovely! it is a one o...5None
90071059Super comfortable, stylish jumpsuitThis is my first ever jumpsuit purchase, and i...5None
16041868My new favorite topThis top is wonderfully comfortable and well m...5None
41221081Beautiful dress!This dress is so pretty, comfortable and easy ...5None
72431020Versatile skirtWell made. color is great, goes from summer ri...5None
\n", 456 | "
" 457 | ], 458 | "text/plain": [ 459 | " Clothing ID ... Review Tokens\n", 460 | "4163 1078 ... None\n", 461 | "2743 984 ... None\n", 462 | "8799 907 ... None\n", 463 | "17369 1048 ... None\n", 464 | "12195 875 ... None\n", 465 | "15466 1068 ... None\n", 466 | "11243 1022 ... None\n", 467 | "11927 868 ... None\n", 468 | "10362 1067 ... None\n", 469 | "7782 936 ... None\n", 470 | "11412 1079 ... None\n", 471 | "9007 1059 ... None\n", 472 | "16041 868 ... None\n", 473 | "4122 1081 ... None\n", 474 | "7243 1020 ... None\n", 475 | "\n", 476 | "[15 rows x 5 columns]" 477 | ] 478 | }, 479 | "metadata": { 480 | "tags": [] 481 | }, 482 | "execution_count": 91 483 | } 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "metadata": { 489 | "id": "wRSetiA_92rz" 490 | }, 491 | "source": [ 492 | "# Lets remove any review without a text review or a rating\n", 493 | "df.dropna(how = \"any\", subset=['Review Text', 'Rating'], inplace=True)\n", 494 | "df.reset_index(drop=True, inplace=True)\n", 495 | "df.shape\n", 496 | "# Remove this sampling to ensure broader reach\n", 497 | "df = df.sample(2000)" 498 | ], 499 | "execution_count": 93, 500 | "outputs": [] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "metadata": { 505 | "colab": { 506 | "base_uri": "https://localhost:8080/", 507 | "height": 514 508 | }, 509 | "id": "YQb_oGMrAx8O", 510 | "outputId": "18b9b81b-9522-427b-d864-5e8ba750b8f7" 511 | }, 512 | "source": [ 513 | "df.sample(15)" 514 | ], 515 | "execution_count": 94, 516 | "outputs": [ 517 | { 518 | "output_type": "execute_result", 519 | "data": { 520 | "text/html": [ 521 | "
\n", 522 | "\n", 535 | "\n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | "
Clothing IDTitleReview TextRating
129171072NaNThis dress is one of my recent faves from reta...5
99821056Very nice and versatile pantsI'm typically a 27. this 27 seems snug but the...4
15160865Stylish & comfortableI ordered the green top in store after trying ...4
6511087Simple but different.I bought this dress in the cream color. it was...4
18641867Cute but noThis shirt ran small and was more sheer than i...2
361002NaNThis is a comfortable skirt that can span seas...4
21470451Just don't wash itI loved this dress...until i washed it. the la...3
4669940Like wearing a hugGreat sweater, beautiful detail, warm and cozy...5
10890829Soft and swingingThe color and fabric are really soft and lovel...4
125481081Perfect fit, forgiving bellyThis dress is so soft, and fits like a dream. ...5
2963895NaNI passed this over on first sight because it l...4
19909949So cute and stylish!!Was in my local retailer today and just had to...5
103671037Great lookThese pants fit great. the velvet material is ...5
17897809White tee with a \"detail\"on backI ordered a small. i'm usually a petite small....5
9848927Really lovely sweater coatI love the patterns and the colors! easy to dr...5
\n", 653 | "
" 654 | ], 655 | "text/plain": [ 656 | " Clothing ID ... Rating\n", 657 | "12917 1072 ... 5\n", 658 | "9982 1056 ... 4\n", 659 | "15160 865 ... 4\n", 660 | "651 1087 ... 4\n", 661 | "18641 867 ... 2\n", 662 | "36 1002 ... 4\n", 663 | "21470 451 ... 3\n", 664 | "4669 940 ... 5\n", 665 | "10890 829 ... 4\n", 666 | "12548 1081 ... 5\n", 667 | "2963 895 ... 4\n", 668 | "19909 949 ... 5\n", 669 | "10367 1037 ... 5\n", 670 | "17897 809 ... 5\n", 671 | "9848 927 ... 5\n", 672 | "\n", 673 | "[15 rows x 4 columns]" 674 | ] 675 | }, 676 | "metadata": { 677 | "tags": [] 678 | }, 679 | "execution_count": 94 680 | } 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": { 686 | "id": "vVSK-zDVT7Uo" 687 | }, 688 | "source": [ 689 | "## From Tokens to Vectors" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "metadata": { 695 | "id": "rTZVEGzmDRY2" 696 | }, 697 | "source": [ 698 | "count = CountVectorizer(tokenizer=basic_tokenizer)\n", 699 | "count_vecs = count.fit_transform(df['Review Text'])\n", 700 | "count_df = pd.DataFrame(count_vecs.toarray(), columns=count.get_feature_names())" 701 | ], 702 | "execution_count": 101, 703 | "outputs": [] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "metadata": { 708 | "colab": { 709 | "base_uri": "https://localhost:8080/", 710 | "height": 253 711 | }, 712 | "id": "YPxlRO5IHg14", 713 | "outputId": "0fef3f5a-0edf-4224-afcb-b7889f9ef974" 714 | }, 715 | "source": [ 716 | "count_df.head()" 717 | ], 718 | "execution_count": 104, 719 | "outputs": [ 720 | { 721 | "output_type": "execute_result", 722 | "data": { 723 | "text/html": [ 724 | "
\n", 725 | "\n", 738 | "\n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | "
ababbyabckabdomenableabsoluteabsolutelyabstractabtabundanceacaccentaccentedaccentuateaccentuateaaccentuatedacceptacceptableacceptablyaccessaccessorizeaccessorizingaccessoryaccidentallyaccoaccommodateaccomodateaccompanyaccomplishaccordaccumulateaccuracyaccurateaccuratelyachieveacknowledgeacrylicactactualactuallly...wrinklewrinkledwrinklingwrinklywristwritewrongxxlxsxsmallxspxspetitexxsxxspyyankyarnyayyearyellowyellowedyellowyyesyesterdayyikesyoyogayogiyolkyoungyryuckyummyzerozipziploczippedzipperzoom
00000000000000000000000000000000000000000...0000000000000000000000000000000000000000
10000000000000000000000000000000000000000...0000000001000000000000000000000000000000
20000000000000000000000000000000000000000...0000000000000000000100000000000000000000
30000000000000000000000000000000000000000...0000000000000000000000000000000000000000
40000000000000000000000000000000000000000...0000000000000000000000000000000000000000
\n", 1248 | "

5 rows × 3613 columns

\n", 1249 | "
" 1250 | ], 1251 | "text/plain": [ 1252 | " ab abby abck abdomen able ... zip ziploc zipped zipper zoom\n", 1253 | "0 0 0 0 0 0 ... 0 0 0 0 0\n", 1254 | "1 0 0 0 0 0 ... 0 0 0 0 0\n", 1255 | "2 0 0 0 0 0 ... 0 0 0 0 0\n", 1256 | "3 0 0 0 0 0 ... 0 0 0 0 0\n", 1257 | "4 0 0 0 0 0 ... 0 0 0 0 0\n", 1258 | "\n", 1259 | "[5 rows x 3613 columns]" 1260 | ] 1261 | }, 1262 | "metadata": { 1263 | "tags": [] 1264 | }, 1265 | "execution_count": 104 1266 | } 1267 | ] 1268 | }, 1269 | { 1270 | "cell_type": "code", 1271 | "metadata": { 1272 | "id": "6llw13j8HP2i" 1273 | }, 1274 | "source": [ 1275 | "tfidf = TfidfVectorizer(tokenizer=basic_tokenizer)\n", 1276 | "tfidf_vecs = tfidf.fit_transform(df['Review Text'])\n", 1277 | "tfidf_df = pd.DataFrame(tfidf_vecs.toarray(), columns=tfidf.get_feature_names())" 1278 | ], 1279 | "execution_count": 102, 1280 | "outputs": [] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "metadata": { 1285 | "colab": { 1286 | "base_uri": "https://localhost:8080/", 1287 | "height": 253 1288 | }, 1289 | "id": "s5AUjPITHZj1", 1290 | "outputId": "5fbaeade-3d52-4612-aa47-c24ddcdb9b3c" 1291 | }, 1292 | "source": [ 1293 | "tfidf_df.head()" 1294 | ], 1295 | "execution_count": 103, 1296 | "outputs": [ 1297 | { 1298 | "output_type": "execute_result", 1299 | "data": { 1300 | "text/html": [ 1301 | "
\n", 1302 | "\n", 1315 | "\n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | "
ababbyabckabdomenableabsoluteabsolutelyabstractabtabundanceacaccentaccentedaccentuateaccentuateaaccentuatedacceptacceptableacceptablyaccessaccessorizeaccessorizingaccessoryaccidentallyaccoaccommodateaccomodateaccompanyaccomplishaccordaccumulateaccuracyaccurateaccuratelyachieveacknowledgeacrylicactactualactuallly...wrinklewrinkledwrinklingwrinklywristwritewrongxxlxsxsmallxspxspetitexxsxxspyyankyarnyayyearyellowyellowedyellowyyesyesterdayyikesyoyogayogiyolkyoungyryuckyummyzerozipziploczippedzipperzoom
00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.1134780.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.2219890.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.0000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
\n", 1825 | "

5 rows × 3613 columns

\n", 1826 | "
" 1827 | ], 1828 | "text/plain": [ 1829 | " ab abby abck abdomen able ... zip ziploc zipped zipper zoom\n", 1830 | "0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", 1831 | "1 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", 1832 | "2 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", 1833 | "3 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", 1834 | "4 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", 1835 | "\n", 1836 | "[5 rows x 3613 columns]" 1837 | ] 1838 | }, 1839 | "metadata": { 1840 | "tags": [] 1841 | }, 1842 | "execution_count": 103 1843 | } 1844 | ] 1845 | }, 1846 | { 1847 | "cell_type": "code", 1848 | "metadata": { 1849 | "colab": { 1850 | "base_uri": "https://localhost:8080/" 1851 | }, 1852 | "id": "CnGpoj23NCJo", 1853 | "outputId": "56b6b559-922d-4f3f-bd01-e2ebad69a67e" 1854 | }, 1855 | "source": [ 1856 | "def top_tfidf_words(tfidf_df):\n", 1857 | " return(tfidf_df[tfidf_df>0.3].mean(axis=0))\n", 1858 | "top_tfidf_words(tfidf_df)" 1859 | ], 1860 | "execution_count": 136, 1861 | "outputs": [ 1862 | { 1863 | "output_type": "execute_result", 1864 | "data": { 1865 | "text/plain": [ 1866 | "ab 0.327036\n", 1867 | "abby 0.322623\n", 1868 | "abck NaN\n", 1869 | "abdomen NaN\n", 1870 | "able 0.301735\n", 1871 | " ... \n", 1872 | "zip 0.376426\n", 1873 | "ziploc NaN\n", 1874 | "zipped 0.319633\n", 1875 | "zipper 0.376748\n", 1876 | "zoom NaN\n", 1877 | "Length: 3613, dtype: float64" 1878 | ] 1879 | }, 1880 | "metadata": { 1881 | "tags": [] 1882 | }, 1883 | "execution_count": 136 1884 | } 1885 | ] 1886 | }, 1887 | { 1888 | "cell_type": "code", 1889 | "metadata": { 1890 | "colab": { 1891 | "base_uri": "https://localhost:8080/" 1892 | }, 1893 | "id": "Lqc_16oENTEE", 1894 | "outputId": "317fc208-d3a0-4c24-bdb4-01bbbb609eeb" 1895 | }, 1896 | "source": [ 1897 | "# Get similarities\n", 1898 | "count_sims = cosine_similarity(count_vecs)\n", 1899 | "tfidf_sims = cosine_similarity(tfidf_vecs)\n", 1900 | "\n", 1901 | "count_sims" 1902 | ], 1903 | "execution_count": 135, 1904 | "outputs": [ 1905 | { 1906 | "output_type": "execute_result", 1907 | "data": { 1908 | "text/plain": [ 1909 | "array([[1. , 0.04351941, 0.07106691, ..., 0.04065578, 0. ,\n", 1910 | " 0.07312724],\n", 1911 | " [0.04351941, 1. , 0.20412415, ..., 0.21408721, 0.16556654,\n", 1912 | " 0.14002801],\n", 1913 | " [0.07106691, 0.20412415, 1. , ..., 0.2224746 , 0.10814761,\n", 1914 | " 0.17149859],\n", 1915 | " ...,\n", 1916 | " [0.04065578, 0.21408721, 0.2224746 , ..., 1. , 0.12373764,\n", 1917 | " 0.29433147],\n", 1918 | " [0. , 0.16556654, 0.10814761, ..., 0.12373764, 1. ,\n", 1919 | " 0.05564149],\n", 1920 | " [0.07312724, 0.14002801, 0.17149859, ..., 0.29433147, 0.05564149,\n", 1921 | " 1. ]])" 1922 | ] 1923 | }, 1924 | "metadata": { 1925 | "tags": [] 1926 | }, 1927 | "execution_count": 135 1928 | } 1929 | ] 1930 | }, 1931 | { 1932 | "cell_type": "markdown", 1933 | "metadata": { 1934 | "id": "PZ9BckdlOhDH" 1935 | }, 1936 | "source": [ 1937 | "### Exercise Time!\n", 1938 | "\n", 1939 | "How do we use these arrays of similarities to identify documents that are similar to the first review?" 1940 | ] 1941 | }, 1942 | { 1943 | "cell_type": "markdown", 1944 | "metadata": { 1945 | "id": "snP8PfzmUC1n" 1946 | }, 1947 | "source": [ 1948 | "## From Review to Document Vectors" 1949 | ] 1950 | }, 1951 | { 1952 | "cell_type": "code", 1953 | "metadata": { 1954 | "id": "hDSUBz-dHp5H" 1955 | }, 1956 | "source": [ 1957 | "# Use spaCy's vectors\n", 1958 | "toy_df = df.head(10)\n", 1959 | "\n", 1960 | "for index, row in toy_df.iterrows():\n", 1961 | " rating = row[\"Rating\"]\n", 1962 | " doc = nlp(row[\"Review Text\"])\n", 1963 | "\n", 1964 | " # A 1D numpy array representing the document’s semantics.\n", 1965 | " doc_vector = doc.vector\n", 1966 | " # The L2 norm of the vector representation.\n", 1967 | " doc_vector_norm = doc.vector_norm\n" 1968 | ], 1969 | "execution_count": 137, 1970 | "outputs": [] 1971 | }, 1972 | { 1973 | "cell_type": "markdown", 1974 | "metadata": { 1975 | "id": "ZaW9u0iDTIsW" 1976 | }, 1977 | "source": [ 1978 | "### Exercise Time!\n", 1979 | "Generate a 2D-array / dataframe with the 300d vectors + Rating (so 301 columns, with 10 rows for the toy dataframe)" 1980 | ] 1981 | }, 1982 | { 1983 | "cell_type": "markdown", 1984 | "metadata": { 1985 | "id": "Dcd7rO83UTYF" 1986 | }, 1987 | "source": [ 1988 | "## Future Work\n", 1989 | "Add vectors" 1990 | ] 1991 | }, 1992 | { 1993 | "cell_type": "code", 1994 | "metadata": { 1995 | "id": "9uMF14ISTHw_" 1996 | }, 1997 | "source": [ 1998 | "" 1999 | ], 2000 | "execution_count": null, 2001 | "outputs": [] 2002 | }, 2003 | { 2004 | "cell_type": "markdown", 2005 | "metadata": { 2006 | "id": "F8PalDAhj7PD" 2007 | }, 2008 | "source": [ 2009 | "## Decide which machine learning algorithm to use\n" 2010 | ] 2011 | }, 2012 | { 2013 | "cell_type": "markdown", 2014 | "metadata": { 2015 | "id": "ucGki7Qdk0Ns" 2016 | }, 2017 | "source": [ 2018 | "![](https://scikit-learn.org/stable/_static/ml_map.png)\n", 2019 | "[Reference](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)\n" 2020 | ] 2021 | }, 2022 | { 2023 | "cell_type": "code", 2024 | "metadata": { 2025 | "id": "N-1uGSBYUpaE" 2026 | }, 2027 | "source": [ 2028 | "class MeanEmbeddingVectorizer(object):\n", 2029 | " def __init__(self, word2vec):\n", 2030 | " self.word2vec = word2vec\n", 2031 | " # if a text is empty we should return a vector of zeros\n", 2032 | " # with the same dimensionality as all the other vectors\n", 2033 | " self.dim = len(word2vec.itervalues().next())\n", 2034 | "\n", 2035 | " def fit(self, X, y):\n", 2036 | " return self\n", 2037 | "\n", 2038 | " def transform(self, X):\n", 2039 | " return np.array([\n", 2040 | " np.mean([self.word2vec[w] for w in words if w in self.word2vec]\n", 2041 | " or [np.zeros(self.dim)], axis=0)\n", 2042 | " for words in X\n", 2043 | " ])\n", 2044 | "\n", 2045 | "class TfidfEmbeddingVectorizer(object):\n", 2046 | " def __init__(self, word2vec):\n", 2047 | " self.word2vec = word2vec\n", 2048 | " self.word2weight = None\n", 2049 | " self.dim = len(word2vec.itervalues().next())\n", 2050 | "\n", 2051 | " def fit(self, X, y):\n", 2052 | " tfidf = TfidfVectorizer(analyzer=lambda x: x)\n", 2053 | " tfidf.fit(X)\n", 2054 | " # if a word was never seen - it must be at least as infrequent\n", 2055 | " # as any of the known words - so the default idf is the max of \n", 2056 | " # known idf's\n", 2057 | " max_idf = max(tfidf.idf_)\n", 2058 | " self.word2weight = defaultdict(\n", 2059 | " lambda: max_idf,\n", 2060 | " [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])\n", 2061 | "\n", 2062 | " return self\n", 2063 | "\n", 2064 | " def transform(self, X):\n", 2065 | " return np.array([\n", 2066 | " np.mean([self.word2vec[w] * self.word2weight[w]\n", 2067 | " for w in words if w in self.word2vec] or\n", 2068 | " [np.zeros(self.dim)], axis=0)\n", 2069 | " for words in X\n", 2070 | " ])" 2071 | ], 2072 | "execution_count": null, 2073 | "outputs": [] 2074 | }, 2075 | { 2076 | "cell_type": "code", 2077 | "metadata": { 2078 | "id": "rCVRRZAcVAru" 2079 | }, 2080 | "source": [ 2081 | "from sklearn.pipeline import Pipeline\n", 2082 | "from sklearn.ensemble import ExtraTreesClassifier\n", 2083 | "\n", 2084 | "etree_w2v = Pipeline([\n", 2085 | " (\"word2vec vectorizer\", MeanEmbeddingVectorizer(w2v)),\n", 2086 | " (\"extra trees\", ExtraTreesClassifier(n_estimators=200))])\n", 2087 | "etree_w2v_tfidf = Pipeline([\n", 2088 | " (\"word2vec vectorizer\", TfidfEmbeddingVectorizer(w2v)),\n", 2089 | " (\"extra trees\", ExtraTreesClassifier(n_estimators=200))])" 2090 | ], 2091 | "execution_count": null, 2092 | "outputs": [] 2093 | }, 2094 | { 2095 | "cell_type": "markdown", 2096 | "metadata": { 2097 | "id": "N2Jxpgu0Up-W" 2098 | }, 2099 | "source": [ 2100 | "## Doc2Vec\n", 2101 | "This is the gensim implementation of doc2vec. " 2102 | ] 2103 | }, 2104 | { 2105 | "cell_type": "code", 2106 | "metadata": { 2107 | "id": "_kf-4RIMUsmS" 2108 | }, 2109 | "source": [ 2110 | "# Init the Doc2Vec model\n", 2111 | "doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=4, epochs=20)\n", 2112 | "\n", 2113 | "# Build the Volabulary\n", 2114 | "doc2vec_.build_vocab(train_data)\n", 2115 | "\n", 2116 | "# Train the Doc2Vec model\n", 2117 | "doc2vec_.train(train_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)" 2118 | ], 2119 | "execution_count": null, 2120 | "outputs": [] 2121 | } 2122 | ] 2123 | } --------------------------------------------------------------------------------