├── Part-08 Web Deployments ├── .gitignore ├── requirements.txt ├── templates │ └── index.html ├── model_predict.py ├── README.md ├── model_train.py ├── utils.py └── api.py ├── tokenization.png ├── requirements.txt ├── _config.yml ├── LICENSE ├── .github └── workflows │ └── deploy-book.yml ├── .gitignore ├── README.md ├── environment.yml ├── 09_Basic_IE.ipynb ├── 10_Coherence_Check.ipynb ├── tokenization.svg ├── 07_Build_Chatbot_in_30minutes.ipynb ├── 02-B.ipynb ├── 02-A.ipynb ├── 04_Text_Representations.ipynb └── 06_DL_for_NLP.ipynb /Part-08 Web Deployments/.gitignore: -------------------------------------------------------------------------------- 1 | extra_32x32.mat 2 | model.pkl -------------------------------------------------------------------------------- /tokenization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NirantK/NLP_Quickbook/HEAD/tokenization.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book==0.10.2 2 | jupyter 3 | black 4 | isort 5 | pandas 6 | spacy 7 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # in _config.yml 2 | title: "Quick NLP Projects" 3 | # logo: images/logo.png 4 | execute: 5 | execute_notebooks: "off" 6 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==1.0 2 | Jinja2==2.11.3 3 | numpy==1.13.1 4 | scikit-learn==0.18.1 5 | scipy==0.18.1 6 | virtualenv==15.1.0 7 | 8 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Text Classification model as a Flask API 5 | 6 | 7 | 8 | 9 | 10 | 11 |

Movie Sentiment Analysis

12 | 13 |
14 | 15 | 16 |

Prediction: {% if label %} {{ label }} {% endif %}

17 |
18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/model_predict.py: -------------------------------------------------------------------------------- 1 | from sklearn.externals import joblib 2 | 3 | 4 | def read_predict(): 5 | model = joblib.load("model.pkl") 6 | # print(model) 7 | 8 | with open(r"1_3.txt", "r") as infile: 9 | test_contents = infile.read() 10 | 11 | with open(r".\\data\\aclImdb\\train\neg\\1_1.txt", "r") as infile: 12 | test_neg_contents = infile.read() 13 | 14 | with open(r".\\data\\aclImdb\\train\pos\\0_9.txt", "r") as infile: 15 | test_pos_contents = infile.read() 16 | 17 | predictions = model.predict([test_contents, test_neg_contents, test_pos_contents]) 18 | return predictions 19 | 20 | 21 | predictions = read_predict() 22 | for p in predictions: 23 | print("pos" if p else "neg") 24 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/README.md: -------------------------------------------------------------------------------- 1 | # Deploying a ML model with Flask API 2 | 3 | Based on Scikit-Learn stack 4 | 5 | --- 6 | 7 | In this tutorial, 8 | 9 | 1. We build a small text classification model, and write it to disk in `model_train.py` 10 | 2. Reuse this model, in `model_predict.py` 11 | 3. Expose the model using Flask with `api.py` 12 | 13 | Using Flask to create an API, we can deploy this model and create a simple web page to load and classify new movie reviews. 14 | 15 | ## To run locally 16 | 17 | - Install pip and Python 3 18 | - Clone this repository 19 | - Navigate to the working directory 20 | - Install the Python dependencies `pip install -r requirements.txt` 21 | - Run the API `python api.py` 22 | - Open a web browser and go to `http://localhost:8000` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Nirant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/deploy-book.yml: -------------------------------------------------------------------------------- 1 | name: deploy-book 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - master 8 | # If your git repository has the Jupyter Book within some-subfolder next to 9 | # unrelated files, you can make this run only if a file within that specific 10 | # folder has been modified. 11 | # 12 | # paths: 13 | # - some-subfolder/** 14 | 15 | # This job installs dependencies, build the book, and pushes it to `gh-pages` 16 | jobs: 17 | deploy-book: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | # Install dependencies 23 | - name: Set up Python 3.7 24 | uses: actions/setup-python@v1 25 | with: 26 | python-version: 3.7 27 | 28 | - name: Install dependencies 29 | run: | 30 | pip install -r requirements.txt 31 | 32 | # Build a table of contents 33 | - name: ToC Build 34 | run: | 35 | jupyter-book toc . 36 | # Build the book 37 | - name: Build the book 38 | run: | 39 | jupyter-book build . 40 | 41 | # Push the book's HTML to github-pages 42 | - name: GitHub Pages action 43 | uses: peaceiris/actions-gh-pages@v3.6.1 44 | with: 45 | github_token: ${{ secrets.GITHUB_TOKEN }} 46 | publish_dir: ./_build/html 47 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/model_train.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import logging 3 | import os 4 | from pathlib import Path 5 | from urllib.request import urlretrieve 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.externals import joblib 10 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 11 | from sklearn.linear_model import LogisticRegression as LR 12 | from sklearn.pipeline import Pipeline 13 | from tqdm import tqdm 14 | 15 | from utils import get_data, read_data 16 | 17 | # create logger 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(logging.DEBUG) 20 | # create file handler which logs even debug messages 21 | fh = logging.FileHandler(str(__name__) + ".log") 22 | fh.setLevel(logging.DEBUG) 23 | # create console handler with a higher log level 24 | ch = logging.StreamHandler() 25 | ch.setLevel(logging.ERROR) 26 | # create formatter and add it to the handlers 27 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 28 | fh.setFormatter(formatter) 29 | ch.setFormatter(formatter) 30 | # add the handlers to the logger 31 | logger.addHandler(fh) 32 | logger.addHandler(ch) 33 | 34 | data_path = Path(os.getcwd()) / "data" / "aclImdb" 35 | logger.info(data_path) 36 | 37 | if not data_path.exists(): 38 | data_url = "http://files.fast.ai/data/aclImdb.tgz" 39 | get_data(data_url, "data/imdb.tgz") 40 | 41 | train_path = data_path / "train" 42 | # load data file as dict object 43 | train = read_data(train_path) 44 | 45 | # extract the images (X) and labels (y) from the dict 46 | X_train, y_train = train["text"], train["label"] 47 | 48 | 49 | lr_clf = Pipeline( 50 | [("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", LR())] 51 | ) 52 | lr_clf.fit(X=X_train, y=y_train) 53 | 54 | # save model 55 | joblib.dump(lr_clf, "model.pkl") 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | data/* 103 | .data/* 104 | .vscode/settings.json 105 | *.pkl 106 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | from pathlib import Path 4 | from urllib.request import urlretrieve 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | 10 | 11 | class TqdmUpTo(tqdm): 12 | def update_to(self, b=1, bsize=1, tsize=None): 13 | if tsize is not None: 14 | self.total = tsize 15 | self.update(b * bsize - self.n) 16 | 17 | 18 | def get_data(url, filename): 19 | """ 20 | Download data if the filename does not exist already 21 | Uses Tqdm to show download progress 22 | """ 23 | if not os.path.exists(filename): 24 | 25 | dirname = os.path.dirname(filename) 26 | if not os.path.exists(dirname): 27 | os.makedirs(dirname) 28 | 29 | with TqdmUpTo( 30 | unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1] 31 | ) as t: 32 | urlretrieve(url, filename, reporthook=t.update_to) 33 | 34 | 35 | def read_data(dir_path): 36 | """read data into pandas dataframe""" 37 | 38 | def load_dir_reviews(reviews_path): 39 | files_list = list(reviews_path.iterdir()) 40 | reviews = [] 41 | for filename in files_list: 42 | f = open(filename, "r", encoding="utf-8") 43 | reviews.append(f.read()) 44 | return pd.DataFrame({"text": reviews}) 45 | 46 | pos_path = dir_path / "pos" 47 | neg_path = dir_path / "neg" 48 | 49 | pos_reviews, neg_reviews = load_dir_reviews(pos_path), load_dir_reviews(neg_path) 50 | 51 | pos_reviews["label"] = 1 52 | neg_reviews["label"] = 0 53 | 54 | merged = pd.concat([pos_reviews, neg_reviews]) 55 | df = merged.sample(frac=1.0) # shuffle the rows 56 | df.reset_index(inplace=True) # don't carry index from previous 57 | df.drop(columns=["index"], inplace=True) # drop the column 'index' 58 | return df 59 | -------------------------------------------------------------------------------- /Part-08 Web Deployments/api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import flask 4 | import os 5 | import numpy as np 6 | from flask import Flask, jsonify, render_template, request 7 | from scipy import misc 8 | from sklearn.externals import joblib 9 | 10 | app = Flask(__name__) 11 | 12 | # create logger 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.DEBUG) 15 | # create file handler which logs even debug messages 16 | fh = logging.FileHandler(str(__name__) + ".log") 17 | fh.setLevel(logging.DEBUG) 18 | # create console handler with a higher log level 19 | ch = logging.StreamHandler() 20 | ch.setLevel(logging.INFO) 21 | # create formatter and add it to the handlers 22 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 23 | fh.setFormatter(formatter) 24 | ch.setFormatter(formatter) 25 | # add the handlers to the logger 26 | logger.addHandler(fh) 27 | logger.addHandler(ch) 28 | 29 | 30 | @app.route("/") 31 | @app.route("/index") 32 | def index(): 33 | return flask.render_template("index.html", label=False) 34 | 35 | 36 | @app.route("/status", methods=["GET"]) 37 | def get_status(): 38 | return jsonify({"version": "0.0.1", "status": True}) 39 | 40 | 41 | @app.route("/predict", methods=["POST"]) 42 | def make_prediction(): 43 | if request.method == "POST": 44 | # get uploaded file if it exists 45 | logger.debug(request.files) 46 | 47 | f = request.files["file"] 48 | f.save(f.filename) # save file to disk 49 | logger.info(f"{f.filename} saved to disk") 50 | 51 | # read file from disk 52 | with open(f.filename, "r") as infile: 53 | text_content = infile.read() 54 | logger.info(f"Text Content from file read") 55 | 56 | prediction = model.predict([text_content]) 57 | logger.info(f"prediction: {prediction}") 58 | prediction = "pos" if prediction[0] == 1 else "neg" 59 | os.remove(f.filename) 60 | return flask.render_template("index.html", label=prediction) 61 | 62 | 63 | if __name__ == "__main__": 64 | # load ml model from disk 65 | model = joblib.load("model.pkl") 66 | # start api 67 | app.run(host="0.0.0.0", port=8000, debug=True) 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Natural Language Processing Notebooks 2 | -- 3 | 4 | # Available as a Book: [NLP in Python - Quickstart Guide](https://www.amazon.in/dp/B07L3PLQS1) 5 | 6 | ### Written for Practicing Engineers 7 | 8 | This work builds on the outstanding work which exists on Natural Language Processing. These range from classics like Jurafsky's Speech and Language Processing to rather modern work in The Deep Learning Book by Ian Goodfellow et al. 9 | 10 | While they are great as introductory textbooks for college students - this is intended for practitioners to quickly read, skim, select what is useful and then proceed. There are several notebooks divided into 7 logical themes. 11 | 12 | Each section builds on ideas and code from previous notebooks, but you can fill in the gaps mentally and jump directly to what interests you. 13 | 14 | ## Chapter 01 15 | [Introduction To Text Processing, with Text Classification](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-01.ipynb) 16 | - Perfect for Getting Started! We learn better with code-first approaches 17 | 18 | ## Chapter 02 19 | - [Text Cleaning](https://github.com/NirantK/nlp-python-deep-learning/blob/master/02-A.ipynb) notebook, code-first approaches with supporting explanation. Covers some simple ideas like: 20 | - Stop words removal 21 | - Lemmatization 22 | - [Spell Correction](https://github.com/NirantK/nlp-python-deep-learning/blob/master/02-B.ipynb) covers **almost everything** that you will ever need to get started with spell correction, similar words problems and so on 23 | 24 | ## Chapter 03 25 | [Leveraging Linguistics](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-03%20NLP%20with%20spaCy%20and%20Textacy.ipynb) is an important toolkit in any practitioners toolkit. Using **spaCy** and textacy we look at two interesting challenges and how to tackle them: 26 | - Redacting names 27 | - Named Entity Recognition 28 | - Question and Answer Generation 29 | - Part of Speech Tagging 30 | - Dependency Parsing 31 | 32 | ## Chapter 04 33 | [Text Representations](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-04%20Text%20Representations.ipynb) is about converting text to numerical representations aka vectors 34 | - Covers popular celebrities: word2vec, fasttext and doc2vec - document similarity using the same 35 | - Programmer's Guide to **gensim** 36 | 37 | ## Chapter 05 38 | [Modern Methods for Text Classification](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-05%20Modern%20Text%20Classification.ipynb) is simple, exploratory and talks about: 39 | - Simple Classifiers and How to Optimize Them from **scikit-learn** 40 | - How to combine and **ensemble** them for increased performance 41 | - Builds intuition for ensembling - so that you can write your own ensembling techniques 42 | 43 | ## Chapter 06 44 | [Deep Learning for NLP](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-06%20Deep%20Learning%20for%20NLP.ipynb) is less about fancy data modeling, and more engineering for Deep Learning 45 | - From scratch code tutorial with Text Classification as an example 46 | - Using **PyTorch** and *torchtext* 47 | - Write our own data loaders, pre-processing, training loop and other utilities 48 | 49 | ## Chapter 07 50 | [Building your own Chatbot](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-07%20Building%20your%20own%20Chatbot%20in%2030%20minutes.ipynb) from scratch in 30 minutes. We use this to explore unsupervised learning and put together several of the ideas we have already seen. 51 | - simpler, direct problem formulation instead of complicated chatbot tutorials commonly seen 52 | - intents, responses and templates in chat bot parlance 53 | - hacking word based similarity engine to work with little to no training samples 54 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | _nb_ext_conf=0.4.0=py36_1 5 | anaconda-client=1.6.14=py36_0 6 | asn1crypto=0.24.0=py36_0 7 | backcall=0.1.0=py36_0 8 | blas=1.0=mkl 9 | bleach=2.1.3=py36_0 10 | boto=2.48.0=py36_1 11 | boto3=1.7.45=py36_0 12 | botocore=1.10.45=py36_0 13 | ca-certificates=2018.03.07=0 14 | certifi=2018.4.16=py36_0 15 | cffi=1.11.5=py36h945400d_0 16 | chardet=3.0.4=py36h420ce6e_1 17 | clyent=1.2.2=py36hb10d595_1 18 | colorama=0.3.9=py36h029ae33_0 19 | cryptography=2.2.2=py36hfa6e2cd_0 20 | cuda91=1.0=0 21 | cymem=1.31.2=py36h51d26f2_0 22 | cytoolz=0.8.2=py36h547e66e_0 23 | decorator=4.3.0=py36_0 24 | dill=0.2.8.2=py36_0 25 | docutils=0.14=py36h6012d8f_0 26 | entrypoints=0.2.3=py36hfd66bb0_2 27 | ftfy=4.4.3=py36hb6b3982_0 28 | gensim=3.4.0=py36h452e1ab_0 29 | html5lib=1.0.1=py36h047fa9f_0 30 | icc_rt=2017.0.4=h97af966_0 31 | idna=2.7=py36_0 32 | intel-openmp=2018.0.3=0 33 | ipykernel=4.8.2=py36_0 34 | ipython=6.4.0=py36_0 35 | ipython_genutils=0.2.0=py36h3c5d0ee_0 36 | ipywidgets=7.2.1=py36_0 37 | jedi=0.12.0=py36_1 38 | jinja2=2.10=py36h292fed1_0 39 | jmespath=0.9.3=py36_0 40 | jsonschema=2.6.0=py36h7636477_0 41 | jupyter_client=5.2.3=py36_0 42 | jupyter_core=4.4.0=py36h56e9d50_0 43 | libiconv=1.15=h1df5818_7 44 | libsodium=1.0.16=h9d3ae62_0 45 | libxml2=2.9.8=hadb2253_1 46 | libxslt=1.1.32=hf6f1972_0 47 | lxml=4.2.2=py36hef2cd61_0 48 | m2w64-gcc-libgfortran=5.3.0=6 49 | m2w64-gcc-libs=5.3.0=7 50 | m2w64-gcc-libs-core=5.3.0=7 51 | m2w64-gmp=6.1.0=2 52 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2 53 | markupsafe=1.0=py36h0e26971_1 54 | mistune=0.8.3=py36hfa6e2cd_1 55 | mkl=2018.0.3=1 56 | mkl_fft=1.0.1=py36h452e1ab_0 57 | mkl_random=1.0.1=py36h9258bd6_0 58 | msgpack-numpy=0.4.1=py36h93564ae_0 59 | msgpack-python=0.4.8=py36h58b1e9d_0 60 | msys2-conda-epoch=20160418=1 61 | murmurhash=0.28.0=py36h866ba4d_0 62 | nb_anacondacloud=1.4.0=py36_0 63 | nb_conda=2.2.0=py36_0 64 | nb_conda_kernels=2.1.0=py36_0 65 | nbconvert=5.3.1=py36h8dc0fde_0 66 | nbformat=4.4.0=py36h3a5bc1b_0 67 | nbpresent=3.0.2=py36_0 68 | notebook=5.5.0=py36_0 69 | numpy=1.14.5=py36h9fa60d3_3 70 | numpy-base=1.14.5=py36h5c71026_3 71 | openssl=1.0.2o=h8ea7d77_0 72 | pandoc=2.2.1=h1a437c5_0 73 | pandocfilters=1.4.2=py36h3ef6317_1 74 | parso=0.2.1=py36_0 75 | pickleshare=0.7.4=py36h9de030f_0 76 | pip=10.0.1=py36_0 77 | plac=0.9.6=py36_0 78 | preshed=1.0.0=py36h065ec1e_0 79 | prompt_toolkit=1.0.15=py36h60b8f86_0 80 | pycparser=2.18=py36hd053e01_1 81 | pygments=2.2.0=py36hb010967_0 82 | pyopenssl=18.0.0=py36_0 83 | pyreadline=2.1=py36h094d952_1 84 | pysocks=1.6.8=py36_0 85 | python=3.6.6=hea74fb7_0 86 | python-dateutil=2.7.3=py36_0 87 | pytorch=0.4.0=py36_cuda91_cudnn7he774522_1 88 | pytz=2018.5=py36_0 89 | pywinpty=0.5.4=py36_0 90 | pyyaml=3.12=py36h1d1928f_1 91 | pyzmq=17.0.0=py36hfa6e2cd_1 92 | regex=2017.11.09=py36ha090894_0 93 | requests=2.19.1=py36_0 94 | s3transfer=0.1.13=py36_0 95 | scipy=1.1.0=py36h672f292_0 96 | send2trash=1.5.0=py36_0 97 | setuptools=39.2.0=py36_0 98 | simplegeneric=0.8.1=py36_2 99 | six=1.11.0=py36h4db2310_1 100 | smart_open=1.6.0=py36_0 101 | spacy=2.0.11=py36h8300f20_0 102 | termcolor=1.1.0=py36_1 103 | terminado=0.8.1=py36_1 104 | testpath=0.3.1=py36h2698cfe_0 105 | thinc=6.10.2=py36h830ac7b_0 106 | toolz=0.9.0=py36_0 107 | tornado=5.0.2=py36_0 108 | tqdm=4.23.4=py36_0 109 | traitlets=4.3.2=py36h096827d_0 110 | ujson=1.35=py36_0 111 | urllib3=1.23=py36_0 112 | vc=14.1=h0510ff6_3 113 | vs2015_runtime=15.5.2=3 114 | wcwidth=0.1.7=py36h3d5aa90_0 115 | webencodings=0.5.1=py36h67c50ae_1 116 | wheel=0.31.1=py36_0 117 | widgetsnbextension=3.2.1=py36_0 118 | win_inet_pton=1.0.1=py36he67d7fd_1 119 | wincertstore=0.2=py36h7fe50ca_0 120 | winpty=0.4.3=4 121 | wrapt=1.10.11=py36hfa6e2cd_2 122 | yaml=0.1.7=hc54c509_2 123 | zeromq=4.2.5=he025d50_1 124 | zlib=1.2.11=h8395fce_2 125 | -------------------------------------------------------------------------------- /09_Basic_IE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## [Basic] Information Extraction\n", 8 | "\n", 9 | "As an example task, consider the challenge of automating Amazon Retail's customer service email response. We should be able to find the following attributes or mark them as missing with high confidence:\n", 10 | "\n", 11 | "- Order Id\n", 12 | "- Dates (such as Shopping Date, Order Delivery) \n", 13 | "- Any `$` amounts \n", 14 | "\n", 15 | "Please note that I don't have any relation to Amazon other than shopping from there. \n", 16 | "\n", 17 | "Let's consider the following totally imagined complaint email from me to Jeff Bezos, the CEO of Amazon:\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "complaint_email = \"\"\"Hello Jeff,\n", 29 | "\n", 30 | "I am Nirant, a loyal Amazon in first customer for months now. I am a huge fan of Kindle as well. \n", 31 | "I am stuck in a new city without a phone thanks to a sequence of problems - and are now compounded by Amazon's inhumane behaviour.\n", 32 | "\n", 33 | "The particular issues I am facing: My new phone bought from Amazon stopped working. What did I do? Requested a replacement on Jul 23\n", 34 | "- First Issue: The system did not allow a pick up on July 23 forcing a delay of more than a day to 24 July 8:00 - 11:00 AM\n", 35 | "- Second Issue: Despite requesting the customer service on chat THRICE, the pickup is delayed to July 24 8:00 - 11:00 AM\n", 36 | "- Third Issue: The pickup is rescheduled without any reason!\n", 37 | "\n", 38 | "Is this how you want Amazon to be world's most customer centric company?\n", 39 | "\n", 40 | "Here is how Amazon can help me:\n", 41 | "- Pick up the order as urgently as possible\n", 42 | "- Deliver the phone on a priority basis on Monday i.e. July 25 itself\n", 43 | "\n", 44 | "Here are the order numbers for reference: \n", 45 | "ORDER # 402-4870778-5154753 and ORDER # 404-8689779-9721113\n", 46 | "\n", 47 | "Here is my phone number: +91 7737887058\n", 48 | "\n", 49 | "I am stuck in a new city, where I don't know the language or directions without a working phone. I would really appreciate it if you could help in anyway. \n", 50 | "\n", 51 | "Regards,\n", 52 | "Nirant Kasliwal\"\"\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Yikes, that is a lot of text. \n", 60 | "\n", 61 | "**The information to pull from this are (1) dates + times (2) phone number and (3) order numbers**. Let's figure out how to do that\n", 62 | "\n", 63 | "### Extract Date and Times\n", 64 | "\n", 65 | "If you are new to regex, consider reading the amazing [HOWTO on Python Regex](https://docs.python.org/3/howto/regex.html) and then coming back here. Let's warm up our regex muscles a bit: " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "import re" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "['12', '11', '10']" 88 | ] 89 | }, 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "p = re.compile(r'\\d+')\n", 97 | "p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "['12', '11', '10']" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "p = re.compile(r'\\d+')\n", 118 | "p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "TKX: Add compile and findall explanations here\n", 126 | "\n", 127 | "TKX: Add d+ explanations here" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "Wall time: 0 ns\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "%%time\n", 145 | "date_pattern = r\"^(Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec)$\"\n", 146 | "p = re.compile(date_pattern)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "[]" 158 | ] 159 | }, 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "p.findall(complaint_email)" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "fastAI", 173 | "language": "python", 174 | "name": "fastai" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /10_Coherence_Check.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "813a6f6d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Coherence Check\n", 9 | "\n", 10 | "Goal: We've a sample of generated text. We want to select the ones which are coherent and discard the rest. \n", 11 | "\n", 12 | "Not a Goal: Checking for semantic correctness.\n", 13 | "\n", 14 | "## Possible Approaches for Coherence Checks\n", 15 | "\n", 16 | "1. [**Recommended**] Using a different Language Model and calculating perplexity of the sentence and threshold to select only coherent variants. Use a LM fine-tuned on your training corpus to make sure that the perple\n", 17 | "\n", 18 | "2. Using dependency parsing from spaCy to see if there are conditions/patterns which incoherent sentences fail, but coherent sentences meet. Common example: The root verb in the sentence should be directly connected to the subject. There should be no dangling clauses.\n", 19 | "\n", 20 | "3. For longer text generation, in addition for training with the next sentence prediction task. Generate multiple next sentences and use the [CLS] emb + classifier to mark each sentence as coherent or not. \n", 21 | "\n", 22 | "> We encode each sentence by adding [CLS] token to the last position, and feed the hidden state of this token to a double dot-product regression model. The final output is from a logistic regression predicting if the two sentences come from the same paragraph or not.\n", 23 | "> - From [Improving Language Generation with Sentence Coherence Objective](https://www.arxiv-vanity.com/papers/2009.06358/)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 14, 29 | "id": "f87ca8b7", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# TODO\n", 34 | "# Add example of perplexity change using GPT-2 or T5" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "id": "5eb4f340", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import spacy\n", 45 | "from spacy import displacy\n", 46 | "\n", 47 | "nlp = spacy.load(\"en_core_web_sm\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 12, 53 | "id": "e704c419", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "\n", 60 | "\n", 61 | " Stitch\n", 62 | " VERB\n", 63 | "\n", 64 | "\n", 65 | "\n", 66 | " in\n", 67 | " ADP\n", 68 | "\n", 69 | "\n", 70 | "\n", 71 | " time,\n", 72 | " NOUN\n", 73 | "\n", 74 | "\n", 75 | "\n", 76 | " saves\n", 77 | " VERB\n", 78 | "\n", 79 | "\n", 80 | "\n", 81 | " Nine\n", 82 | " NUM\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | " \n", 87 | " \n", 88 | " prep\n", 89 | " \n", 90 | " \n", 91 | "\n", 92 | "\n", 93 | "\n", 94 | " \n", 95 | " \n", 96 | " pobj\n", 97 | " \n", 98 | " \n", 99 | "\n", 100 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " conj\n", 105 | " \n", 106 | " \n", 107 | "\n", 108 | "\n", 109 | "\n", 110 | " \n", 111 | " \n", 112 | " dobj\n", 113 | " \n", 114 | " \n", 115 | "\n", 116 | "" 117 | ], 118 | "text/plain": [ 119 | "" 120 | ] 121 | }, 122 | "metadata": {}, 123 | "output_type": "display_data" 124 | } 125 | ], 126 | "source": [ 127 | "doc = nlp(\"Stitch in time, saves Nine\")\n", 128 | "options = {\"bg\": \"#09a3d5\", \"color\": \"white\"}\n", 129 | "displacy.render(doc, style=\"dep\", options=options)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 13, 135 | "id": "72c5d731", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/html": [ 141 | "\n", 142 | "\n", 143 | " Stitch\n", 144 | " VERB\n", 145 | "\n", 146 | "\n", 147 | "\n", 148 | " in\n", 149 | " ADP\n", 150 | "\n", 151 | "\n", 152 | "\n", 153 | " time\n", 154 | " NOUN\n", 155 | "\n", 156 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " prep\n", 161 | " \n", 162 | " \n", 163 | "\n", 164 | "\n", 165 | "\n", 166 | " \n", 167 | " \n", 168 | " pobj\n", 169 | " \n", 170 | " \n", 171 | "\n", 172 | "" 173 | ], 174 | "text/plain": [ 175 | "" 176 | ] 177 | }, 178 | "metadata": {}, 179 | "output_type": "display_data" 180 | } 181 | ], 182 | "source": [ 183 | "doc = nlp(\"Stitch in time\")\n", 184 | "displacy.render(doc, style=\"dep\", options=options)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "cc8148af", 190 | "metadata": {}, 191 | "source": [ 192 | "Notice that when the phrase/clause is used - the verb \"Stitch\" does not have a subject. While the previous one, does via `conj` (conjuction). \n", 193 | "\n", 194 | "As an example of rule/filter, we can enforce a constraint that every verb needs to have a subject." 195 | ] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3", 201 | "language": "python", 202 | "name": "python3" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.7.0" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 5 219 | } 220 | -------------------------------------------------------------------------------- /tokenization.svg: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | “Let’s 9 | 10 | 11 | go 12 | 13 | 14 | to 15 | 16 | 17 | N.Y.!” 18 | 19 | 20 | 21 | 22 | 23 | Let’s 24 | 25 | 26 | go 27 | 28 | 29 | to 30 | 31 | 32 | N.Y.!” 33 | 34 | 35 | 36 | 37 | Let 38 | 39 | 40 | go 41 | 42 | 43 | to 44 | 45 | 46 | N.Y.!” 47 | 48 | 49 | ’s 50 | 51 | 52 | 53 | 54 | 55 | Let 56 | 57 | 58 | go 59 | 60 | 61 | to 62 | 63 | 64 | N.Y.! 65 | 66 | 67 | ’s 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | Let 77 | 78 | 79 | go 80 | 81 | 82 | to 83 | 84 | 85 | N.Y. 86 | 87 | 88 | ’s 89 | 90 | 91 | 92 | 93 | 94 | ! 95 | 96 | 97 | 98 | Let 99 | 100 | go 101 | 102 | to 103 | 104 | N.Y. 105 | 106 | ’s 107 | 108 | 109 | 110 | ! 111 | 112 | EXCEPTION 113 | 114 | PREFIX 115 | 116 | SUFFIX 117 | 118 | SUFFIX 119 | 120 | EXCEPTION 121 | 122 | DONE 123 | -------------------------------------------------------------------------------- /07_Build_Chatbot_in_30minutes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building your own Chatbot\n", 8 | "\n", 9 | "## Why should I build the service again? \n", 10 | "\n", 11 | "##### Related: Why can't I use FB/MSFT/some other cloud service?" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Word Vectors + Heuristic - Fancy Stuff = Quick Working Code" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "Gensim version: 3.4.0\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "import numpy as np\n", 36 | "import gensim\n", 37 | "print(f\"Gensim version: {gensim.__version__}\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "File already exists, please remove if you wish to download again\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "from tqdm import tqdm\n", 55 | "class TqdmUpTo(tqdm):\n", 56 | " def update_to(self, b=1, bsize=1, tsize=None):\n", 57 | " if tsize is not None: self.total = tsize\n", 58 | " self.update(b * bsize - self.n)\n", 59 | "\n", 60 | "def get_data(url, filename):\n", 61 | " \"\"\"\n", 62 | " Download data if the filename does not exist already\n", 63 | " Uses Tqdm to show download progress\n", 64 | " \"\"\"\n", 65 | " import os\n", 66 | " from urllib.request import urlretrieve\n", 67 | " \n", 68 | " if not os.path.exists(filename):\n", 69 | "\n", 70 | " dirname = os.path.dirname(filename)\n", 71 | " if not os.path.exists(dirname):\n", 72 | " os.makedirs(dirname)\n", 73 | "\n", 74 | " with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:\n", 75 | " urlretrieve(url, filename, reporthook=t.update_to)\n", 76 | " else:\n", 77 | " print(\"File already exists, please remove if you wish to download again\")\n", 78 | "\n", 79 | "embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n", 80 | "get_data(embedding_url, 'data/glove.6B.zip')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# !unzip data/glove.6B.zip \n", 90 | "# !mv -v glove.6B.300d.txt data/glove.6B.300d.txt \n", 91 | "# !mv -v glove.6B.200d.txt data/glove.6B.200d.txt \n", 92 | "# !mv -v glove.6B.100d.txt data/glove.6B.100d.txt \n", 93 | "# !mv -v glove.6B.50d.txt data/glove.6B.50d.txt \n", 94 | "\n", 95 | "from gensim.scripts.glove2word2vec import glove2word2vec\n", 96 | "glove_input_file = 'data/glove.6B.300d.txt'\n", 97 | "word2vec_output_file = 'data/glove.6B.300d.txt.word2vec'\n", 98 | "import os\n", 99 | "if not os.path.exists(word2vec_output_file):\n", 100 | " glove2word2vec(glove_input_file, word2vec_output_file)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "CPU times: user 1min 49s, sys: 2.11 s, total: 1min 51s\n", 113 | "Wall time: 1min 47s\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "%%time\n", 119 | "from gensim.models import KeyedVectors\n", 120 | "filename = word2vec_output_file\n", 121 | "embed = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "assert embed['awesome'] is not None" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "'awesome', this works!" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## Use Case: Food Order Bot\n", 145 | "\n", 146 | "### Do word vectors even work for this? " 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "cuisine_refs = [\"mexican\", \"thai\", \"british\", \"american\", \"italian\"]\n", 156 | "sample_sentence = \"I’m looking for a cheap Indian or Chinese place in Indiranagar\"" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "looking: 7.448504447937012\n", 169 | "for: 10.627421379089355\n", 170 | "a: 11.809560775756836\n", 171 | "cheap: 7.09670877456665\n", 172 | "indian: 18.64516258239746\n", 173 | "or: 9.692893981933594\n", 174 | "chinese: 19.09498405456543\n", 175 | "place: 7.651237487792969\n", 176 | "in: 10.085711479187012\n", 177 | "['indian', 'chinese']\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "tokens = sample_sentence.split()\n", 183 | "tokens = [x.lower().strip() for x in tokens] \n", 184 | "threshold = 18.3\n", 185 | "found = []\n", 186 | "for term in tokens:\n", 187 | " if term in embed.vocab:\n", 188 | " scores = []\n", 189 | " for C in cuisine_refs:\n", 190 | " scores.append(np.dot(embed[C], embed[term].T))\n", 191 | " # hint replace above above np.dot with: \n", 192 | " # scores.append(embed.cosine_similarities(, ))\n", 193 | " mean_score = np.mean(scores)\n", 194 | " print(f\"{term}: {mean_score}\")\n", 195 | " if mean_score > threshold:\n", 196 | " found.append(term)\n", 197 | "print(found)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### Next Stop: Classifying user intent" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 8, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "(300,)\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "def sum_vecs(embed,text):\n", 222 | "\n", 223 | " tokens = text.split(' ')\n", 224 | " vec = np.zeros(embed.vector_size)\n", 225 | "\n", 226 | " for idx, term in enumerate(tokens):\n", 227 | " if term in embed.vocab:\n", 228 | " vec = vec + embed[term]\n", 229 | " return vec\n", 230 | "\n", 231 | "sentence_vector = sum_vecs(embed, sample_sentence)\n", 232 | "print(sentence_vector.shape)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 9, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "data={\n", 242 | " \"greet\": {\n", 243 | " \"examples\" : [\"hello\",\"hey there\",\"howdy\",\"hello\",\"hi\",\"hey\",\"hey ho\"],\n", 244 | " \"centroid\" : None\n", 245 | " },\n", 246 | " \"inform\": {\n", 247 | " \"examples\" : [\n", 248 | " \"i'd like something asian\",\n", 249 | " \"maybe korean\",\n", 250 | " \"what mexican options do i have\",\n", 251 | " \"what italian options do i have\",\n", 252 | " \"i want korean food\",\n", 253 | " \"i want german food\",\n", 254 | " \"i want vegetarian food\",\n", 255 | " \"i would like chinese food\",\n", 256 | " \"i would like indian food\",\n", 257 | " \"what japanese options do i have\",\n", 258 | " \"korean please\",\n", 259 | " \"what about indian\",\n", 260 | " \"i want some chicken\",\n", 261 | " \"maybe thai\",\n", 262 | " \"i'd like something vegetarian\",\n", 263 | " \"show me french restaurants\",\n", 264 | " \"show me a cool malaysian spot\",\n", 265 | " \"where can I get some spicy food\"\n", 266 | " ],\n", 267 | " \"centroid\" : None\n", 268 | " },\n", 269 | " \"deny\": {\n", 270 | " \"examples\" : [\n", 271 | " \"nah\",\n", 272 | " \"any other places ?\",\n", 273 | " \"anything else\",\n", 274 | " \"no thanks\"\n", 275 | " \"not that one\",\n", 276 | " \"i do not like that place\",\n", 277 | " \"something else please\",\n", 278 | " \"no please show other options\"\n", 279 | " ],\n", 280 | " \"centroid\" : None\n", 281 | " },\n", 282 | " \"affirm\":{\n", 283 | " \"examples\":[\n", 284 | " \"yeah\",\n", 285 | " \"that works\",\n", 286 | " \"good, thanks\",\n", 287 | " \"this works\",\n", 288 | " \"sounds good\",\n", 289 | " \"thanks, this is perfect\",\n", 290 | " \"just what I wanted\"\n", 291 | " ],\n", 292 | " \"centroid\": None\n", 293 | " }\n", 294 | "\n", 295 | "}" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 10, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "def get_centroid(embed, examples):\n", 305 | " C = np.zeros((len(examples),embed.vector_size))\n", 306 | " for idx, text in enumerate(examples):\n", 307 | " C[idx,:] = sum_vecs(embed,text)\n", 308 | "\n", 309 | " centroid = np.mean(C,axis=0)\n", 310 | " assert centroid.shape[0] == embed.vector_size\n", 311 | " return centroid" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 11, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# Adding Centroid to data dictionary\n", 321 | "for label in data.keys():\n", 322 | " data[label][\"centroid\"] = get_centroid(embed,data[label][\"examples\"])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 12, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "greet: ['hello', 'hey there', 'howdy', 'hello', 'hi', 'hey', 'hey ho']\n", 335 | "inform: [\"i'd like something asian\", 'maybe korean', 'what mexican options do i have', 'what italian options do i have', 'i want korean food', 'i want german food', 'i want vegetarian food', 'i would like chinese food', 'i would like indian food', 'what japanese options do i have', 'korean please', 'what about indian', 'i want some chicken', 'maybe thai', \"i'd like something vegetarian\", 'show me french restaurants', 'show me a cool malaysian spot', 'where can I get some spicy food']\n", 336 | "deny: ['nah', 'any other places ?', 'anything else', 'no thanksnot that one', 'i do not like that place', 'something else please', 'no please show other options']\n", 337 | "affirm: ['yeah', 'that works', 'good, thanks', 'this works', 'sounds good', 'thanks, this is perfect', 'just what I wanted']\n" 338 | ] 339 | } 340 | ], 341 | "source": [ 342 | "for label in data.keys():\n", 343 | " print(f\"{label}: {data[label]['examples']}\")" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 13, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "def get_intent(embed,data, text):\n", 353 | " intents = list(data.keys())\n", 354 | " vec = sum_vecs(embed,text)\n", 355 | " scores = np.array([ np.linalg.norm(vec-data[label][\"centroid\"]) for label in intents])\n", 356 | " return intents[np.argmin(scores)]" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 14, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "text : 'hey ', predicted_label : 'greet'\n", 369 | "text : 'i am looking for chinese food', predicted_label : 'inform'\n", 370 | "text : 'not for me', predicted_label : 'deny'\n", 371 | "text : 'ok, this is good', predicted_label : 'affirm'\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "for text in [\"hey \",\"i am looking for chinese food\",\"not for me\", \"ok, this is good\"]:\n", 377 | " print(f\"text : '{text}', predicted_label : '{get_intent(embed, data, text)}'\")" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "## Bot Responses" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 15, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "templates = {\n", 394 | " \"utter_greet\": [\"hey there!\", \"Hey! How you doin'? \"],\n", 395 | " \"utter_options\": [\"ok, let me check some more\"],\n", 396 | " \"utter_goodbye\": [\"Great, I'll go now. Bye bye\", \"bye bye\", \"Goodbye!\"],\n", 397 | " \"utter_default\": [\"Sorry, I didn't quite follow\"],\n", 398 | " \"utter_confirm\": [\"Got it\", \"Gotcha\", \"Your order is confirmed now\"]\n", 399 | " }" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 16, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "response_map = {\n", 409 | " \"greet\": \"utter_greet\",\n", 410 | " \"affirm\": \"utter_goodbye\",\n", 411 | " \"deny\": \"utter_options\",\n", 412 | " \"inform\": \"utter_confirm\",\n", 413 | " \"default\": \"utter_default\",\n", 414 | "}" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 17, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "import random\n", 424 | "def get_bot_response(bot_response_map, bot_templates, intent):\n", 425 | " if intent not in list(response_map):\n", 426 | " intent = \"default\"\n", 427 | " select_template = bot_response_map[intent]\n", 428 | " templates = bot_templates[select_template]\n", 429 | " return random.choice(templates)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 18, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "'Got it'" 441 | ] 442 | }, 443 | "execution_count": 18, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "user_intent = get_intent(embed, data, \"i want indian food\")\n", 450 | "get_bot_response(response_map, templates, user_intent)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "**Better Response Personalisation?**:" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 19, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "text : 'hey', intent: greet, bot: Hey! How you doin'? \n", 470 | "text : 'i am looking for italian food', intent: inform, bot: Gotcha\n", 471 | "text : 'not for me', intent: deny, bot: ok, let me check some more\n", 472 | "text : 'ok, this is good', intent: affirm, bot: Goodbye!\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "for text in [\"hey\",\"i am looking for italian food\",\"not for me\", \"ok, this is good\"]:\n", 478 | " user_intent = get_intent(embed, data, text)\n", 479 | " bot_reply = get_bot_response(response_map, templates, user_intent)\n", 480 | " print(f\"text : '{text}', intent: {user_intent}, bot: {bot_reply}\")" 481 | ] 482 | } 483 | ], 484 | "metadata": { 485 | "kernelspec": { 486 | "display_name": "fastAI", 487 | "language": "python", 488 | "name": "fastai" 489 | }, 490 | "language_info": { 491 | "codemirror_mode": { 492 | "name": "ipython", 493 | "version": 3 494 | }, 495 | "file_extension": ".py", 496 | "mimetype": "text/x-python", 497 | "name": "python", 498 | "nbconvert_exporter": "python", 499 | "pygments_lexer": "ipython3", 500 | "version": "3.6.6" 501 | } 502 | }, 503 | "nbformat": 4, 504 | "nbformat_minor": 2 505 | } 506 | -------------------------------------------------------------------------------- /02-B.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Python 3.6.6 :: Anaconda, Inc.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "!python --version\n", 18 | "__author__ = \"nirant.bits@gmail.com\"" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Spell Correction" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "Collecting fuzzywuzzy[speedup]\n", 38 | " Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl\n", 39 | "Requirement already satisfied: python-levenshtein>=0.12; extra == \"speedup\" in d:\\miniconda3\\envs\\nlp\\lib\\site-packages (from fuzzywuzzy[speedup]) (0.12.0)\n", 40 | "Requirement already satisfied: setuptools in d:\\miniconda3\\envs\\nlp\\lib\\site-packages (from python-levenshtein>=0.12; extra == \"speedup\"->fuzzywuzzy[speedup]) (39.2.0)\n", 41 | "Installing collected packages: fuzzywuzzy\n", 42 | "Successfully installed fuzzywuzzy-0.17.0\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "import sys\n", 48 | "# !{sys.executable} -m pip install fuzzywuzzy\n", 49 | "# alternative for 4-10x faster computation: \n", 50 | "!{sys.executable} -m pip install fuzzywuzzy[speedup]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from fuzzywuzzy import fuzz" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "82" 71 | ] 72 | }, 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "fuzz.ratio(\"Electronic City Phase One\", \"Electronic City Phase One, Bangalore\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "100" 91 | ] 92 | }, 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "fuzz.partial_ratio(\"Electronic City Phase One\", \"Electronic City Phase One, Bangalore\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "90" 111 | ] 112 | }, 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "fuzz.ratio('Narendra Modi', 'Narendra D. Modi')" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 7, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "77" 131 | ] 132 | }, 133 | "execution_count": 7, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "fuzz.partial_ratio('Narendra Modi', 'Narendra D. Modi')" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "93" 151 | ] 152 | }, 153 | "execution_count": 8, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "fuzz.token_sort_ratio('Narendra Modi', 'Narendra D. Modi')" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "100" 171 | ] 172 | }, 173 | "execution_count": 9, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "fuzz.token_set_ratio('Narendra Modi', 'Narendra D. Modi')" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 10, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "from fuzzywuzzy import process" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 11, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "[('Gujarat', 92), ('Gujarat Govt.', 75), ('Gujjar', 67)]\n" 203 | ] 204 | }, 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "('Gujarat', 92)" 209 | ] 210 | }, 211 | "execution_count": 11, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "query = 'Gujrat'\n", 218 | "choices = ['Gujarat', 'Gujjar', 'Gujarat Govt.']\n", 219 | "# Get a list of matches ordered by score, default limit to 5\n", 220 | "print(process.extract(query, choices))\n", 221 | "\n", 222 | "# If we want only the top one\n", 223 | "process.extractOne(query, choices)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "[('Bangalore', 94), ('Bengaluru', 59)]\n" 236 | ] 237 | }, 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "('Bangalore', 94)" 242 | ] 243 | }, 244 | "execution_count": 12, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "query = 'Banglore'\n", 251 | "choices = ['Bangalore', 'Bengaluru']\n", 252 | "print(process.extract(query, choices))\n", 253 | "process.extractOne(query, choices)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 13, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "[('chilli', 91), ('chilling', 77), ('chilled', 67)]\n" 266 | ] 267 | }, 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "('chilli', 91)" 272 | ] 273 | }, 274 | "execution_count": 13, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "# Let's take an example of a common search typo in online shopping:\n", 281 | "query = 'chili'\n", 282 | "choices = ['chilli', 'chilled', 'chilling']\n", 283 | "print(process.extract(query, choices))\n", 284 | "process.extractOne(query, choices)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## Jellyfish" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "Collecting jellyfish\n", 304 | " Downloading https://files.pythonhosted.org/packages/61/3f/60ac86fb43dfbf976768e80674b5538e535f6eca5aa7806cf2fdfd63550f/jellyfish-0.6.1.tar.gz (132kB)\n", 305 | "Building wheels for collected packages: jellyfish\n", 306 | " Running setup.py bdist_wheel for jellyfish: started\n", 307 | " Running setup.py bdist_wheel for jellyfish: finished with status 'done'\n", 308 | " Stored in directory: C:\\Users\\nirantk\\AppData\\Local\\pip\\Cache\\wheels\\9c\\6f\\33\\92bb9a4b4562a60ba6a80cedbab8907e48bc7a8b1f369ea0ae\n", 309 | "Successfully built jellyfish\n", 310 | "Installing collected packages: jellyfish\n", 311 | "Successfully installed jellyfish-0.6.1\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "import sys\n", 317 | "!{sys.executable} -m pip install jellyfish" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 9, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "import jellyfish\n", 327 | "correct_example = ('Narendra Modi', 'Narendra Modi')\n", 328 | "damodardas_example = ('Narendra Modi', 'Narendra D. Modi')\n", 329 | "modi_typo_example = ('Narendra Modi', 'Narendar Modi')\n", 330 | "gujarat_typo_example = ('Gujarat', 'Gujrat')\n", 331 | "\n", 332 | "examples = [correct_example, damodardas_example, modi_typo_example, gujarat_typo_example]" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 10, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "def calculate_distance(function, examples=examples):\n", 342 | " for ele in examples:\n", 343 | " print(f'{ele}: {function(*ele)}') " 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 11, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "('Narendra Modi', 'Narendra Modi'): 0\n", 356 | "('Narendra Modi', 'Narendra D. Modi'): 3\n", 357 | "('Narendra Modi', 'Narendar Modi'): 2\n", 358 | "('Gujarat', 'Gujrat'): 1\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "calculate_distance(jellyfish.levenshtein_distance)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 12, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "('Narendra Modi', 'Narendra Modi'): 0\n", 376 | "('Narendra Modi', 'Narendra D. Modi'): 3\n", 377 | "('Narendra Modi', 'Narendar Modi'): 1\n", 378 | "('Gujarat', 'Gujrat'): 1\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "calculate_distance(jellyfish.damerau_levenshtein_distance)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "('Narendra Modi', 'Narendra Modi'): 0\n", 396 | "('Narendra Modi', 'Narendra D. Modi'): 7\n", 397 | "('Narendra Modi', 'Narendar Modi'): 2\n", 398 | "('Gujarat', 'Gujrat'): 4\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "calculate_distance(jellyfish.hamming_distance)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 14, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "('Narendra Modi', 'Narendra Modi'): 1.0\n", 416 | "('Narendra Modi', 'Narendra D. Modi'): 0.9375\n", 417 | "('Narendra Modi', 'Narendar Modi'): 0.9743589743589745\n", 418 | "('Gujarat', 'Gujrat'): 0.8968253968253969\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "calculate_distance(jellyfish.jaro_distance) " 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 21, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "('Narendra Modi', 'Narendra Modi'): 1.0\n", 436 | "('Narendra Modi', 'Narendra D. Modi'): 0.9625\n", 437 | "('Narendra Modi', 'Narendar Modi'): 0.9846153846153847\n", 438 | "('Gujarat', 'Gujrat'): 0.9277777777777778\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "calculate_distance(jellyfish.jaro_winkler)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": { 449 | "collapsed": true 450 | }, 451 | "source": [ 452 | "### Phonetic Word Similarity\n", 453 | "\n", 454 | "#### What is a phonetic encoding?" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 15, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/plain": [ 465 | "'J412'" 466 | ] 467 | }, 468 | "execution_count": 15, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "jellyfish.soundex('Jellyfish')" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 16, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "'JALYF'" 486 | ] 487 | }, 488 | "execution_count": 16, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "jellyfish.nysiis('Jellyfish')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 17, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "'JLFX'" 506 | ] 507 | }, 508 | "execution_count": 17, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "jellyfish.metaphone('Jellyfish')" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 18, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "'JLLFSH'" 526 | ] 527 | }, 528 | "execution_count": 18, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "jellyfish.match_rating_codex('Jellyfish')" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "#### Metaphone + Levenshtein" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 20, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "0" 553 | ] 554 | }, 555 | "execution_count": 20, 556 | "metadata": {}, 557 | "output_type": "execute_result" 558 | } 559 | ], 560 | "source": [ 561 | "jellyfish.levenshtein_distance(jellyfish.metaphone('write'), jellyfish.metaphone('right'))" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 21, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "examples+= [('write', 'right'), ('Mangalore', 'Bangalore'), ('Delhi', 'Dilli')] # adding a few examples to show how cool this is" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 28, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance\n", 583 | "Narendra Modi\tNRNTR MT \tNarendra Modi \tNRNTR MT \t0 \n", 584 | "Narendra Modi\tNRNTR MT \tNarendra D. Modi \tNRNTR T MT\t2 \n", 585 | "Narendra Modi\tNRNTR MT \tNarendar Modi \tNRNTR MT \t0 \n", 586 | "Gujarat \tKJRT \tGujrat \tKJRT \t0 \n", 587 | "write \tRT \tright \tRT \t0 \n", 588 | "Mangalore \tMNKLR \tBangalore \tBNKLR \t1 \n", 589 | "Delhi \tTLH \tDilli \tTL \t1 \n" 590 | ] 591 | } 592 | ], 593 | "source": [ 594 | "def calculate_phonetic_distance(phonetic_func, distance_func, examples=examples):\n", 595 | " print(\"Word\\t\\tSound\\t\\tWord\\t\\t\\tSound\\t\\tPhonetic Distance\")\n", 596 | " for ele in examples:\n", 597 | " correct, typo = ele[0], ele[1]\n", 598 | " phonetic_correct, phonetic_typo = phonetic_func(correct), phonetic_func(typo)\n", 599 | " phonetic_distance = distance_func(phonetic_correct, phonetic_typo)\n", 600 | " print(f'{correct:<10}\\t{phonetic_correct:<10}\\t{typo:<20}\\t{phonetic_typo:<10}\\t{phonetic_distance:<10}') \n", 601 | " \n", 602 | "calculate_phonetic_distance(phonetic_func=jellyfish.metaphone, distance_func=jellyfish.levenshtein_distance) " 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "#### American Soundex" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 29, 615 | "metadata": {}, 616 | "outputs": [ 617 | { 618 | "name": "stdout", 619 | "output_type": "stream", 620 | "text": [ 621 | "Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance\n", 622 | "Narendra Modi\tN653 \tNarendra Modi \tN653 \t0 \n", 623 | "Narendra Modi\tN653 \tNarendra D. Modi \tN653 \t0 \n", 624 | "Narendra Modi\tN653 \tNarendar Modi \tN653 \t0 \n", 625 | "Gujarat \tG263 \tGujrat \tG263 \t0 \n", 626 | "write \tW630 \tright \tR230 \t2 \n", 627 | "Mangalore \tM524 \tBangalore \tB524 \t1 \n", 628 | "Delhi \tD400 \tDilli \tD400 \t0 \n" 629 | ] 630 | } 631 | ], 632 | "source": [ 633 | "calculate_phonetic_distance(phonetic_func=jellyfish.soundex, distance_func=jellyfish.levenshtein_distance) " 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "## Updating the Original Corpus with FlashText" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 22, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "Collecting flashtext\n", 653 | " Downloading https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz\n", 654 | "Building wheels for collected packages: flashtext\n", 655 | " Running setup.py bdist_wheel for flashtext: started\n", 656 | " Running setup.py bdist_wheel for flashtext: finished with status 'done'\n", 657 | " Stored in directory: C:\\Users\\nirantk\\AppData\\Local\\pip\\Cache\\wheels\\37\\db\\d7\\fe74f7cb8e5c3afed90fe6f4967c933a6f13d81ab6b3d3128c\n", 658 | "Successfully built flashtext\n", 659 | "Installing collected packages: flashtext\n", 660 | "Successfully installed flashtext-2.7\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "import sys\n", 666 | "!{sys.executable} -m pip install flashtext" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 31, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "data": { 676 | "text/plain": [ 677 | "['NCR', 'Mumbai']" 678 | ] 679 | }, 680 | "execution_count": 31, 681 | "metadata": {}, 682 | "output_type": "execute_result" 683 | } 684 | ], 685 | "source": [ 686 | "from flashtext.keyword import KeywordProcessor\n", 687 | "keyword_processor = KeywordProcessor()\n", 688 | "keyword_processor.add_keyword('Delhi', 'NCR') # notice we are adding tuples here\n", 689 | "keyword_processor.add_keyword('Bombay', 'Mumbai')\n", 690 | "keywords_found = keyword_processor.extract_keywords('I love the food in Delhi and the people in Bombay')\n", 691 | "keywords_found\n", 692 | "# ['NCR', 'Mumbai']" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "How about we replace them now?" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 32, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "data": { 709 | "text/plain": [ 710 | "'I love the food in NCR and the people in Mumbai'" 711 | ] 712 | }, 713 | "execution_count": 32, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "from flashtext.keyword import KeywordProcessor\n", 720 | "keyword_processor = KeywordProcessor()\n", 721 | "keyword_processor.add_keyword('Delhi', 'NCR')\n", 722 | "keyword_processor.add_keyword('Bombay', 'Mumbai')\n", 723 | "replaced_sentence = keyword_processor.replace_keywords('I love the food in Delhi and the people in Bombay')\n", 724 | "replaced_sentence\n", 725 | "# 'I love the food in NCR and the people in Mumbai'" 726 | ] 727 | } 728 | ], 729 | "metadata": { 730 | "kernelspec": { 731 | "display_name": "Python [conda env:nlp]", 732 | "language": "python", 733 | "name": "conda-env-nlp-py" 734 | }, 735 | "language_info": { 736 | "codemirror_mode": { 737 | "name": "ipython", 738 | "version": 3 739 | }, 740 | "file_extension": ".py", 741 | "mimetype": "text/x-python", 742 | "name": "python", 743 | "nbconvert_exporter": "python", 744 | "pygments_lexer": "ipython3", 745 | "version": "3.6.6" 746 | } 747 | }, 748 | "nbformat": 4, 749 | "nbformat_minor": 2 750 | } 751 | -------------------------------------------------------------------------------- /02-A.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Cleaning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'\n", 17 | "file_name = 'sherlock.txt'" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import urllib.request\n", 27 | "# Download the file from `url` and save it locally under `file_name`:\n", 28 | "\n", 29 | "with urllib.request.urlopen(url) as response:\n", 30 | " with open(file_name, 'wb') as out_file:\n", 31 | " data = response.read() # a `bytes` object\n", 32 | " out_file.write(data)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "requirements.txt\n", 45 | "sherlock.txt\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "!ls {*.txt}" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n", 63 | "\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "!head -2 sherlock.txt" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "!sed -i 1,33d sherlock.txt" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "THE ADVENTURES OF SHERLOCK HOLMES\n", 92 | "\n", 93 | "by\n", 94 | "\n", 95 | "SIR ARTHUR CONAN DOYLE\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "!head -5 sherlock.txt" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Load Data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "THE A\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "#let's the load data to RAM\n", 125 | "text = open(file_name, 'r', encoding='utf-8').read() # note that I add an encoding='utf-8' parameter to preserve information\n", 126 | "print(text[:5])" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "The file is loaded as datatype: and has 581204 characters in it\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### Exploring Loaded Data" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 9, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "['\\n', ' ', '!', '\"', '$', '%', '&', \"'\", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']\n", 163 | "There are 85 unique characters, including both ASCII and Unicode character\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "# how many unique characters do we see? \n", 169 | "# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters\n", 170 | "unique_chars = list(set(text))\n", 171 | "unique_chars.sort()\n", 172 | "print(unique_chars)\n", 173 | "print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Tokenization \n", 181 | "\n", 182 | "### Split by Whitespace" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 10, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "107431\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "words = text.split()\n", 200 | "print(len(words))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 11, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler.', 'All', 'emotions,', 'and', 'that', 'one', 'particularly,', 'were', 'abhorrent', 'to', 'his', 'cold,', 'precise', 'but', 'admirably', 'balanced', 'mind.', 'He', 'was,', 'I', 'take', 'it,', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen,', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions,', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer.', 'They', 'were', 'admirable', 'things', 'for']\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "print(words[90:200]) #start with the first chapeter, ignoring the index for now" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 12, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "['red-headed', 'woman', 'on', 'the', 'street']" 229 | ] 230 | }, 231 | "execution_count": 12, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "# Let's look at another example: \n", 238 | "'red-headed woman on the street'.split()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Split by Word Extraction\n", 246 | "**Introducing Regex**" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 13, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "['Words', 'words', 'words', '']" 258 | ] 259 | }, 260 | "execution_count": 13, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "import re\n", 267 | "re.split('\\W+', 'Words, words, words.')" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 14, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "words_alphanumeric = re.split('\\W+', text)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 15, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "(109111, 107431)" 290 | ] 291 | }, 292 | "execution_count": 15, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "len(words_alphanumeric), len(words)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 16, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "['BOHEMIA', 'I', 'To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', 'All', 'emotions', 'and', 'that', 'one', 'particularly', 'were', 'abhorrent', 'to', 'his', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', 'They', 'were', 'admirable']\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "print(words_alphanumeric[90:200])" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 17, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "['Isn', 't', 'he', 'coming', 'home', 'for', 'dinner', 'with', 'the', 'red', 'headed', 'girl', '']\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "words_break = re.split('\\W+', \"Isn't he coming home for dinner with the red-headed girl?\")\n", 333 | "print(words_break)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### spaCy for Tokenization" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 18, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "Wall time: 2.46 s\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "%%time\n", 358 | "import spacy\n", 359 | "nlp = spacy.load('en')" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 19, 365 | "metadata": { 366 | "collapsed": true 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "doc = nlp(text)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 20, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "[whole, of, her, sex, ., It, was, not, that, he, felt, \n", 383 | ", any, emotion, akin, to, love, for, Irene, Adler, ., All, emotions, ,, and, that, \n", 384 | ", one, particularly, ,, were, abhorrent, to, his, cold, ,, precise, but, \n", 385 | ", admirably, balanced, mind, ., He, was, ,, I, take, it, ,]\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "print(list(doc)[150:200])" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 21, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "[Is, n't, he, coming, home, for, dinner, with, the, red, -, headed, girl, ?]\n" 410 | ] 411 | } 412 | ], 413 | "source": [ 414 | "words = nlp(\"Isn't he coming home for dinner with the red-headed girl?\")\n", 415 | "print([token for token in words])" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 22, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "[I. A SCANDAL IN BOHEMIA\n", 428 | "\n", 429 | "I.\n", 430 | "\n", 431 | "To Sherlock Holmes, she is always THE woman., I have seldom heard\n", 432 | "him mention her under any other name., In his eyes she eclipses\n", 433 | "and predominates the whole of her sex., It was not that he felt\n", 434 | "any emotion akin to love for Irene Adler.]\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "sentences = list(doc.sents)\n", 440 | "print(sentences[13:18])" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "#### STOP WORD REMOVAL & CASE CHANGE" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: " 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 49, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "sentence_example = \"the AI/AGI uprising cannot happen without the progress of NLP\"" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 54, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/plain": [ 476 | "[(the, True, False),\n", 477 | " (AI, False, False),\n", 478 | " (/, False, True),\n", 479 | " (AGI, True, False),\n", 480 | " (uprising, False, False),\n", 481 | " (can, True, False),\n", 482 | " (not, True, False),\n", 483 | " (happen, False, False),\n", 484 | " (without, True, False),\n", 485 | " (the, True, False),\n", 486 | " (progress, False, False),\n", 487 | " (of, True, False),\n", 488 | " (NLP, True, False)]" 489 | ] 490 | }, 491 | "execution_count": 54, 492 | "metadata": {}, 493 | "output_type": "execute_result" 494 | } 495 | ], 496 | "source": [ 497 | "[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 57, 503 | "metadata": {}, 504 | "outputs": [ 505 | { 506 | "name": "stdout", 507 | "output_type": "stream", 508 | "text": [ 509 | "THE False False\n", 510 | "ADVENTURES False False\n", 511 | "OF False False\n", 512 | "SHERLOCK False False\n", 513 | "HOLMES False False\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "for token in doc[:5]:\n", 519 | " print(token, token.is_stop, token.is_punct)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 30, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "text_lower = text.lower() # native python function\n", 529 | "doc_lower = nlp(text_lower)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 32, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "the True\n", 542 | "adventures False\n", 543 | "of True\n", 544 | "sherlock False\n", 545 | "holmes False\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "for token in doc_lower[:5]:\n", 551 | " print(token, token.is_stop)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 28, 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "data": { 561 | "text/plain": [ 562 | "'spaCy has a dictionary of 305 stop words'" 563 | ] 564 | }, 565 | "execution_count": 28, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "from spacy.lang.en.stop_words import STOP_WORDS\n", 572 | "f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 58, 578 | "metadata": { 579 | "collapsed": true 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "domain_stop_words = [\"NLP\", \"Processing\", \"AGI\"]\n", 584 | "for word in domain_stop_words:\n", 585 | " STOP_WORDS.add(word)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 59, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/plain": [ 596 | "[(the, True, False),\n", 597 | " (AI, False, False),\n", 598 | " (/, False, True),\n", 599 | " (AGI, True, False),\n", 600 | " (uprising, False, False),\n", 601 | " (can, True, False),\n", 602 | " (not, True, False),\n", 603 | " (happen, False, False),\n", 604 | " (without, True, False),\n", 605 | " (the, True, False),\n", 606 | " (progress, False, False),\n", 607 | " (of, True, False),\n", 608 | " (NLP, True, False)]" 609 | ] 610 | }, 611 | "execution_count": 59, 612 | "metadata": {}, 613 | "output_type": "execute_result" 614 | } 615 | ], 616 | "source": [ 617 | "[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 61, 623 | "metadata": {}, 624 | "outputs": [ 625 | { 626 | "data": { 627 | "text/plain": [ 628 | "['AI', 'uprising', 'happen', 'progress']" 629 | ] 630 | }, 631 | "execution_count": 61, 632 | "metadata": {}, 633 | "output_type": "execute_result" 634 | } 635 | ], 636 | "source": [ 637 | "[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 62, 643 | "metadata": {}, 644 | "outputs": [ 645 | { 646 | "data": { 647 | "text/plain": [ 648 | "['AI', '/', 'uprising', 'happen', 'progress']" 649 | ] 650 | }, 651 | "execution_count": 62, 652 | "metadata": {}, 653 | "output_type": "execute_result" 654 | } 655 | ], 656 | "source": [ 657 | "[str(token) for token in nlp(sentence_example) if not token.is_stop]" 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "## Stemming and Lemmatization" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | "### spaCy for Lemmatization\n", 672 | "**spaCy only supports lemmatization** " 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. " 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 72, 685 | "metadata": {}, 686 | "outputs": [ 687 | { 688 | "data": { 689 | "text/plain": [ 690 | "[(Their, '-PRON-', 561228191312463089, 'ADJ'),\n", 691 | " (Apples, 'apples', 14374618037326464786, 'PROPN'),\n", 692 | " (&, '&', 15473034735919704609, 'CCONJ'),\n", 693 | " (Banana, 'banana', 2525716904149915114, 'PROPN'),\n", 694 | " (fruit, 'fruit', 17674554054627885835, 'NOUN'),\n", 695 | " (salads, 'salad', 16382906660984395826, 'NOUN'),\n", 696 | " (are, 'be', 10382539506755952630, 'VERB'),\n", 697 | " (amazing, 'amazing', 12968186374132960503, 'ADJ'),\n", 698 | " (., '.', 12646065887601541794, 'PUNCT'),\n", 699 | " (Would, 'would', 6992604926141104606, 'VERB'),\n", 700 | " (you, '-PRON-', 561228191312463089, 'PRON'),\n", 701 | " (like, 'like', 18194338103975822726, 'VERB'),\n", 702 | " (meeting, 'meet', 6880656908171229526, 'VERB'),\n", 703 | " (me, '-PRON-', 561228191312463089, 'PRON'),\n", 704 | " (at, 'at', 11667289587015813222, 'ADP'),\n", 705 | " (the, 'the', 7425985699627899538, 'DET'),\n", 706 | " (cafe, 'cafe', 10569699879655997926, 'NOUN'),\n", 707 | " (?, '?', 8205403955989537350, 'PUNCT')]" 708 | ] 709 | }, 710 | "execution_count": 72, 711 | "metadata": {}, 712 | "output_type": "execute_result" 713 | } 714 | ], 715 | "source": [ 716 | "lemma_sentence_example = \"Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?\"\n", 717 | "[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]" 718 | ] 719 | } 720 | ], 721 | "metadata": { 722 | "kernelspec": { 723 | "display_name": "Python [conda env:nlp]", 724 | "language": "python", 725 | "name": "conda-env-nlp-py" 726 | }, 727 | "language_info": { 728 | "codemirror_mode": { 729 | "name": "ipython", 730 | "version": 3 731 | }, 732 | "file_extension": ".py", 733 | "mimetype": "text/x-python", 734 | "name": "python", 735 | "nbconvert_exporter": "python", 736 | "pygments_lexer": "ipython3", 737 | "version": "3.6.6" 738 | } 739 | }, 740 | "nbformat": 4, 741 | "nbformat_minor": 2 742 | } 743 | -------------------------------------------------------------------------------- /04_Text_Representations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", 13 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n" 14 | ] 15 | }, 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "gensim: 3.4.0\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "import gensim\n", 26 | "print(f'gensim: {gensim.__version__}')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Let's download some pre-trained GLove embeddings: " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Solving environment: ...working... done\n", 46 | "\n", 47 | "# All requested packages already installed.\n", 48 | "\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "!conda install -y tqdm" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from tqdm import tqdm\n", 63 | "class TqdmUpTo(tqdm):\n", 64 | " def update_to(self, b=1, bsize=1, tsize=None):\n", 65 | " if tsize is not None: self.total = tsize\n", 66 | " self.update(b * bsize - self.n)\n", 67 | "\n", 68 | "def get_data(url, filename):\n", 69 | " \"\"\"\n", 70 | " Download data if the filename does not exist already\n", 71 | " Uses Tqdm to show download progress\n", 72 | " \"\"\"\n", 73 | " import os\n", 74 | " from urllib.request import urlretrieve\n", 75 | " \n", 76 | " if not os.path.exists(filename):\n", 77 | "\n", 78 | " dirname = os.path.dirname(filename)\n", 79 | " if not os.path.exists(dirname):\n", 80 | " os.makedirs(dirname)\n", 81 | "\n", 82 | " with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:\n", 83 | " urlretrieve(url, filename, reporthook=t.update_to)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n", 93 | "get_data(embedding_url, 'data/glove.6B.zip')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "Archive: data/glove.6B.zip\n", 106 | " inflating: glove.6B.50d.txt \n", 107 | " inflating: glove.6B.100d.txt \n", 108 | " inflating: glove.6B.200d.txt \n", 109 | " inflating: glove.6B.300d.txt \n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# # We need to run this only once, can unzip manually unzip to the data directory too\n", 115 | "!unzip data/glove.6B.zip \n", 116 | "!mv glove.6B.300d.txt data/glove.6B.300d.txt \n", 117 | "!mv glove.6B.200d.txt data/glove.6B.200d.txt \n", 118 | "!mv glove.6B.100d.txt data/glove.6B.100d.txt \n", 119 | "!mv glove.6B.50d.txt data/glove.6B.50d.txt " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "from gensim.scripts.glove2word2vec import glove2word2vec\n", 129 | "glove_input_file = 'data/glove.6B.300d.txt'\n", 130 | "\n", 131 | "word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "import os\n", 141 | "if not os.path.exists(word2vec_output_file):\n", 142 | " glove2word2vec(glove_input_file, word2vec_output_file)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### KeyedVectors API" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 8, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "from gensim.models import KeyedVectors\n", 159 | "filename = word2vec_output_file " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "Wall time: 1min 24s\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "%%time\n", 177 | "# load the Stanford GloVe model from file, this is Disk I/O and can be slow\n", 178 | "pretrained_w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)\n", 179 | "# binary=False format for human readable text (.txt) files, and binary=True for .bin files " 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 10, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stderr", 189 | "output_type": "stream", 190 | "text": [ 191 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", 192 | " \n" 193 | ] 194 | }, 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "[('queen', 0.6713277101516724)]\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# calculate: (king - man) + woman = ?\n", 205 | "result = pretrained_w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)\n", 206 | "print(result)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 11, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "[('twitter', 0.37966805696487427)]\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "# calculate: (india - canada) + = ?\n", 224 | "result = pretrained_w2v_model.most_similar(positive=['quora', 'facebook'], negative=['linkedin'], topn=1)\n", 225 | "print(result)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 12, 231 | "metadata": { 232 | "scrolled": true 233 | }, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "[('indian', 0.7355823516845703),\n", 239 | " ('pakistan', 0.7285579442977905),\n", 240 | " ('delhi', 0.6846905946731567),\n", 241 | " ('bangladesh', 0.620319128036499),\n", 242 | " ('lanka', 0.609517514705658),\n", 243 | " ('sri', 0.6011613607406616),\n", 244 | " ('kashmir', 0.5746493935585022),\n", 245 | " ('nepal', 0.5421023964881897),\n", 246 | " ('pradesh', 0.5405810475349426),\n", 247 | " ('maharashtra', 0.518537700176239)]" 248 | ] 249 | }, 250 | "execution_count": 12, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "pretrained_w2v_model.most_similar('india')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "#### What is missing in both word2vec and GloVe? " 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 13, 269 | "metadata": { 270 | "scrolled": true 271 | }, 272 | "outputs": [ 273 | { 274 | "name": "stderr", 275 | "output_type": "stream", 276 | "text": [ 277 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", 278 | " \n" 279 | ] 280 | }, 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "\"word 'nirant' not in vocabulary\"\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "try:\n", 291 | " pretrained_w2v_model.wv.most_similar('nirant')\n", 292 | "except Exception as e:\n", 293 | " print(e)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### How to handle OOV words? " 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 14, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "ted_dataset = \"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\"\n", 310 | "get_data(ted_dataset, \"data/ted_en.zip\")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 15, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "import zipfile\n", 320 | "import lxml.etree\n", 321 | "with zipfile.ZipFile('data/ted_en.zip', 'r') as z:\n", 322 | " doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n", 323 | "input_text = '\\n'.join(doc.xpath('//content/text()'))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 16, 329 | "metadata": { 330 | "scrolled": true 331 | }, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "\"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A\"" 337 | ] 338 | }, 339 | "execution_count": 16, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "input_text[:500]" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 17, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "import re\n", 355 | "# remove parenthesis \n", 356 | "input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)\n", 357 | "\n", 358 | "# store as list of sentences\n", 359 | "sentences_strings_ted = []\n", 360 | "for line in input_text_noparens.split('\\n'):\n", 361 | " m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$', line)\n", 362 | " sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)\n", 363 | "\n", 364 | "# store as list of lists of words\n", 365 | "sentences_ted = []\n", 366 | "for sent_str in sentences_strings_ted:\n", 367 | " tokens = re.sub(r\"[^a-z0-9]+\", \" \", sent_str.lower()).split()\n", 368 | " sentences_ted.append(tokens)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 18, 374 | "metadata": { 375 | "scrolled": true 376 | }, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]\n" 383 | ] 384 | } 385 | ], 386 | "source": [ 387 | "print(sentences_ted[:2])" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 19, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "import json\n", 397 | "with open('ted_clean_sentences.json', 'w') as fp:\n", 398 | " json.dump(sentences_ted, fp)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 20, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "with open('ted_clean_sentences.json', 'r') as fp:\n", 408 | " sentences_ted = json.load(fp)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 21, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "print(sentences_ted[:2])" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "### Train FastText Embedddings" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 22, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "from gensim.models.fasttext import FastText" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 23, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "name": "stdout", 451 | "output_type": "stream", 452 | "text": [ 453 | "Wall time: 5.48 s\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "%%time\n", 459 | "fasttext_ted_model = FastText(sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)\n", 460 | "# sg = 1 denotes skipgram, else CBOW is used" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 24, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "[('indians', 0.5911639928817749),\n", 472 | " ('indian', 0.5406097769737244),\n", 473 | " ('indiana', 0.4898717999458313),\n", 474 | " ('indicated', 0.44004374742507935),\n", 475 | " ('indicate', 0.4042605757713318),\n", 476 | " ('internal', 0.39166826009750366),\n", 477 | " ('interior', 0.3871103823184967),\n", 478 | " ('byproducts', 0.37529298663139343),\n", 479 | " ('princesses', 0.37265270948410034),\n", 480 | " ('indications', 0.369659960269928)]" 481 | ] 482 | }, 483 | "execution_count": 24, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "fasttext_ted_model.wv.most_similar(\"india\")" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "### Train word2vec Embeddings" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 25, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "from gensim.models.word2vec import Word2Vec" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 26, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "Wall time: 1.44 s\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "%%time\n", 523 | "word2vec_ted_model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 27, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/plain": [ 534 | "[('bordered', 0.41709238290786743),\n", 535 | " ('hovering', 0.4083016514778137),\n", 536 | " ('almost', 0.3865964710712433),\n", 537 | " ('sad', 0.3704090118408203),\n", 538 | " ('supporters', 0.3616541624069214),\n", 539 | " ('spite', 0.3598758280277252),\n", 540 | " ('wrinkles', 0.3590206205844879),\n", 541 | " ('guaranteed', 0.3535975515842438),\n", 542 | " ('hd', 0.3512127995491028),\n", 543 | " ('assistant', 0.346971333026886)]" 544 | ] 545 | }, 546 | "execution_count": 27, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "word2vec_ted_model.wv.most_similar(\"india\")" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "## fastText or word2vec? " 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "# Document Embeddings" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 28, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", 576 | "import gensim\n", 577 | "from pprint import pprint\n", 578 | "import multiprocessing" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 29, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "import zipfile\n", 588 | "import lxml.etree\n", 589 | "with zipfile.ZipFile('data/ted_en.zip', 'r') as z:\n", 590 | " doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n", 591 | " \n", 592 | "talks = doc.xpath('//content/text()')" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 30, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "def read_corpus(talks, tokens_only=False):\n", 602 | " for i, line in enumerate(talks):\n", 603 | " if tokens_only:\n", 604 | " yield gensim.utils.simple_preprocess(line)\n", 605 | " else:\n", 606 | " # For training data, add tags\n", 607 | " yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 31, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "data": { 617 | "text/plain": [ 618 | "" 619 | ] 620 | }, 621 | "execution_count": 31, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "read_corpus(talks)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 32, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "ted_talk_docs = list(read_corpus(talks)) " 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 33, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "text/plain": [ 647 | "TaggedDocument(words=['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 'new', 'to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation', 'both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'good', 'thing', 'consider', 'facit', 'actually', 'old', 'enough', 'to', 'remember', 'them', 'facit', 'was', 'fantastic', 'company', 'they', 'were', 'born', 'deep', 'in', 'the', 'swedish', 'forest', 'and', 'they', 'made', 'the', 'best', 'mechanical', 'calculators', 'in', 'the', 'world', 'everybody', 'used', 'them', 'and', 'what', 'did', 'facit', 'do', 'when', 'the', 'electronic', 'calculator', 'came', 'along', 'they', 'continued', 'doing', 'exactly', 'the', 'same', 'in', 'six', 'months', 'they', 'went', 'from', 'maximum', 'revenue', 'and', 'they', 'were', 'gone', 'gone', 'to', 'me', 'the', 'irony', 'about', 'the', 'facit', 'story', 'is', 'hearing', 'about', 'the', 'facit', 'engineers', 'who', 'had', 'bought', 'cheap', 'small', 'electronic', 'calculators', 'in', 'japan', 'that', 'they', 'used', 'to', 'double', 'check', 'their', 'calculators', 'laughter', 'facit', 'did', 'too', 'much', 'exploitation', 'but', 'exploration', 'can', 'go', 'wild', 'too', 'few', 'years', 'back', 'worked', 'closely', 'alongside', 'european', 'biotech', 'company', 'let', 'call', 'them', 'oncosearch', 'the', 'company', 'was', 'brilliant', 'they', 'had', 'applications', 'that', 'promised', 'to', 'diagnose', 'even', 'cure', 'certain', 'forms', 'of', 'blood', 'cancer', 'every', 'day', 'was', 'about', 'creating', 'something', 'new', 'they', 'were', 'extremely', 'innovative', 'and', 'the', 'mantra', 'was', 'when', 'we', 'only', 'get', 'it', 'right', 'or', 'even', 'we', 'want', 'it', 'perfect', 'the', 'sad', 'thing', 'is', 'before', 'they', 'became', 'perfect', 'even', 'good', 'enough', 'they', 'became', 'obsolete', 'oncosearch', 'did', 'too', 'much', 'exploration', 'first', 'heard', 'about', 'exploration', 'and', 'exploitation', 'about', 'years', 'ago', 'when', 'worked', 'as', 'visiting', 'scholar', 'at', 'stanford', 'university', 'the', 'founder', 'of', 'the', 'idea', 'is', 'jim', 'march', 'and', 'to', 'me', 'the', 'power', 'of', 'the', 'idea', 'is', 'its', 'practicality', 'exploration', 'exploration', 'is', 'about', 'coming', 'up', 'with', 'what', 'new', 'it', 'about', 'search', 'it', 'about', 'discovery', 'it', 'about', 'new', 'products', 'it', 'about', 'new', 'innovations', 'it', 'about', 'changing', 'our', 'frontiers', 'our', 'heroes', 'are', 'people', 'who', 'have', 'done', 'exploration', 'madame', 'curie', 'picasso', 'neil', 'armstrong', 'sir', 'edmund', 'hillary', 'etc', 'come', 'from', 'norway', 'all', 'our', 'heroes', 'are', 'explorers', 'and', 'they', 'deserve', 'to', 'be', 'we', 'all', 'know', 'that', 'exploration', 'is', 'risky', 'we', 'don', 'know', 'the', 'answers', 'we', 'don', 'know', 'if', 'we', 're', 'going', 'to', 'find', 'them', 'and', 'we', 'know', 'that', 'the', 'risks', 'are', 'high', 'exploitation', 'is', 'the', 'opposite', 'exploitation', 'is', 'taking', 'the', 'knowledge', 'we', 'have', 'and', 'making', 'good', 'better', 'exploitation', 'is', 'about', 'making', 'our', 'trains', 'run', 'on', 'time', 'it', 'about', 'making', 'good', 'products', 'faster', 'and', 'cheaper', 'exploitation', 'is', 'not', 'risky', 'in', 'the', 'short', 'term', 'but', 'if', 'we', 'only', 'exploit', 'it', 'very', 'risky', 'in', 'the', 'long', 'term', 'and', 'think', 'we', 'all', 'have', 'memories', 'of', 'the', 'famous', 'pop', 'groups', 'who', 'keep', 'singing', 'the', 'same', 'songs', 'again', 'and', 'again', 'until', 'they', 'become', 'obsolete', 'or', 'even', 'pathetic', 'that', 'the', 'risk', 'of', 'exploitation', 'so', 'if', 'we', 'take', 'long', 'term', 'perspective', 'we', 'explore', 'if', 'we', 'take', 'short', 'term', 'perspective', 'we', 'exploit', 'small', 'children', 'they', 'explore', 'all', 'day', 'all', 'day', 'it', 'about', 'exploration', 'as', 'we', 'grow', 'older', 'we', 'explore', 'less', 'because', 'we', 'have', 'more', 'knowledge', 'to', 'exploit', 'on', 'the', 'same', 'goes', 'for', 'companies', 'companies', 'become', 'by', 'nature', 'less', 'innovative', 'as', 'they', 'become', 'more', 'competent', 'and', 'this', 'is', 'of', 'course', 'big', 'worry', 'to', 'ceos', 'and', 'hear', 'very', 'often', 'questions', 'phrased', 'in', 'different', 'ways', 'for', 'example', 'how', 'can', 'both', 'effectively', 'run', 'and', 'reinvent', 'my', 'company', 'or', 'how', 'can', 'make', 'sure', 'that', 'our', 'company', 'changes', 'before', 'we', 'become', 'obsolete', 'or', 'are', 'hit', 'by', 'crisis', 'so', 'doing', 'one', 'well', 'is', 'difficult', 'doing', 'both', 'well', 'as', 'the', 'same', 'time', 'is', 'art', 'pushing', 'both', 'exploration', 'and', 'exploitation', 'so', 'one', 'thing', 'we', 've', 'found', 'is', 'only', 'about', 'two', 'percent', 'of', 'companies', 'are', 'able', 'to', 'effectively', 'explore', 'and', 'exploit', 'at', 'the', 'same', 'time', 'in', 'parallel', 'but', 'when', 'they', 'do', 'the', 'payoffs', 'are', 'huge', 'so', 'we', 'have', 'lots', 'of', 'great', 'examples', 'we', 'have', 'nestlé', 'creating', 'nespresso', 'we', 'have', 'lego', 'going', 'into', 'animated', 'films', 'toyota', 'creating', 'the', 'hybrids', 'unilever', 'pushing', 'into', 'sustainability', 'there', 'are', 'lots', 'of', 'examples', 'and', 'the', 'benefits', 'are', 'huge', 'why', 'is', 'balancing', 'so', 'difficult', 'think', 'it', 'difficult', 'because', 'there', 'are', 'so', 'many', 'traps', 'that', 'keep', 'us', 'where', 'we', 'are', 'so', 'll', 'talk', 'about', 'two', 'but', 'there', 'are', 'many', 'so', 'let', 'talk', 'about', 'the', 'perpetual', 'search', 'trap', 'we', 'discover', 'something', 'but', 'we', 'don', 'have', 'the', 'patience', 'or', 'the', 'persistence', 'to', 'get', 'at', 'it', 'and', 'make', 'it', 'work', 'so', 'instead', 'of', 'staying', 'with', 'it', 'we', 'create', 'something', 'new', 'but', 'the', 'same', 'goes', 'for', 'that', 'then', 'we', 're', 'in', 'the', 'vicious', 'circle', 'of', 'actually', 'coming', 'up', 'with', 'ideas', 'but', 'being', 'frustrated', 'oncosearch', 'was', 'good', 'example', 'famous', 'example', 'is', 'of', 'course', 'xerox', 'but', 'we', 'don', 'only', 'see', 'this', 'in', 'companies', 'we', 'see', 'this', 'in', 'the', 'public', 'sector', 'as', 'well', 'we', 'all', 'know', 'that', 'any', 'kind', 'of', 'effective', 'reform', 'of', 'education', 'research', 'health', 'care', 'even', 'defense', 'takes', 'maybe', 'years', 'to', 'work', 'but', 'still', 'we', 'change', 'much', 'more', 'often', 'we', 'really', 'don', 'give', 'them', 'the', 'chance', 'another', 'trap', 'is', 'the', 'success', 'trap', 'facit', 'fell', 'into', 'the', 'success', 'trap', 'they', 'literally', 'held', 'the', 'future', 'in', 'their', 'hands', 'but', 'they', 'couldn', 'see', 'it', 'they', 'were', 'simply', 'so', 'good', 'at', 'making', 'what', 'they', 'loved', 'doing', 'that', 'they', 'wouldn', 'change', 'we', 'are', 'like', 'that', 'too', 'when', 'we', 'know', 'something', 'well', 'it', 'difficult', 'to', 'change', 'bill', 'gates', 'has', 'said', 'success', 'is', 'lousy', 'teacher', 'it', 'seduces', 'us', 'into', 'thinking', 'we', 'cannot', 'fail', 'that', 'the', 'challenge', 'with', 'success', 'so', 'think', 'there', 'are', 'some', 'lessons', 'and', 'think', 'they', 'apply', 'to', 'us', 'and', 'they', 'apply', 'to', 'our', 'companies', 'the', 'first', 'lesson', 'is', 'get', 'ahead', 'of', 'the', 'crisis', 'and', 'any', 'company', 'that', 'able', 'to', 'innovate', 'is', 'actually', 'able', 'to', 'also', 'buy', 'an', 'insurance', 'in', 'the', 'future', 'netflix', 'they', 'could', 'so', 'easily', 'have', 'been', 'content', 'with', 'earlier', 'generations', 'of', 'distribution', 'but', 'they', 'always', 'and', 'think', 'they', 'will', 'always', 'keep', 'pushing', 'for', 'the', 'next', 'battle', 'see', 'other', 'companies', 'that', 'say', 'll', 'win', 'the', 'next', 'innovation', 'cycle', 'whatever', 'it', 'takes', 'second', 'one', 'think', 'in', 'multiple', 'time', 'scales', 'll', 'share', 'chart', 'with', 'you', 'and', 'think', 'it', 'wonderful', 'one', 'any', 'company', 'we', 'look', 'at', 'taking', 'one', 'year', 'perspective', 'and', 'looking', 'at', 'the', 'valuation', 'of', 'the', 'company', 'innovation', 'typically', 'accounts', 'for', 'only', 'about', 'percent', 'so', 'when', 'we', 'think', 'one', 'year', 'innovation', 'isn', 'really', 'that', 'important', 'move', 'ahead', 'take', 'year', 'perspective', 'on', 'the', 'same', 'company', 'suddenly', 'innovation', 'and', 'ability', 'to', 'renew', 'account', 'for', 'percent', 'but', 'companies', 'can', 'choose', 'they', 'need', 'to', 'fund', 'the', 'journey', 'and', 'lead', 'the', 'long', 'term', 'third', 'invite', 'talent', 'don', 'think', 'it', 'possible', 'for', 'any', 'of', 'us', 'to', 'be', 'able', 'to', 'balance', 'exploration', 'and', 'exploitation', 'by', 'ourselves', 'think', 'it', 'team', 'sport', 'think', 'we', 'need', 'to', 'allow', 'challenging', 'think', 'the', 'mark', 'of', 'great', 'company', 'is', 'being', 'open', 'to', 'be', 'challenged', 'and', 'the', 'mark', 'of', 'good', 'corporate', 'board', 'is', 'to', 'constructively', 'challenge', 'think', 'that', 'also', 'what', 'good', 'parenting', 'is', 'about', 'last', 'one', 'be', 'skeptical', 'of', 'success', 'maybe', 'it', 'useful', 'to', 'think', 'back', 'at', 'the', 'old', 'triumph', 'marches', 'in', 'rome', 'when', 'the', 'generals', 'after', 'big', 'victory', 'were', 'given', 'their', 'celebration', 'riding', 'into', 'rome', 'on', 'the', 'carriage', 'they', 'always', 'had', 'companion', 'whispering', 'in', 'their', 'ear', 'remember', 'you', 're', 'only', 'human', 'so', 'hope', 'made', 'the', 'point', 'balancing', 'exploration', 'and', 'exploitation', 'has', 'huge', 'payoff', 'but', 'it', 'difficult', 'and', 'we', 'need', 'to', 'be', 'conscious', 'want', 'to', 'just', 'point', 'out', 'two', 'questions', 'that', 'think', 'are', 'useful', 'first', 'question', 'is', 'looking', 'at', 'your', 'own', 'company', 'in', 'which', 'areas', 'do', 'you', 'see', 'that', 'the', 'company', 'is', 'at', 'the', 'risk', 'of', 'falling', 'into', 'success', 'traps', 'of', 'just', 'going', 'on', 'autopilot', 'and', 'what', 'can', 'you', 'do', 'to', 'challenge', 'second', 'question', 'is', 'when', 'did', 'explore', 'something', 'new', 'last', 'and', 'what', 'kind', 'of', 'effect', 'did', 'it', 'have', 'on', 'me', 'is', 'that', 'something', 'should', 'do', 'more', 'of', 'in', 'my', 'case', 'yes', 'so', 'let', 'me', 'leave', 'you', 'with', 'this', 'whether', 'you', 're', 'an', 'explorer', 'by', 'nature', 'or', 'whether', 'you', 'tend', 'to', 'exploit', 'what', 'you', 'already', 'know', 'don', 'forget', 'the', 'beauty', 'is', 'in', 'the', 'balance', 'thank', 'you', 'applause'], tags=[0])" 648 | ] 649 | }, 650 | "execution_count": 33, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "ted_talk_docs[0]" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 34, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "name": "stdout", 666 | "output_type": "stream", 667 | "text": [ 668 | "8\n" 669 | ] 670 | } 671 | ], 672 | "source": [ 673 | "cores = multiprocessing.cpu_count()\n", 674 | "print(cores)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 35, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, epochs=5, workers=cores)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 36, 689 | "metadata": {}, 690 | "outputs": [ 691 | { 692 | "name": "stdout", 693 | "output_type": "stream", 694 | "text": [ 695 | "Wall time: 1.4 s\n" 696 | ] 697 | } 698 | ], 699 | "source": [ 700 | "%time model.build_vocab(ted_talk_docs)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 37, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "sentence_1 = 'Modern medicine has changed the way we think about healthcare, life spans and by extension career and marriage'" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 38, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "sentence_2 = 'Modern medicine is not just a boon to the rich, making the raw chemicals behind these is also pollutes the poorest neighborhoods'" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 39, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "sentence_3 = 'Modern medicine has changed the way we think about healthcare, and increased life spans, delaying weddings'" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 40, 733 | "metadata": {}, 734 | "outputs": [ 735 | { 736 | "data": { 737 | "text/plain": [ 738 | "-0.14454556996040863" 739 | ] 740 | }, 741 | "execution_count": 40, 742 | "metadata": {}, 743 | "output_type": "execute_result" 744 | } 745 | ], 746 | "source": [ 747 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 41, 753 | "metadata": {}, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": [ 758 | "-0.04978240807521571" 759 | ] 760 | }, 761 | "execution_count": 41, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 42, 773 | "metadata": {}, 774 | "outputs": [ 775 | { 776 | "name": "stdout", 777 | "output_type": "stream", 778 | "text": [ 779 | "Wall time: 6.77 s\n" 780 | ] 781 | } 782 | ], 783 | "source": [ 784 | "%time model.train(ted_talk_docs, total_examples=model.corpus_count, epochs=model.epochs)" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 43, 790 | "metadata": {}, 791 | "outputs": [ 792 | { 793 | "data": { 794 | "text/plain": [ 795 | "array([ 0.20152442, 0.07655947, 0.04110149, -0.09114903, -0.02466601,\n", 796 | " 0.10063498, -0.04590227, -0.16054891, -0.23367156, -0.07714292,\n", 797 | " -0.32246125, 0.10532021, 0.11020374, -0.02373328, -0.06048575,\n", 798 | " 0.06041928, -0.20840394, 0.11885054, -0.09653657, 0.02215091,\n", 799 | " 0.01846626, 0.06881414, -0.01988592, 0.01138998, 0.06924792,\n", 800 | " 0.11989842, 0.09510404, 0.01230403, 0.05453861, 0.05833528,\n", 801 | " 0.22496092, 0.06185873, 0.15445319, -0.13073249, 0.1320086 ,\n", 802 | " 0.15955518, 0.09083826, -0.262743 , 0.07112081, -0.12404393,\n", 803 | " -0.07876749, -0.17020509, -0.08309909, 0.20299006, -0.07867863,\n", 804 | " -0.19080839, -0.00371094, -0.2119167 , -0.11631834, -0.12984131,\n", 805 | " -0.11451794, 0.12690201, -0.02519317, 0.23437414, -0.11313629,\n", 806 | " 0.06674401, -0.0190409 , 0.3384525 , -0.13124712, -0.12843844,\n", 807 | " -0.2605964 , 0.22317892, -0.20078087, -0.05607577, -0.08431446,\n", 808 | " -0.20859231, 0.15535517, 0.0073873 , -0.11435535, 0.16722508,\n", 809 | " -0.0567028 , 0.23436148, -0.1829926 , 0.05211424, -0.14246033,\n", 810 | " 0.20756294, 0.03515876, 0.10574302, -0.05463392, 0.09465599,\n", 811 | " -0.24758984, -0.04593265, -0.13151605, 0.3317288 , -0.13002025,\n", 812 | " -0.37372893, -0.26798424, -0.27239782, -0.16636257, 0.19000524,\n", 813 | " 0.12744325, -0.14971398, -0.11483772, -0.01594907, 0.02319706,\n", 814 | " 0.03037767, -0.01439404, 0.08120204, -0.188371 , -0.21033412],\n", 815 | " dtype=float32)" 816 | ] 817 | }, 818 | "execution_count": 43, 819 | "metadata": {}, 820 | "output_type": "execute_result" 821 | } 822 | ], 823 | "source": [ 824 | "model.infer_vector(sentence_1.split())" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 44, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "data": { 834 | "text/plain": [ 835 | "0.9073806748252071" 836 | ] 837 | }, 838 | "execution_count": 44, 839 | "metadata": {}, 840 | "output_type": "execute_result" 841 | } 842 | ], 843 | "source": [ 844 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 45, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/plain": [ 855 | "0.7626341790517841" 856 | ] 857 | }, 858 | "execution_count": 45, 859 | "metadata": {}, 860 | "output_type": "execute_result" 861 | } 862 | ], 863 | "source": [ 864 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 46, 870 | "metadata": {}, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "0.8026655396100536" 876 | ] 877 | }, 878 | "execution_count": 46, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "model.docvecs.similarity_unseen_docs(model, sentence_2.split(), sentence_3.split())" 885 | ] 886 | }, 887 | { 888 | "cell_type": "markdown", 889 | "metadata": {}, 890 | "source": [ 891 | "# Model Assessment" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 47, 897 | "metadata": {}, 898 | "outputs": [], 899 | "source": [ 900 | "ranks = []\n", 901 | "for idx in range(len(ted_talk_docs)):\n", 902 | " inferred_vector = model.infer_vector(ted_talk_docs[idx].words)\n", 903 | " sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))\n", 904 | " rank = [docid for docid, sim in sims].index(idx)\n", 905 | " ranks.append(rank)" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 48, 911 | "metadata": {}, 912 | "outputs": [ 913 | { 914 | "data": { 915 | "text/plain": [ 916 | "Counter({0: 2080, 3: 1, 4: 1, 1: 1, 2: 1, 6: 1})" 917 | ] 918 | }, 919 | "execution_count": 48, 920 | "metadata": {}, 921 | "output_type": "execute_result" 922 | } 923 | ], 924 | "source": [ 925 | "import collections\n", 926 | "collections.Counter(ranks) # Results vary due to random seeding + very small corpus" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 49, 932 | "metadata": { 933 | "scrolled": false 934 | }, 935 | "outputs": [ 936 | { 937 | "name": "stdout", 938 | "output_type": "stream", 939 | "text": [ 940 | "Document (2084): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»\n", 941 | "\n", 942 | "SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)\n", 943 | "MOST (2084, 0.8938855528831482): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»\n", 944 | "\n", 945 | "MEDIAN (949, 0.40211787819862366): «we conventionally divide space into private and public realms and we know these legal distinctions very well because we ve become experts at protecting our private property and private space but we re less attuned to the nuances of the public what translates generic public space into qualitative space mean this is something that our studio has been working on for the past decade and we re doing this through some case studies large chunk of our work has been put into transforming this neglected i»\n", 946 | "\n", 947 | "LEAST (876, 0.11194153130054474): «so the machine going to talk you about is what call the greatest machine that never was it was machine that was never built and yet it will be built it was machine that was designed long before anyone thought about computers if you know anything about the history of computers you will know that in the and the simple computers were created that started the computer revolution we have today and you would be correct except for you have the wrong century the first computer was really designed in the»\n", 948 | "\n" 949 | ] 950 | } 951 | ], 952 | "source": [ 953 | "doc_slice = ' '.join(ted_talk_docs[idx].words)[:500]\n", 954 | "print(f'Document ({idx}): «{doc_slice}»\\n')\n", 955 | "print(f'SIMILAR/DISSIMILAR DOCS PER MODEL {model}')\n", 956 | "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n", 957 | " doc_slice = ' '.join(ted_talk_docs[sims[index][0]].words)[:500]\n", 958 | " print(f'{label} {sims[index]}: «{doc_slice}»\\n')" 959 | ] 960 | } 961 | ], 962 | "metadata": { 963 | "kernelspec": { 964 | "display_name": "Python [conda env:nlp]", 965 | "language": "python", 966 | "name": "conda-env-nlp-py" 967 | }, 968 | "language_info": { 969 | "codemirror_mode": { 970 | "name": "ipython", 971 | "version": 3 972 | }, 973 | "file_extension": ".py", 974 | "mimetype": "text/x-python", 975 | "name": "python", 976 | "nbconvert_exporter": "python", 977 | "pygments_lexer": "ipython3", 978 | "version": "3.6.6" 979 | } 980 | }, 981 | "nbformat": 4, 982 | "nbformat_minor": 2 983 | } 984 | -------------------------------------------------------------------------------- /06_DL_for_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deep Learning for NLP" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Understanding Deep Learning" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Kaggle: Text Categorization Challenge\n", 22 | "\n", 23 | "# Getting the Data\n", 24 | "\n", 25 | "**Direct Download**: You can get the train and test data from the [data tab on challenge website](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data). " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Exploring the Data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Solving environment: ...working... done\n", 45 | "\n", 46 | "## Package Plan ##\n", 47 | "\n", 48 | " environment location: D:\\Miniconda3\\envs\\nlp\n", 49 | "\n", 50 | " added / updated specs: \n", 51 | " - pandas\n", 52 | "\n", 53 | "\n", 54 | "The following packages will be UPDATED:\n", 55 | "\n", 56 | " pandas: 0.23.3-py36h830ac7b_0 --> 0.23.4-py36h830ac7b_0\n", 57 | "\n", 58 | "Preparing transaction: ...working... done\n", 59 | "Verifying transaction: ...working... done\n", 60 | "Executing transaction: ...working... done\n", 61 | "Solving environment: ...working... done\n", 62 | "\n", 63 | "## Package Plan ##\n", 64 | "\n", 65 | " environment location: D:\\Miniconda3\\envs\\nlp\n", 66 | "\n", 67 | " added / updated specs: \n", 68 | " - numpy\n", 69 | "\n", 70 | "\n", 71 | "The following packages will be UPDATED:\n", 72 | "\n", 73 | " numpy: 1.15.1-py36ha559c80_0 --> 1.15.4-py36ha559c80_0\n", 74 | " numpy-base: 1.15.1-py36h8128ebf_0 --> 1.15.4-py36h8128ebf_0\n", 75 | "\n", 76 | "Preparing transaction: ...working... done\n", 77 | "Verifying transaction: ...working... done\n", 78 | "Executing transaction: ...working... done\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "!conda install -y pandas\n", 84 | "!conda install -y numpy" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "import pandas as pd\n", 94 | "import numpy as np" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "ename": "FileNotFoundError", 104 | "evalue": "File b'data/train.csv' does not exist", 105 | "output_type": "error", 106 | "traceback": [ 107 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 108 | "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 109 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtrain_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/train.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 110 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 676\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 677\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 678\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 679\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 680\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 111 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 438\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 439\u001b[0m \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 440\u001b[1;33m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 441\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 442\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 112 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 785\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 786\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 787\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 788\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 789\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 113 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m 1012\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1013\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1014\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1015\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1016\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 114 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 1706\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'usecols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1707\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1708\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1709\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1710\u001b[0m \u001b[0mpassed_names\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 115 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[1;34m()\u001b[0m\n", 116 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[1;34m()\u001b[0m\n", 117 | "\u001b[1;31mFileNotFoundError\u001b[0m: File b'data/train.csv' does not exist" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "train_df = pd.read_csv(\"data/train.csv\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "train_df.head()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "val_df = pd.read_csv(\"data/valid.csv\")" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "val_df.head()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Multiple Target Dataset!" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "test_df = pd.read_csv(\"data/test.csv\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "test_df.head()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "# Why PyTorch? " 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# PyTorch and torchtext" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "!conda install -y pytorch cuda92 -c pytorch" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "!pip install --upgrade git+https://github.com/pytorch/text" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "import torch\n", 216 | "import torch.nn as nn\n", 217 | "import torch.nn.functional as F\n", 218 | "import torchtext" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 12, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "use_gpu = True\n", 228 | "if use_gpu:\n", 229 | " assert torch.cuda.is_available(), 'You either do not have a GPU or is not accessible to PyTorch'" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "Let's see how many GPU devices are available to PyTorch on this machine" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 13, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "1" 248 | ] 249 | }, 250 | "execution_count": 13, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "torch.cuda.device_count()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "## Data Loaders with torchtext\n", 264 | "\n", 265 | "### Conventions and Style" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 14, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "from torchtext.data import Field " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 15, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "LABEL = Field(sequential=False, use_vocab=False)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 16, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "tokenize = lambda x: x.split()\n", 293 | "TEXT = Field(sequential=True, tokenize=tokenize, lower=True)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 17, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "from torchtext.data import TabularDataset" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 18, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "tv_datafields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n", 312 | " (\"comment_text\", TEXT), (\"toxic\", LABEL),\n", 313 | " (\"severe_toxic\", LABEL), (\"threat\", LABEL),\n", 314 | " (\"obscene\", LABEL), (\"insult\", LABEL),\n", 315 | " (\"identity_hate\", LABEL)]" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 19, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "trn, vld = TabularDataset.splits(\n", 325 | " path=\"data\", # the root directory where the data lies\n", 326 | " train='train.csv', validation=\"valid.csv\",\n", 327 | " format='csv',\n", 328 | " skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!\n", 329 | " fields=tv_datafields)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 20, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "tst_datafields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n", 339 | " (\"comment_text\", TEXT)\n", 340 | " ]" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 21, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "tst = TabularDataset(\n", 350 | " path=\"data/test.csv\", # the file path\n", 351 | " format='csv',\n", 352 | " skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!\n", 353 | " fields=tst_datafields)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Exploring the Dataset Objects" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 22, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "(,\n", 372 | " ,\n", 373 | " )" 374 | ] 375 | }, 376 | "execution_count": 22, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "trn, vld, tst" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 23, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "(,\n", 394 | " ,\n", 395 | " )" 396 | ] 397 | }, 398 | "execution_count": 23, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "trn[0], vld[0], tst[0]" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 24, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])" 416 | ] 417 | }, 418 | "execution_count": 24, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "trn[0].__dict__.keys()" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 25, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "['explanation', 'why', 'the', 'edits', 'made']" 436 | ] 437 | }, 438 | "execution_count": 25, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "trn[0].__dict__['comment_text'][:5]" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 26, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "TEXT.build_vocab(trn)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 27, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "" 465 | ] 466 | }, 467 | "execution_count": 27, 468 | "metadata": {}, 469 | "output_type": "execute_result" 470 | } 471 | ], 472 | "source": [ 473 | "TEXT.vocab" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 28, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "text/plain": [ 484 | "collections.Counter" 485 | ] 486 | }, 487 | "execution_count": 28, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "type(TEXT.vocab.freqs)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 29, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/plain": [ 504 | "[('the', 78), ('to', 41), ('you', 33), ('of', 30), ('and', 26)]" 505 | ] 506 | }, 507 | "execution_count": 29, 508 | "metadata": {}, 509 | "output_type": "execute_result" 510 | } 511 | ], 512 | "source": [ 513 | "TEXT.vocab.freqs.most_common(5)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 30, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "(list, collections.defaultdict, 784, 784)" 525 | ] 526 | }, 527 | "execution_count": 30, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "type(TEXT.vocab.itos), type(TEXT.vocab.stoi), len(TEXT.vocab.itos), len(TEXT.vocab.stoi.keys()), " 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 31, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "(7, 'and')" 545 | ] 546 | }, 547 | "execution_count": 31, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "TEXT.vocab.stoi['and'], TEXT.vocab.itos[7]" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "## Iterators!" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 32, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "from torchtext.data import Iterator, BucketIterator" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": {}, 575 | "source": [ 576 | "## BucketIterator" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 33, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "train_iter, val_iter = BucketIterator.splits(\n", 586 | " (trn, vld), # we pass in the datasets we want the iterator to draw data from\n", 587 | " batch_sizes=(32, 32),\n", 588 | " sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.\n", 589 | " sort_within_batch=False,\n", 590 | " repeat=False # we pass repeat=False because we want to wrap this Iterator layer.\n", 591 | ")" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 34, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "" 603 | ] 604 | }, 605 | "execution_count": 34, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "train_iter" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 35, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "\n", 623 | "[torchtext.data.batch.Batch of size 25]\n", 624 | "\t[.comment_text]:[torch.LongTensor of size 494x25]\n", 625 | "\t[.toxic]:[torch.LongTensor of size 25]\n", 626 | "\t[.severe_toxic]:[torch.LongTensor of size 25]\n", 627 | "\t[.threat]:[torch.LongTensor of size 25]\n", 628 | "\t[.obscene]:[torch.LongTensor of size 25]\n", 629 | "\t[.insult]:[torch.LongTensor of size 25]\n", 630 | "\t[.identity_hate]:[torch.LongTensor of size 25]" 631 | ] 632 | }, 633 | "execution_count": 35, 634 | "metadata": {}, 635 | "output_type": "execute_result" 636 | } 637 | ], 638 | "source": [ 639 | "batch = next(train_iter.__iter__())\n", 640 | "batch" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 36, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/plain": [ 651 | "dict_keys(['batch_size', 'dataset', 'fields', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])" 652 | ] 653 | }, 654 | "execution_count": 36, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "batch.__dict__.keys()" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 37, 666 | "metadata": {}, 667 | "outputs": [ 668 | { 669 | "data": { 670 | "text/plain": [ 671 | "(,\n", 672 | " ,\n", 673 | " True)" 674 | ] 675 | }, 676 | "execution_count": 37, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | } 680 | ], 681 | "source": [ 682 | "batch.__dict__['dataset'], trn, batch.__dict__['dataset']==trn" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 38, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "test_iter = Iterator(tst, batch_size=64, sort=False, sort_within_batch=False, repeat=False)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 39, 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/plain": [ 702 | "\n", 703 | "[torchtext.data.batch.Batch of size 33]\n", 704 | "\t[.comment_text]:[torch.LongTensor of size 158x33]" 705 | ] 706 | }, 707 | "execution_count": 39, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "next(test_iter.__iter__())" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [ 720 | "## BatchWrapper" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 40, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "class BatchWrapper:\n", 730 | " def __init__(self, dl, x_var, y_vars):\n", 731 | " self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y\n", 732 | " \n", 733 | " def __iter__(self):\n", 734 | " for batch in self.dl:\n", 735 | " x = getattr(batch, self.x_var) # we assume only one input in this wrapper\n", 736 | " \n", 737 | " if self.y_vars is not None: # we will concatenate y into a single tensor\n", 738 | " y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()\n", 739 | " else:\n", 740 | " y = torch.zeros((1))\n", 741 | " if use_gpu:\n", 742 | " yield (x.cuda(), y.cuda())\n", 743 | " else:\n", 744 | " yield (x, y)\n", 745 | " \n", 746 | " def __len__(self):\n", 747 | " return len(self.dl)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 41, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "train_dl = BatchWrapper(train_iter, \"comment_text\", [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"])\n", 757 | "valid_dl = BatchWrapper(val_iter, \"comment_text\", [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"])\n", 758 | "test_dl = BatchWrapper(test_iter, \"comment_text\", None)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 42, 764 | "metadata": {}, 765 | "outputs": [ 766 | { 767 | "data": { 768 | "text/plain": [ 769 | "(tensor([[ 453, 63, 15, ..., 454, 660, 778],\n", 770 | " [ 523, 4, 601, ..., 78, 11, 650],\n", 771 | " [ 30, 664, 242, ..., 8, 2, 22],\n", 772 | " ...,\n", 773 | " [ 1, 1, 1, ..., 1, 1, 1],\n", 774 | " [ 1, 1, 1, ..., 1, 1, 1],\n", 775 | " [ 1, 1, 1, ..., 1, 1, 1]], device='cuda:0'),\n", 776 | " tensor([[ 0., 0., 0., 0., 0., 0.],\n", 777 | " [ 0., 0., 0., 0., 0., 0.],\n", 778 | " [ 0., 0., 0., 0., 0., 0.],\n", 779 | " [ 0., 0., 0., 0., 0., 0.],\n", 780 | " [ 0., 0., 0., 0., 0., 0.],\n", 781 | " [ 0., 0., 0., 0., 0., 0.],\n", 782 | " [ 0., 0., 0., 0., 0., 0.],\n", 783 | " [ 1., 1., 0., 1., 1., 0.],\n", 784 | " [ 0., 0., 0., 0., 0., 0.],\n", 785 | " [ 0., 0., 0., 0., 0., 0.],\n", 786 | " [ 0., 0., 0., 0., 0., 0.],\n", 787 | " [ 0., 0., 0., 0., 0., 0.],\n", 788 | " [ 0., 0., 0., 0., 0., 0.],\n", 789 | " [ 0., 0., 0., 0., 0., 0.],\n", 790 | " [ 0., 0., 0., 0., 0., 0.],\n", 791 | " [ 0., 0., 0., 0., 0., 0.],\n", 792 | " [ 0., 0., 0., 0., 0., 0.],\n", 793 | " [ 0., 0., 0., 0., 0., 0.],\n", 794 | " [ 0., 0., 0., 0., 0., 0.],\n", 795 | " [ 0., 0., 0., 0., 0., 0.],\n", 796 | " [ 1., 0., 0., 0., 0., 0.],\n", 797 | " [ 0., 0., 0., 0., 0., 0.],\n", 798 | " [ 1., 0., 0., 0., 0., 0.],\n", 799 | " [ 0., 0., 0., 0., 0., 0.],\n", 800 | " [ 0., 0., 0., 0., 0., 0.]], device='cuda:0'))" 801 | ] 802 | }, 803 | "execution_count": 42, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "next(train_dl.__iter__())" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "metadata": {}, 815 | "source": [ 816 | "## Training a Text Classifier" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 43, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "class SimpleLSTMBaseline(nn.Module):\n", 826 | " def __init__(self, hidden_dim, emb_dim=300,\n", 827 | " spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=2):\n", 828 | " super().__init__() # don't forget to call this!\n", 829 | " self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)\n", 830 | " self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=num_linear, dropout=recurrent_dropout)\n", 831 | " self.linear_layers = []\n", 832 | " for _ in range(num_linear - 1):\n", 833 | " self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))\n", 834 | " self.linear_layers = nn.ModuleList(self.linear_layers)\n", 835 | " self.predictor = nn.Linear(hidden_dim, 6)\n", 836 | " \n", 837 | " def forward(self, seq):\n", 838 | " hdn, _ = self.encoder(self.embedding(seq))\n", 839 | " feature = hdn[-1, :, :]\n", 840 | " for layer in self.linear_layers:\n", 841 | " feature = layer(feature)\n", 842 | " preds = self.predictor(feature)\n", 843 | " return preds" 844 | ] 845 | }, 846 | { 847 | "cell_type": "markdown", 848 | "metadata": {}, 849 | "source": [ 850 | "### Initializing the Model" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 44, 856 | "metadata": {}, 857 | "outputs": [ 858 | { 859 | "name": "stdout", 860 | "output_type": "stream", 861 | "text": [ 862 | "SimpleLSTMBaseline(\n", 863 | " (embedding): Embedding(784, 300)\n", 864 | " (encoder): LSTM(300, 500, num_layers=2, dropout=0.1)\n", 865 | " (linear_layers): ModuleList(\n", 866 | " (0): Linear(in_features=500, out_features=500, bias=True)\n", 867 | " )\n", 868 | " (predictor): Linear(in_features=500, out_features=6, bias=True)\n", 869 | ")\n" 870 | ] 871 | } 872 | ], 873 | "source": [ 874 | "em_sz = 300\n", 875 | "nh = 500\n", 876 | "model = SimpleLSTMBaseline(nh, emb_dim=em_sz)\n", 877 | "print(model)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 45, 883 | "metadata": {}, 884 | "outputs": [ 885 | { 886 | "name": "stdout", 887 | "output_type": "stream", 888 | "text": [ 889 | "4.096706 million parameters\n" 890 | ] 891 | } 892 | ], 893 | "source": [ 894 | "def model_size(model: torch.nn)->int:\n", 895 | " \"\"\"\n", 896 | " Calculates the number of trainable parameters in any model\n", 897 | " \n", 898 | " Returns:\n", 899 | " params (int): the total count of all model weights\n", 900 | " \"\"\"\n", 901 | " model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n", 902 | "# model_parameters = model.parameters()\n", 903 | " params = sum([np.prod(p.size()) for p in model_parameters])\n", 904 | " return params\n", 905 | "\n", 906 | "print(f'{model_size(model)/10**6} million parameters')" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 46, 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [ 915 | "if use_gpu:\n", 916 | " model = model.cuda()" 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "**Putting together the pieces again**:" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": 47, 929 | "metadata": {}, 930 | "outputs": [], 931 | "source": [ 932 | "from torch import optim\n", 933 | "opt = optim.Adam(model.parameters(), lr=1e-2)\n", 934 | "loss_func = nn.BCEWithLogitsLoss().cuda()\n", 935 | "epochs = 3" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "## Training Loop" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 48, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "name": "stderr", 952 | "output_type": "stream", 953 | "text": [ 954 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.34it/s]\n" 955 | ] 956 | }, 957 | { 958 | "name": "stdout", 959 | "output_type": "stream", 960 | "text": [ 961 | "Epoch: 1, Training Loss: 13.5037, Validation Loss: 4.6498\n" 962 | ] 963 | }, 964 | { 965 | "name": "stderr", 966 | "output_type": "stream", 967 | "text": [ 968 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4.58it/s]\n" 969 | ] 970 | }, 971 | { 972 | "name": "stdout", 973 | "output_type": "stream", 974 | "text": [ 975 | "Epoch: 2, Training Loss: 7.8243, Validation Loss: 24.5401\n" 976 | ] 977 | }, 978 | { 979 | "name": "stderr", 980 | "output_type": "stream", 981 | "text": [ 982 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.35it/s]\n" 983 | ] 984 | }, 985 | { 986 | "name": "stdout", 987 | "output_type": "stream", 988 | "text": [ 989 | "Epoch: 3, Training Loss: 57.4577, Validation Loss: 4.0107\n" 990 | ] 991 | } 992 | ], 993 | "source": [ 994 | "from tqdm import tqdm\n", 995 | "for epoch in range(1, epochs + 1):\n", 996 | " running_loss = 0.0\n", 997 | " running_corrects = 0\n", 998 | " model.train() # turn on training mode\n", 999 | " for x, y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!\n", 1000 | " opt.zero_grad()\n", 1001 | " preds = model(x)\n", 1002 | " loss = loss_func(preds, y)\n", 1003 | " loss.backward()\n", 1004 | " opt.step()\n", 1005 | " \n", 1006 | " running_loss += loss.item() * x.size(0)\n", 1007 | " \n", 1008 | " epoch_loss = running_loss / len(trn)\n", 1009 | " \n", 1010 | " # calculate the validation loss for this epoch\n", 1011 | " val_loss = 0.0\n", 1012 | " model.eval() # turn on evaluation mode\n", 1013 | " for x, y in valid_dl:\n", 1014 | " preds = model(x)\n", 1015 | " loss = loss_func(preds, y)\n", 1016 | " val_loss += loss.item() * x.size(0)\n", 1017 | "\n", 1018 | " val_loss /= len(vld)\n", 1019 | " print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "markdown", 1024 | "metadata": {}, 1025 | "source": [ 1026 | "## Prediction Mode" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": 49, 1032 | "metadata": {}, 1033 | "outputs": [ 1034 | { 1035 | "name": "stderr", 1036 | "output_type": "stream", 1037 | "text": [ 1038 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9.64it/s]\n" 1039 | ] 1040 | } 1041 | ], 1042 | "source": [ 1043 | "test_preds = []\n", 1044 | "for x, y in tqdm(test_dl):\n", 1045 | " preds = model(x)\n", 1046 | " # if you're data is on the GPU, you need to move the data back to the cpu\n", 1047 | " # preds = preds.data.cpu().numpy()\n", 1048 | " preds = preds.data.cpu().numpy()\n", 1049 | " # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function\n", 1050 | " preds = 1 / (1 + np.exp(-preds))\n", 1051 | " test_preds.append(preds)\n", 1052 | "test_preds = np.hstack(test_preds)" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "markdown", 1057 | "metadata": {}, 1058 | "source": [ 1059 | "### Convert predictions to a pandas dataframe\n", 1060 | "\n", 1061 | "This helps us convert the predictions to a more interpretable format. Let's insert the predictions in the correct column and then we can preview few rows of the dataframe: " 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "code", 1066 | "execution_count": 50, 1067 | "metadata": {}, 1068 | "outputs": [], 1069 | "source": [ 1070 | "test_df = pd.read_csv(\"data/test.csv\")\n", 1071 | "for i, col in enumerate([\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]):\n", 1072 | " test_df[col] = test_preds[:, i]" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 51, 1078 | "metadata": {}, 1079 | "outputs": [ 1080 | { 1081 | "data": { 1082 | "text/html": [ 1083 | "
\n", 1084 | "\n", 1097 | "\n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
000001cee341fdb12Yo bitch Ja Rule is more succesful then you'll...0.6291460.1167210.4386060.1568480.1396960.388736
10000247867823ef7== From RfC == \\r\\n\\r\\n The title is fine as i...0.6291460.1167210.4386060.1568480.1396960.388736
200013b17ad220c46\" \\r\\n\\r\\n == Sources == \\r\\n\\r\\n * Zawe Ashto...0.6291460.1167210.4386060.1568480.1396960.388736
\n", 1147 | "
" 1148 | ], 1149 | "text/plain": [ 1150 | " id comment_text \\\n", 1151 | "0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll... \n", 1152 | "1 0000247867823ef7 == From RfC == \\r\\n\\r\\n The title is fine as i... \n", 1153 | "2 00013b17ad220c46 \" \\r\\n\\r\\n == Sources == \\r\\n\\r\\n * Zawe Ashto... \n", 1154 | "\n", 1155 | " toxic severe_toxic obscene threat insult identity_hate \n", 1156 | "0 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 \n", 1157 | "1 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 \n", 1158 | "2 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 " 1159 | ] 1160 | }, 1161 | "execution_count": 51, 1162 | "metadata": {}, 1163 | "output_type": "execute_result" 1164 | } 1165 | ], 1166 | "source": [ 1167 | "test_df.head(3)" 1168 | ] 1169 | } 1170 | ], 1171 | "metadata": { 1172 | "kernelspec": { 1173 | "display_name": "Python [conda env:nlp]", 1174 | "language": "python", 1175 | "name": "conda-env-nlp-py" 1176 | }, 1177 | "language_info": { 1178 | "codemirror_mode": { 1179 | "name": "ipython", 1180 | "version": 3 1181 | }, 1182 | "file_extension": ".py", 1183 | "mimetype": "text/x-python", 1184 | "name": "python", 1185 | "nbconvert_exporter": "python", 1186 | "pygments_lexer": "ipython3", 1187 | "version": "3.6.6" 1188 | } 1189 | }, 1190 | "nbformat": 4, 1191 | "nbformat_minor": 2 1192 | } 1193 | --------------------------------------------------------------------------------