├── Part-08 Web Deployments
├── .gitignore
├── requirements.txt
├── templates
│ └── index.html
├── model_predict.py
├── README.md
├── model_train.py
├── utils.py
└── api.py
├── tokenization.png
├── requirements.txt
├── _config.yml
├── LICENSE
├── .github
└── workflows
│ └── deploy-book.yml
├── .gitignore
├── README.md
├── environment.yml
├── 09_Basic_IE.ipynb
├── 10_Coherence_Check.ipynb
├── tokenization.svg
├── 07_Build_Chatbot_in_30minutes.ipynb
├── 02-B.ipynb
├── 02-A.ipynb
├── 04_Text_Representations.ipynb
└── 06_DL_for_NLP.ipynb
/Part-08 Web Deployments/.gitignore:
--------------------------------------------------------------------------------
1 | extra_32x32.mat
2 | model.pkl
--------------------------------------------------------------------------------
/tokenization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NirantK/NLP_Quickbook/HEAD/tokenization.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter-book==0.10.2
2 | jupyter
3 | black
4 | isort
5 | pandas
6 | spacy
7 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | # in _config.yml
2 | title: "Quick NLP Projects"
3 | # logo: images/logo.png
4 | execute:
5 | execute_notebooks: "off"
6 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==1.0
2 | Jinja2==2.11.3
3 | numpy==1.13.1
4 | scikit-learn==0.18.1
5 | scipy==0.18.1
6 | virtualenv==15.1.0
7 |
8 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Text Classification model as a Flask API
5 |
6 |
7 |
8 |
9 |
10 |
11 | Movie Sentiment Analysis
12 |
13 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/model_predict.py:
--------------------------------------------------------------------------------
1 | from sklearn.externals import joblib
2 |
3 |
4 | def read_predict():
5 | model = joblib.load("model.pkl")
6 | # print(model)
7 |
8 | with open(r"1_3.txt", "r") as infile:
9 | test_contents = infile.read()
10 |
11 | with open(r".\\data\\aclImdb\\train\neg\\1_1.txt", "r") as infile:
12 | test_neg_contents = infile.read()
13 |
14 | with open(r".\\data\\aclImdb\\train\pos\\0_9.txt", "r") as infile:
15 | test_pos_contents = infile.read()
16 |
17 | predictions = model.predict([test_contents, test_neg_contents, test_pos_contents])
18 | return predictions
19 |
20 |
21 | predictions = read_predict()
22 | for p in predictions:
23 | print("pos" if p else "neg")
24 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/README.md:
--------------------------------------------------------------------------------
1 | # Deploying a ML model with Flask API
2 |
3 | Based on Scikit-Learn stack
4 |
5 | ---
6 |
7 | In this tutorial,
8 |
9 | 1. We build a small text classification model, and write it to disk in `model_train.py`
10 | 2. Reuse this model, in `model_predict.py`
11 | 3. Expose the model using Flask with `api.py`
12 |
13 | Using Flask to create an API, we can deploy this model and create a simple web page to load and classify new movie reviews.
14 |
15 | ## To run locally
16 |
17 | - Install pip and Python 3
18 | - Clone this repository
19 | - Navigate to the working directory
20 | - Install the Python dependencies `pip install -r requirements.txt`
21 | - Run the API `python api.py`
22 | - Open a web browser and go to `http://localhost:8000`
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Nirant
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/deploy-book.yml:
--------------------------------------------------------------------------------
1 | name: deploy-book
2 |
3 | # Only run this when the master branch changes
4 | on:
5 | push:
6 | branches:
7 | - master
8 | # If your git repository has the Jupyter Book within some-subfolder next to
9 | # unrelated files, you can make this run only if a file within that specific
10 | # folder has been modified.
11 | #
12 | # paths:
13 | # - some-subfolder/**
14 |
15 | # This job installs dependencies, build the book, and pushes it to `gh-pages`
16 | jobs:
17 | deploy-book:
18 | runs-on: ubuntu-latest
19 | steps:
20 | - uses: actions/checkout@v2
21 |
22 | # Install dependencies
23 | - name: Set up Python 3.7
24 | uses: actions/setup-python@v1
25 | with:
26 | python-version: 3.7
27 |
28 | - name: Install dependencies
29 | run: |
30 | pip install -r requirements.txt
31 |
32 | # Build a table of contents
33 | - name: ToC Build
34 | run: |
35 | jupyter-book toc .
36 | # Build the book
37 | - name: Build the book
38 | run: |
39 | jupyter-book build .
40 |
41 | # Push the book's HTML to github-pages
42 | - name: GitHub Pages action
43 | uses: peaceiris/actions-gh-pages@v3.6.1
44 | with:
45 | github_token: ${{ secrets.GITHUB_TOKEN }}
46 | publish_dir: ./_build/html
47 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/model_train.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import logging
3 | import os
4 | from pathlib import Path
5 | from urllib.request import urlretrieve
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.externals import joblib
10 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
11 | from sklearn.linear_model import LogisticRegression as LR
12 | from sklearn.pipeline import Pipeline
13 | from tqdm import tqdm
14 |
15 | from utils import get_data, read_data
16 |
17 | # create logger
18 | logger = logging.getLogger(__name__)
19 | logger.setLevel(logging.DEBUG)
20 | # create file handler which logs even debug messages
21 | fh = logging.FileHandler(str(__name__) + ".log")
22 | fh.setLevel(logging.DEBUG)
23 | # create console handler with a higher log level
24 | ch = logging.StreamHandler()
25 | ch.setLevel(logging.ERROR)
26 | # create formatter and add it to the handlers
27 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
28 | fh.setFormatter(formatter)
29 | ch.setFormatter(formatter)
30 | # add the handlers to the logger
31 | logger.addHandler(fh)
32 | logger.addHandler(ch)
33 |
34 | data_path = Path(os.getcwd()) / "data" / "aclImdb"
35 | logger.info(data_path)
36 |
37 | if not data_path.exists():
38 | data_url = "http://files.fast.ai/data/aclImdb.tgz"
39 | get_data(data_url, "data/imdb.tgz")
40 |
41 | train_path = data_path / "train"
42 | # load data file as dict object
43 | train = read_data(train_path)
44 |
45 | # extract the images (X) and labels (y) from the dict
46 | X_train, y_train = train["text"], train["label"]
47 |
48 |
49 | lr_clf = Pipeline(
50 | [("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", LR())]
51 | )
52 | lr_clf.fit(X=X_train, y=y_train)
53 |
54 | # save model
55 | joblib.dump(lr_clf, "model.pkl")
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 | data/*
103 | .data/*
104 | .vscode/settings.json
105 | *.pkl
106 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/utils.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import os
3 | from pathlib import Path
4 | from urllib.request import urlretrieve
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from tqdm import tqdm
9 |
10 |
11 | class TqdmUpTo(tqdm):
12 | def update_to(self, b=1, bsize=1, tsize=None):
13 | if tsize is not None:
14 | self.total = tsize
15 | self.update(b * bsize - self.n)
16 |
17 |
18 | def get_data(url, filename):
19 | """
20 | Download data if the filename does not exist already
21 | Uses Tqdm to show download progress
22 | """
23 | if not os.path.exists(filename):
24 |
25 | dirname = os.path.dirname(filename)
26 | if not os.path.exists(dirname):
27 | os.makedirs(dirname)
28 |
29 | with TqdmUpTo(
30 | unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1]
31 | ) as t:
32 | urlretrieve(url, filename, reporthook=t.update_to)
33 |
34 |
35 | def read_data(dir_path):
36 | """read data into pandas dataframe"""
37 |
38 | def load_dir_reviews(reviews_path):
39 | files_list = list(reviews_path.iterdir())
40 | reviews = []
41 | for filename in files_list:
42 | f = open(filename, "r", encoding="utf-8")
43 | reviews.append(f.read())
44 | return pd.DataFrame({"text": reviews})
45 |
46 | pos_path = dir_path / "pos"
47 | neg_path = dir_path / "neg"
48 |
49 | pos_reviews, neg_reviews = load_dir_reviews(pos_path), load_dir_reviews(neg_path)
50 |
51 | pos_reviews["label"] = 1
52 | neg_reviews["label"] = 0
53 |
54 | merged = pd.concat([pos_reviews, neg_reviews])
55 | df = merged.sample(frac=1.0) # shuffle the rows
56 | df.reset_index(inplace=True) # don't carry index from previous
57 | df.drop(columns=["index"], inplace=True) # drop the column 'index'
58 | return df
59 |
--------------------------------------------------------------------------------
/Part-08 Web Deployments/api.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import flask
4 | import os
5 | import numpy as np
6 | from flask import Flask, jsonify, render_template, request
7 | from scipy import misc
8 | from sklearn.externals import joblib
9 |
10 | app = Flask(__name__)
11 |
12 | # create logger
13 | logger = logging.getLogger(__name__)
14 | logger.setLevel(logging.DEBUG)
15 | # create file handler which logs even debug messages
16 | fh = logging.FileHandler(str(__name__) + ".log")
17 | fh.setLevel(logging.DEBUG)
18 | # create console handler with a higher log level
19 | ch = logging.StreamHandler()
20 | ch.setLevel(logging.INFO)
21 | # create formatter and add it to the handlers
22 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
23 | fh.setFormatter(formatter)
24 | ch.setFormatter(formatter)
25 | # add the handlers to the logger
26 | logger.addHandler(fh)
27 | logger.addHandler(ch)
28 |
29 |
30 | @app.route("/")
31 | @app.route("/index")
32 | def index():
33 | return flask.render_template("index.html", label=False)
34 |
35 |
36 | @app.route("/status", methods=["GET"])
37 | def get_status():
38 | return jsonify({"version": "0.0.1", "status": True})
39 |
40 |
41 | @app.route("/predict", methods=["POST"])
42 | def make_prediction():
43 | if request.method == "POST":
44 | # get uploaded file if it exists
45 | logger.debug(request.files)
46 |
47 | f = request.files["file"]
48 | f.save(f.filename) # save file to disk
49 | logger.info(f"{f.filename} saved to disk")
50 |
51 | # read file from disk
52 | with open(f.filename, "r") as infile:
53 | text_content = infile.read()
54 | logger.info(f"Text Content from file read")
55 |
56 | prediction = model.predict([text_content])
57 | logger.info(f"prediction: {prediction}")
58 | prediction = "pos" if prediction[0] == 1 else "neg"
59 | os.remove(f.filename)
60 | return flask.render_template("index.html", label=prediction)
61 |
62 |
63 | if __name__ == "__main__":
64 | # load ml model from disk
65 | model = joblib.load("model.pkl")
66 | # start api
67 | app.run(host="0.0.0.0", port=8000, debug=True)
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Natural Language Processing Notebooks
2 | --
3 |
4 | # Available as a Book: [NLP in Python - Quickstart Guide](https://www.amazon.in/dp/B07L3PLQS1)
5 |
6 | ### Written for Practicing Engineers
7 |
8 | This work builds on the outstanding work which exists on Natural Language Processing. These range from classics like Jurafsky's Speech and Language Processing to rather modern work in The Deep Learning Book by Ian Goodfellow et al.
9 |
10 | While they are great as introductory textbooks for college students - this is intended for practitioners to quickly read, skim, select what is useful and then proceed. There are several notebooks divided into 7 logical themes.
11 |
12 | Each section builds on ideas and code from previous notebooks, but you can fill in the gaps mentally and jump directly to what interests you.
13 |
14 | ## Chapter 01
15 | [Introduction To Text Processing, with Text Classification](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-01.ipynb)
16 | - Perfect for Getting Started! We learn better with code-first approaches
17 |
18 | ## Chapter 02
19 | - [Text Cleaning](https://github.com/NirantK/nlp-python-deep-learning/blob/master/02-A.ipynb) notebook, code-first approaches with supporting explanation. Covers some simple ideas like:
20 | - Stop words removal
21 | - Lemmatization
22 | - [Spell Correction](https://github.com/NirantK/nlp-python-deep-learning/blob/master/02-B.ipynb) covers **almost everything** that you will ever need to get started with spell correction, similar words problems and so on
23 |
24 | ## Chapter 03
25 | [Leveraging Linguistics](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-03%20NLP%20with%20spaCy%20and%20Textacy.ipynb) is an important toolkit in any practitioners toolkit. Using **spaCy** and textacy we look at two interesting challenges and how to tackle them:
26 | - Redacting names
27 | - Named Entity Recognition
28 | - Question and Answer Generation
29 | - Part of Speech Tagging
30 | - Dependency Parsing
31 |
32 | ## Chapter 04
33 | [Text Representations](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-04%20Text%20Representations.ipynb) is about converting text to numerical representations aka vectors
34 | - Covers popular celebrities: word2vec, fasttext and doc2vec - document similarity using the same
35 | - Programmer's Guide to **gensim**
36 |
37 | ## Chapter 05
38 | [Modern Methods for Text Classification](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-05%20Modern%20Text%20Classification.ipynb) is simple, exploratory and talks about:
39 | - Simple Classifiers and How to Optimize Them from **scikit-learn**
40 | - How to combine and **ensemble** them for increased performance
41 | - Builds intuition for ensembling - so that you can write your own ensembling techniques
42 |
43 | ## Chapter 06
44 | [Deep Learning for NLP](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-06%20Deep%20Learning%20for%20NLP.ipynb) is less about fancy data modeling, and more engineering for Deep Learning
45 | - From scratch code tutorial with Text Classification as an example
46 | - Using **PyTorch** and *torchtext*
47 | - Write our own data loaders, pre-processing, training loop and other utilities
48 |
49 | ## Chapter 07
50 | [Building your own Chatbot](https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-07%20Building%20your%20own%20Chatbot%20in%2030%20minutes.ipynb) from scratch in 30 minutes. We use this to explore unsupervised learning and put together several of the ideas we have already seen.
51 | - simpler, direct problem formulation instead of complicated chatbot tutorials commonly seen
52 | - intents, responses and templates in chat bot parlance
53 | - hacking word based similarity engine to work with little to no training samples
54 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: win-64
4 | _nb_ext_conf=0.4.0=py36_1
5 | anaconda-client=1.6.14=py36_0
6 | asn1crypto=0.24.0=py36_0
7 | backcall=0.1.0=py36_0
8 | blas=1.0=mkl
9 | bleach=2.1.3=py36_0
10 | boto=2.48.0=py36_1
11 | boto3=1.7.45=py36_0
12 | botocore=1.10.45=py36_0
13 | ca-certificates=2018.03.07=0
14 | certifi=2018.4.16=py36_0
15 | cffi=1.11.5=py36h945400d_0
16 | chardet=3.0.4=py36h420ce6e_1
17 | clyent=1.2.2=py36hb10d595_1
18 | colorama=0.3.9=py36h029ae33_0
19 | cryptography=2.2.2=py36hfa6e2cd_0
20 | cuda91=1.0=0
21 | cymem=1.31.2=py36h51d26f2_0
22 | cytoolz=0.8.2=py36h547e66e_0
23 | decorator=4.3.0=py36_0
24 | dill=0.2.8.2=py36_0
25 | docutils=0.14=py36h6012d8f_0
26 | entrypoints=0.2.3=py36hfd66bb0_2
27 | ftfy=4.4.3=py36hb6b3982_0
28 | gensim=3.4.0=py36h452e1ab_0
29 | html5lib=1.0.1=py36h047fa9f_0
30 | icc_rt=2017.0.4=h97af966_0
31 | idna=2.7=py36_0
32 | intel-openmp=2018.0.3=0
33 | ipykernel=4.8.2=py36_0
34 | ipython=6.4.0=py36_0
35 | ipython_genutils=0.2.0=py36h3c5d0ee_0
36 | ipywidgets=7.2.1=py36_0
37 | jedi=0.12.0=py36_1
38 | jinja2=2.10=py36h292fed1_0
39 | jmespath=0.9.3=py36_0
40 | jsonschema=2.6.0=py36h7636477_0
41 | jupyter_client=5.2.3=py36_0
42 | jupyter_core=4.4.0=py36h56e9d50_0
43 | libiconv=1.15=h1df5818_7
44 | libsodium=1.0.16=h9d3ae62_0
45 | libxml2=2.9.8=hadb2253_1
46 | libxslt=1.1.32=hf6f1972_0
47 | lxml=4.2.2=py36hef2cd61_0
48 | m2w64-gcc-libgfortran=5.3.0=6
49 | m2w64-gcc-libs=5.3.0=7
50 | m2w64-gcc-libs-core=5.3.0=7
51 | m2w64-gmp=6.1.0=2
52 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2
53 | markupsafe=1.0=py36h0e26971_1
54 | mistune=0.8.3=py36hfa6e2cd_1
55 | mkl=2018.0.3=1
56 | mkl_fft=1.0.1=py36h452e1ab_0
57 | mkl_random=1.0.1=py36h9258bd6_0
58 | msgpack-numpy=0.4.1=py36h93564ae_0
59 | msgpack-python=0.4.8=py36h58b1e9d_0
60 | msys2-conda-epoch=20160418=1
61 | murmurhash=0.28.0=py36h866ba4d_0
62 | nb_anacondacloud=1.4.0=py36_0
63 | nb_conda=2.2.0=py36_0
64 | nb_conda_kernels=2.1.0=py36_0
65 | nbconvert=5.3.1=py36h8dc0fde_0
66 | nbformat=4.4.0=py36h3a5bc1b_0
67 | nbpresent=3.0.2=py36_0
68 | notebook=5.5.0=py36_0
69 | numpy=1.14.5=py36h9fa60d3_3
70 | numpy-base=1.14.5=py36h5c71026_3
71 | openssl=1.0.2o=h8ea7d77_0
72 | pandoc=2.2.1=h1a437c5_0
73 | pandocfilters=1.4.2=py36h3ef6317_1
74 | parso=0.2.1=py36_0
75 | pickleshare=0.7.4=py36h9de030f_0
76 | pip=10.0.1=py36_0
77 | plac=0.9.6=py36_0
78 | preshed=1.0.0=py36h065ec1e_0
79 | prompt_toolkit=1.0.15=py36h60b8f86_0
80 | pycparser=2.18=py36hd053e01_1
81 | pygments=2.2.0=py36hb010967_0
82 | pyopenssl=18.0.0=py36_0
83 | pyreadline=2.1=py36h094d952_1
84 | pysocks=1.6.8=py36_0
85 | python=3.6.6=hea74fb7_0
86 | python-dateutil=2.7.3=py36_0
87 | pytorch=0.4.0=py36_cuda91_cudnn7he774522_1
88 | pytz=2018.5=py36_0
89 | pywinpty=0.5.4=py36_0
90 | pyyaml=3.12=py36h1d1928f_1
91 | pyzmq=17.0.0=py36hfa6e2cd_1
92 | regex=2017.11.09=py36ha090894_0
93 | requests=2.19.1=py36_0
94 | s3transfer=0.1.13=py36_0
95 | scipy=1.1.0=py36h672f292_0
96 | send2trash=1.5.0=py36_0
97 | setuptools=39.2.0=py36_0
98 | simplegeneric=0.8.1=py36_2
99 | six=1.11.0=py36h4db2310_1
100 | smart_open=1.6.0=py36_0
101 | spacy=2.0.11=py36h8300f20_0
102 | termcolor=1.1.0=py36_1
103 | terminado=0.8.1=py36_1
104 | testpath=0.3.1=py36h2698cfe_0
105 | thinc=6.10.2=py36h830ac7b_0
106 | toolz=0.9.0=py36_0
107 | tornado=5.0.2=py36_0
108 | tqdm=4.23.4=py36_0
109 | traitlets=4.3.2=py36h096827d_0
110 | ujson=1.35=py36_0
111 | urllib3=1.23=py36_0
112 | vc=14.1=h0510ff6_3
113 | vs2015_runtime=15.5.2=3
114 | wcwidth=0.1.7=py36h3d5aa90_0
115 | webencodings=0.5.1=py36h67c50ae_1
116 | wheel=0.31.1=py36_0
117 | widgetsnbextension=3.2.1=py36_0
118 | win_inet_pton=1.0.1=py36he67d7fd_1
119 | wincertstore=0.2=py36h7fe50ca_0
120 | winpty=0.4.3=4
121 | wrapt=1.10.11=py36hfa6e2cd_2
122 | yaml=0.1.7=hc54c509_2
123 | zeromq=4.2.5=he025d50_1
124 | zlib=1.2.11=h8395fce_2
125 |
--------------------------------------------------------------------------------
/09_Basic_IE.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## [Basic] Information Extraction\n",
8 | "\n",
9 | "As an example task, consider the challenge of automating Amazon Retail's customer service email response. We should be able to find the following attributes or mark them as missing with high confidence:\n",
10 | "\n",
11 | "- Order Id\n",
12 | "- Dates (such as Shopping Date, Order Delivery) \n",
13 | "- Any `$` amounts \n",
14 | "\n",
15 | "Please note that I don't have any relation to Amazon other than shopping from there. \n",
16 | "\n",
17 | "Let's consider the following totally imagined complaint email from me to Jeff Bezos, the CEO of Amazon:\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "complaint_email = \"\"\"Hello Jeff,\n",
29 | "\n",
30 | "I am Nirant, a loyal Amazon in first customer for months now. I am a huge fan of Kindle as well. \n",
31 | "I am stuck in a new city without a phone thanks to a sequence of problems - and are now compounded by Amazon's inhumane behaviour.\n",
32 | "\n",
33 | "The particular issues I am facing: My new phone bought from Amazon stopped working. What did I do? Requested a replacement on Jul 23\n",
34 | "- First Issue: The system did not allow a pick up on July 23 forcing a delay of more than a day to 24 July 8:00 - 11:00 AM\n",
35 | "- Second Issue: Despite requesting the customer service on chat THRICE, the pickup is delayed to July 24 8:00 - 11:00 AM\n",
36 | "- Third Issue: The pickup is rescheduled without any reason!\n",
37 | "\n",
38 | "Is this how you want Amazon to be world's most customer centric company?\n",
39 | "\n",
40 | "Here is how Amazon can help me:\n",
41 | "- Pick up the order as urgently as possible\n",
42 | "- Deliver the phone on a priority basis on Monday i.e. July 25 itself\n",
43 | "\n",
44 | "Here are the order numbers for reference: \n",
45 | "ORDER # 402-4870778-5154753 and ORDER # 404-8689779-9721113\n",
46 | "\n",
47 | "Here is my phone number: +91 7737887058\n",
48 | "\n",
49 | "I am stuck in a new city, where I don't know the language or directions without a working phone. I would really appreciate it if you could help in anyway. \n",
50 | "\n",
51 | "Regards,\n",
52 | "Nirant Kasliwal\"\"\""
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Yikes, that is a lot of text. \n",
60 | "\n",
61 | "**The information to pull from this are (1) dates + times (2) phone number and (3) order numbers**. Let's figure out how to do that\n",
62 | "\n",
63 | "### Extract Date and Times\n",
64 | "\n",
65 | "If you are new to regex, consider reading the amazing [HOWTO on Python Regex](https://docs.python.org/3/howto/regex.html) and then coming back here. Let's warm up our regex muscles a bit: "
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 2,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "import re"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 3,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/plain": [
87 | "['12', '11', '10']"
88 | ]
89 | },
90 | "execution_count": 3,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "p = re.compile(r'\\d+')\n",
97 | "p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 4,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "['12', '11', '10']"
109 | ]
110 | },
111 | "execution_count": 4,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "p = re.compile(r'\\d+')\n",
118 | "p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "TKX: Add compile and findall explanations here\n",
126 | "\n",
127 | "TKX: Add d+ explanations here"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 5,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stdout",
137 | "output_type": "stream",
138 | "text": [
139 | "Wall time: 0 ns\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "%%time\n",
145 | "date_pattern = r\"^(Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec)$\"\n",
146 | "p = re.compile(date_pattern)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 6,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "[]"
158 | ]
159 | },
160 | "execution_count": 6,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "p.findall(complaint_email)"
167 | ]
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": "fastAI",
173 | "language": "python",
174 | "name": "fastai"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.6.5"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 2
191 | }
192 |
--------------------------------------------------------------------------------
/10_Coherence_Check.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "813a6f6d",
6 | "metadata": {},
7 | "source": [
8 | "# Coherence Check\n",
9 | "\n",
10 | "Goal: We've a sample of generated text. We want to select the ones which are coherent and discard the rest. \n",
11 | "\n",
12 | "Not a Goal: Checking for semantic correctness.\n",
13 | "\n",
14 | "## Possible Approaches for Coherence Checks\n",
15 | "\n",
16 | "1. [**Recommended**] Using a different Language Model and calculating perplexity of the sentence and threshold to select only coherent variants. Use a LM fine-tuned on your training corpus to make sure that the perple\n",
17 | "\n",
18 | "2. Using dependency parsing from spaCy to see if there are conditions/patterns which incoherent sentences fail, but coherent sentences meet. Common example: The root verb in the sentence should be directly connected to the subject. There should be no dangling clauses.\n",
19 | "\n",
20 | "3. For longer text generation, in addition for training with the next sentence prediction task. Generate multiple next sentences and use the [CLS] emb + classifier to mark each sentence as coherent or not. \n",
21 | "\n",
22 | "> We encode each sentence by adding [CLS] token to the last position, and feed the hidden state of this token to a double dot-product regression model. The final output is from a logistic regression predicting if the two sentences come from the same paragraph or not.\n",
23 | "> - From [Improving Language Generation with Sentence Coherence Objective](https://www.arxiv-vanity.com/papers/2009.06358/)"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 14,
29 | "id": "f87ca8b7",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# TODO\n",
34 | "# Add example of perplexity change using GPT-2 or T5"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "id": "5eb4f340",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import spacy\n",
45 | "from spacy import displacy\n",
46 | "\n",
47 | "nlp = spacy.load(\"en_core_web_sm\")"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 12,
53 | "id": "e704c419",
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/html": [
59 | ""
117 | ],
118 | "text/plain": [
119 | ""
120 | ]
121 | },
122 | "metadata": {},
123 | "output_type": "display_data"
124 | }
125 | ],
126 | "source": [
127 | "doc = nlp(\"Stitch in time, saves Nine\")\n",
128 | "options = {\"bg\": \"#09a3d5\", \"color\": \"white\"}\n",
129 | "displacy.render(doc, style=\"dep\", options=options)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 13,
135 | "id": "72c5d731",
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/html": [
141 | ""
173 | ],
174 | "text/plain": [
175 | ""
176 | ]
177 | },
178 | "metadata": {},
179 | "output_type": "display_data"
180 | }
181 | ],
182 | "source": [
183 | "doc = nlp(\"Stitch in time\")\n",
184 | "displacy.render(doc, style=\"dep\", options=options)"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "id": "cc8148af",
190 | "metadata": {},
191 | "source": [
192 | "Notice that when the phrase/clause is used - the verb \"Stitch\" does not have a subject. While the previous one, does via `conj` (conjuction). \n",
193 | "\n",
194 | "As an example of rule/filter, we can enforce a constraint that every verb needs to have a subject."
195 | ]
196 | }
197 | ],
198 | "metadata": {
199 | "kernelspec": {
200 | "display_name": "Python 3",
201 | "language": "python",
202 | "name": "python3"
203 | },
204 | "language_info": {
205 | "codemirror_mode": {
206 | "name": "ipython",
207 | "version": 3
208 | },
209 | "file_extension": ".py",
210 | "mimetype": "text/x-python",
211 | "name": "python",
212 | "nbconvert_exporter": "python",
213 | "pygments_lexer": "ipython3",
214 | "version": "3.7.0"
215 | }
216 | },
217 | "nbformat": 4,
218 | "nbformat_minor": 5
219 | }
220 |
--------------------------------------------------------------------------------
/tokenization.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/07_Build_Chatbot_in_30minutes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Building your own Chatbot\n",
8 | "\n",
9 | "## Why should I build the service again? \n",
10 | "\n",
11 | "##### Related: Why can't I use FB/MSFT/some other cloud service?"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Word Vectors + Heuristic - Fancy Stuff = Quick Working Code"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "Gensim version: 3.4.0\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "import numpy as np\n",
36 | "import gensim\n",
37 | "print(f\"Gensim version: {gensim.__version__}\")"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "File already exists, please remove if you wish to download again\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "from tqdm import tqdm\n",
55 | "class TqdmUpTo(tqdm):\n",
56 | " def update_to(self, b=1, bsize=1, tsize=None):\n",
57 | " if tsize is not None: self.total = tsize\n",
58 | " self.update(b * bsize - self.n)\n",
59 | "\n",
60 | "def get_data(url, filename):\n",
61 | " \"\"\"\n",
62 | " Download data if the filename does not exist already\n",
63 | " Uses Tqdm to show download progress\n",
64 | " \"\"\"\n",
65 | " import os\n",
66 | " from urllib.request import urlretrieve\n",
67 | " \n",
68 | " if not os.path.exists(filename):\n",
69 | "\n",
70 | " dirname = os.path.dirname(filename)\n",
71 | " if not os.path.exists(dirname):\n",
72 | " os.makedirs(dirname)\n",
73 | "\n",
74 | " with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:\n",
75 | " urlretrieve(url, filename, reporthook=t.update_to)\n",
76 | " else:\n",
77 | " print(\"File already exists, please remove if you wish to download again\")\n",
78 | "\n",
79 | "embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n",
80 | "get_data(embedding_url, 'data/glove.6B.zip')"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 3,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# !unzip data/glove.6B.zip \n",
90 | "# !mv -v glove.6B.300d.txt data/glove.6B.300d.txt \n",
91 | "# !mv -v glove.6B.200d.txt data/glove.6B.200d.txt \n",
92 | "# !mv -v glove.6B.100d.txt data/glove.6B.100d.txt \n",
93 | "# !mv -v glove.6B.50d.txt data/glove.6B.50d.txt \n",
94 | "\n",
95 | "from gensim.scripts.glove2word2vec import glove2word2vec\n",
96 | "glove_input_file = 'data/glove.6B.300d.txt'\n",
97 | "word2vec_output_file = 'data/glove.6B.300d.txt.word2vec'\n",
98 | "import os\n",
99 | "if not os.path.exists(word2vec_output_file):\n",
100 | " glove2word2vec(glove_input_file, word2vec_output_file)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 4,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "CPU times: user 1min 49s, sys: 2.11 s, total: 1min 51s\n",
113 | "Wall time: 1min 47s\n"
114 | ]
115 | }
116 | ],
117 | "source": [
118 | "%%time\n",
119 | "from gensim.models import KeyedVectors\n",
120 | "filename = word2vec_output_file\n",
121 | "embed = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 5,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "assert embed['awesome'] is not None"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "'awesome', this works!"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "## Use Case: Food Order Bot\n",
145 | "\n",
146 | "### Do word vectors even work for this? "
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 6,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "cuisine_refs = [\"mexican\", \"thai\", \"british\", \"american\", \"italian\"]\n",
156 | "sample_sentence = \"I’m looking for a cheap Indian or Chinese place in Indiranagar\""
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 7,
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "name": "stdout",
166 | "output_type": "stream",
167 | "text": [
168 | "looking: 7.448504447937012\n",
169 | "for: 10.627421379089355\n",
170 | "a: 11.809560775756836\n",
171 | "cheap: 7.09670877456665\n",
172 | "indian: 18.64516258239746\n",
173 | "or: 9.692893981933594\n",
174 | "chinese: 19.09498405456543\n",
175 | "place: 7.651237487792969\n",
176 | "in: 10.085711479187012\n",
177 | "['indian', 'chinese']\n"
178 | ]
179 | }
180 | ],
181 | "source": [
182 | "tokens = sample_sentence.split()\n",
183 | "tokens = [x.lower().strip() for x in tokens] \n",
184 | "threshold = 18.3\n",
185 | "found = []\n",
186 | "for term in tokens:\n",
187 | " if term in embed.vocab:\n",
188 | " scores = []\n",
189 | " for C in cuisine_refs:\n",
190 | " scores.append(np.dot(embed[C], embed[term].T))\n",
191 | " # hint replace above above np.dot with: \n",
192 | " # scores.append(embed.cosine_similarities(, ))\n",
193 | " mean_score = np.mean(scores)\n",
194 | " print(f\"{term}: {mean_score}\")\n",
195 | " if mean_score > threshold:\n",
196 | " found.append(term)\n",
197 | "print(found)"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "### Next Stop: Classifying user intent"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 8,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "(300,)\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "def sum_vecs(embed,text):\n",
222 | "\n",
223 | " tokens = text.split(' ')\n",
224 | " vec = np.zeros(embed.vector_size)\n",
225 | "\n",
226 | " for idx, term in enumerate(tokens):\n",
227 | " if term in embed.vocab:\n",
228 | " vec = vec + embed[term]\n",
229 | " return vec\n",
230 | "\n",
231 | "sentence_vector = sum_vecs(embed, sample_sentence)\n",
232 | "print(sentence_vector.shape)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 9,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "data={\n",
242 | " \"greet\": {\n",
243 | " \"examples\" : [\"hello\",\"hey there\",\"howdy\",\"hello\",\"hi\",\"hey\",\"hey ho\"],\n",
244 | " \"centroid\" : None\n",
245 | " },\n",
246 | " \"inform\": {\n",
247 | " \"examples\" : [\n",
248 | " \"i'd like something asian\",\n",
249 | " \"maybe korean\",\n",
250 | " \"what mexican options do i have\",\n",
251 | " \"what italian options do i have\",\n",
252 | " \"i want korean food\",\n",
253 | " \"i want german food\",\n",
254 | " \"i want vegetarian food\",\n",
255 | " \"i would like chinese food\",\n",
256 | " \"i would like indian food\",\n",
257 | " \"what japanese options do i have\",\n",
258 | " \"korean please\",\n",
259 | " \"what about indian\",\n",
260 | " \"i want some chicken\",\n",
261 | " \"maybe thai\",\n",
262 | " \"i'd like something vegetarian\",\n",
263 | " \"show me french restaurants\",\n",
264 | " \"show me a cool malaysian spot\",\n",
265 | " \"where can I get some spicy food\"\n",
266 | " ],\n",
267 | " \"centroid\" : None\n",
268 | " },\n",
269 | " \"deny\": {\n",
270 | " \"examples\" : [\n",
271 | " \"nah\",\n",
272 | " \"any other places ?\",\n",
273 | " \"anything else\",\n",
274 | " \"no thanks\"\n",
275 | " \"not that one\",\n",
276 | " \"i do not like that place\",\n",
277 | " \"something else please\",\n",
278 | " \"no please show other options\"\n",
279 | " ],\n",
280 | " \"centroid\" : None\n",
281 | " },\n",
282 | " \"affirm\":{\n",
283 | " \"examples\":[\n",
284 | " \"yeah\",\n",
285 | " \"that works\",\n",
286 | " \"good, thanks\",\n",
287 | " \"this works\",\n",
288 | " \"sounds good\",\n",
289 | " \"thanks, this is perfect\",\n",
290 | " \"just what I wanted\"\n",
291 | " ],\n",
292 | " \"centroid\": None\n",
293 | " }\n",
294 | "\n",
295 | "}"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 10,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "def get_centroid(embed, examples):\n",
305 | " C = np.zeros((len(examples),embed.vector_size))\n",
306 | " for idx, text in enumerate(examples):\n",
307 | " C[idx,:] = sum_vecs(embed,text)\n",
308 | "\n",
309 | " centroid = np.mean(C,axis=0)\n",
310 | " assert centroid.shape[0] == embed.vector_size\n",
311 | " return centroid"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 11,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "# Adding Centroid to data dictionary\n",
321 | "for label in data.keys():\n",
322 | " data[label][\"centroid\"] = get_centroid(embed,data[label][\"examples\"])"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 12,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "greet: ['hello', 'hey there', 'howdy', 'hello', 'hi', 'hey', 'hey ho']\n",
335 | "inform: [\"i'd like something asian\", 'maybe korean', 'what mexican options do i have', 'what italian options do i have', 'i want korean food', 'i want german food', 'i want vegetarian food', 'i would like chinese food', 'i would like indian food', 'what japanese options do i have', 'korean please', 'what about indian', 'i want some chicken', 'maybe thai', \"i'd like something vegetarian\", 'show me french restaurants', 'show me a cool malaysian spot', 'where can I get some spicy food']\n",
336 | "deny: ['nah', 'any other places ?', 'anything else', 'no thanksnot that one', 'i do not like that place', 'something else please', 'no please show other options']\n",
337 | "affirm: ['yeah', 'that works', 'good, thanks', 'this works', 'sounds good', 'thanks, this is perfect', 'just what I wanted']\n"
338 | ]
339 | }
340 | ],
341 | "source": [
342 | "for label in data.keys():\n",
343 | " print(f\"{label}: {data[label]['examples']}\")"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 13,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "def get_intent(embed,data, text):\n",
353 | " intents = list(data.keys())\n",
354 | " vec = sum_vecs(embed,text)\n",
355 | " scores = np.array([ np.linalg.norm(vec-data[label][\"centroid\"]) for label in intents])\n",
356 | " return intents[np.argmin(scores)]"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 14,
362 | "metadata": {},
363 | "outputs": [
364 | {
365 | "name": "stdout",
366 | "output_type": "stream",
367 | "text": [
368 | "text : 'hey ', predicted_label : 'greet'\n",
369 | "text : 'i am looking for chinese food', predicted_label : 'inform'\n",
370 | "text : 'not for me', predicted_label : 'deny'\n",
371 | "text : 'ok, this is good', predicted_label : 'affirm'\n"
372 | ]
373 | }
374 | ],
375 | "source": [
376 | "for text in [\"hey \",\"i am looking for chinese food\",\"not for me\", \"ok, this is good\"]:\n",
377 | " print(f\"text : '{text}', predicted_label : '{get_intent(embed, data, text)}'\")"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {},
383 | "source": [
384 | "## Bot Responses"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 15,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "templates = {\n",
394 | " \"utter_greet\": [\"hey there!\", \"Hey! How you doin'? \"],\n",
395 | " \"utter_options\": [\"ok, let me check some more\"],\n",
396 | " \"utter_goodbye\": [\"Great, I'll go now. Bye bye\", \"bye bye\", \"Goodbye!\"],\n",
397 | " \"utter_default\": [\"Sorry, I didn't quite follow\"],\n",
398 | " \"utter_confirm\": [\"Got it\", \"Gotcha\", \"Your order is confirmed now\"]\n",
399 | " }"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 16,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "response_map = {\n",
409 | " \"greet\": \"utter_greet\",\n",
410 | " \"affirm\": \"utter_goodbye\",\n",
411 | " \"deny\": \"utter_options\",\n",
412 | " \"inform\": \"utter_confirm\",\n",
413 | " \"default\": \"utter_default\",\n",
414 | "}"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 17,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "import random\n",
424 | "def get_bot_response(bot_response_map, bot_templates, intent):\n",
425 | " if intent not in list(response_map):\n",
426 | " intent = \"default\"\n",
427 | " select_template = bot_response_map[intent]\n",
428 | " templates = bot_templates[select_template]\n",
429 | " return random.choice(templates)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": 18,
435 | "metadata": {},
436 | "outputs": [
437 | {
438 | "data": {
439 | "text/plain": [
440 | "'Got it'"
441 | ]
442 | },
443 | "execution_count": 18,
444 | "metadata": {},
445 | "output_type": "execute_result"
446 | }
447 | ],
448 | "source": [
449 | "user_intent = get_intent(embed, data, \"i want indian food\")\n",
450 | "get_bot_response(response_map, templates, user_intent)"
451 | ]
452 | },
453 | {
454 | "cell_type": "markdown",
455 | "metadata": {},
456 | "source": [
457 | "**Better Response Personalisation?**:"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": 19,
463 | "metadata": {},
464 | "outputs": [
465 | {
466 | "name": "stdout",
467 | "output_type": "stream",
468 | "text": [
469 | "text : 'hey', intent: greet, bot: Hey! How you doin'? \n",
470 | "text : 'i am looking for italian food', intent: inform, bot: Gotcha\n",
471 | "text : 'not for me', intent: deny, bot: ok, let me check some more\n",
472 | "text : 'ok, this is good', intent: affirm, bot: Goodbye!\n"
473 | ]
474 | }
475 | ],
476 | "source": [
477 | "for text in [\"hey\",\"i am looking for italian food\",\"not for me\", \"ok, this is good\"]:\n",
478 | " user_intent = get_intent(embed, data, text)\n",
479 | " bot_reply = get_bot_response(response_map, templates, user_intent)\n",
480 | " print(f\"text : '{text}', intent: {user_intent}, bot: {bot_reply}\")"
481 | ]
482 | }
483 | ],
484 | "metadata": {
485 | "kernelspec": {
486 | "display_name": "fastAI",
487 | "language": "python",
488 | "name": "fastai"
489 | },
490 | "language_info": {
491 | "codemirror_mode": {
492 | "name": "ipython",
493 | "version": 3
494 | },
495 | "file_extension": ".py",
496 | "mimetype": "text/x-python",
497 | "name": "python",
498 | "nbconvert_exporter": "python",
499 | "pygments_lexer": "ipython3",
500 | "version": "3.6.6"
501 | }
502 | },
503 | "nbformat": 4,
504 | "nbformat_minor": 2
505 | }
506 |
--------------------------------------------------------------------------------
/02-B.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "Python 3.6.6 :: Anaconda, Inc.\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "!python --version\n",
18 | "__author__ = \"nirant.bits@gmail.com\""
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "# Spell Correction"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "Collecting fuzzywuzzy[speedup]\n",
38 | " Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl\n",
39 | "Requirement already satisfied: python-levenshtein>=0.12; extra == \"speedup\" in d:\\miniconda3\\envs\\nlp\\lib\\site-packages (from fuzzywuzzy[speedup]) (0.12.0)\n",
40 | "Requirement already satisfied: setuptools in d:\\miniconda3\\envs\\nlp\\lib\\site-packages (from python-levenshtein>=0.12; extra == \"speedup\"->fuzzywuzzy[speedup]) (39.2.0)\n",
41 | "Installing collected packages: fuzzywuzzy\n",
42 | "Successfully installed fuzzywuzzy-0.17.0\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "import sys\n",
48 | "# !{sys.executable} -m pip install fuzzywuzzy\n",
49 | "# alternative for 4-10x faster computation: \n",
50 | "!{sys.executable} -m pip install fuzzywuzzy[speedup]"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "from fuzzywuzzy import fuzz"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "82"
71 | ]
72 | },
73 | "execution_count": 4,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "fuzz.ratio(\"Electronic City Phase One\", \"Electronic City Phase One, Bangalore\")"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/plain": [
90 | "100"
91 | ]
92 | },
93 | "execution_count": 5,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "fuzz.partial_ratio(\"Electronic City Phase One\", \"Electronic City Phase One, Bangalore\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 6,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "data": {
109 | "text/plain": [
110 | "90"
111 | ]
112 | },
113 | "execution_count": 6,
114 | "metadata": {},
115 | "output_type": "execute_result"
116 | }
117 | ],
118 | "source": [
119 | "fuzz.ratio('Narendra Modi', 'Narendra D. Modi')"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 7,
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "77"
131 | ]
132 | },
133 | "execution_count": 7,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "fuzz.partial_ratio('Narendra Modi', 'Narendra D. Modi')"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 8,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "93"
151 | ]
152 | },
153 | "execution_count": 8,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "fuzz.token_sort_ratio('Narendra Modi', 'Narendra D. Modi')"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 9,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/plain": [
170 | "100"
171 | ]
172 | },
173 | "execution_count": 9,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "fuzz.token_set_ratio('Narendra Modi', 'Narendra D. Modi')"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 10,
185 | "metadata": {
186 | "collapsed": true
187 | },
188 | "outputs": [],
189 | "source": [
190 | "from fuzzywuzzy import process"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 11,
196 | "metadata": {},
197 | "outputs": [
198 | {
199 | "name": "stdout",
200 | "output_type": "stream",
201 | "text": [
202 | "[('Gujarat', 92), ('Gujarat Govt.', 75), ('Gujjar', 67)]\n"
203 | ]
204 | },
205 | {
206 | "data": {
207 | "text/plain": [
208 | "('Gujarat', 92)"
209 | ]
210 | },
211 | "execution_count": 11,
212 | "metadata": {},
213 | "output_type": "execute_result"
214 | }
215 | ],
216 | "source": [
217 | "query = 'Gujrat'\n",
218 | "choices = ['Gujarat', 'Gujjar', 'Gujarat Govt.']\n",
219 | "# Get a list of matches ordered by score, default limit to 5\n",
220 | "print(process.extract(query, choices))\n",
221 | "\n",
222 | "# If we want only the top one\n",
223 | "process.extractOne(query, choices)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 12,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "[('Bangalore', 94), ('Bengaluru', 59)]\n"
236 | ]
237 | },
238 | {
239 | "data": {
240 | "text/plain": [
241 | "('Bangalore', 94)"
242 | ]
243 | },
244 | "execution_count": 12,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "query = 'Banglore'\n",
251 | "choices = ['Bangalore', 'Bengaluru']\n",
252 | "print(process.extract(query, choices))\n",
253 | "process.extractOne(query, choices)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 13,
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "name": "stdout",
263 | "output_type": "stream",
264 | "text": [
265 | "[('chilli', 91), ('chilling', 77), ('chilled', 67)]\n"
266 | ]
267 | },
268 | {
269 | "data": {
270 | "text/plain": [
271 | "('chilli', 91)"
272 | ]
273 | },
274 | "execution_count": 13,
275 | "metadata": {},
276 | "output_type": "execute_result"
277 | }
278 | ],
279 | "source": [
280 | "# Let's take an example of a common search typo in online shopping:\n",
281 | "query = 'chili'\n",
282 | "choices = ['chilli', 'chilled', 'chilling']\n",
283 | "print(process.extract(query, choices))\n",
284 | "process.extractOne(query, choices)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## Jellyfish"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 8,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "Collecting jellyfish\n",
304 | " Downloading https://files.pythonhosted.org/packages/61/3f/60ac86fb43dfbf976768e80674b5538e535f6eca5aa7806cf2fdfd63550f/jellyfish-0.6.1.tar.gz (132kB)\n",
305 | "Building wheels for collected packages: jellyfish\n",
306 | " Running setup.py bdist_wheel for jellyfish: started\n",
307 | " Running setup.py bdist_wheel for jellyfish: finished with status 'done'\n",
308 | " Stored in directory: C:\\Users\\nirantk\\AppData\\Local\\pip\\Cache\\wheels\\9c\\6f\\33\\92bb9a4b4562a60ba6a80cedbab8907e48bc7a8b1f369ea0ae\n",
309 | "Successfully built jellyfish\n",
310 | "Installing collected packages: jellyfish\n",
311 | "Successfully installed jellyfish-0.6.1\n"
312 | ]
313 | }
314 | ],
315 | "source": [
316 | "import sys\n",
317 | "!{sys.executable} -m pip install jellyfish"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 9,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "import jellyfish\n",
327 | "correct_example = ('Narendra Modi', 'Narendra Modi')\n",
328 | "damodardas_example = ('Narendra Modi', 'Narendra D. Modi')\n",
329 | "modi_typo_example = ('Narendra Modi', 'Narendar Modi')\n",
330 | "gujarat_typo_example = ('Gujarat', 'Gujrat')\n",
331 | "\n",
332 | "examples = [correct_example, damodardas_example, modi_typo_example, gujarat_typo_example]"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 10,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "def calculate_distance(function, examples=examples):\n",
342 | " for ele in examples:\n",
343 | " print(f'{ele}: {function(*ele)}') "
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 11,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "('Narendra Modi', 'Narendra Modi'): 0\n",
356 | "('Narendra Modi', 'Narendra D. Modi'): 3\n",
357 | "('Narendra Modi', 'Narendar Modi'): 2\n",
358 | "('Gujarat', 'Gujrat'): 1\n"
359 | ]
360 | }
361 | ],
362 | "source": [
363 | "calculate_distance(jellyfish.levenshtein_distance)"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 12,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "name": "stdout",
373 | "output_type": "stream",
374 | "text": [
375 | "('Narendra Modi', 'Narendra Modi'): 0\n",
376 | "('Narendra Modi', 'Narendra D. Modi'): 3\n",
377 | "('Narendra Modi', 'Narendar Modi'): 1\n",
378 | "('Gujarat', 'Gujrat'): 1\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "calculate_distance(jellyfish.damerau_levenshtein_distance)"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 13,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "name": "stdout",
393 | "output_type": "stream",
394 | "text": [
395 | "('Narendra Modi', 'Narendra Modi'): 0\n",
396 | "('Narendra Modi', 'Narendra D. Modi'): 7\n",
397 | "('Narendra Modi', 'Narendar Modi'): 2\n",
398 | "('Gujarat', 'Gujrat'): 4\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "calculate_distance(jellyfish.hamming_distance)"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 14,
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "name": "stdout",
413 | "output_type": "stream",
414 | "text": [
415 | "('Narendra Modi', 'Narendra Modi'): 1.0\n",
416 | "('Narendra Modi', 'Narendra D. Modi'): 0.9375\n",
417 | "('Narendra Modi', 'Narendar Modi'): 0.9743589743589745\n",
418 | "('Gujarat', 'Gujrat'): 0.8968253968253969\n"
419 | ]
420 | }
421 | ],
422 | "source": [
423 | "calculate_distance(jellyfish.jaro_distance) "
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 21,
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "name": "stdout",
433 | "output_type": "stream",
434 | "text": [
435 | "('Narendra Modi', 'Narendra Modi'): 1.0\n",
436 | "('Narendra Modi', 'Narendra D. Modi'): 0.9625\n",
437 | "('Narendra Modi', 'Narendar Modi'): 0.9846153846153847\n",
438 | "('Gujarat', 'Gujrat'): 0.9277777777777778\n"
439 | ]
440 | }
441 | ],
442 | "source": [
443 | "calculate_distance(jellyfish.jaro_winkler)"
444 | ]
445 | },
446 | {
447 | "cell_type": "markdown",
448 | "metadata": {
449 | "collapsed": true
450 | },
451 | "source": [
452 | "### Phonetic Word Similarity\n",
453 | "\n",
454 | "#### What is a phonetic encoding?"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 15,
460 | "metadata": {},
461 | "outputs": [
462 | {
463 | "data": {
464 | "text/plain": [
465 | "'J412'"
466 | ]
467 | },
468 | "execution_count": 15,
469 | "metadata": {},
470 | "output_type": "execute_result"
471 | }
472 | ],
473 | "source": [
474 | "jellyfish.soundex('Jellyfish')"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": 16,
480 | "metadata": {},
481 | "outputs": [
482 | {
483 | "data": {
484 | "text/plain": [
485 | "'JALYF'"
486 | ]
487 | },
488 | "execution_count": 16,
489 | "metadata": {},
490 | "output_type": "execute_result"
491 | }
492 | ],
493 | "source": [
494 | "jellyfish.nysiis('Jellyfish')"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 17,
500 | "metadata": {},
501 | "outputs": [
502 | {
503 | "data": {
504 | "text/plain": [
505 | "'JLFX'"
506 | ]
507 | },
508 | "execution_count": 17,
509 | "metadata": {},
510 | "output_type": "execute_result"
511 | }
512 | ],
513 | "source": [
514 | "jellyfish.metaphone('Jellyfish')"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 18,
520 | "metadata": {},
521 | "outputs": [
522 | {
523 | "data": {
524 | "text/plain": [
525 | "'JLLFSH'"
526 | ]
527 | },
528 | "execution_count": 18,
529 | "metadata": {},
530 | "output_type": "execute_result"
531 | }
532 | ],
533 | "source": [
534 | "jellyfish.match_rating_codex('Jellyfish')"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "#### Metaphone + Levenshtein"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": 20,
547 | "metadata": {},
548 | "outputs": [
549 | {
550 | "data": {
551 | "text/plain": [
552 | "0"
553 | ]
554 | },
555 | "execution_count": 20,
556 | "metadata": {},
557 | "output_type": "execute_result"
558 | }
559 | ],
560 | "source": [
561 | "jellyfish.levenshtein_distance(jellyfish.metaphone('write'), jellyfish.metaphone('right'))"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 21,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": [
570 | "examples+= [('write', 'right'), ('Mangalore', 'Bangalore'), ('Delhi', 'Dilli')] # adding a few examples to show how cool this is"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 28,
576 | "metadata": {},
577 | "outputs": [
578 | {
579 | "name": "stdout",
580 | "output_type": "stream",
581 | "text": [
582 | "Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance\n",
583 | "Narendra Modi\tNRNTR MT \tNarendra Modi \tNRNTR MT \t0 \n",
584 | "Narendra Modi\tNRNTR MT \tNarendra D. Modi \tNRNTR T MT\t2 \n",
585 | "Narendra Modi\tNRNTR MT \tNarendar Modi \tNRNTR MT \t0 \n",
586 | "Gujarat \tKJRT \tGujrat \tKJRT \t0 \n",
587 | "write \tRT \tright \tRT \t0 \n",
588 | "Mangalore \tMNKLR \tBangalore \tBNKLR \t1 \n",
589 | "Delhi \tTLH \tDilli \tTL \t1 \n"
590 | ]
591 | }
592 | ],
593 | "source": [
594 | "def calculate_phonetic_distance(phonetic_func, distance_func, examples=examples):\n",
595 | " print(\"Word\\t\\tSound\\t\\tWord\\t\\t\\tSound\\t\\tPhonetic Distance\")\n",
596 | " for ele in examples:\n",
597 | " correct, typo = ele[0], ele[1]\n",
598 | " phonetic_correct, phonetic_typo = phonetic_func(correct), phonetic_func(typo)\n",
599 | " phonetic_distance = distance_func(phonetic_correct, phonetic_typo)\n",
600 | " print(f'{correct:<10}\\t{phonetic_correct:<10}\\t{typo:<20}\\t{phonetic_typo:<10}\\t{phonetic_distance:<10}') \n",
601 | " \n",
602 | "calculate_phonetic_distance(phonetic_func=jellyfish.metaphone, distance_func=jellyfish.levenshtein_distance) "
603 | ]
604 | },
605 | {
606 | "cell_type": "markdown",
607 | "metadata": {},
608 | "source": [
609 | "#### American Soundex"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 29,
615 | "metadata": {},
616 | "outputs": [
617 | {
618 | "name": "stdout",
619 | "output_type": "stream",
620 | "text": [
621 | "Word\t\tSound\t\tWord\t\t\tSound\t\tPhonetic Distance\n",
622 | "Narendra Modi\tN653 \tNarendra Modi \tN653 \t0 \n",
623 | "Narendra Modi\tN653 \tNarendra D. Modi \tN653 \t0 \n",
624 | "Narendra Modi\tN653 \tNarendar Modi \tN653 \t0 \n",
625 | "Gujarat \tG263 \tGujrat \tG263 \t0 \n",
626 | "write \tW630 \tright \tR230 \t2 \n",
627 | "Mangalore \tM524 \tBangalore \tB524 \t1 \n",
628 | "Delhi \tD400 \tDilli \tD400 \t0 \n"
629 | ]
630 | }
631 | ],
632 | "source": [
633 | "calculate_phonetic_distance(phonetic_func=jellyfish.soundex, distance_func=jellyfish.levenshtein_distance) "
634 | ]
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {},
639 | "source": [
640 | "## Updating the Original Corpus with FlashText"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 22,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "name": "stdout",
650 | "output_type": "stream",
651 | "text": [
652 | "Collecting flashtext\n",
653 | " Downloading https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz\n",
654 | "Building wheels for collected packages: flashtext\n",
655 | " Running setup.py bdist_wheel for flashtext: started\n",
656 | " Running setup.py bdist_wheel for flashtext: finished with status 'done'\n",
657 | " Stored in directory: C:\\Users\\nirantk\\AppData\\Local\\pip\\Cache\\wheels\\37\\db\\d7\\fe74f7cb8e5c3afed90fe6f4967c933a6f13d81ab6b3d3128c\n",
658 | "Successfully built flashtext\n",
659 | "Installing collected packages: flashtext\n",
660 | "Successfully installed flashtext-2.7\n"
661 | ]
662 | }
663 | ],
664 | "source": [
665 | "import sys\n",
666 | "!{sys.executable} -m pip install flashtext"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 31,
672 | "metadata": {},
673 | "outputs": [
674 | {
675 | "data": {
676 | "text/plain": [
677 | "['NCR', 'Mumbai']"
678 | ]
679 | },
680 | "execution_count": 31,
681 | "metadata": {},
682 | "output_type": "execute_result"
683 | }
684 | ],
685 | "source": [
686 | "from flashtext.keyword import KeywordProcessor\n",
687 | "keyword_processor = KeywordProcessor()\n",
688 | "keyword_processor.add_keyword('Delhi', 'NCR') # notice we are adding tuples here\n",
689 | "keyword_processor.add_keyword('Bombay', 'Mumbai')\n",
690 | "keywords_found = keyword_processor.extract_keywords('I love the food in Delhi and the people in Bombay')\n",
691 | "keywords_found\n",
692 | "# ['NCR', 'Mumbai']"
693 | ]
694 | },
695 | {
696 | "cell_type": "markdown",
697 | "metadata": {},
698 | "source": [
699 | "How about we replace them now?"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": 32,
705 | "metadata": {},
706 | "outputs": [
707 | {
708 | "data": {
709 | "text/plain": [
710 | "'I love the food in NCR and the people in Mumbai'"
711 | ]
712 | },
713 | "execution_count": 32,
714 | "metadata": {},
715 | "output_type": "execute_result"
716 | }
717 | ],
718 | "source": [
719 | "from flashtext.keyword import KeywordProcessor\n",
720 | "keyword_processor = KeywordProcessor()\n",
721 | "keyword_processor.add_keyword('Delhi', 'NCR')\n",
722 | "keyword_processor.add_keyword('Bombay', 'Mumbai')\n",
723 | "replaced_sentence = keyword_processor.replace_keywords('I love the food in Delhi and the people in Bombay')\n",
724 | "replaced_sentence\n",
725 | "# 'I love the food in NCR and the people in Mumbai'"
726 | ]
727 | }
728 | ],
729 | "metadata": {
730 | "kernelspec": {
731 | "display_name": "Python [conda env:nlp]",
732 | "language": "python",
733 | "name": "conda-env-nlp-py"
734 | },
735 | "language_info": {
736 | "codemirror_mode": {
737 | "name": "ipython",
738 | "version": 3
739 | },
740 | "file_extension": ".py",
741 | "mimetype": "text/x-python",
742 | "name": "python",
743 | "nbconvert_exporter": "python",
744 | "pygments_lexer": "ipython3",
745 | "version": "3.6.6"
746 | }
747 | },
748 | "nbformat": 4,
749 | "nbformat_minor": 2
750 | }
751 |
--------------------------------------------------------------------------------
/02-A.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text Cleaning"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'\n",
17 | "file_name = 'sherlock.txt'"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 3,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import urllib.request\n",
27 | "# Download the file from `url` and save it locally under `file_name`:\n",
28 | "\n",
29 | "with urllib.request.urlopen(url) as response:\n",
30 | " with open(file_name, 'wb') as out_file:\n",
31 | " data = response.read() # a `bytes` object\n",
32 | " out_file.write(data)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "requirements.txt\n",
45 | "sherlock.txt\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "!ls {*.txt}"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 5,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n",
63 | "\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "!head -2 sherlock.txt"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 5,
74 | "metadata": {
75 | "collapsed": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "!sed -i 1,33d sherlock.txt"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "THE ADVENTURES OF SHERLOCK HOLMES\n",
92 | "\n",
93 | "by\n",
94 | "\n",
95 | "SIR ARTHUR CONAN DOYLE\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "!head -5 sherlock.txt"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## Load Data"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 7,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "THE A\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "#let's the load data to RAM\n",
125 | "text = open(file_name, 'r', encoding='utf-8').read() # note that I add an encoding='utf-8' parameter to preserve information\n",
126 | "print(text[:5])"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | "The file is loaded as datatype: and has 581204 characters in it\n"
139 | ]
140 | }
141 | ],
142 | "source": [
143 | "print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "### Exploring Loaded Data"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 9,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "name": "stdout",
160 | "output_type": "stream",
161 | "text": [
162 | "['\\n', ' ', '!', '\"', '$', '%', '&', \"'\", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']\n",
163 | "There are 85 unique characters, including both ASCII and Unicode character\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "# how many unique characters do we see? \n",
169 | "# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters\n",
170 | "unique_chars = list(set(text))\n",
171 | "unique_chars.sort()\n",
172 | "print(unique_chars)\n",
173 | "print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "## Tokenization \n",
181 | "\n",
182 | "### Split by Whitespace"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 10,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "107431\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "words = text.split()\n",
200 | "print(len(words))"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 11,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "name": "stdout",
210 | "output_type": "stream",
211 | "text": [
212 | "['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler.', 'All', 'emotions,', 'and', 'that', 'one', 'particularly,', 'were', 'abhorrent', 'to', 'his', 'cold,', 'precise', 'but', 'admirably', 'balanced', 'mind.', 'He', 'was,', 'I', 'take', 'it,', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen,', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions,', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer.', 'They', 'were', 'admirable', 'things', 'for']\n"
213 | ]
214 | }
215 | ],
216 | "source": [
217 | "print(words[90:200]) #start with the first chapeter, ignoring the index for now"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 12,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "data": {
227 | "text/plain": [
228 | "['red-headed', 'woman', 'on', 'the', 'street']"
229 | ]
230 | },
231 | "execution_count": 12,
232 | "metadata": {},
233 | "output_type": "execute_result"
234 | }
235 | ],
236 | "source": [
237 | "# Let's look at another example: \n",
238 | "'red-headed woman on the street'.split()"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### Split by Word Extraction\n",
246 | "**Introducing Regex**"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 13,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | "['Words', 'words', 'words', '']"
258 | ]
259 | },
260 | "execution_count": 13,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "import re\n",
267 | "re.split('\\W+', 'Words, words, words.')"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 14,
273 | "metadata": {
274 | "collapsed": true
275 | },
276 | "outputs": [],
277 | "source": [
278 | "words_alphanumeric = re.split('\\W+', text)"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 15,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/plain": [
289 | "(109111, 107431)"
290 | ]
291 | },
292 | "execution_count": 15,
293 | "metadata": {},
294 | "output_type": "execute_result"
295 | }
296 | ],
297 | "source": [
298 | "len(words_alphanumeric), len(words)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 16,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "name": "stdout",
308 | "output_type": "stream",
309 | "text": [
310 | "['BOHEMIA', 'I', 'To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', 'All', 'emotions', 'and', 'that', 'one', 'particularly', 'were', 'abhorrent', 'to', 'his', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', 'They', 'were', 'admirable']\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "print(words_alphanumeric[90:200])"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 17,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "['Isn', 't', 'he', 'coming', 'home', 'for', 'dinner', 'with', 'the', 'red', 'headed', 'girl', '']\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "words_break = re.split('\\W+', \"Isn't he coming home for dinner with the red-headed girl?\")\n",
333 | "print(words_break)"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "### spaCy for Tokenization"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 18,
346 | "metadata": {},
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | "Wall time: 2.46 s\n"
353 | ]
354 | }
355 | ],
356 | "source": [
357 | "%%time\n",
358 | "import spacy\n",
359 | "nlp = spacy.load('en')"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 19,
365 | "metadata": {
366 | "collapsed": true
367 | },
368 | "outputs": [],
369 | "source": [
370 | "doc = nlp(text)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 20,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "name": "stdout",
380 | "output_type": "stream",
381 | "text": [
382 | "[whole, of, her, sex, ., It, was, not, that, he, felt, \n",
383 | ", any, emotion, akin, to, love, for, Irene, Adler, ., All, emotions, ,, and, that, \n",
384 | ", one, particularly, ,, were, abhorrent, to, his, cold, ,, precise, but, \n",
385 | ", admirably, balanced, mind, ., He, was, ,, I, take, it, ,]\n"
386 | ]
387 | }
388 | ],
389 | "source": [
390 | "print(list(doc)[150:200])"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "metadata": {},
396 | "source": [
397 | "Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 21,
403 | "metadata": {},
404 | "outputs": [
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "[Is, n't, he, coming, home, for, dinner, with, the, red, -, headed, girl, ?]\n"
410 | ]
411 | }
412 | ],
413 | "source": [
414 | "words = nlp(\"Isn't he coming home for dinner with the red-headed girl?\")\n",
415 | "print([token for token in words])"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 22,
421 | "metadata": {},
422 | "outputs": [
423 | {
424 | "name": "stdout",
425 | "output_type": "stream",
426 | "text": [
427 | "[I. A SCANDAL IN BOHEMIA\n",
428 | "\n",
429 | "I.\n",
430 | "\n",
431 | "To Sherlock Holmes, she is always THE woman., I have seldom heard\n",
432 | "him mention her under any other name., In his eyes she eclipses\n",
433 | "and predominates the whole of her sex., It was not that he felt\n",
434 | "any emotion akin to love for Irene Adler.]\n"
435 | ]
436 | }
437 | ],
438 | "source": [
439 | "sentences = list(doc.sents)\n",
440 | "print(sentences[13:18])"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "#### STOP WORD REMOVAL & CASE CHANGE"
448 | ]
449 | },
450 | {
451 | "cell_type": "markdown",
452 | "metadata": {},
453 | "source": [
454 | "spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: "
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 49,
460 | "metadata": {
461 | "collapsed": true
462 | },
463 | "outputs": [],
464 | "source": [
465 | "sentence_example = \"the AI/AGI uprising cannot happen without the progress of NLP\""
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 54,
471 | "metadata": {},
472 | "outputs": [
473 | {
474 | "data": {
475 | "text/plain": [
476 | "[(the, True, False),\n",
477 | " (AI, False, False),\n",
478 | " (/, False, True),\n",
479 | " (AGI, True, False),\n",
480 | " (uprising, False, False),\n",
481 | " (can, True, False),\n",
482 | " (not, True, False),\n",
483 | " (happen, False, False),\n",
484 | " (without, True, False),\n",
485 | " (the, True, False),\n",
486 | " (progress, False, False),\n",
487 | " (of, True, False),\n",
488 | " (NLP, True, False)]"
489 | ]
490 | },
491 | "execution_count": 54,
492 | "metadata": {},
493 | "output_type": "execute_result"
494 | }
495 | ],
496 | "source": [
497 | "[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 57,
503 | "metadata": {},
504 | "outputs": [
505 | {
506 | "name": "stdout",
507 | "output_type": "stream",
508 | "text": [
509 | "THE False False\n",
510 | "ADVENTURES False False\n",
511 | "OF False False\n",
512 | "SHERLOCK False False\n",
513 | "HOLMES False False\n"
514 | ]
515 | }
516 | ],
517 | "source": [
518 | "for token in doc[:5]:\n",
519 | " print(token, token.is_stop, token.is_punct)"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 30,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "text_lower = text.lower() # native python function\n",
529 | "doc_lower = nlp(text_lower)"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 32,
535 | "metadata": {},
536 | "outputs": [
537 | {
538 | "name": "stdout",
539 | "output_type": "stream",
540 | "text": [
541 | "the True\n",
542 | "adventures False\n",
543 | "of True\n",
544 | "sherlock False\n",
545 | "holmes False\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "for token in doc_lower[:5]:\n",
551 | " print(token, token.is_stop)"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 28,
557 | "metadata": {},
558 | "outputs": [
559 | {
560 | "data": {
561 | "text/plain": [
562 | "'spaCy has a dictionary of 305 stop words'"
563 | ]
564 | },
565 | "execution_count": 28,
566 | "metadata": {},
567 | "output_type": "execute_result"
568 | }
569 | ],
570 | "source": [
571 | "from spacy.lang.en.stop_words import STOP_WORDS\n",
572 | "f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 58,
578 | "metadata": {
579 | "collapsed": true
580 | },
581 | "outputs": [],
582 | "source": [
583 | "domain_stop_words = [\"NLP\", \"Processing\", \"AGI\"]\n",
584 | "for word in domain_stop_words:\n",
585 | " STOP_WORDS.add(word)"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 59,
591 | "metadata": {},
592 | "outputs": [
593 | {
594 | "data": {
595 | "text/plain": [
596 | "[(the, True, False),\n",
597 | " (AI, False, False),\n",
598 | " (/, False, True),\n",
599 | " (AGI, True, False),\n",
600 | " (uprising, False, False),\n",
601 | " (can, True, False),\n",
602 | " (not, True, False),\n",
603 | " (happen, False, False),\n",
604 | " (without, True, False),\n",
605 | " (the, True, False),\n",
606 | " (progress, False, False),\n",
607 | " (of, True, False),\n",
608 | " (NLP, True, False)]"
609 | ]
610 | },
611 | "execution_count": 59,
612 | "metadata": {},
613 | "output_type": "execute_result"
614 | }
615 | ],
616 | "source": [
617 | "[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": 61,
623 | "metadata": {},
624 | "outputs": [
625 | {
626 | "data": {
627 | "text/plain": [
628 | "['AI', 'uprising', 'happen', 'progress']"
629 | ]
630 | },
631 | "execution_count": 61,
632 | "metadata": {},
633 | "output_type": "execute_result"
634 | }
635 | ],
636 | "source": [
637 | "[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 62,
643 | "metadata": {},
644 | "outputs": [
645 | {
646 | "data": {
647 | "text/plain": [
648 | "['AI', '/', 'uprising', 'happen', 'progress']"
649 | ]
650 | },
651 | "execution_count": 62,
652 | "metadata": {},
653 | "output_type": "execute_result"
654 | }
655 | ],
656 | "source": [
657 | "[str(token) for token in nlp(sentence_example) if not token.is_stop]"
658 | ]
659 | },
660 | {
661 | "cell_type": "markdown",
662 | "metadata": {},
663 | "source": [
664 | "## Stemming and Lemmatization"
665 | ]
666 | },
667 | {
668 | "cell_type": "markdown",
669 | "metadata": {},
670 | "source": [
671 | "### spaCy for Lemmatization\n",
672 | "**spaCy only supports lemmatization** "
673 | ]
674 | },
675 | {
676 | "cell_type": "markdown",
677 | "metadata": {},
678 | "source": [
679 | "An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. "
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "execution_count": 72,
685 | "metadata": {},
686 | "outputs": [
687 | {
688 | "data": {
689 | "text/plain": [
690 | "[(Their, '-PRON-', 561228191312463089, 'ADJ'),\n",
691 | " (Apples, 'apples', 14374618037326464786, 'PROPN'),\n",
692 | " (&, '&', 15473034735919704609, 'CCONJ'),\n",
693 | " (Banana, 'banana', 2525716904149915114, 'PROPN'),\n",
694 | " (fruit, 'fruit', 17674554054627885835, 'NOUN'),\n",
695 | " (salads, 'salad', 16382906660984395826, 'NOUN'),\n",
696 | " (are, 'be', 10382539506755952630, 'VERB'),\n",
697 | " (amazing, 'amazing', 12968186374132960503, 'ADJ'),\n",
698 | " (., '.', 12646065887601541794, 'PUNCT'),\n",
699 | " (Would, 'would', 6992604926141104606, 'VERB'),\n",
700 | " (you, '-PRON-', 561228191312463089, 'PRON'),\n",
701 | " (like, 'like', 18194338103975822726, 'VERB'),\n",
702 | " (meeting, 'meet', 6880656908171229526, 'VERB'),\n",
703 | " (me, '-PRON-', 561228191312463089, 'PRON'),\n",
704 | " (at, 'at', 11667289587015813222, 'ADP'),\n",
705 | " (the, 'the', 7425985699627899538, 'DET'),\n",
706 | " (cafe, 'cafe', 10569699879655997926, 'NOUN'),\n",
707 | " (?, '?', 8205403955989537350, 'PUNCT')]"
708 | ]
709 | },
710 | "execution_count": 72,
711 | "metadata": {},
712 | "output_type": "execute_result"
713 | }
714 | ],
715 | "source": [
716 | "lemma_sentence_example = \"Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?\"\n",
717 | "[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]"
718 | ]
719 | }
720 | ],
721 | "metadata": {
722 | "kernelspec": {
723 | "display_name": "Python [conda env:nlp]",
724 | "language": "python",
725 | "name": "conda-env-nlp-py"
726 | },
727 | "language_info": {
728 | "codemirror_mode": {
729 | "name": "ipython",
730 | "version": 3
731 | },
732 | "file_extension": ".py",
733 | "mimetype": "text/x-python",
734 | "name": "python",
735 | "nbconvert_exporter": "python",
736 | "pygments_lexer": "ipython3",
737 | "version": "3.6.6"
738 | }
739 | },
740 | "nbformat": 4,
741 | "nbformat_minor": 2
742 | }
743 |
--------------------------------------------------------------------------------
/04_Text_Representations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
13 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
14 | ]
15 | },
16 | {
17 | "name": "stdout",
18 | "output_type": "stream",
19 | "text": [
20 | "gensim: 3.4.0\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "import gensim\n",
26 | "print(f'gensim: {gensim.__version__}')"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Let's download some pre-trained GLove embeddings: "
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Solving environment: ...working... done\n",
46 | "\n",
47 | "# All requested packages already installed.\n",
48 | "\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "!conda install -y tqdm"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "from tqdm import tqdm\n",
63 | "class TqdmUpTo(tqdm):\n",
64 | " def update_to(self, b=1, bsize=1, tsize=None):\n",
65 | " if tsize is not None: self.total = tsize\n",
66 | " self.update(b * bsize - self.n)\n",
67 | "\n",
68 | "def get_data(url, filename):\n",
69 | " \"\"\"\n",
70 | " Download data if the filename does not exist already\n",
71 | " Uses Tqdm to show download progress\n",
72 | " \"\"\"\n",
73 | " import os\n",
74 | " from urllib.request import urlretrieve\n",
75 | " \n",
76 | " if not os.path.exists(filename):\n",
77 | "\n",
78 | " dirname = os.path.dirname(filename)\n",
79 | " if not os.path.exists(dirname):\n",
80 | " os.makedirs(dirname)\n",
81 | "\n",
82 | " with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:\n",
83 | " urlretrieve(url, filename, reporthook=t.update_to)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n",
93 | "get_data(embedding_url, 'data/glove.6B.zip')"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 5,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "Archive: data/glove.6B.zip\n",
106 | " inflating: glove.6B.50d.txt \n",
107 | " inflating: glove.6B.100d.txt \n",
108 | " inflating: glove.6B.200d.txt \n",
109 | " inflating: glove.6B.300d.txt \n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "# # We need to run this only once, can unzip manually unzip to the data directory too\n",
115 | "!unzip data/glove.6B.zip \n",
116 | "!mv glove.6B.300d.txt data/glove.6B.300d.txt \n",
117 | "!mv glove.6B.200d.txt data/glove.6B.200d.txt \n",
118 | "!mv glove.6B.100d.txt data/glove.6B.100d.txt \n",
119 | "!mv glove.6B.50d.txt data/glove.6B.50d.txt "
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 6,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "from gensim.scripts.glove2word2vec import glove2word2vec\n",
129 | "glove_input_file = 'data/glove.6B.300d.txt'\n",
130 | "\n",
131 | "word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 7,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "import os\n",
141 | "if not os.path.exists(word2vec_output_file):\n",
142 | " glove2word2vec(glove_input_file, word2vec_output_file)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "### KeyedVectors API"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 8,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "from gensim.models import KeyedVectors\n",
159 | "filename = word2vec_output_file "
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 9,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "name": "stdout",
169 | "output_type": "stream",
170 | "text": [
171 | "Wall time: 1min 24s\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "%%time\n",
177 | "# load the Stanford GloVe model from file, this is Disk I/O and can be slow\n",
178 | "pretrained_w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)\n",
179 | "# binary=False format for human readable text (.txt) files, and binary=True for .bin files "
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 10,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "name": "stderr",
189 | "output_type": "stream",
190 | "text": [
191 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
192 | " \n"
193 | ]
194 | },
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "[('queen', 0.6713277101516724)]\n"
200 | ]
201 | }
202 | ],
203 | "source": [
204 | "# calculate: (king - man) + woman = ?\n",
205 | "result = pretrained_w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)\n",
206 | "print(result)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 11,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "name": "stdout",
216 | "output_type": "stream",
217 | "text": [
218 | "[('twitter', 0.37966805696487427)]\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "# calculate: (india - canada) + = ?\n",
224 | "result = pretrained_w2v_model.most_similar(positive=['quora', 'facebook'], negative=['linkedin'], topn=1)\n",
225 | "print(result)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 12,
231 | "metadata": {
232 | "scrolled": true
233 | },
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "[('indian', 0.7355823516845703),\n",
239 | " ('pakistan', 0.7285579442977905),\n",
240 | " ('delhi', 0.6846905946731567),\n",
241 | " ('bangladesh', 0.620319128036499),\n",
242 | " ('lanka', 0.609517514705658),\n",
243 | " ('sri', 0.6011613607406616),\n",
244 | " ('kashmir', 0.5746493935585022),\n",
245 | " ('nepal', 0.5421023964881897),\n",
246 | " ('pradesh', 0.5405810475349426),\n",
247 | " ('maharashtra', 0.518537700176239)]"
248 | ]
249 | },
250 | "execution_count": 12,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "pretrained_w2v_model.most_similar('india')"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "#### What is missing in both word2vec and GloVe? "
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 13,
269 | "metadata": {
270 | "scrolled": true
271 | },
272 | "outputs": [
273 | {
274 | "name": "stderr",
275 | "output_type": "stream",
276 | "text": [
277 | "D:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
278 | " \n"
279 | ]
280 | },
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "\"word 'nirant' not in vocabulary\"\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "try:\n",
291 | " pretrained_w2v_model.wv.most_similar('nirant')\n",
292 | "except Exception as e:\n",
293 | " print(e)"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "### How to handle OOV words? "
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 14,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "ted_dataset = \"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\"\n",
310 | "get_data(ted_dataset, \"data/ted_en.zip\")"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 15,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "import zipfile\n",
320 | "import lxml.etree\n",
321 | "with zipfile.ZipFile('data/ted_en.zip', 'r') as z:\n",
322 | " doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n",
323 | "input_text = '\\n'.join(doc.xpath('//content/text()'))"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 16,
329 | "metadata": {
330 | "scrolled": true
331 | },
332 | "outputs": [
333 | {
334 | "data": {
335 | "text/plain": [
336 | "\"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A\""
337 | ]
338 | },
339 | "execution_count": 16,
340 | "metadata": {},
341 | "output_type": "execute_result"
342 | }
343 | ],
344 | "source": [
345 | "input_text[:500]"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 17,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "import re\n",
355 | "# remove parenthesis \n",
356 | "input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)\n",
357 | "\n",
358 | "# store as list of sentences\n",
359 | "sentences_strings_ted = []\n",
360 | "for line in input_text_noparens.split('\\n'):\n",
361 | " m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$', line)\n",
362 | " sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)\n",
363 | "\n",
364 | "# store as list of lists of words\n",
365 | "sentences_ted = []\n",
366 | "for sent_str in sentences_strings_ted:\n",
367 | " tokens = re.sub(r\"[^a-z0-9]+\", \" \", sent_str.lower()).split()\n",
368 | " sentences_ted.append(tokens)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 18,
374 | "metadata": {
375 | "scrolled": true
376 | },
377 | "outputs": [
378 | {
379 | "name": "stdout",
380 | "output_type": "stream",
381 | "text": [
382 | "[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]\n"
383 | ]
384 | }
385 | ],
386 | "source": [
387 | "print(sentences_ted[:2])"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 19,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "import json\n",
397 | "with open('ted_clean_sentences.json', 'w') as fp:\n",
398 | " json.dump(sentences_ted, fp)"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 20,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "with open('ted_clean_sentences.json', 'r') as fp:\n",
408 | " sentences_ted = json.load(fp)"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 21,
414 | "metadata": {},
415 | "outputs": [
416 | {
417 | "name": "stdout",
418 | "output_type": "stream",
419 | "text": [
420 | "[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]\n"
421 | ]
422 | }
423 | ],
424 | "source": [
425 | "print(sentences_ted[:2])"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "### Train FastText Embedddings"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 22,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "from gensim.models.fasttext import FastText"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 23,
447 | "metadata": {},
448 | "outputs": [
449 | {
450 | "name": "stdout",
451 | "output_type": "stream",
452 | "text": [
453 | "Wall time: 5.48 s\n"
454 | ]
455 | }
456 | ],
457 | "source": [
458 | "%%time\n",
459 | "fasttext_ted_model = FastText(sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)\n",
460 | "# sg = 1 denotes skipgram, else CBOW is used"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 24,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "data": {
470 | "text/plain": [
471 | "[('indians', 0.5911639928817749),\n",
472 | " ('indian', 0.5406097769737244),\n",
473 | " ('indiana', 0.4898717999458313),\n",
474 | " ('indicated', 0.44004374742507935),\n",
475 | " ('indicate', 0.4042605757713318),\n",
476 | " ('internal', 0.39166826009750366),\n",
477 | " ('interior', 0.3871103823184967),\n",
478 | " ('byproducts', 0.37529298663139343),\n",
479 | " ('princesses', 0.37265270948410034),\n",
480 | " ('indications', 0.369659960269928)]"
481 | ]
482 | },
483 | "execution_count": 24,
484 | "metadata": {},
485 | "output_type": "execute_result"
486 | }
487 | ],
488 | "source": [
489 | "fasttext_ted_model.wv.most_similar(\"india\")"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "metadata": {},
495 | "source": [
496 | "### Train word2vec Embeddings"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 25,
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "from gensim.models.word2vec import Word2Vec"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 26,
511 | "metadata": {},
512 | "outputs": [
513 | {
514 | "name": "stdout",
515 | "output_type": "stream",
516 | "text": [
517 | "Wall time: 1.44 s\n"
518 | ]
519 | }
520 | ],
521 | "source": [
522 | "%%time\n",
523 | "word2vec_ted_model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 27,
529 | "metadata": {},
530 | "outputs": [
531 | {
532 | "data": {
533 | "text/plain": [
534 | "[('bordered', 0.41709238290786743),\n",
535 | " ('hovering', 0.4083016514778137),\n",
536 | " ('almost', 0.3865964710712433),\n",
537 | " ('sad', 0.3704090118408203),\n",
538 | " ('supporters', 0.3616541624069214),\n",
539 | " ('spite', 0.3598758280277252),\n",
540 | " ('wrinkles', 0.3590206205844879),\n",
541 | " ('guaranteed', 0.3535975515842438),\n",
542 | " ('hd', 0.3512127995491028),\n",
543 | " ('assistant', 0.346971333026886)]"
544 | ]
545 | },
546 | "execution_count": 27,
547 | "metadata": {},
548 | "output_type": "execute_result"
549 | }
550 | ],
551 | "source": [
552 | "word2vec_ted_model.wv.most_similar(\"india\")"
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {},
558 | "source": [
559 | "## fastText or word2vec? "
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "# Document Embeddings"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 28,
572 | "metadata": {},
573 | "outputs": [],
574 | "source": [
575 | "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n",
576 | "import gensim\n",
577 | "from pprint import pprint\n",
578 | "import multiprocessing"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": 29,
584 | "metadata": {},
585 | "outputs": [],
586 | "source": [
587 | "import zipfile\n",
588 | "import lxml.etree\n",
589 | "with zipfile.ZipFile('data/ted_en.zip', 'r') as z:\n",
590 | " doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n",
591 | " \n",
592 | "talks = doc.xpath('//content/text()')"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 30,
598 | "metadata": {},
599 | "outputs": [],
600 | "source": [
601 | "def read_corpus(talks, tokens_only=False):\n",
602 | " for i, line in enumerate(talks):\n",
603 | " if tokens_only:\n",
604 | " yield gensim.utils.simple_preprocess(line)\n",
605 | " else:\n",
606 | " # For training data, add tags\n",
607 | " yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": 31,
613 | "metadata": {},
614 | "outputs": [
615 | {
616 | "data": {
617 | "text/plain": [
618 | ""
619 | ]
620 | },
621 | "execution_count": 31,
622 | "metadata": {},
623 | "output_type": "execute_result"
624 | }
625 | ],
626 | "source": [
627 | "read_corpus(talks)"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 32,
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "ted_talk_docs = list(read_corpus(talks)) "
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 33,
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "data": {
646 | "text/plain": [
647 | "TaggedDocument(words=['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 'new', 'to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation', 'both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'good', 'thing', 'consider', 'facit', 'actually', 'old', 'enough', 'to', 'remember', 'them', 'facit', 'was', 'fantastic', 'company', 'they', 'were', 'born', 'deep', 'in', 'the', 'swedish', 'forest', 'and', 'they', 'made', 'the', 'best', 'mechanical', 'calculators', 'in', 'the', 'world', 'everybody', 'used', 'them', 'and', 'what', 'did', 'facit', 'do', 'when', 'the', 'electronic', 'calculator', 'came', 'along', 'they', 'continued', 'doing', 'exactly', 'the', 'same', 'in', 'six', 'months', 'they', 'went', 'from', 'maximum', 'revenue', 'and', 'they', 'were', 'gone', 'gone', 'to', 'me', 'the', 'irony', 'about', 'the', 'facit', 'story', 'is', 'hearing', 'about', 'the', 'facit', 'engineers', 'who', 'had', 'bought', 'cheap', 'small', 'electronic', 'calculators', 'in', 'japan', 'that', 'they', 'used', 'to', 'double', 'check', 'their', 'calculators', 'laughter', 'facit', 'did', 'too', 'much', 'exploitation', 'but', 'exploration', 'can', 'go', 'wild', 'too', 'few', 'years', 'back', 'worked', 'closely', 'alongside', 'european', 'biotech', 'company', 'let', 'call', 'them', 'oncosearch', 'the', 'company', 'was', 'brilliant', 'they', 'had', 'applications', 'that', 'promised', 'to', 'diagnose', 'even', 'cure', 'certain', 'forms', 'of', 'blood', 'cancer', 'every', 'day', 'was', 'about', 'creating', 'something', 'new', 'they', 'were', 'extremely', 'innovative', 'and', 'the', 'mantra', 'was', 'when', 'we', 'only', 'get', 'it', 'right', 'or', 'even', 'we', 'want', 'it', 'perfect', 'the', 'sad', 'thing', 'is', 'before', 'they', 'became', 'perfect', 'even', 'good', 'enough', 'they', 'became', 'obsolete', 'oncosearch', 'did', 'too', 'much', 'exploration', 'first', 'heard', 'about', 'exploration', 'and', 'exploitation', 'about', 'years', 'ago', 'when', 'worked', 'as', 'visiting', 'scholar', 'at', 'stanford', 'university', 'the', 'founder', 'of', 'the', 'idea', 'is', 'jim', 'march', 'and', 'to', 'me', 'the', 'power', 'of', 'the', 'idea', 'is', 'its', 'practicality', 'exploration', 'exploration', 'is', 'about', 'coming', 'up', 'with', 'what', 'new', 'it', 'about', 'search', 'it', 'about', 'discovery', 'it', 'about', 'new', 'products', 'it', 'about', 'new', 'innovations', 'it', 'about', 'changing', 'our', 'frontiers', 'our', 'heroes', 'are', 'people', 'who', 'have', 'done', 'exploration', 'madame', 'curie', 'picasso', 'neil', 'armstrong', 'sir', 'edmund', 'hillary', 'etc', 'come', 'from', 'norway', 'all', 'our', 'heroes', 'are', 'explorers', 'and', 'they', 'deserve', 'to', 'be', 'we', 'all', 'know', 'that', 'exploration', 'is', 'risky', 'we', 'don', 'know', 'the', 'answers', 'we', 'don', 'know', 'if', 'we', 're', 'going', 'to', 'find', 'them', 'and', 'we', 'know', 'that', 'the', 'risks', 'are', 'high', 'exploitation', 'is', 'the', 'opposite', 'exploitation', 'is', 'taking', 'the', 'knowledge', 'we', 'have', 'and', 'making', 'good', 'better', 'exploitation', 'is', 'about', 'making', 'our', 'trains', 'run', 'on', 'time', 'it', 'about', 'making', 'good', 'products', 'faster', 'and', 'cheaper', 'exploitation', 'is', 'not', 'risky', 'in', 'the', 'short', 'term', 'but', 'if', 'we', 'only', 'exploit', 'it', 'very', 'risky', 'in', 'the', 'long', 'term', 'and', 'think', 'we', 'all', 'have', 'memories', 'of', 'the', 'famous', 'pop', 'groups', 'who', 'keep', 'singing', 'the', 'same', 'songs', 'again', 'and', 'again', 'until', 'they', 'become', 'obsolete', 'or', 'even', 'pathetic', 'that', 'the', 'risk', 'of', 'exploitation', 'so', 'if', 'we', 'take', 'long', 'term', 'perspective', 'we', 'explore', 'if', 'we', 'take', 'short', 'term', 'perspective', 'we', 'exploit', 'small', 'children', 'they', 'explore', 'all', 'day', 'all', 'day', 'it', 'about', 'exploration', 'as', 'we', 'grow', 'older', 'we', 'explore', 'less', 'because', 'we', 'have', 'more', 'knowledge', 'to', 'exploit', 'on', 'the', 'same', 'goes', 'for', 'companies', 'companies', 'become', 'by', 'nature', 'less', 'innovative', 'as', 'they', 'become', 'more', 'competent', 'and', 'this', 'is', 'of', 'course', 'big', 'worry', 'to', 'ceos', 'and', 'hear', 'very', 'often', 'questions', 'phrased', 'in', 'different', 'ways', 'for', 'example', 'how', 'can', 'both', 'effectively', 'run', 'and', 'reinvent', 'my', 'company', 'or', 'how', 'can', 'make', 'sure', 'that', 'our', 'company', 'changes', 'before', 'we', 'become', 'obsolete', 'or', 'are', 'hit', 'by', 'crisis', 'so', 'doing', 'one', 'well', 'is', 'difficult', 'doing', 'both', 'well', 'as', 'the', 'same', 'time', 'is', 'art', 'pushing', 'both', 'exploration', 'and', 'exploitation', 'so', 'one', 'thing', 'we', 've', 'found', 'is', 'only', 'about', 'two', 'percent', 'of', 'companies', 'are', 'able', 'to', 'effectively', 'explore', 'and', 'exploit', 'at', 'the', 'same', 'time', 'in', 'parallel', 'but', 'when', 'they', 'do', 'the', 'payoffs', 'are', 'huge', 'so', 'we', 'have', 'lots', 'of', 'great', 'examples', 'we', 'have', 'nestlé', 'creating', 'nespresso', 'we', 'have', 'lego', 'going', 'into', 'animated', 'films', 'toyota', 'creating', 'the', 'hybrids', 'unilever', 'pushing', 'into', 'sustainability', 'there', 'are', 'lots', 'of', 'examples', 'and', 'the', 'benefits', 'are', 'huge', 'why', 'is', 'balancing', 'so', 'difficult', 'think', 'it', 'difficult', 'because', 'there', 'are', 'so', 'many', 'traps', 'that', 'keep', 'us', 'where', 'we', 'are', 'so', 'll', 'talk', 'about', 'two', 'but', 'there', 'are', 'many', 'so', 'let', 'talk', 'about', 'the', 'perpetual', 'search', 'trap', 'we', 'discover', 'something', 'but', 'we', 'don', 'have', 'the', 'patience', 'or', 'the', 'persistence', 'to', 'get', 'at', 'it', 'and', 'make', 'it', 'work', 'so', 'instead', 'of', 'staying', 'with', 'it', 'we', 'create', 'something', 'new', 'but', 'the', 'same', 'goes', 'for', 'that', 'then', 'we', 're', 'in', 'the', 'vicious', 'circle', 'of', 'actually', 'coming', 'up', 'with', 'ideas', 'but', 'being', 'frustrated', 'oncosearch', 'was', 'good', 'example', 'famous', 'example', 'is', 'of', 'course', 'xerox', 'but', 'we', 'don', 'only', 'see', 'this', 'in', 'companies', 'we', 'see', 'this', 'in', 'the', 'public', 'sector', 'as', 'well', 'we', 'all', 'know', 'that', 'any', 'kind', 'of', 'effective', 'reform', 'of', 'education', 'research', 'health', 'care', 'even', 'defense', 'takes', 'maybe', 'years', 'to', 'work', 'but', 'still', 'we', 'change', 'much', 'more', 'often', 'we', 'really', 'don', 'give', 'them', 'the', 'chance', 'another', 'trap', 'is', 'the', 'success', 'trap', 'facit', 'fell', 'into', 'the', 'success', 'trap', 'they', 'literally', 'held', 'the', 'future', 'in', 'their', 'hands', 'but', 'they', 'couldn', 'see', 'it', 'they', 'were', 'simply', 'so', 'good', 'at', 'making', 'what', 'they', 'loved', 'doing', 'that', 'they', 'wouldn', 'change', 'we', 'are', 'like', 'that', 'too', 'when', 'we', 'know', 'something', 'well', 'it', 'difficult', 'to', 'change', 'bill', 'gates', 'has', 'said', 'success', 'is', 'lousy', 'teacher', 'it', 'seduces', 'us', 'into', 'thinking', 'we', 'cannot', 'fail', 'that', 'the', 'challenge', 'with', 'success', 'so', 'think', 'there', 'are', 'some', 'lessons', 'and', 'think', 'they', 'apply', 'to', 'us', 'and', 'they', 'apply', 'to', 'our', 'companies', 'the', 'first', 'lesson', 'is', 'get', 'ahead', 'of', 'the', 'crisis', 'and', 'any', 'company', 'that', 'able', 'to', 'innovate', 'is', 'actually', 'able', 'to', 'also', 'buy', 'an', 'insurance', 'in', 'the', 'future', 'netflix', 'they', 'could', 'so', 'easily', 'have', 'been', 'content', 'with', 'earlier', 'generations', 'of', 'distribution', 'but', 'they', 'always', 'and', 'think', 'they', 'will', 'always', 'keep', 'pushing', 'for', 'the', 'next', 'battle', 'see', 'other', 'companies', 'that', 'say', 'll', 'win', 'the', 'next', 'innovation', 'cycle', 'whatever', 'it', 'takes', 'second', 'one', 'think', 'in', 'multiple', 'time', 'scales', 'll', 'share', 'chart', 'with', 'you', 'and', 'think', 'it', 'wonderful', 'one', 'any', 'company', 'we', 'look', 'at', 'taking', 'one', 'year', 'perspective', 'and', 'looking', 'at', 'the', 'valuation', 'of', 'the', 'company', 'innovation', 'typically', 'accounts', 'for', 'only', 'about', 'percent', 'so', 'when', 'we', 'think', 'one', 'year', 'innovation', 'isn', 'really', 'that', 'important', 'move', 'ahead', 'take', 'year', 'perspective', 'on', 'the', 'same', 'company', 'suddenly', 'innovation', 'and', 'ability', 'to', 'renew', 'account', 'for', 'percent', 'but', 'companies', 'can', 'choose', 'they', 'need', 'to', 'fund', 'the', 'journey', 'and', 'lead', 'the', 'long', 'term', 'third', 'invite', 'talent', 'don', 'think', 'it', 'possible', 'for', 'any', 'of', 'us', 'to', 'be', 'able', 'to', 'balance', 'exploration', 'and', 'exploitation', 'by', 'ourselves', 'think', 'it', 'team', 'sport', 'think', 'we', 'need', 'to', 'allow', 'challenging', 'think', 'the', 'mark', 'of', 'great', 'company', 'is', 'being', 'open', 'to', 'be', 'challenged', 'and', 'the', 'mark', 'of', 'good', 'corporate', 'board', 'is', 'to', 'constructively', 'challenge', 'think', 'that', 'also', 'what', 'good', 'parenting', 'is', 'about', 'last', 'one', 'be', 'skeptical', 'of', 'success', 'maybe', 'it', 'useful', 'to', 'think', 'back', 'at', 'the', 'old', 'triumph', 'marches', 'in', 'rome', 'when', 'the', 'generals', 'after', 'big', 'victory', 'were', 'given', 'their', 'celebration', 'riding', 'into', 'rome', 'on', 'the', 'carriage', 'they', 'always', 'had', 'companion', 'whispering', 'in', 'their', 'ear', 'remember', 'you', 're', 'only', 'human', 'so', 'hope', 'made', 'the', 'point', 'balancing', 'exploration', 'and', 'exploitation', 'has', 'huge', 'payoff', 'but', 'it', 'difficult', 'and', 'we', 'need', 'to', 'be', 'conscious', 'want', 'to', 'just', 'point', 'out', 'two', 'questions', 'that', 'think', 'are', 'useful', 'first', 'question', 'is', 'looking', 'at', 'your', 'own', 'company', 'in', 'which', 'areas', 'do', 'you', 'see', 'that', 'the', 'company', 'is', 'at', 'the', 'risk', 'of', 'falling', 'into', 'success', 'traps', 'of', 'just', 'going', 'on', 'autopilot', 'and', 'what', 'can', 'you', 'do', 'to', 'challenge', 'second', 'question', 'is', 'when', 'did', 'explore', 'something', 'new', 'last', 'and', 'what', 'kind', 'of', 'effect', 'did', 'it', 'have', 'on', 'me', 'is', 'that', 'something', 'should', 'do', 'more', 'of', 'in', 'my', 'case', 'yes', 'so', 'let', 'me', 'leave', 'you', 'with', 'this', 'whether', 'you', 're', 'an', 'explorer', 'by', 'nature', 'or', 'whether', 'you', 'tend', 'to', 'exploit', 'what', 'you', 'already', 'know', 'don', 'forget', 'the', 'beauty', 'is', 'in', 'the', 'balance', 'thank', 'you', 'applause'], tags=[0])"
648 | ]
649 | },
650 | "execution_count": 33,
651 | "metadata": {},
652 | "output_type": "execute_result"
653 | }
654 | ],
655 | "source": [
656 | "ted_talk_docs[0]"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 34,
662 | "metadata": {},
663 | "outputs": [
664 | {
665 | "name": "stdout",
666 | "output_type": "stream",
667 | "text": [
668 | "8\n"
669 | ]
670 | }
671 | ],
672 | "source": [
673 | "cores = multiprocessing.cpu_count()\n",
674 | "print(cores)"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": 35,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, epochs=5, workers=cores)"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": 36,
689 | "metadata": {},
690 | "outputs": [
691 | {
692 | "name": "stdout",
693 | "output_type": "stream",
694 | "text": [
695 | "Wall time: 1.4 s\n"
696 | ]
697 | }
698 | ],
699 | "source": [
700 | "%time model.build_vocab(ted_talk_docs)"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 37,
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "sentence_1 = 'Modern medicine has changed the way we think about healthcare, life spans and by extension career and marriage'"
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": 38,
715 | "metadata": {},
716 | "outputs": [],
717 | "source": [
718 | "sentence_2 = 'Modern medicine is not just a boon to the rich, making the raw chemicals behind these is also pollutes the poorest neighborhoods'"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 39,
724 | "metadata": {},
725 | "outputs": [],
726 | "source": [
727 | "sentence_3 = 'Modern medicine has changed the way we think about healthcare, and increased life spans, delaying weddings'"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 40,
733 | "metadata": {},
734 | "outputs": [
735 | {
736 | "data": {
737 | "text/plain": [
738 | "-0.14454556996040863"
739 | ]
740 | },
741 | "execution_count": 40,
742 | "metadata": {},
743 | "output_type": "execute_result"
744 | }
745 | ],
746 | "source": [
747 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 41,
753 | "metadata": {},
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/plain": [
758 | "-0.04978240807521571"
759 | ]
760 | },
761 | "execution_count": 41,
762 | "metadata": {},
763 | "output_type": "execute_result"
764 | }
765 | ],
766 | "source": [
767 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())"
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": 42,
773 | "metadata": {},
774 | "outputs": [
775 | {
776 | "name": "stdout",
777 | "output_type": "stream",
778 | "text": [
779 | "Wall time: 6.77 s\n"
780 | ]
781 | }
782 | ],
783 | "source": [
784 | "%time model.train(ted_talk_docs, total_examples=model.corpus_count, epochs=model.epochs)"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 43,
790 | "metadata": {},
791 | "outputs": [
792 | {
793 | "data": {
794 | "text/plain": [
795 | "array([ 0.20152442, 0.07655947, 0.04110149, -0.09114903, -0.02466601,\n",
796 | " 0.10063498, -0.04590227, -0.16054891, -0.23367156, -0.07714292,\n",
797 | " -0.32246125, 0.10532021, 0.11020374, -0.02373328, -0.06048575,\n",
798 | " 0.06041928, -0.20840394, 0.11885054, -0.09653657, 0.02215091,\n",
799 | " 0.01846626, 0.06881414, -0.01988592, 0.01138998, 0.06924792,\n",
800 | " 0.11989842, 0.09510404, 0.01230403, 0.05453861, 0.05833528,\n",
801 | " 0.22496092, 0.06185873, 0.15445319, -0.13073249, 0.1320086 ,\n",
802 | " 0.15955518, 0.09083826, -0.262743 , 0.07112081, -0.12404393,\n",
803 | " -0.07876749, -0.17020509, -0.08309909, 0.20299006, -0.07867863,\n",
804 | " -0.19080839, -0.00371094, -0.2119167 , -0.11631834, -0.12984131,\n",
805 | " -0.11451794, 0.12690201, -0.02519317, 0.23437414, -0.11313629,\n",
806 | " 0.06674401, -0.0190409 , 0.3384525 , -0.13124712, -0.12843844,\n",
807 | " -0.2605964 , 0.22317892, -0.20078087, -0.05607577, -0.08431446,\n",
808 | " -0.20859231, 0.15535517, 0.0073873 , -0.11435535, 0.16722508,\n",
809 | " -0.0567028 , 0.23436148, -0.1829926 , 0.05211424, -0.14246033,\n",
810 | " 0.20756294, 0.03515876, 0.10574302, -0.05463392, 0.09465599,\n",
811 | " -0.24758984, -0.04593265, -0.13151605, 0.3317288 , -0.13002025,\n",
812 | " -0.37372893, -0.26798424, -0.27239782, -0.16636257, 0.19000524,\n",
813 | " 0.12744325, -0.14971398, -0.11483772, -0.01594907, 0.02319706,\n",
814 | " 0.03037767, -0.01439404, 0.08120204, -0.188371 , -0.21033412],\n",
815 | " dtype=float32)"
816 | ]
817 | },
818 | "execution_count": 43,
819 | "metadata": {},
820 | "output_type": "execute_result"
821 | }
822 | ],
823 | "source": [
824 | "model.infer_vector(sentence_1.split())"
825 | ]
826 | },
827 | {
828 | "cell_type": "code",
829 | "execution_count": 44,
830 | "metadata": {},
831 | "outputs": [
832 | {
833 | "data": {
834 | "text/plain": [
835 | "0.9073806748252071"
836 | ]
837 | },
838 | "execution_count": 44,
839 | "metadata": {},
840 | "output_type": "execute_result"
841 | }
842 | ],
843 | "source": [
844 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())"
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 45,
850 | "metadata": {},
851 | "outputs": [
852 | {
853 | "data": {
854 | "text/plain": [
855 | "0.7626341790517841"
856 | ]
857 | },
858 | "execution_count": 45,
859 | "metadata": {},
860 | "output_type": "execute_result"
861 | }
862 | ],
863 | "source": [
864 | "model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())"
865 | ]
866 | },
867 | {
868 | "cell_type": "code",
869 | "execution_count": 46,
870 | "metadata": {},
871 | "outputs": [
872 | {
873 | "data": {
874 | "text/plain": [
875 | "0.8026655396100536"
876 | ]
877 | },
878 | "execution_count": 46,
879 | "metadata": {},
880 | "output_type": "execute_result"
881 | }
882 | ],
883 | "source": [
884 | "model.docvecs.similarity_unseen_docs(model, sentence_2.split(), sentence_3.split())"
885 | ]
886 | },
887 | {
888 | "cell_type": "markdown",
889 | "metadata": {},
890 | "source": [
891 | "# Model Assessment"
892 | ]
893 | },
894 | {
895 | "cell_type": "code",
896 | "execution_count": 47,
897 | "metadata": {},
898 | "outputs": [],
899 | "source": [
900 | "ranks = []\n",
901 | "for idx in range(len(ted_talk_docs)):\n",
902 | " inferred_vector = model.infer_vector(ted_talk_docs[idx].words)\n",
903 | " sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))\n",
904 | " rank = [docid for docid, sim in sims].index(idx)\n",
905 | " ranks.append(rank)"
906 | ]
907 | },
908 | {
909 | "cell_type": "code",
910 | "execution_count": 48,
911 | "metadata": {},
912 | "outputs": [
913 | {
914 | "data": {
915 | "text/plain": [
916 | "Counter({0: 2080, 3: 1, 4: 1, 1: 1, 2: 1, 6: 1})"
917 | ]
918 | },
919 | "execution_count": 48,
920 | "metadata": {},
921 | "output_type": "execute_result"
922 | }
923 | ],
924 | "source": [
925 | "import collections\n",
926 | "collections.Counter(ranks) # Results vary due to random seeding + very small corpus"
927 | ]
928 | },
929 | {
930 | "cell_type": "code",
931 | "execution_count": 49,
932 | "metadata": {
933 | "scrolled": false
934 | },
935 | "outputs": [
936 | {
937 | "name": "stdout",
938 | "output_type": "stream",
939 | "text": [
940 | "Document (2084): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»\n",
941 | "\n",
942 | "SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)\n",
943 | "MOST (2084, 0.8938855528831482): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»\n",
944 | "\n",
945 | "MEDIAN (949, 0.40211787819862366): «we conventionally divide space into private and public realms and we know these legal distinctions very well because we ve become experts at protecting our private property and private space but we re less attuned to the nuances of the public what translates generic public space into qualitative space mean this is something that our studio has been working on for the past decade and we re doing this through some case studies large chunk of our work has been put into transforming this neglected i»\n",
946 | "\n",
947 | "LEAST (876, 0.11194153130054474): «so the machine going to talk you about is what call the greatest machine that never was it was machine that was never built and yet it will be built it was machine that was designed long before anyone thought about computers if you know anything about the history of computers you will know that in the and the simple computers were created that started the computer revolution we have today and you would be correct except for you have the wrong century the first computer was really designed in the»\n",
948 | "\n"
949 | ]
950 | }
951 | ],
952 | "source": [
953 | "doc_slice = ' '.join(ted_talk_docs[idx].words)[:500]\n",
954 | "print(f'Document ({idx}): «{doc_slice}»\\n')\n",
955 | "print(f'SIMILAR/DISSIMILAR DOCS PER MODEL {model}')\n",
956 | "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
957 | " doc_slice = ' '.join(ted_talk_docs[sims[index][0]].words)[:500]\n",
958 | " print(f'{label} {sims[index]}: «{doc_slice}»\\n')"
959 | ]
960 | }
961 | ],
962 | "metadata": {
963 | "kernelspec": {
964 | "display_name": "Python [conda env:nlp]",
965 | "language": "python",
966 | "name": "conda-env-nlp-py"
967 | },
968 | "language_info": {
969 | "codemirror_mode": {
970 | "name": "ipython",
971 | "version": 3
972 | },
973 | "file_extension": ".py",
974 | "mimetype": "text/x-python",
975 | "name": "python",
976 | "nbconvert_exporter": "python",
977 | "pygments_lexer": "ipython3",
978 | "version": "3.6.6"
979 | }
980 | },
981 | "nbformat": 4,
982 | "nbformat_minor": 2
983 | }
984 |
--------------------------------------------------------------------------------
/06_DL_for_NLP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Deep Learning for NLP"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Understanding Deep Learning"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "# Kaggle: Text Categorization Challenge\n",
22 | "\n",
23 | "# Getting the Data\n",
24 | "\n",
25 | "**Direct Download**: You can get the train and test data from the [data tab on challenge website](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data). "
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## Exploring the Data"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "Solving environment: ...working... done\n",
45 | "\n",
46 | "## Package Plan ##\n",
47 | "\n",
48 | " environment location: D:\\Miniconda3\\envs\\nlp\n",
49 | "\n",
50 | " added / updated specs: \n",
51 | " - pandas\n",
52 | "\n",
53 | "\n",
54 | "The following packages will be UPDATED:\n",
55 | "\n",
56 | " pandas: 0.23.3-py36h830ac7b_0 --> 0.23.4-py36h830ac7b_0\n",
57 | "\n",
58 | "Preparing transaction: ...working... done\n",
59 | "Verifying transaction: ...working... done\n",
60 | "Executing transaction: ...working... done\n",
61 | "Solving environment: ...working... done\n",
62 | "\n",
63 | "## Package Plan ##\n",
64 | "\n",
65 | " environment location: D:\\Miniconda3\\envs\\nlp\n",
66 | "\n",
67 | " added / updated specs: \n",
68 | " - numpy\n",
69 | "\n",
70 | "\n",
71 | "The following packages will be UPDATED:\n",
72 | "\n",
73 | " numpy: 1.15.1-py36ha559c80_0 --> 1.15.4-py36ha559c80_0\n",
74 | " numpy-base: 1.15.1-py36h8128ebf_0 --> 1.15.4-py36h8128ebf_0\n",
75 | "\n",
76 | "Preparing transaction: ...working... done\n",
77 | "Verifying transaction: ...working... done\n",
78 | "Executing transaction: ...working... done\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "!conda install -y pandas\n",
84 | "!conda install -y numpy"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 2,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "import pandas as pd\n",
94 | "import numpy as np"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 3,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "ename": "FileNotFoundError",
104 | "evalue": "File b'data/train.csv' does not exist",
105 | "output_type": "error",
106 | "traceback": [
107 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
108 | "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
109 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtrain_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"data/train.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
110 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 676\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 677\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 678\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 679\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 680\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
111 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 438\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 439\u001b[0m \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 440\u001b[1;33m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 441\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 442\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
112 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 785\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 786\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 787\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 788\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 789\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
113 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m 1012\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1013\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1014\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1015\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1016\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
114 | "\u001b[1;32mD:\\Miniconda3\\envs\\nlp\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 1706\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'usecols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1707\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1708\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1709\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1710\u001b[0m \u001b[0mpassed_names\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnames\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
115 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[1;34m()\u001b[0m\n",
116 | "\u001b[1;32mpandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[1;34m()\u001b[0m\n",
117 | "\u001b[1;31mFileNotFoundError\u001b[0m: File b'data/train.csv' does not exist"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "train_df = pd.read_csv(\"data/train.csv\")"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "train_df.head()"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "val_df = pd.read_csv(\"data/valid.csv\")"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "val_df.head()"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## Multiple Target Dataset!"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "test_df = pd.read_csv(\"data/test.csv\")"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "test_df.head()"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "# Why PyTorch? "
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "# PyTorch and torchtext"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "!conda install -y pytorch cuda92 -c pytorch"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "!pip install --upgrade git+https://github.com/pytorch/text"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "import torch\n",
216 | "import torch.nn as nn\n",
217 | "import torch.nn.functional as F\n",
218 | "import torchtext"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 12,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "use_gpu = True\n",
228 | "if use_gpu:\n",
229 | " assert torch.cuda.is_available(), 'You either do not have a GPU or is not accessible to PyTorch'"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "Let's see how many GPU devices are available to PyTorch on this machine"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 13,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "data": {
246 | "text/plain": [
247 | "1"
248 | ]
249 | },
250 | "execution_count": 13,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "torch.cuda.device_count()"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "## Data Loaders with torchtext\n",
264 | "\n",
265 | "### Conventions and Style"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 14,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "from torchtext.data import Field "
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 15,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "LABEL = Field(sequential=False, use_vocab=False)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 16,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "tokenize = lambda x: x.split()\n",
293 | "TEXT = Field(sequential=True, tokenize=tokenize, lower=True)"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 17,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "from torchtext.data import TabularDataset"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 18,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "tv_datafields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n",
312 | " (\"comment_text\", TEXT), (\"toxic\", LABEL),\n",
313 | " (\"severe_toxic\", LABEL), (\"threat\", LABEL),\n",
314 | " (\"obscene\", LABEL), (\"insult\", LABEL),\n",
315 | " (\"identity_hate\", LABEL)]"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 19,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "trn, vld = TabularDataset.splits(\n",
325 | " path=\"data\", # the root directory where the data lies\n",
326 | " train='train.csv', validation=\"valid.csv\",\n",
327 | " format='csv',\n",
328 | " skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!\n",
329 | " fields=tv_datafields)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 20,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "tst_datafields = [(\"id\", None), # we won't be needing the id, so we pass in None as the field\n",
339 | " (\"comment_text\", TEXT)\n",
340 | " ]"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 21,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "tst = TabularDataset(\n",
350 | " path=\"data/test.csv\", # the file path\n",
351 | " format='csv',\n",
352 | " skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!\n",
353 | " fields=tst_datafields)"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "## Exploring the Dataset Objects"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 22,
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "data": {
370 | "text/plain": [
371 | "(,\n",
372 | " ,\n",
373 | " )"
374 | ]
375 | },
376 | "execution_count": 22,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "trn, vld, tst"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 23,
388 | "metadata": {},
389 | "outputs": [
390 | {
391 | "data": {
392 | "text/plain": [
393 | "(,\n",
394 | " ,\n",
395 | " )"
396 | ]
397 | },
398 | "execution_count": 23,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "trn[0], vld[0], tst[0]"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 24,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "data": {
414 | "text/plain": [
415 | "dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])"
416 | ]
417 | },
418 | "execution_count": 24,
419 | "metadata": {},
420 | "output_type": "execute_result"
421 | }
422 | ],
423 | "source": [
424 | "trn[0].__dict__.keys()"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 25,
430 | "metadata": {},
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/plain": [
435 | "['explanation', 'why', 'the', 'edits', 'made']"
436 | ]
437 | },
438 | "execution_count": 25,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "trn[0].__dict__['comment_text'][:5]"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 26,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "TEXT.build_vocab(trn)"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 27,
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "data": {
463 | "text/plain": [
464 | ""
465 | ]
466 | },
467 | "execution_count": 27,
468 | "metadata": {},
469 | "output_type": "execute_result"
470 | }
471 | ],
472 | "source": [
473 | "TEXT.vocab"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 28,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "data": {
483 | "text/plain": [
484 | "collections.Counter"
485 | ]
486 | },
487 | "execution_count": 28,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "type(TEXT.vocab.freqs)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 29,
499 | "metadata": {},
500 | "outputs": [
501 | {
502 | "data": {
503 | "text/plain": [
504 | "[('the', 78), ('to', 41), ('you', 33), ('of', 30), ('and', 26)]"
505 | ]
506 | },
507 | "execution_count": 29,
508 | "metadata": {},
509 | "output_type": "execute_result"
510 | }
511 | ],
512 | "source": [
513 | "TEXT.vocab.freqs.most_common(5)"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 30,
519 | "metadata": {},
520 | "outputs": [
521 | {
522 | "data": {
523 | "text/plain": [
524 | "(list, collections.defaultdict, 784, 784)"
525 | ]
526 | },
527 | "execution_count": 30,
528 | "metadata": {},
529 | "output_type": "execute_result"
530 | }
531 | ],
532 | "source": [
533 | "type(TEXT.vocab.itos), type(TEXT.vocab.stoi), len(TEXT.vocab.itos), len(TEXT.vocab.stoi.keys()), "
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 31,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "data": {
543 | "text/plain": [
544 | "(7, 'and')"
545 | ]
546 | },
547 | "execution_count": 31,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "TEXT.vocab.stoi['and'], TEXT.vocab.itos[7]"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "metadata": {},
559 | "source": [
560 | "## Iterators!"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 32,
566 | "metadata": {},
567 | "outputs": [],
568 | "source": [
569 | "from torchtext.data import Iterator, BucketIterator"
570 | ]
571 | },
572 | {
573 | "cell_type": "markdown",
574 | "metadata": {},
575 | "source": [
576 | "## BucketIterator"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": 33,
582 | "metadata": {},
583 | "outputs": [],
584 | "source": [
585 | "train_iter, val_iter = BucketIterator.splits(\n",
586 | " (trn, vld), # we pass in the datasets we want the iterator to draw data from\n",
587 | " batch_sizes=(32, 32),\n",
588 | " sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.\n",
589 | " sort_within_batch=False,\n",
590 | " repeat=False # we pass repeat=False because we want to wrap this Iterator layer.\n",
591 | ")"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 34,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "data": {
601 | "text/plain": [
602 | ""
603 | ]
604 | },
605 | "execution_count": 34,
606 | "metadata": {},
607 | "output_type": "execute_result"
608 | }
609 | ],
610 | "source": [
611 | "train_iter"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 35,
617 | "metadata": {},
618 | "outputs": [
619 | {
620 | "data": {
621 | "text/plain": [
622 | "\n",
623 | "[torchtext.data.batch.Batch of size 25]\n",
624 | "\t[.comment_text]:[torch.LongTensor of size 494x25]\n",
625 | "\t[.toxic]:[torch.LongTensor of size 25]\n",
626 | "\t[.severe_toxic]:[torch.LongTensor of size 25]\n",
627 | "\t[.threat]:[torch.LongTensor of size 25]\n",
628 | "\t[.obscene]:[torch.LongTensor of size 25]\n",
629 | "\t[.insult]:[torch.LongTensor of size 25]\n",
630 | "\t[.identity_hate]:[torch.LongTensor of size 25]"
631 | ]
632 | },
633 | "execution_count": 35,
634 | "metadata": {},
635 | "output_type": "execute_result"
636 | }
637 | ],
638 | "source": [
639 | "batch = next(train_iter.__iter__())\n",
640 | "batch"
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 36,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "data": {
650 | "text/plain": [
651 | "dict_keys(['batch_size', 'dataset', 'fields', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])"
652 | ]
653 | },
654 | "execution_count": 36,
655 | "metadata": {},
656 | "output_type": "execute_result"
657 | }
658 | ],
659 | "source": [
660 | "batch.__dict__.keys()"
661 | ]
662 | },
663 | {
664 | "cell_type": "code",
665 | "execution_count": 37,
666 | "metadata": {},
667 | "outputs": [
668 | {
669 | "data": {
670 | "text/plain": [
671 | "(,\n",
672 | " ,\n",
673 | " True)"
674 | ]
675 | },
676 | "execution_count": 37,
677 | "metadata": {},
678 | "output_type": "execute_result"
679 | }
680 | ],
681 | "source": [
682 | "batch.__dict__['dataset'], trn, batch.__dict__['dataset']==trn"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 38,
688 | "metadata": {},
689 | "outputs": [],
690 | "source": [
691 | "test_iter = Iterator(tst, batch_size=64, sort=False, sort_within_batch=False, repeat=False)"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": 39,
697 | "metadata": {},
698 | "outputs": [
699 | {
700 | "data": {
701 | "text/plain": [
702 | "\n",
703 | "[torchtext.data.batch.Batch of size 33]\n",
704 | "\t[.comment_text]:[torch.LongTensor of size 158x33]"
705 | ]
706 | },
707 | "execution_count": 39,
708 | "metadata": {},
709 | "output_type": "execute_result"
710 | }
711 | ],
712 | "source": [
713 | "next(test_iter.__iter__())"
714 | ]
715 | },
716 | {
717 | "cell_type": "markdown",
718 | "metadata": {},
719 | "source": [
720 | "## BatchWrapper"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 40,
726 | "metadata": {},
727 | "outputs": [],
728 | "source": [
729 | "class BatchWrapper:\n",
730 | " def __init__(self, dl, x_var, y_vars):\n",
731 | " self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y\n",
732 | " \n",
733 | " def __iter__(self):\n",
734 | " for batch in self.dl:\n",
735 | " x = getattr(batch, self.x_var) # we assume only one input in this wrapper\n",
736 | " \n",
737 | " if self.y_vars is not None: # we will concatenate y into a single tensor\n",
738 | " y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()\n",
739 | " else:\n",
740 | " y = torch.zeros((1))\n",
741 | " if use_gpu:\n",
742 | " yield (x.cuda(), y.cuda())\n",
743 | " else:\n",
744 | " yield (x, y)\n",
745 | " \n",
746 | " def __len__(self):\n",
747 | " return len(self.dl)"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 41,
753 | "metadata": {},
754 | "outputs": [],
755 | "source": [
756 | "train_dl = BatchWrapper(train_iter, \"comment_text\", [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"])\n",
757 | "valid_dl = BatchWrapper(val_iter, \"comment_text\", [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"])\n",
758 | "test_dl = BatchWrapper(test_iter, \"comment_text\", None)"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": 42,
764 | "metadata": {},
765 | "outputs": [
766 | {
767 | "data": {
768 | "text/plain": [
769 | "(tensor([[ 453, 63, 15, ..., 454, 660, 778],\n",
770 | " [ 523, 4, 601, ..., 78, 11, 650],\n",
771 | " [ 30, 664, 242, ..., 8, 2, 22],\n",
772 | " ...,\n",
773 | " [ 1, 1, 1, ..., 1, 1, 1],\n",
774 | " [ 1, 1, 1, ..., 1, 1, 1],\n",
775 | " [ 1, 1, 1, ..., 1, 1, 1]], device='cuda:0'),\n",
776 | " tensor([[ 0., 0., 0., 0., 0., 0.],\n",
777 | " [ 0., 0., 0., 0., 0., 0.],\n",
778 | " [ 0., 0., 0., 0., 0., 0.],\n",
779 | " [ 0., 0., 0., 0., 0., 0.],\n",
780 | " [ 0., 0., 0., 0., 0., 0.],\n",
781 | " [ 0., 0., 0., 0., 0., 0.],\n",
782 | " [ 0., 0., 0., 0., 0., 0.],\n",
783 | " [ 1., 1., 0., 1., 1., 0.],\n",
784 | " [ 0., 0., 0., 0., 0., 0.],\n",
785 | " [ 0., 0., 0., 0., 0., 0.],\n",
786 | " [ 0., 0., 0., 0., 0., 0.],\n",
787 | " [ 0., 0., 0., 0., 0., 0.],\n",
788 | " [ 0., 0., 0., 0., 0., 0.],\n",
789 | " [ 0., 0., 0., 0., 0., 0.],\n",
790 | " [ 0., 0., 0., 0., 0., 0.],\n",
791 | " [ 0., 0., 0., 0., 0., 0.],\n",
792 | " [ 0., 0., 0., 0., 0., 0.],\n",
793 | " [ 0., 0., 0., 0., 0., 0.],\n",
794 | " [ 0., 0., 0., 0., 0., 0.],\n",
795 | " [ 0., 0., 0., 0., 0., 0.],\n",
796 | " [ 1., 0., 0., 0., 0., 0.],\n",
797 | " [ 0., 0., 0., 0., 0., 0.],\n",
798 | " [ 1., 0., 0., 0., 0., 0.],\n",
799 | " [ 0., 0., 0., 0., 0., 0.],\n",
800 | " [ 0., 0., 0., 0., 0., 0.]], device='cuda:0'))"
801 | ]
802 | },
803 | "execution_count": 42,
804 | "metadata": {},
805 | "output_type": "execute_result"
806 | }
807 | ],
808 | "source": [
809 | "next(train_dl.__iter__())"
810 | ]
811 | },
812 | {
813 | "cell_type": "markdown",
814 | "metadata": {},
815 | "source": [
816 | "## Training a Text Classifier"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 43,
822 | "metadata": {},
823 | "outputs": [],
824 | "source": [
825 | "class SimpleLSTMBaseline(nn.Module):\n",
826 | " def __init__(self, hidden_dim, emb_dim=300,\n",
827 | " spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=2):\n",
828 | " super().__init__() # don't forget to call this!\n",
829 | " self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)\n",
830 | " self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=num_linear, dropout=recurrent_dropout)\n",
831 | " self.linear_layers = []\n",
832 | " for _ in range(num_linear - 1):\n",
833 | " self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))\n",
834 | " self.linear_layers = nn.ModuleList(self.linear_layers)\n",
835 | " self.predictor = nn.Linear(hidden_dim, 6)\n",
836 | " \n",
837 | " def forward(self, seq):\n",
838 | " hdn, _ = self.encoder(self.embedding(seq))\n",
839 | " feature = hdn[-1, :, :]\n",
840 | " for layer in self.linear_layers:\n",
841 | " feature = layer(feature)\n",
842 | " preds = self.predictor(feature)\n",
843 | " return preds"
844 | ]
845 | },
846 | {
847 | "cell_type": "markdown",
848 | "metadata": {},
849 | "source": [
850 | "### Initializing the Model"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 44,
856 | "metadata": {},
857 | "outputs": [
858 | {
859 | "name": "stdout",
860 | "output_type": "stream",
861 | "text": [
862 | "SimpleLSTMBaseline(\n",
863 | " (embedding): Embedding(784, 300)\n",
864 | " (encoder): LSTM(300, 500, num_layers=2, dropout=0.1)\n",
865 | " (linear_layers): ModuleList(\n",
866 | " (0): Linear(in_features=500, out_features=500, bias=True)\n",
867 | " )\n",
868 | " (predictor): Linear(in_features=500, out_features=6, bias=True)\n",
869 | ")\n"
870 | ]
871 | }
872 | ],
873 | "source": [
874 | "em_sz = 300\n",
875 | "nh = 500\n",
876 | "model = SimpleLSTMBaseline(nh, emb_dim=em_sz)\n",
877 | "print(model)"
878 | ]
879 | },
880 | {
881 | "cell_type": "code",
882 | "execution_count": 45,
883 | "metadata": {},
884 | "outputs": [
885 | {
886 | "name": "stdout",
887 | "output_type": "stream",
888 | "text": [
889 | "4.096706 million parameters\n"
890 | ]
891 | }
892 | ],
893 | "source": [
894 | "def model_size(model: torch.nn)->int:\n",
895 | " \"\"\"\n",
896 | " Calculates the number of trainable parameters in any model\n",
897 | " \n",
898 | " Returns:\n",
899 | " params (int): the total count of all model weights\n",
900 | " \"\"\"\n",
901 | " model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n",
902 | "# model_parameters = model.parameters()\n",
903 | " params = sum([np.prod(p.size()) for p in model_parameters])\n",
904 | " return params\n",
905 | "\n",
906 | "print(f'{model_size(model)/10**6} million parameters')"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": 46,
912 | "metadata": {},
913 | "outputs": [],
914 | "source": [
915 | "if use_gpu:\n",
916 | " model = model.cuda()"
917 | ]
918 | },
919 | {
920 | "cell_type": "markdown",
921 | "metadata": {},
922 | "source": [
923 | "**Putting together the pieces again**:"
924 | ]
925 | },
926 | {
927 | "cell_type": "code",
928 | "execution_count": 47,
929 | "metadata": {},
930 | "outputs": [],
931 | "source": [
932 | "from torch import optim\n",
933 | "opt = optim.Adam(model.parameters(), lr=1e-2)\n",
934 | "loss_func = nn.BCEWithLogitsLoss().cuda()\n",
935 | "epochs = 3"
936 | ]
937 | },
938 | {
939 | "cell_type": "markdown",
940 | "metadata": {},
941 | "source": [
942 | "## Training Loop"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 48,
948 | "metadata": {},
949 | "outputs": [
950 | {
951 | "name": "stderr",
952 | "output_type": "stream",
953 | "text": [
954 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.34it/s]\n"
955 | ]
956 | },
957 | {
958 | "name": "stdout",
959 | "output_type": "stream",
960 | "text": [
961 | "Epoch: 1, Training Loss: 13.5037, Validation Loss: 4.6498\n"
962 | ]
963 | },
964 | {
965 | "name": "stderr",
966 | "output_type": "stream",
967 | "text": [
968 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4.58it/s]\n"
969 | ]
970 | },
971 | {
972 | "name": "stdout",
973 | "output_type": "stream",
974 | "text": [
975 | "Epoch: 2, Training Loss: 7.8243, Validation Loss: 24.5401\n"
976 | ]
977 | },
978 | {
979 | "name": "stderr",
980 | "output_type": "stream",
981 | "text": [
982 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.35it/s]\n"
983 | ]
984 | },
985 | {
986 | "name": "stdout",
987 | "output_type": "stream",
988 | "text": [
989 | "Epoch: 3, Training Loss: 57.4577, Validation Loss: 4.0107\n"
990 | ]
991 | }
992 | ],
993 | "source": [
994 | "from tqdm import tqdm\n",
995 | "for epoch in range(1, epochs + 1):\n",
996 | " running_loss = 0.0\n",
997 | " running_corrects = 0\n",
998 | " model.train() # turn on training mode\n",
999 | " for x, y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!\n",
1000 | " opt.zero_grad()\n",
1001 | " preds = model(x)\n",
1002 | " loss = loss_func(preds, y)\n",
1003 | " loss.backward()\n",
1004 | " opt.step()\n",
1005 | " \n",
1006 | " running_loss += loss.item() * x.size(0)\n",
1007 | " \n",
1008 | " epoch_loss = running_loss / len(trn)\n",
1009 | " \n",
1010 | " # calculate the validation loss for this epoch\n",
1011 | " val_loss = 0.0\n",
1012 | " model.eval() # turn on evaluation mode\n",
1013 | " for x, y in valid_dl:\n",
1014 | " preds = model(x)\n",
1015 | " loss = loss_func(preds, y)\n",
1016 | " val_loss += loss.item() * x.size(0)\n",
1017 | "\n",
1018 | " val_loss /= len(vld)\n",
1019 | " print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "markdown",
1024 | "metadata": {},
1025 | "source": [
1026 | "## Prediction Mode"
1027 | ]
1028 | },
1029 | {
1030 | "cell_type": "code",
1031 | "execution_count": 49,
1032 | "metadata": {},
1033 | "outputs": [
1034 | {
1035 | "name": "stderr",
1036 | "output_type": "stream",
1037 | "text": [
1038 | "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9.64it/s]\n"
1039 | ]
1040 | }
1041 | ],
1042 | "source": [
1043 | "test_preds = []\n",
1044 | "for x, y in tqdm(test_dl):\n",
1045 | " preds = model(x)\n",
1046 | " # if you're data is on the GPU, you need to move the data back to the cpu\n",
1047 | " # preds = preds.data.cpu().numpy()\n",
1048 | " preds = preds.data.cpu().numpy()\n",
1049 | " # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function\n",
1050 | " preds = 1 / (1 + np.exp(-preds))\n",
1051 | " test_preds.append(preds)\n",
1052 | "test_preds = np.hstack(test_preds)"
1053 | ]
1054 | },
1055 | {
1056 | "cell_type": "markdown",
1057 | "metadata": {},
1058 | "source": [
1059 | "### Convert predictions to a pandas dataframe\n",
1060 | "\n",
1061 | "This helps us convert the predictions to a more interpretable format. Let's insert the predictions in the correct column and then we can preview few rows of the dataframe: "
1062 | ]
1063 | },
1064 | {
1065 | "cell_type": "code",
1066 | "execution_count": 50,
1067 | "metadata": {},
1068 | "outputs": [],
1069 | "source": [
1070 | "test_df = pd.read_csv(\"data/test.csv\")\n",
1071 | "for i, col in enumerate([\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]):\n",
1072 | " test_df[col] = test_preds[:, i]"
1073 | ]
1074 | },
1075 | {
1076 | "cell_type": "code",
1077 | "execution_count": 51,
1078 | "metadata": {},
1079 | "outputs": [
1080 | {
1081 | "data": {
1082 | "text/html": [
1083 | "\n",
1084 | "\n",
1097 | "
\n",
1098 | " \n",
1099 | " \n",
1100 | " | \n",
1101 | " id | \n",
1102 | " comment_text | \n",
1103 | " toxic | \n",
1104 | " severe_toxic | \n",
1105 | " obscene | \n",
1106 | " threat | \n",
1107 | " insult | \n",
1108 | " identity_hate | \n",
1109 | "
\n",
1110 | " \n",
1111 | " \n",
1112 | " \n",
1113 | " | 0 | \n",
1114 | " 00001cee341fdb12 | \n",
1115 | " Yo bitch Ja Rule is more succesful then you'll... | \n",
1116 | " 0.629146 | \n",
1117 | " 0.116721 | \n",
1118 | " 0.438606 | \n",
1119 | " 0.156848 | \n",
1120 | " 0.139696 | \n",
1121 | " 0.388736 | \n",
1122 | "
\n",
1123 | " \n",
1124 | " | 1 | \n",
1125 | " 0000247867823ef7 | \n",
1126 | " == From RfC == \\r\\n\\r\\n The title is fine as i... | \n",
1127 | " 0.629146 | \n",
1128 | " 0.116721 | \n",
1129 | " 0.438606 | \n",
1130 | " 0.156848 | \n",
1131 | " 0.139696 | \n",
1132 | " 0.388736 | \n",
1133 | "
\n",
1134 | " \n",
1135 | " | 2 | \n",
1136 | " 00013b17ad220c46 | \n",
1137 | " \" \\r\\n\\r\\n == Sources == \\r\\n\\r\\n * Zawe Ashto... | \n",
1138 | " 0.629146 | \n",
1139 | " 0.116721 | \n",
1140 | " 0.438606 | \n",
1141 | " 0.156848 | \n",
1142 | " 0.139696 | \n",
1143 | " 0.388736 | \n",
1144 | "
\n",
1145 | " \n",
1146 | "
\n",
1147 | "
"
1148 | ],
1149 | "text/plain": [
1150 | " id comment_text \\\n",
1151 | "0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll... \n",
1152 | "1 0000247867823ef7 == From RfC == \\r\\n\\r\\n The title is fine as i... \n",
1153 | "2 00013b17ad220c46 \" \\r\\n\\r\\n == Sources == \\r\\n\\r\\n * Zawe Ashto... \n",
1154 | "\n",
1155 | " toxic severe_toxic obscene threat insult identity_hate \n",
1156 | "0 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 \n",
1157 | "1 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 \n",
1158 | "2 0.629146 0.116721 0.438606 0.156848 0.139696 0.388736 "
1159 | ]
1160 | },
1161 | "execution_count": 51,
1162 | "metadata": {},
1163 | "output_type": "execute_result"
1164 | }
1165 | ],
1166 | "source": [
1167 | "test_df.head(3)"
1168 | ]
1169 | }
1170 | ],
1171 | "metadata": {
1172 | "kernelspec": {
1173 | "display_name": "Python [conda env:nlp]",
1174 | "language": "python",
1175 | "name": "conda-env-nlp-py"
1176 | },
1177 | "language_info": {
1178 | "codemirror_mode": {
1179 | "name": "ipython",
1180 | "version": 3
1181 | },
1182 | "file_extension": ".py",
1183 | "mimetype": "text/x-python",
1184 | "name": "python",
1185 | "nbconvert_exporter": "python",
1186 | "pygments_lexer": "ipython3",
1187 | "version": "3.6.6"
1188 | }
1189 | },
1190 | "nbformat": 4,
1191 | "nbformat_minor": 2
1192 | }
1193 |
--------------------------------------------------------------------------------