├── .gitignore
├── 1 - Neural Bag of Words.ipynb
├── 2 - Recurrent Neural Networks.ipynb
├── 3 - Convolutional Neural Networks.ipynb
├── 4 - Transformers.ipynb
├── LICENSE
├── README.md
├── assets
├── nbow_model.png
└── nbow_model.xml
├── legacy
├── 1 - Simple Sentiment Analysis.ipynb
├── 2 - Upgraded Sentiment Analysis.ipynb
├── 3 - Faster Sentiment Analysis.ipynb
├── 4 - Convolutional Sentiment Analysis.ipynb
├── 5 - Multi-class Sentiment Analysis.ipynb
├── 6 - Transformers for Sentiment Analysis.ipynb
├── A - Using TorchText with Your Own Datasets.ipynb
├── B - A Closer Look at Word Embeddings.ipynb
├── C - Loading, Saving and Freezing Embeddings.ipynb
├── assets
│ ├── nbow_model.png
│ ├── nbow_model.xml
│ ├── padding.png
│ ├── padding.xml
│ ├── sentiment1.png
│ ├── sentiment1.xml
│ ├── sentiment10.png
│ ├── sentiment10.xml
│ ├── sentiment11.png
│ ├── sentiment11.xml
│ ├── sentiment12.png
│ ├── sentiment12.xml
│ ├── sentiment13.png
│ ├── sentiment13.xml
│ ├── sentiment14.png
│ ├── sentiment14.xml
│ ├── sentiment15.png
│ ├── sentiment15.xml
│ ├── sentiment2.png
│ ├── sentiment2.xml
│ ├── sentiment3.png
│ ├── sentiment3.xml
│ ├── sentiment4.png
│ ├── sentiment4.xml
│ ├── sentiment5.png
│ ├── sentiment5.xml
│ ├── sentiment6.png
│ ├── sentiment6.xml
│ ├── sentiment7.png
│ ├── sentiment7.xml
│ ├── sentiment8.png
│ ├── sentiment8.xml
│ ├── sentiment9.png
│ ├── sentiment9.xml
│ ├── vocabulary.png
│ └── vocabulary.xml
├── custom_embeddings
│ └── embeddings.txt
└── data
│ ├── test.csv
│ ├── test.json
│ ├── test.tsv
│ ├── train.csv
│ ├── train.json
│ ├── train.tsv
│ ├── valid.csv
│ ├── valid.json
│ └── valid.tsv
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | #data
104 | .data/*
105 | .vector_cache/*
106 | saves/*
107 | *.pt
108 | .vscode/
109 | custom_embeddings/trained_embeddings.*
110 | experimental/.data/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Ben Trevett
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Sentiment Analysis
2 |
3 | This repo contains tutorials covering understanding and implementing sequence classification models using [PyTorch](https://github.com/pytorch/pytorch), with Python 3.9. Specifically, we'll train models to predict sentiment from movie reviews.
4 |
5 | **If you find any mistakes or disagree with any of the explanations, please do not hesitate to [submit an issue](https://github.com/bentrevett/pytorch-sentiment-analysis/issues/new). I welcome any feedback, positive or negative!**
6 |
7 | ## Getting Started
8 |
9 | Install the required dependencies with: `pip install -r requirements.txt --upgrade`.
10 |
11 | ## Tutorials
12 |
13 | - 1 - [Neural Bag of Words](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/1%20-%20Neural%20Bag%20of%20Words.ipynb) [](https://colab.research.google.com/github/bentrevett/pytorch-sentiment-analysis/blob/main/1%20-%20Neural%20Bag%20of%20Words.ipynb)
14 |
15 | This tutorial covers the workflow of a sequence classification project with PyTorch. We'll cover the basics of sequence classification using a simple, but effective, neural bag-of-words model, and how to use the datasets/torchtext libaries to simplify data loading/preprocessing.
16 |
17 | - 2 - [Recurrent Neural Networks](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/2%20-%20Recurrent%20Neural%20Networks.ipynb) [](https://colab.research.google.com/github/bentrevett/pytorch-sentiment-analysis/blob/main/2%20-%20Recurrent%20Neural%20Networks.ipynb)
18 |
19 | Now we have the basic sequence classification workflow covered, this tutorial will focus on improving our results by switching to a recurrent neural network (RNN) model. We'll cover the theory behind RNNs, and look at an implementation of the long short-term memory (LSTM) RNN, one of the most common variants of RNN.
20 |
21 | - 3 - [Convolutional Neural Networks](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/3%20-%20Convolutional%20Neural%20Networks.ipynb) [](https://colab.research.google.com/github/bentrevett/pytorch-sentiment-analysis/blob/main/3%20-%20Convolutional%20Neural%20Networks.ipynb)
22 |
23 | Next, we'll cover convolutional neural networks (CNNs) for sentiment analysis. This model will be an implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882).
24 |
25 | - 4 - [Transformers](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/4%20-%20Transformers.ipynb) [](https://colab.research.google.com/github/bentrevett/pytorch-sentiment-analysis/blob/main/4%20-%20Transformers.ipynb)
26 |
27 | Finally, we'll show how to use the transformers library to load a pre-trained transformer model, specifically the BERT model from [this](https://arxiv.org/abs/1810.04805) paper, and use it for sequence classification.
28 |
29 | ## Legacy Tutorials
30 |
31 | Previous versions of these tutorials used features from the torchtext library which are no longer available. These are stored in the [legacy](https://github.com/bentrevett/pytorch-sentiment-analysis/tree/main/legacy) directory.
32 |
33 | ## References
34 |
35 | Here are some things I looked at while making these tutorials. Some of it may be out of date.
36 |
37 | - http://anie.me/On-Torchtext/
38 | - http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
39 | - https://github.com/spro/practical-pytorch
40 | - https://gist.github.com/Tushar-N/dfca335e370a2bc3bc79876e6270099e
41 | - https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec
42 | - https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py
43 | - https://github.com/Shawn1993/cnn-text-classification-pytorch
44 |
--------------------------------------------------------------------------------
/assets/nbow_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/assets/nbow_model.png
--------------------------------------------------------------------------------
/assets/nbow_model.xml:
--------------------------------------------------------------------------------
1 | 7VtNc5swEP0tPfjYjpGwTY6N637MpDOZSTtNTh0ZZFArkCvk2O6vrzDiU45LiRnhlFPYRQJpn97TsnJGcB7uPnC0Dj4zD9MRGHu7EXw3AsCygSX/JJ596pnBq9Thc+KpRoXjjvzGyjlW3g3xcFxpKBijgqyrTpdFEXZFxYc4Z9tqsxWj1beukY81x52LqO79RjwRpF4HzAr/R0z8IHuzNVXzC1HWWM0kDpDHtiUXXIzgnDMm0qtwN8c0CV4Wl7Tf+yfu5gPjOBJNOizQ+G34i3+//x2ugz1AX2fe3euJGpvYZxPGnpy/MhkXAfNZhOii8F5ztok8nDx1LK2izQ1ja+m0pPMHFmKvwEQbwaQrECFVd/GOiPvS9UPyqDcTZb3bqScfjH1mRILv78tGqVdiFt0OVtZvRSidM8r4YXbQQ9hZudIfC85+4tKdqevg5UreSSOShOHJQCtXzDbcxSeimy1YxH0sTrQD+XKQPMIsxHIGsh/HFAnyWB0HUgvaz9sVmMsLBfs/LAEAWqyBZ6BeIP1QBvovqBdAP2TPO4U6i4QaCLCl7W7442HFWoYAtkwirJ77iOhGvUlDvMrpbUAEvlujw9y3UtirSLbl1CPmAu9OB12PUdZhqnRUbST2WNnbkiwrV1BS5Mx39qhOB+nsUDpBQ2ZBo9IJW6yBihZdmI4aANisdIKXIJ25BvZFOmeDdHYonbAhs2yjxLIH6ewaYLPSCV+CdOZS2RfptNqkHIN2NqWW3ZBaltlP9jZlm0E8/wlhs+JpvwTxnIC+iafRcueFE8dpKo0meeNovPmkQS4XtKgiVGVBxKIEd0SJH0nTlZHC0n+dcIG4iL5VN0LieYcVcox71VVThyU/C0huxj+xcANl+BTFsbqmaInpLYuJIOzoSG5qDfIRxXIwJPK/pCt0fB46W7XPyAnU6WwfoTPojM5GS3AXTuerpvugSTpfaXQOkMADo8/EaDjuG6ONVoYunNHZkXm/a+bZKEucFgGJB06fidM26BunnYHT7Tnd9ITZaDE3G2WJ0/KDMxw4fSZOT+yecRq0ybxNFKC6pOZzM2PV9ZYR+epCv2tYwzqIqRSoXjUc82E8g8tA47KGdf9LT1O7b6Wnq2EXbE+1yyjLW3rV9tWwB55pD5xO+7YHtmH0cAhTOlxpUn1yTBIa6HntWffCleNi9+heuHQm9uRcX4S1Kk9OE1N7IdSj2E/mdMmApscp0Ozv3/QDlQvMBusMsIBpBujCEmE/xbOrhGHJhGDh/5MwaIUww/kC1H8QtQiX2PPkzKX7Bu1lsFqCX2ZVBwlkBf8y5JW1cFwtn5f01b7iTOd8UM/wbxmjBwQH5E4gB2amkZtoyN2QCCM+UK8RgFZ38inN4r8g0zJW8b+kcPEH
--------------------------------------------------------------------------------
/legacy/1 - Simple Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 1 - Simple Sentiment Analysis\n",
8 | "\n",
9 | "In this series we'll be building a machine learning model to detect sentiment (i.e. detect if a sentence is positive or negative) using PyTorch and TorchText. This will be done on movie reviews, using the [IMDb dataset](http://ai.stanford.edu/~amaas/data/sentiment/).\n",
10 | "\n",
11 | "In this first notebook, we'll start very simple to understand the general concepts whilst not really caring about good results. Further notebooks will build on this knowledge and we'll actually get good results.\n",
12 | "\n",
13 | "### Introduction\n",
14 | "\n",
15 | "We'll be using a **recurrent neural network** (RNN) as they are commonly used in analysing sequences. An RNN takes in sequence of words, $X=\\{x_1, ..., x_T\\}$, one at a time, and produces a _hidden state_, $h$, for each word. We use the RNN _recurrently_ by feeding in the current word $x_t$ as well as the hidden state from the previous word, $h_{t-1}$, to produce the next hidden state, $h_t$. \n",
16 | "\n",
17 | "$$h_t = \\text{RNN}(x_t, h_{t-1})$$\n",
18 | "\n",
19 | "Once we have our final hidden state, $h_T$, (from feeding in the last word in the sequence, $x_T$) we feed it through a linear layer, $f$, (also known as a fully connected layer), to receive our predicted sentiment, $\\hat{y} = f(h_T)$.\n",
20 | "\n",
21 | "Below shows an example sentence, with the RNN predicting zero, which indicates a negative sentiment. The RNN is shown in orange and the linear layer shown in silver. Note that we use the same RNN for every word, i.e. it has the same parameters. The initial hidden state, $h_0$, is a tensor initialized to all zeros. \n",
22 | "\n",
23 | "\n",
24 | "\n",
25 | "**Note:** some layers and steps have been omitted from the diagram, but these will be explained later."
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## Preparing Data\n",
33 | "\n",
34 | "One of the main concepts of TorchText is the `Field`. These define how your data should be processed. In our sentiment classification task the data consists of both the raw string of the review and the sentiment, either \"pos\" or \"neg\".\n",
35 | "\n",
36 | "The parameters of a `Field` specify how the data should be processed. \n",
37 | "\n",
38 | "We use the `TEXT` field to define how the review should be processed, and the `LABEL` field to process the sentiment. \n",
39 | "\n",
40 | "Our `TEXT` field has `tokenize='spacy'` as an argument. This defines that the \"tokenization\" (the act of splitting the string into discrete \"tokens\") should be done using the [spaCy](https://spacy.io) tokenizer. If no `tokenize` argument is passed, the default is simply splitting the string on spaces. We also need to specify a `tokenizer_language` which tells torchtext which spaCy model to use. We use the `en_core_web_sm` model which has to be downloaded with `python -m spacy download en_core_web_sm` before you run this notebook!\n",
41 | "\n",
42 | "`LABEL` is defined by a `LabelField`, a special subset of the `Field` class specifically used for handling labels. We will explain the `dtype` argument later.\n",
43 | "\n",
44 | "For more on `Fields`, go [here](https://github.com/pytorch/text/blob/master/torchtext/data/field.py).\n",
45 | "\n",
46 | "We also set the random seeds for reproducibility. "
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 1,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "import torch\n",
56 | "from torchtext.legacy import data\n",
57 | "\n",
58 | "SEED = 1234\n",
59 | "\n",
60 | "torch.manual_seed(SEED)\n",
61 | "torch.backends.cudnn.deterministic = True\n",
62 | "\n",
63 | "TEXT = data.Field(tokenize = 'spacy',\n",
64 | " tokenizer_language = 'en_core_web_sm')\n",
65 | "LABEL = data.LabelField(dtype = torch.float)"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Another handy feature of TorchText is that it has support for common datasets used in natural language processing (NLP). \n",
73 | "\n",
74 | "The following code automatically downloads the IMDb dataset and splits it into the canonical train/test splits as `torchtext.datasets` objects. It process the data using the `Fields` we have previously defined. The IMDb dataset consists of 50,000 movie reviews, each marked as being a positive or negative review."
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 2,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "from torchtext.legacy import datasets\n",
84 | "\n",
85 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "We can see how many examples are in each split by checking their length."
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 3,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "Number of training examples: 25000\n",
105 | "Number of testing examples: 25000\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "print(f'Number of training examples: {len(train_data)}')\n",
111 | "print(f'Number of testing examples: {len(test_data)}')"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "We can also check an example."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "{'text': ['elvira', 'mistress', 'of', 'the', 'dark', 'is', 'one', 'of', 'my', 'fav', 'movies', ',', 'it', 'has', 'every', 'thing', 'you', 'would', 'want', 'in', 'a', 'film', ',', 'like', 'great', 'one', 'liners', ',', 'sexy', 'star', 'and', 'a', 'Outrageous', 'story', '!', 'if', 'you', 'have', 'not', 'seen', 'it', ',', 'you', 'are', 'missing', 'out', 'on', 'one', 'of', 'the', 'greatest', 'films', 'made', '.', 'i', 'ca', \"n't\", 'wait', 'till', 'her', 'new', 'movie', 'comes', 'out', '!'], 'label': 'pos'}\n"
131 | ]
132 | }
133 | ],
134 | "source": [
135 | "print(vars(train_data.examples[0]))"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "The IMDb dataset only has train/test splits, so we need to create a validation set. We can do this with the `.split()` method. \n",
143 | "\n",
144 | "By default this splits 70/30, however by passing a `split_ratio` argument, we can change the ratio of the split, i.e. a `split_ratio` of 0.8 would mean 80% of the examples make up the training set and 20% make up the validation set. \n",
145 | "\n",
146 | "We also pass our random seed to the `random_state` argument, ensuring that we get the same train/validation split each time."
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 5,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "import random\n",
156 | "\n",
157 | "train_data, valid_data = train_data.split(random_state = random.seed(SEED))"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "Again, we'll view how many examples are in each split."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "Number of training examples: 17500\n",
177 | "Number of validation examples: 7500\n",
178 | "Number of testing examples: 25000\n"
179 | ]
180 | }
181 | ],
182 | "source": [
183 | "print(f'Number of training examples: {len(train_data)}')\n",
184 | "print(f'Number of validation examples: {len(valid_data)}')\n",
185 | "print(f'Number of testing examples: {len(test_data)}')"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "Next, we have to build a _vocabulary_. This is a effectively a look up table where every unique word in your data set has a corresponding _index_ (an integer).\n",
193 | "\n",
194 | "We do this as our machine learning model cannot operate on strings, only numbers. Each _index_ is used to construct a _one-hot_ vector for each word. A one-hot vector is a vector where all of the elements are 0, except one, which is 1, and dimensionality is the total number of unique words in your vocabulary, commonly denoted by $V$.\n",
195 | "\n",
196 | "\n",
197 | "\n",
198 | "The number of unique words in our training set is over 100,000, which means that our one-hot vectors will have over 100,000 dimensions! This will make training slow and possibly won't fit onto your GPU (if you're using one). \n",
199 | "\n",
200 | "There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 25,000 words.\n",
201 | "\n",
202 | "What do we do with words that appear in examples but we have cut from the vocabulary? We replace them with a special _unknown_ or `` token. For example, if the sentence was \"This film is great and I love it\" but the word \"love\" was not in the vocabulary, it would become \"This film is great and I `` it\".\n",
203 | "\n",
204 | "The following builds the vocabulary, only keeping the most common `max_size` tokens."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 7,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "MAX_VOCAB_SIZE = 25_000\n",
214 | "\n",
215 | "TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)\n",
216 | "LABEL.build_vocab(train_data)"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "Why do we only build the vocabulary on the training set? When testing any machine learning system you do not want to look at the test set in any way. We do not include the validation set as we want it to reflect the test set as much as possible."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 8,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "Unique tokens in TEXT vocabulary: 25002\n",
236 | "Unique tokens in LABEL vocabulary: 2\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "print(f\"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}\")\n",
242 | "print(f\"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}\")"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "Why is the vocab size 25002 and not 25000? One of the addition tokens is the `` token and the other is a `` token.\n",
250 | "\n",
251 | "When we feed sentences into our model, we feed a _batch_ of them at a time, i.e. more than one at a time, and all sentences in the batch need to be the same size. Thus, to ensure each sentence in the batch is the same size, any shorter than the longest within the batch are padded.\n",
252 | "\n",
253 | "\n",
254 | "\n",
255 | "We can also view the most common words in the vocabulary and their frequencies."
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 9,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "[('the', 202789), (',', 192769), ('.', 165632), ('and', 109469), ('a', 109242), ('of', 100791), ('to', 93641), ('is', 76253), ('in', 61374), ('I', 54030), ('it', 53487), ('that', 49111), ('\"', 44657), (\"'s\", 43331), ('this', 42385), ('-', 36979), ('/>
', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "print(TEXT.vocab.itos[:10])"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "We can also check the labels, ensuring 0 is for negative and 1 is for positive."
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 11,
309 | "metadata": {},
310 | "outputs": [
311 | {
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "defaultdict(, {'neg': 0, 'pos': 1})\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "print(LABEL.vocab.stoi)"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "The final step of preparing the data is creating the iterators. We iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.\n",
328 | "\n",
329 | "We'll use a `BucketIterator` which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.\n",
330 | "\n",
331 | "We also want to place the tensors returned by the iterator on the GPU (if you're using one). PyTorch handles this using `torch.device`, we then pass this device to the iterator."
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 12,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "BATCH_SIZE = 64\n",
341 | "\n",
342 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
343 | "\n",
344 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
345 | " (train_data, valid_data, test_data), \n",
346 | " batch_size = BATCH_SIZE,\n",
347 | " device = device)"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "## Build the Model\n",
355 | "\n",
356 | "The next stage is building the model that we'll eventually train and evaluate. \n",
357 | "\n",
358 | "There is a small amount of boilerplate code when creating models in PyTorch, note how our `RNN` class is a sub-class of `nn.Module` and the use of `super`.\n",
359 | "\n",
360 | "Within the `__init__` we define the _layers_ of the module. Our three layers are an _embedding_ layer, our RNN, and a _linear_ layer. All layers have their parameters initialized to random values, unless explicitly specified.\n",
361 | "\n",
362 | "The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector space. For more information about word embeddings, see [here](https://monkeylearn.com/blog/word-embeddings-transform-text-numbers/).\n",
363 | "\n",
364 | "The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$.\n",
365 | "\n",
366 | "\n",
367 | "\n",
368 | "Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.\n",
369 | "\n",
370 | "The `forward` method is called when we feed examples into our model.\n",
371 | "\n",
372 | "Each batch, `text`, is a tensor of size _**[sentence length, batch size]**_. That is a batch of sentences, each having each word converted into a one-hot vector. \n",
373 | "\n",
374 | "You may notice that this tensor should have another dimension due to the one-hot vectors, however PyTorch conveniently stores a one-hot vector as it's index value, i.e. the tensor representing a sentence is just a tensor of the indexes for each token in that sentence. The act of converting a list of tokens into a list of indexes is commonly called *numericalizing*.\n",
375 | "\n",
376 | "The input batch is then passed through the embedding layer to get `embedded`, which gives us a dense vector representation of our sentences. `embedded` is a tensor of size _**[sentence length, batch size, embedding dim]**_.\n",
377 | "\n",
378 | "`embedded` is then fed into the RNN. In some frameworks you must feed the initial hidden state, $h_0$, into the RNN, however in PyTorch, if no initial hidden state is passed as an argument it defaults to a tensor of all zeros.\n",
379 | "\n",
380 | "The RNN returns 2 tensors, `output` of size _**[sentence length, batch size, hidden dim]**_ and `hidden` of size _**[1, batch size, hidden dim]**_. `output` is the concatenation of the hidden state from every time step, whereas `hidden` is simply the final hidden state. We verify this using the `assert` statement. Note the `squeeze` method, which is used to remove a dimension of size 1. \n",
381 | "\n",
382 | "Finally, we feed the last hidden state, `hidden`, through the linear layer, `fc`, to produce a prediction."
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 13,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "import torch.nn as nn\n",
392 | "\n",
393 | "class RNN(nn.Module):\n",
394 | " def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):\n",
395 | " \n",
396 | " super().__init__()\n",
397 | " \n",
398 | " self.embedding = nn.Embedding(input_dim, embedding_dim)\n",
399 | " \n",
400 | " self.rnn = nn.RNN(embedding_dim, hidden_dim)\n",
401 | " \n",
402 | " self.fc = nn.Linear(hidden_dim, output_dim)\n",
403 | " \n",
404 | " def forward(self, text):\n",
405 | "\n",
406 | " #text = [sent len, batch size]\n",
407 | " \n",
408 | " embedded = self.embedding(text)\n",
409 | " \n",
410 | " #embedded = [sent len, batch size, emb dim]\n",
411 | " \n",
412 | " output, hidden = self.rnn(embedded)\n",
413 | " \n",
414 | " #output = [sent len, batch size, hid dim]\n",
415 | " #hidden = [1, batch size, hid dim]\n",
416 | " \n",
417 | " assert torch.equal(output[-1,:,:], hidden.squeeze(0))\n",
418 | " \n",
419 | " return self.fc(hidden.squeeze(0))"
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "We now create an instance of our RNN class. \n",
427 | "\n",
428 | "The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size. \n",
429 | "\n",
430 | "The embedding dimension is the size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.\n",
431 | "\n",
432 | "The hidden dimension is the size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.\n",
433 | "\n",
434 | "The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 14,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "INPUT_DIM = len(TEXT.vocab)\n",
444 | "EMBEDDING_DIM = 100\n",
445 | "HIDDEN_DIM = 256\n",
446 | "OUTPUT_DIM = 1\n",
447 | "\n",
448 | "model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "Let's also create a function that will tell us how many trainable parameters our model has so we can compare the number of parameters across different models."
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 15,
461 | "metadata": {},
462 | "outputs": [
463 | {
464 | "name": "stdout",
465 | "output_type": "stream",
466 | "text": [
467 | "The model has 2,592,105 trainable parameters\n"
468 | ]
469 | }
470 | ],
471 | "source": [
472 | "def count_parameters(model):\n",
473 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
474 | "\n",
475 | "print(f'The model has {count_parameters(model):,} trainable parameters')"
476 | ]
477 | },
478 | {
479 | "cell_type": "markdown",
480 | "metadata": {},
481 | "source": [
482 | "## Train the Model"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "Now we'll set up the training and then train the model.\n",
490 | "\n",
491 | "First, we'll create an optimizer. This is the algorithm we use to update the parameters of the module. Here, we'll use _stochastic gradient descent_ (SGD). The first argument is the parameters will be updated by the optimizer, the second is the learning rate, i.e. how much we'll change the parameters by when we do a parameter update."
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 16,
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "import torch.optim as optim\n",
501 | "\n",
502 | "optimizer = optim.SGD(model.parameters(), lr=1e-3)"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "Next, we'll define our loss function. In PyTorch this is commonly called a criterion. \n",
510 | "\n",
511 | "The loss function here is _binary cross entropy with logits_. \n",
512 | "\n",
513 | "Our model currently outputs an unbound real number. As our labels are either 0 or 1, we want to restrict the predictions to a number between 0 and 1. We do this using the _sigmoid_ or _logit_ functions. \n",
514 | "\n",
515 | "We then use this this bound scalar to calculate the loss using binary cross entropy. \n",
516 | "\n",
517 | "The `BCEWithLogitsLoss` criterion carries out both the sigmoid and the binary cross entropy steps."
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 17,
523 | "metadata": {},
524 | "outputs": [],
525 | "source": [
526 | "criterion = nn.BCEWithLogitsLoss()"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "Using `.to`, we can place the model and the criterion on the GPU (if we have one). "
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 18,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": [
542 | "model = model.to(device)\n",
543 | "criterion = criterion.to(device)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "Our criterion function calculates the loss, however we have to write our function to calculate the accuracy. \n",
551 | "\n",
552 | "This function first feeds the predictions through a sigmoid layer, squashing the values between 0 and 1, we then round them to the nearest integer. This rounds any value greater than 0.5 to 1 (a positive sentiment) and the rest to 0 (a negative sentiment).\n",
553 | "\n",
554 | "We then calculate how many rounded predictions equal the actual labels and average it across the batch."
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 19,
560 | "metadata": {},
561 | "outputs": [],
562 | "source": [
563 | "def binary_accuracy(preds, y):\n",
564 | " \"\"\"\n",
565 | " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n",
566 | " \"\"\"\n",
567 | "\n",
568 | " #round predictions to the closest integer\n",
569 | " rounded_preds = torch.round(torch.sigmoid(preds))\n",
570 | " correct = (rounded_preds == y).float() #convert into float for division \n",
571 | " acc = correct.sum() / len(correct)\n",
572 | " return acc"
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {},
578 | "source": [
579 | "The `train` function iterates over all examples, one batch at a time. \n",
580 | "\n",
581 | "`model.train()` is used to put the model in \"training mode\", which turns on _dropout_ and _batch normalization_. Although we aren't using them in this model, it's good practice to include it.\n",
582 | "\n",
583 | "For each batch, we first zero the gradients. Each parameter in a model has a `grad` attribute which stores the gradient calculated by the `criterion`. PyTorch does not automatically remove (or \"zero\") the gradients calculated from the last gradient calculation, so they must be manually zeroed.\n",
584 | "\n",
585 | "We then feed the batch of sentences, `batch.text`, into the model. Note, you do not need to do `model.forward(batch.text)`, simply calling the model works. The `squeeze` is needed as the predictions are initially size _**[batch size, 1]**_, and we need to remove the dimension of size 1 as PyTorch expects the predictions input to our criterion function to be of size _**[batch size]**_.\n",
586 | "\n",
587 | "The loss and accuracy are then calculated using our predictions and the labels, `batch.label`, with the loss being averaged over all examples in the batch.\n",
588 | "\n",
589 | "We calculate the gradient of each parameter with `loss.backward()`, and then update the parameters using the gradients and optimizer algorithm with `optimizer.step()`.\n",
590 | "\n",
591 | "The loss and accuracy is accumulated across the epoch, the `.item()` method is used to extract a scalar from a tensor which only contains a single value.\n",
592 | "\n",
593 | "Finally, we return the loss and accuracy, averaged across the epoch. The `len` of an iterator is the number of batches in the iterator.\n",
594 | "\n",
595 | "You may recall when initializing the `LABEL` field, we set `dtype=torch.float`. This is because TorchText sets tensors to be `LongTensor`s by default, however our criterion expects both inputs to be `FloatTensor`s. Setting the `dtype` to be `torch.float`, did this for us. The alternative method of doing this would be to do the conversion inside the `train` function by passing `batch.label.float()` instad of `batch.label` to the criterion. "
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 20,
601 | "metadata": {},
602 | "outputs": [],
603 | "source": [
604 | "def train(model, iterator, optimizer, criterion):\n",
605 | " \n",
606 | " epoch_loss = 0\n",
607 | " epoch_acc = 0\n",
608 | " \n",
609 | " model.train()\n",
610 | " \n",
611 | " for batch in iterator:\n",
612 | " \n",
613 | " optimizer.zero_grad()\n",
614 | " \n",
615 | " predictions = model(batch.text).squeeze(1)\n",
616 | " \n",
617 | " loss = criterion(predictions, batch.label)\n",
618 | " \n",
619 | " acc = binary_accuracy(predictions, batch.label)\n",
620 | " \n",
621 | " loss.backward()\n",
622 | " \n",
623 | " optimizer.step()\n",
624 | " \n",
625 | " epoch_loss += loss.item()\n",
626 | " epoch_acc += acc.item()\n",
627 | " \n",
628 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
629 | ]
630 | },
631 | {
632 | "cell_type": "markdown",
633 | "metadata": {},
634 | "source": [
635 | "`evaluate` is similar to `train`, with a few modifications as you don't want to update the parameters when evaluating.\n",
636 | "\n",
637 | "`model.eval()` puts the model in \"evaluation mode\", this turns off _dropout_ and _batch normalization_. Again, we are not using them in this model, but it is good practice to include them.\n",
638 | "\n",
639 | "No gradients are calculated on PyTorch operations inside the `with no_grad()` block. This causes less memory to be used and speeds up computation.\n",
640 | "\n",
641 | "The rest of the function is the same as `train`, with the removal of `optimizer.zero_grad()`, `loss.backward()` and `optimizer.step()`, as we do not update the model's parameters when evaluating."
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 21,
647 | "metadata": {},
648 | "outputs": [],
649 | "source": [
650 | "def evaluate(model, iterator, criterion):\n",
651 | " \n",
652 | " epoch_loss = 0\n",
653 | " epoch_acc = 0\n",
654 | " \n",
655 | " model.eval()\n",
656 | " \n",
657 | " with torch.no_grad():\n",
658 | " \n",
659 | " for batch in iterator:\n",
660 | "\n",
661 | " predictions = model(batch.text).squeeze(1)\n",
662 | " \n",
663 | " loss = criterion(predictions, batch.label)\n",
664 | " \n",
665 | " acc = binary_accuracy(predictions, batch.label)\n",
666 | "\n",
667 | " epoch_loss += loss.item()\n",
668 | " epoch_acc += acc.item()\n",
669 | " \n",
670 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
671 | ]
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "metadata": {},
676 | "source": [
677 | "We'll also create a function to tell us how long an epoch takes to compare training times between models."
678 | ]
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": 22,
683 | "metadata": {},
684 | "outputs": [],
685 | "source": [
686 | "import time\n",
687 | "\n",
688 | "def epoch_time(start_time, end_time):\n",
689 | " elapsed_time = end_time - start_time\n",
690 | " elapsed_mins = int(elapsed_time / 60)\n",
691 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
692 | " return elapsed_mins, elapsed_secs"
693 | ]
694 | },
695 | {
696 | "cell_type": "markdown",
697 | "metadata": {},
698 | "source": [
699 | "We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.\n",
700 | "\n",
701 | "At each epoch, if the validation loss is the best we have seen so far, we'll save the parameters of the model and then after training has finished we'll use that model on the test set."
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": 23,
707 | "metadata": {},
708 | "outputs": [
709 | {
710 | "name": "stdout",
711 | "output_type": "stream",
712 | "text": [
713 | "Epoch: 01 | Epoch Time: 0m 17s\n",
714 | "\tTrain Loss: 0.694 | Train Acc: 50.12%\n",
715 | "\t Val. Loss: 0.696 | Val. Acc: 50.17%\n",
716 | "Epoch: 02 | Epoch Time: 0m 16s\n",
717 | "\tTrain Loss: 0.693 | Train Acc: 49.72%\n",
718 | "\t Val. Loss: 0.696 | Val. Acc: 51.01%\n",
719 | "Epoch: 03 | Epoch Time: 0m 16s\n",
720 | "\tTrain Loss: 0.693 | Train Acc: 50.22%\n",
721 | "\t Val. Loss: 0.696 | Val. Acc: 50.87%\n",
722 | "Epoch: 04 | Epoch Time: 0m 16s\n",
723 | "\tTrain Loss: 0.693 | Train Acc: 49.94%\n",
724 | "\t Val. Loss: 0.696 | Val. Acc: 49.91%\n",
725 | "Epoch: 05 | Epoch Time: 0m 17s\n",
726 | "\tTrain Loss: 0.693 | Train Acc: 50.07%\n",
727 | "\t Val. Loss: 0.696 | Val. Acc: 51.00%\n"
728 | ]
729 | }
730 | ],
731 | "source": [
732 | "N_EPOCHS = 5\n",
733 | "\n",
734 | "best_valid_loss = float('inf')\n",
735 | "\n",
736 | "for epoch in range(N_EPOCHS):\n",
737 | "\n",
738 | " start_time = time.time()\n",
739 | " \n",
740 | " train_loss, train_acc = train(model, train_iterator, optimizer, criterion)\n",
741 | " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)\n",
742 | " \n",
743 | " end_time = time.time()\n",
744 | "\n",
745 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
746 | " \n",
747 | " if valid_loss < best_valid_loss:\n",
748 | " best_valid_loss = valid_loss\n",
749 | " torch.save(model.state_dict(), 'tut1-model.pt')\n",
750 | " \n",
751 | " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
752 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
753 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')"
754 | ]
755 | },
756 | {
757 | "cell_type": "markdown",
758 | "metadata": {},
759 | "source": [
760 | "You may have noticed the loss is not really decreasing and the accuracy is poor. This is due to several issues with the model which we'll improve in the next notebook.\n",
761 | "\n",
762 | "Finally, the metric we actually care about, the test loss and accuracy, which we get from our parameters that gave us the best validation loss."
763 | ]
764 | },
765 | {
766 | "cell_type": "code",
767 | "execution_count": 24,
768 | "metadata": {},
769 | "outputs": [
770 | {
771 | "name": "stdout",
772 | "output_type": "stream",
773 | "text": [
774 | "Test Loss: 0.708 | Test Acc: 47.87%\n"
775 | ]
776 | }
777 | ],
778 | "source": [
779 | "model.load_state_dict(torch.load('tut1-model.pt'))\n",
780 | "\n",
781 | "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n",
782 | "\n",
783 | "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "metadata": {},
789 | "source": [
790 | "## Next Steps\n",
791 | "\n",
792 | "In the next notebook, the improvements we will make are:\n",
793 | "- packed padded sequences\n",
794 | "- pre-trained word embeddings\n",
795 | "- different RNN architecture\n",
796 | "- bidirectional RNN\n",
797 | "- multi-layer RNN\n",
798 | "- regularization\n",
799 | "- a different optimizer\n",
800 | "\n",
801 | "This will allow us to achieve ~84% accuracy."
802 | ]
803 | }
804 | ],
805 | "metadata": {
806 | "kernelspec": {
807 | "display_name": "Python 3 (ipykernel)",
808 | "language": "python",
809 | "name": "python3"
810 | },
811 | "language_info": {
812 | "codemirror_mode": {
813 | "name": "ipython",
814 | "version": 3
815 | },
816 | "file_extension": ".py",
817 | "mimetype": "text/x-python",
818 | "name": "python",
819 | "nbconvert_exporter": "python",
820 | "pygments_lexer": "ipython3",
821 | "version": "3.9.12"
822 | }
823 | },
824 | "nbformat": 4,
825 | "nbformat_minor": 2
826 | }
827 |
--------------------------------------------------------------------------------
/legacy/3 - Faster Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 3 - Faster Sentiment Analysis\n",
8 | "\n",
9 | "In the previous notebook we managed to achieve a decent test accuracy of ~84% using all of the common techniques used for sentiment analysis. In this notebook, we'll implement a model that gets comparable results whilst training significantly faster and using around half of the parameters. More specifically, we'll be implementing the \"FastText\" model from the paper [Bag of Tricks for Efficient Text Classification](https://arxiv.org/abs/1607.01759)."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Preparing Data\n",
17 | "\n",
18 | "One of the key concepts in the FastText paper is that they calculate the n-grams of an input sentence and append them to the end of a sentence. Here, we'll use bi-grams. Briefly, a bi-gram is a pair of words/tokens that appear consecutively within a sentence. \n",
19 | "\n",
20 | "For example, in the sentence \"how are you ?\", the bi-grams are: \"how are\", \"are you\" and \"you ?\".\n",
21 | "\n",
22 | "The `generate_bigrams` function takes a sentence that has already been tokenized, calculates the bi-grams and appends them to the end of the tokenized list."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 1,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "def generate_bigrams(x):\n",
32 | " n_grams = set(zip(*[x[i:] for i in range(2)]))\n",
33 | " for n_gram in n_grams:\n",
34 | " x.append(' '.join(n_gram))\n",
35 | " return x"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "As an example:"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "['This', 'film', 'is', 'terrible', 'film is', 'This film', 'is terrible']"
54 | ]
55 | },
56 | "execution_count": 2,
57 | "metadata": {},
58 | "output_type": "execute_result"
59 | }
60 | ],
61 | "source": [
62 | "generate_bigrams(['This', 'film', 'is', 'terrible'])"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "TorchText `Field`s have a `preprocessing` argument. A function passed here will be applied to a sentence after it has been tokenized (transformed from a string into a list of tokens), but before it has been numericalized (transformed from a list of tokens to a list of indexes). This is where we'll pass our `generate_bigrams` function.\n",
70 | "\n",
71 | "As we aren't using an RNN we can't use packed padded sequences, thus we do not need to set `include_lengths = True`."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stderr",
81 | "output_type": "stream",
82 | "text": [
83 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/field.py:150: UserWarning: Field class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
84 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n",
85 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/field.py:150: UserWarning: LabelField class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
86 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "import torch\n",
92 | "from torchtext.legacy import data\n",
93 | "from torchtext.legacy import datasets\n",
94 | "\n",
95 | "SEED = 1234\n",
96 | "\n",
97 | "torch.manual_seed(SEED)\n",
98 | "torch.backends.cudnn.deterministic = True\n",
99 | "\n",
100 | "TEXT = data.Field(tokenize = 'spacy',\n",
101 | " tokenizer_language = 'en_core_web_sm',\n",
102 | " preprocessing = generate_bigrams)\n",
103 | "\n",
104 | "LABEL = data.LabelField(dtype = torch.float)"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "As before, we load the IMDb dataset and create the splits."
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 4,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stderr",
121 | "output_type": "stream",
122 | "text": [
123 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/example.py:78: UserWarning: Example class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
124 | " warnings.warn('Example class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.', UserWarning)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "import random\n",
130 | "\n",
131 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
132 | "\n",
133 | "train_data, valid_data = train_data.split(random_state = random.seed(SEED))"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "Build the vocab and load the pre-trained word embeddings."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "MAX_VOCAB_SIZE = 25_000\n",
150 | "\n",
151 | "TEXT.build_vocab(train_data, \n",
152 | " max_size = MAX_VOCAB_SIZE, \n",
153 | " vectors = \"glove.6B.100d\", \n",
154 | " unk_init = torch.Tensor.normal_)\n",
155 | "\n",
156 | "LABEL.build_vocab(train_data)"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "And create the iterators."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 6,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "name": "stderr",
173 | "output_type": "stream",
174 | "text": [
175 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/iterator.py:48: UserWarning: BucketIterator class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
176 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "BATCH_SIZE = 64\n",
182 | "\n",
183 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
184 | "\n",
185 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
186 | " (train_data, valid_data, test_data), \n",
187 | " batch_size = BATCH_SIZE, \n",
188 | " device = device)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "## Build the Model\n",
196 | "\n",
197 | "This model has far fewer parameters than the previous model as it only has 2 layers that have any parameters, the embedding layer and the linear layer. There is no RNN component in sight!\n",
198 | "\n",
199 | "Instead, it first calculates the word embedding for each word using the `Embedding` layer (blue), then calculates the average of all of the word embeddings (pink) and feeds this through the `Linear` layer (silver), and that's it!\n",
200 | "\n",
201 | "\n",
202 | "\n",
203 | "We implement the averaging with the `avg_pool2d` (average pool 2-dimensions) function. Initially, you may think using a 2-dimensional pooling seems strange, surely our sentences are 1-dimensional, not 2-dimensional? However, you can think of the word embeddings as a 2-dimensional grid, where the words are along one axis and the dimensions of the word embeddings are along the other. The image below is an example sentence after being converted into 5-dimensional word embeddings, with the words along the vertical axis and the embeddings along the horizontal axis. Each element in this [4x5] tensor is represented by a green block.\n",
204 | "\n",
205 | "\n",
206 | "\n",
207 | "The `avg_pool2d` uses a filter of size `embedded.shape[1]` (i.e. the length of the sentence) by 1. This is shown in pink in the image below.\n",
208 | "\n",
209 | "\n",
210 | "\n",
211 | "We calculate the average value of all elements covered by the filter, then the filter then slides to the right, calculating the average over the next column of embedding values for each word in the sentence. \n",
212 | "\n",
213 | "\n",
214 | "\n",
215 | "Each filter position gives us a single value, the average of all covered elements. After the filter has covered all embedding dimensions we get a [1x5] tensor. This tensor is then passed through the linear layer to produce our prediction."
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 7,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "import torch.nn as nn\n",
225 | "import torch.nn.functional as F\n",
226 | "\n",
227 | "class FastText(nn.Module):\n",
228 | " def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):\n",
229 | " \n",
230 | " super().__init__()\n",
231 | " \n",
232 | " self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)\n",
233 | " \n",
234 | " self.fc = nn.Linear(embedding_dim, output_dim)\n",
235 | " \n",
236 | " def forward(self, text):\n",
237 | " \n",
238 | " #text = [sent len, batch size]\n",
239 | " \n",
240 | " embedded = self.embedding(text)\n",
241 | " \n",
242 | " #embedded = [sent len, batch size, emb dim]\n",
243 | " \n",
244 | " embedded = embedded.permute(1, 0, 2)\n",
245 | " \n",
246 | " #embedded = [batch size, sent len, emb dim]\n",
247 | " \n",
248 | " pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) \n",
249 | " \n",
250 | " #pooled = [batch size, embedding_dim]\n",
251 | " \n",
252 | " return self.fc(pooled)"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "As previously, we'll create an instance of our `FastText` class."
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 8,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "INPUT_DIM = len(TEXT.vocab)\n",
269 | "EMBEDDING_DIM = 100\n",
270 | "OUTPUT_DIM = 1\n",
271 | "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n",
272 | "\n",
273 | "model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "Looking at the number of parameters in our model, we see we have about the same as the standard RNN from the first notebook and half the parameters of the previous model."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 9,
286 | "metadata": {},
287 | "outputs": [
288 | {
289 | "name": "stdout",
290 | "output_type": "stream",
291 | "text": [
292 | "The model has 2,500,301 trainable parameters\n"
293 | ]
294 | }
295 | ],
296 | "source": [
297 | "def count_parameters(model):\n",
298 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
299 | "\n",
300 | "print(f'The model has {count_parameters(model):,} trainable parameters')"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "And copy the pre-trained vectors to our embedding layer."
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 10,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "data": {
317 | "text/plain": [
318 | "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.2647, -0.2753, -0.1325],\n",
319 | " [-0.8555, -0.7208, 1.3755, ..., 0.0825, -1.1314, 0.3997],\n",
320 | " [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n",
321 | " ...,\n",
322 | " [-0.1606, -0.7357, 0.5809, ..., 0.8704, -1.5637, -1.5724],\n",
323 | " [-1.3126, -1.6717, 0.4203, ..., 0.2348, -0.9110, 1.0914],\n",
324 | " [-1.5268, 1.5639, -1.0541, ..., 1.0045, -0.6813, -0.8846]])"
325 | ]
326 | },
327 | "execution_count": 10,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "pretrained_embeddings = TEXT.vocab.vectors\n",
334 | "\n",
335 | "model.embedding.weight.data.copy_(pretrained_embeddings)"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "Not forgetting to zero the initial weights of our unknown and padding tokens."
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 11,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]\n",
352 | "\n",
353 | "model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)\n",
354 | "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "## Train the Model"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "Training the model is the exact same as last time.\n",
369 | "\n",
370 | "We initialize our optimizer..."
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 12,
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "import torch.optim as optim\n",
380 | "\n",
381 | "optimizer = optim.Adam(model.parameters())"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "We define the criterion and place the model and criterion on the GPU (if available)..."
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 13,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "criterion = nn.BCEWithLogitsLoss()\n",
398 | "\n",
399 | "model = model.to(device)\n",
400 | "criterion = criterion.to(device)"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "We implement the function to calculate accuracy..."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 14,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "def binary_accuracy(preds, y):\n",
417 | " \"\"\"\n",
418 | " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n",
419 | " \"\"\"\n",
420 | "\n",
421 | " #round predictions to the closest integer\n",
422 | " rounded_preds = torch.round(torch.sigmoid(preds))\n",
423 | " correct = (rounded_preds == y).float() #convert into float for division \n",
424 | " acc = correct.sum() / len(correct)\n",
425 | " return acc"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "We define a function for training our model...\n",
433 | "\n",
434 | "**Note**: we are no longer using dropout so we do not need to use `model.train()`, but as mentioned in the 1st notebook, it is good practice to use it."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 15,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "def train(model, iterator, optimizer, criterion):\n",
444 | " \n",
445 | " epoch_loss = 0\n",
446 | " epoch_acc = 0\n",
447 | " \n",
448 | " model.train()\n",
449 | " \n",
450 | " for batch in iterator:\n",
451 | " \n",
452 | " optimizer.zero_grad()\n",
453 | " \n",
454 | " predictions = model(batch.text).squeeze(1)\n",
455 | " \n",
456 | " loss = criterion(predictions, batch.label)\n",
457 | " \n",
458 | " acc = binary_accuracy(predictions, batch.label)\n",
459 | " \n",
460 | " loss.backward()\n",
461 | " \n",
462 | " optimizer.step()\n",
463 | " \n",
464 | " epoch_loss += loss.item()\n",
465 | " epoch_acc += acc.item()\n",
466 | " \n",
467 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "We define a function for testing our model...\n",
475 | "\n",
476 | "**Note**: again, we leave `model.eval()` even though we do not use dropout."
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 16,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "def evaluate(model, iterator, criterion):\n",
486 | " \n",
487 | " epoch_loss = 0\n",
488 | " epoch_acc = 0\n",
489 | " \n",
490 | " model.eval()\n",
491 | " \n",
492 | " with torch.no_grad():\n",
493 | " \n",
494 | " for batch in iterator:\n",
495 | "\n",
496 | " predictions = model(batch.text).squeeze(1)\n",
497 | " \n",
498 | " loss = criterion(predictions, batch.label)\n",
499 | " \n",
500 | " acc = binary_accuracy(predictions, batch.label)\n",
501 | "\n",
502 | " epoch_loss += loss.item()\n",
503 | " epoch_acc += acc.item()\n",
504 | " \n",
505 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {},
511 | "source": [
512 | "As before, we'll implement a useful function to tell us how long an epoch takes."
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 17,
518 | "metadata": {},
519 | "outputs": [],
520 | "source": [
521 | "import time\n",
522 | "\n",
523 | "def epoch_time(start_time, end_time):\n",
524 | " elapsed_time = end_time - start_time\n",
525 | " elapsed_mins = int(elapsed_time / 60)\n",
526 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
527 | " return elapsed_mins, elapsed_secs"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "Finally, we train our model."
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 18,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "name": "stderr",
544 | "output_type": "stream",
545 | "text": [
546 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/batch.py:23: UserWarning: Batch class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
547 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n"
548 | ]
549 | },
550 | {
551 | "name": "stdout",
552 | "output_type": "stream",
553 | "text": [
554 | "Epoch: 01 | Epoch Time: 0m 7s\n",
555 | "\tTrain Loss: 0.688 | Train Acc: 61.31%\n",
556 | "\t Val. Loss: 0.637 | Val. Acc: 72.46%\n",
557 | "Epoch: 02 | Epoch Time: 0m 6s\n",
558 | "\tTrain Loss: 0.651 | Train Acc: 75.04%\n",
559 | "\t Val. Loss: 0.507 | Val. Acc: 76.92%\n",
560 | "Epoch: 03 | Epoch Time: 0m 6s\n",
561 | "\tTrain Loss: 0.578 | Train Acc: 79.91%\n",
562 | "\t Val. Loss: 0.424 | Val. Acc: 80.97%\n",
563 | "Epoch: 04 | Epoch Time: 0m 6s\n",
564 | "\tTrain Loss: 0.501 | Train Acc: 83.97%\n",
565 | "\t Val. Loss: 0.377 | Val. Acc: 84.34%\n",
566 | "Epoch: 05 | Epoch Time: 0m 6s\n",
567 | "\tTrain Loss: 0.435 | Train Acc: 86.96%\n",
568 | "\t Val. Loss: 0.363 | Val. Acc: 86.18%\n"
569 | ]
570 | }
571 | ],
572 | "source": [
573 | "N_EPOCHS = 5\n",
574 | "\n",
575 | "best_valid_loss = float('inf')\n",
576 | "\n",
577 | "for epoch in range(N_EPOCHS):\n",
578 | "\n",
579 | " start_time = time.time()\n",
580 | " \n",
581 | " train_loss, train_acc = train(model, train_iterator, optimizer, criterion)\n",
582 | " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)\n",
583 | " \n",
584 | " end_time = time.time()\n",
585 | "\n",
586 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
587 | " \n",
588 | " if valid_loss < best_valid_loss:\n",
589 | " best_valid_loss = valid_loss\n",
590 | " torch.save(model.state_dict(), 'tut3-model.pt')\n",
591 | " \n",
592 | " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
593 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
594 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')"
595 | ]
596 | },
597 | {
598 | "cell_type": "markdown",
599 | "metadata": {},
600 | "source": [
601 | "...and get the test accuracy!\n",
602 | "\n",
603 | "The results are comparable to the results in the last notebook, but training takes considerably less time!"
604 | ]
605 | },
606 | {
607 | "cell_type": "code",
608 | "execution_count": 19,
609 | "metadata": {},
610 | "outputs": [
611 | {
612 | "name": "stdout",
613 | "output_type": "stream",
614 | "text": [
615 | "Test Loss: 0.381 | Test Acc: 85.42%\n"
616 | ]
617 | }
618 | ],
619 | "source": [
620 | "model.load_state_dict(torch.load('tut3-model.pt'))\n",
621 | "\n",
622 | "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n",
623 | "\n",
624 | "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
625 | ]
626 | },
627 | {
628 | "cell_type": "markdown",
629 | "metadata": {},
630 | "source": [
631 | "## User Input\n",
632 | "\n",
633 | "And as before, we can test on any input the user provides making sure to generate bigrams from our tokenized sentence."
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 20,
639 | "metadata": {},
640 | "outputs": [],
641 | "source": [
642 | "import spacy\n",
643 | "nlp = spacy.load('en_core_web_sm')\n",
644 | "\n",
645 | "def predict_sentiment(model, sentence):\n",
646 | " model.eval()\n",
647 | " tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])\n",
648 | " indexed = [TEXT.vocab.stoi[t] for t in tokenized]\n",
649 | " tensor = torch.LongTensor(indexed).to(device)\n",
650 | " tensor = tensor.unsqueeze(1)\n",
651 | " prediction = torch.sigmoid(model(tensor))\n",
652 | " return prediction.item()"
653 | ]
654 | },
655 | {
656 | "cell_type": "markdown",
657 | "metadata": {},
658 | "source": [
659 | "An example negative review..."
660 | ]
661 | },
662 | {
663 | "cell_type": "code",
664 | "execution_count": 21,
665 | "metadata": {},
666 | "outputs": [
667 | {
668 | "data": {
669 | "text/plain": [
670 | "2.1313092350011553e-12"
671 | ]
672 | },
673 | "execution_count": 21,
674 | "metadata": {},
675 | "output_type": "execute_result"
676 | }
677 | ],
678 | "source": [
679 | "predict_sentiment(model, \"This film is terrible\")"
680 | ]
681 | },
682 | {
683 | "cell_type": "markdown",
684 | "metadata": {},
685 | "source": [
686 | "An example positive review..."
687 | ]
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": 22,
692 | "metadata": {},
693 | "outputs": [
694 | {
695 | "data": {
696 | "text/plain": [
697 | "1.0"
698 | ]
699 | },
700 | "execution_count": 22,
701 | "metadata": {},
702 | "output_type": "execute_result"
703 | }
704 | ],
705 | "source": [
706 | "predict_sentiment(model, \"This film is great\")"
707 | ]
708 | },
709 | {
710 | "cell_type": "markdown",
711 | "metadata": {},
712 | "source": [
713 | "## Next Steps\n",
714 | "\n",
715 | "In the next notebook we'll use convolutional neural networks (CNNs) to perform sentiment analysis."
716 | ]
717 | }
718 | ],
719 | "metadata": {
720 | "kernelspec": {
721 | "display_name": "Python 3",
722 | "language": "python",
723 | "name": "python3"
724 | },
725 | "language_info": {
726 | "codemirror_mode": {
727 | "name": "ipython",
728 | "version": 3
729 | },
730 | "file_extension": ".py",
731 | "mimetype": "text/x-python",
732 | "name": "python",
733 | "nbconvert_exporter": "python",
734 | "pygments_lexer": "ipython3",
735 | "version": "3.9.5"
736 | }
737 | },
738 | "nbformat": 4,
739 | "nbformat_minor": 2
740 | }
741 |
--------------------------------------------------------------------------------
/legacy/5 - Multi-class Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 5 - Multi-class Sentiment Analysis\n",
8 | "\n",
9 | "In all of the previous notebooks we have performed sentiment analysis on a dataset with only two classes, positive or negative. When we have only two classes our output can be a single scalar, bound between 0 and 1, that indicates what class an example belongs to. When we have more than 2 examples, our output must be a $C$ dimensional vector, where $C$ is the number of classes.\n",
10 | "\n",
11 | "In this notebook, we'll be performing classification on a dataset with 6 classes. Note that this dataset isn't actually a sentiment analysis dataset, it's a dataset of questions and the task is to classify what category the question belongs to. However, everything covered in this notebook applies to any dataset with examples that contain an input sequence belonging to one of $C$ classes.\n",
12 | "\n",
13 | "Below, we setup the fields, and load the dataset. \n",
14 | "\n",
15 | "The first difference is that we do not need to set the `dtype` in the `LABEL` field. When doing a mutli-class problem, PyTorch expects the labels to be numericalized `LongTensor`s. \n",
16 | "\n",
17 | "The second different is that we use `TREC` instead of `IMDB` to load the `TREC` dataset. The `fine_grained` argument allows us to use the fine-grained labels (of which there are 50 classes) or not (in which case they'll be 6 classes). You can change this how you please."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {},
24 | "outputs": [
25 | {
26 | "name": "stderr",
27 | "output_type": "stream",
28 | "text": [
29 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/field.py:150: UserWarning: Field class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
30 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n",
31 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/field.py:150: UserWarning: LabelField class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
32 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n",
33 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/example.py:78: UserWarning: Example class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
34 | " warnings.warn('Example class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.', UserWarning)\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "import torch\n",
40 | "from torchtext.legacy import data\n",
41 | "from torchtext.legacy import datasets\n",
42 | "import random\n",
43 | "\n",
44 | "SEED = 1234\n",
45 | "\n",
46 | "torch.manual_seed(SEED)\n",
47 | "torch.backends.cudnn.deterministic = True\n",
48 | "\n",
49 | "TEXT = data.Field(tokenize = 'spacy',\n",
50 | " tokenizer_language = 'en_core_web_sm')\n",
51 | "\n",
52 | "LABEL = data.LabelField()\n",
53 | "\n",
54 | "train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)\n",
55 | "\n",
56 | "train_data, valid_data = train_data.split(random_state = random.seed(SEED))"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "Let's look at one of the examples in the training set."
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 2,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "{'text': ['What', 'is', 'a', 'Cartesian', 'Diver', '?'], 'label': 'DESC'}"
75 | ]
76 | },
77 | "execution_count": 2,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "vars(train_data[-1])"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "Next, we'll build the vocabulary. As this dataset is small (only ~3800 training examples) it also has a very small vocabulary (~7500 unique tokens), this means we do not need to set a `max_size` on the vocabulary as before."
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 3,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "MAX_VOCAB_SIZE = 25_000\n",
100 | "\n",
101 | "TEXT.build_vocab(train_data, \n",
102 | " max_size = MAX_VOCAB_SIZE, \n",
103 | " vectors = \"glove.6B.100d\", \n",
104 | " unk_init = torch.Tensor.normal_)\n",
105 | "\n",
106 | "LABEL.build_vocab(train_data)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "Next, we can check the labels.\n",
114 | "\n",
115 | "The 6 labels (for the non-fine-grained case) correspond to the 6 types of questions in the dataset:\n",
116 | "- `HUM` for questions about humans\n",
117 | "- `ENTY` for questions about entities\n",
118 | "- `DESC` for questions asking you for a description \n",
119 | "- `NUM` for questions where the answer is numerical\n",
120 | "- `LOC` for questions where the answer is a location\n",
121 | "- `ABBR` for questions asking about abbreviations"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 4,
127 | "metadata": {},
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "defaultdict(None, {'HUM': 0, 'ENTY': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "print(LABEL.vocab.stoi)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "As always, we set up the iterators."
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 5,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "name": "stderr",
155 | "output_type": "stream",
156 | "text": [
157 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/iterator.py:48: UserWarning: BucketIterator class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
158 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "BATCH_SIZE = 64\n",
164 | "\n",
165 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
166 | "\n",
167 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
168 | " (train_data, valid_data, test_data), \n",
169 | " batch_size = BATCH_SIZE, \n",
170 | " device = device)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "We'll be using the CNN model from the previous notebook, however any of the models covered in these tutorials will work on this dataset. The only difference is now the `output_dim` will be $C$ instead of $1$."
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 6,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "import torch.nn as nn\n",
187 | "import torch.nn.functional as F\n",
188 | "\n",
189 | "class CNN(nn.Module):\n",
190 | " def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, \n",
191 | " dropout, pad_idx):\n",
192 | " \n",
193 | " super().__init__()\n",
194 | " \n",
195 | " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
196 | " \n",
197 | " self.convs = nn.ModuleList([\n",
198 | " nn.Conv2d(in_channels = 1, \n",
199 | " out_channels = n_filters, \n",
200 | " kernel_size = (fs, embedding_dim)) \n",
201 | " for fs in filter_sizes\n",
202 | " ])\n",
203 | " \n",
204 | " self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)\n",
205 | " \n",
206 | " self.dropout = nn.Dropout(dropout)\n",
207 | " \n",
208 | " def forward(self, text):\n",
209 | " \n",
210 | " #text = [sent len, batch size]\n",
211 | " \n",
212 | " text = text.permute(1, 0)\n",
213 | " \n",
214 | " #text = [batch size, sent len]\n",
215 | " \n",
216 | " embedded = self.embedding(text)\n",
217 | " \n",
218 | " #embedded = [batch size, sent len, emb dim]\n",
219 | " \n",
220 | " embedded = embedded.unsqueeze(1)\n",
221 | " \n",
222 | " #embedded = [batch size, 1, sent len, emb dim]\n",
223 | " \n",
224 | " conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]\n",
225 | " \n",
226 | " #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]\n",
227 | " \n",
228 | " pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]\n",
229 | " \n",
230 | " #pooled_n = [batch size, n_filters]\n",
231 | " \n",
232 | " cat = self.dropout(torch.cat(pooled, dim = 1))\n",
233 | "\n",
234 | " #cat = [batch size, n_filters * len(filter_sizes)]\n",
235 | " \n",
236 | " return self.fc(cat)"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "We define our model, making sure to set `OUTPUT_DIM` to $C$. We can get $C$ easily by using the size of the `LABEL` vocab, much like we used the length of the `TEXT` vocab to get the size of the vocabulary of the input.\n",
244 | "\n",
245 | "The examples in this dataset are generally a lot smaller than those in the IMDb dataset, so we'll use smaller filter sizes."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 7,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "INPUT_DIM = len(TEXT.vocab)\n",
255 | "EMBEDDING_DIM = 100\n",
256 | "N_FILTERS = 100\n",
257 | "FILTER_SIZES = [2,3,4]\n",
258 | "OUTPUT_DIM = len(LABEL.vocab)\n",
259 | "DROPOUT = 0.5\n",
260 | "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n",
261 | "\n",
262 | "model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "Checking the number of parameters, we can see how the smaller filter sizes means we have about a third of the parameters than we did for the CNN model on the IMDb dataset."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 8,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "The model has 841,806 trainable parameters\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "def count_parameters(model):\n",
287 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
288 | "\n",
289 | "print(f'The model has {count_parameters(model):,} trainable parameters')"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "Next, we'll load our pre-trained embeddings."
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 9,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.2647, -0.2753, -0.1325],\n",
308 | " [-0.8555, -0.7208, 1.3755, ..., 0.0825, -1.1314, 0.3997],\n",
309 | " [ 0.1638, 0.6046, 1.0789, ..., -0.3140, 0.1844, 0.3624],\n",
310 | " ...,\n",
311 | " [-0.3110, -0.3398, 1.0308, ..., 0.5317, 0.2836, -0.0640],\n",
312 | " [ 0.0091, 0.2810, 0.7356, ..., -0.7508, 0.8967, -0.7631],\n",
313 | " [ 0.5831, -0.2514, 0.4156, ..., -0.2735, -0.8659, -1.4063]])"
314 | ]
315 | },
316 | "execution_count": 9,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "pretrained_embeddings = TEXT.vocab.vectors\n",
323 | "\n",
324 | "model.embedding.weight.data.copy_(pretrained_embeddings)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "Then zero the initial weights of the unknown and padding tokens."
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 10,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]\n",
341 | "\n",
342 | "model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)\n",
343 | "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "Another different to the previous notebooks is our loss function (aka criterion). Before we used `BCEWithLogitsLoss`, however now we use `CrossEntropyLoss`. Without going into too much detail, `CrossEntropyLoss` performs a *softmax* function over our model outputs and the loss is given by the *cross entropy* between that and the label.\n",
351 | "\n",
352 | "Generally:\n",
353 | "- `CrossEntropyLoss` is used when our examples exclusively belong to one of $C$ classes\n",
354 | "- `BCEWithLogitsLoss` is used when our examples exclusively belong to only 2 classes (0 and 1) and is also used in the case where our examples belong to between 0 and $C$ classes (aka multilabel classification)."
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 11,
360 | "metadata": {},
361 | "outputs": [],
362 | "source": [
363 | "import torch.optim as optim\n",
364 | "\n",
365 | "optimizer = optim.Adam(model.parameters())\n",
366 | "\n",
367 | "criterion = nn.CrossEntropyLoss()\n",
368 | "\n",
369 | "model = model.to(device)\n",
370 | "criterion = criterion.to(device)"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "Before, we had a function that calculated accuracy in the binary label case, where we said if the value was over 0.5 then we would assume it is positive. In the case where we have more than 2 classes, our model outputs a $C$ dimensional vector, where the value of each element is the beleief that the example belongs to that class. \n",
378 | "\n",
379 | "For example, in our labels we have: 'HUM' = 0, 'ENTY' = 1, 'DESC' = 2, 'NUM' = 3, 'LOC' = 4 and 'ABBR' = 5. If the output of our model was something like: **[5.1, 0.3, 0.1, 2.1, 0.2, 0.6]** this means that the model strongly believes the example belongs to class 0, a question about a human, and slightly believes the example belongs to class 3, a numerical question.\n",
380 | "\n",
381 | "We calculate the accuracy by performing an `argmax` to get the index of the maximum value in the prediction for each element in the batch, and then counting how many times this equals the actual label. We then average this across the batch."
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 12,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "def categorical_accuracy(preds, y):\n",
391 | " \"\"\"\n",
392 | " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n",
393 | " \"\"\"\n",
394 | " top_pred = preds.argmax(1, keepdim = True)\n",
395 | " correct = top_pred.eq(y.view_as(top_pred)).sum()\n",
396 | " acc = correct.float() / y.shape[0]\n",
397 | " return acc"
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {},
403 | "source": [
404 | "The training loop is similar to before, without the need to `squeeze` the model predictions as `CrossEntropyLoss` expects the input to be **[batch size, n classes]** and the label to be **[batch size]**.\n",
405 | "\n",
406 | "The label needs to be a `LongTensor`, which it is by default as we did not set the `dtype` to a `FloatTensor` as before."
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 13,
412 | "metadata": {},
413 | "outputs": [],
414 | "source": [
415 | "def train(model, iterator, optimizer, criterion):\n",
416 | " \n",
417 | " epoch_loss = 0\n",
418 | " epoch_acc = 0\n",
419 | " \n",
420 | " model.train()\n",
421 | " \n",
422 | " for batch in iterator:\n",
423 | " \n",
424 | " optimizer.zero_grad()\n",
425 | " \n",
426 | " predictions = model(batch.text)\n",
427 | " \n",
428 | " loss = criterion(predictions, batch.label)\n",
429 | " \n",
430 | " acc = categorical_accuracy(predictions, batch.label)\n",
431 | " \n",
432 | " loss.backward()\n",
433 | " \n",
434 | " optimizer.step()\n",
435 | " \n",
436 | " epoch_loss += loss.item()\n",
437 | " epoch_acc += acc.item()\n",
438 | " \n",
439 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "The evaluation loop is, again, similar to before."
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 14,
452 | "metadata": {},
453 | "outputs": [],
454 | "source": [
455 | "def evaluate(model, iterator, criterion):\n",
456 | " \n",
457 | " epoch_loss = 0\n",
458 | " epoch_acc = 0\n",
459 | " \n",
460 | " model.eval()\n",
461 | " \n",
462 | " with torch.no_grad():\n",
463 | " \n",
464 | " for batch in iterator:\n",
465 | "\n",
466 | " predictions = model(batch.text)\n",
467 | " \n",
468 | " loss = criterion(predictions, batch.label)\n",
469 | " \n",
470 | " acc = categorical_accuracy(predictions, batch.label)\n",
471 | "\n",
472 | " epoch_loss += loss.item()\n",
473 | " epoch_acc += acc.item()\n",
474 | " \n",
475 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 15,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "import time\n",
485 | "\n",
486 | "def epoch_time(start_time, end_time):\n",
487 | " elapsed_time = end_time - start_time\n",
488 | " elapsed_mins = int(elapsed_time / 60)\n",
489 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
490 | " return elapsed_mins, elapsed_secs"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "Next, we train our model."
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 16,
503 | "metadata": {
504 | "scrolled": true
505 | },
506 | "outputs": [
507 | {
508 | "name": "stderr",
509 | "output_type": "stream",
510 | "text": [
511 | "/home/ben/miniconda3/envs/pytorch17/lib/python3.8/site-packages/torchtext-0.9.0a0+c38fd42-py3.8-linux-x86_64.egg/torchtext/data/batch.py:23: UserWarning: Batch class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.\n",
512 | " warnings.warn('{} class will be retired soon and moved to torchtext.legacy. Please see the most recent release notes for further information.'.format(self.__class__.__name__), UserWarning)\n"
513 | ]
514 | },
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "Epoch: 01 | Epoch Time: 0m 0s\n",
520 | "\tTrain Loss: 1.312 | Train Acc: 47.11%\n",
521 | "\t Val. Loss: 0.947 | Val. Acc: 66.41%\n",
522 | "Epoch: 02 | Epoch Time: 0m 0s\n",
523 | "\tTrain Loss: 0.870 | Train Acc: 69.18%\n",
524 | "\t Val. Loss: 0.741 | Val. Acc: 74.14%\n",
525 | "Epoch: 03 | Epoch Time: 0m 0s\n",
526 | "\tTrain Loss: 0.675 | Train Acc: 76.32%\n",
527 | "\t Val. Loss: 0.621 | Val. Acc: 78.49%\n",
528 | "Epoch: 04 | Epoch Time: 0m 0s\n",
529 | "\tTrain Loss: 0.506 | Train Acc: 83.97%\n",
530 | "\t Val. Loss: 0.547 | Val. Acc: 80.32%\n",
531 | "Epoch: 05 | Epoch Time: 0m 0s\n",
532 | "\tTrain Loss: 0.373 | Train Acc: 88.23%\n",
533 | "\t Val. Loss: 0.487 | Val. Acc: 82.92%\n"
534 | ]
535 | }
536 | ],
537 | "source": [
538 | "N_EPOCHS = 5\n",
539 | "\n",
540 | "best_valid_loss = float('inf')\n",
541 | "\n",
542 | "for epoch in range(N_EPOCHS):\n",
543 | "\n",
544 | " start_time = time.time()\n",
545 | " \n",
546 | " train_loss, train_acc = train(model, train_iterator, optimizer, criterion)\n",
547 | " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)\n",
548 | " \n",
549 | " end_time = time.time()\n",
550 | "\n",
551 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
552 | " \n",
553 | " if valid_loss < best_valid_loss:\n",
554 | " best_valid_loss = valid_loss\n",
555 | " torch.save(model.state_dict(), 'tut5-model.pt')\n",
556 | " \n",
557 | " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
558 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
559 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "Finally, let's run our model on the test set!"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 17,
572 | "metadata": {},
573 | "outputs": [
574 | {
575 | "name": "stdout",
576 | "output_type": "stream",
577 | "text": [
578 | "Test Loss: 0.415 | Test Acc: 86.07%\n"
579 | ]
580 | }
581 | ],
582 | "source": [
583 | "model.load_state_dict(torch.load('tut5-model.pt'))\n",
584 | "\n",
585 | "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n",
586 | "\n",
587 | "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
588 | ]
589 | },
590 | {
591 | "cell_type": "markdown",
592 | "metadata": {},
593 | "source": [
594 | "Similar to how we made a function to predict sentiment for any given sentences, we can now make a function that will predict the class of question given.\n",
595 | "\n",
596 | "The only difference here is that instead of using a sigmoid function to squash the input between 0 and 1, we use the `argmax` to get the highest predicted class index. We then use this index with the label vocab to get the human readable label."
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 18,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": [
605 | "import spacy\n",
606 | "nlp = spacy.load('en_core_web_sm')\n",
607 | "\n",
608 | "def predict_class(model, sentence, min_len = 4):\n",
609 | " model.eval()\n",
610 | " tokenized = [tok.text for tok in nlp.tokenizer(sentence)]\n",
611 | " if len(tokenized) < min_len:\n",
612 | " tokenized += [''] * (min_len - len(tokenized))\n",
613 | " indexed = [TEXT.vocab.stoi[t] for t in tokenized]\n",
614 | " tensor = torch.LongTensor(indexed).to(device)\n",
615 | " tensor = tensor.unsqueeze(1)\n",
616 | " preds = model(tensor)\n",
617 | " max_preds = preds.argmax(dim = 1)\n",
618 | " return max_preds.item()"
619 | ]
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | "Now, let's try it out on a few different questions..."
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 19,
631 | "metadata": {},
632 | "outputs": [
633 | {
634 | "name": "stdout",
635 | "output_type": "stream",
636 | "text": [
637 | "Predicted class is: 0 = HUM\n"
638 | ]
639 | }
640 | ],
641 | "source": [
642 | "pred_class = predict_class(model, \"Who is Keyser Söze?\")\n",
643 | "print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 20,
649 | "metadata": {},
650 | "outputs": [
651 | {
652 | "name": "stdout",
653 | "output_type": "stream",
654 | "text": [
655 | "Predicted class is: 3 = NUM\n"
656 | ]
657 | }
658 | ],
659 | "source": [
660 | "pred_class = predict_class(model, \"How many minutes are in six hundred and eighteen hours?\")\n",
661 | "print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 21,
667 | "metadata": {},
668 | "outputs": [
669 | {
670 | "name": "stdout",
671 | "output_type": "stream",
672 | "text": [
673 | "Predicted class is: 4 = LOC\n"
674 | ]
675 | }
676 | ],
677 | "source": [
678 | "pred_class = predict_class(model, \"What continent is Bulgaria in?\")\n",
679 | "print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')"
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "execution_count": 22,
685 | "metadata": {},
686 | "outputs": [
687 | {
688 | "name": "stdout",
689 | "output_type": "stream",
690 | "text": [
691 | "Predicted class is: 5 = ABBR\n"
692 | ]
693 | }
694 | ],
695 | "source": [
696 | "pred_class = predict_class(model, \"What does WYSIWYG stand for?\")\n",
697 | "print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')"
698 | ]
699 | }
700 | ],
701 | "metadata": {
702 | "kernelspec": {
703 | "display_name": "Python 3",
704 | "language": "python",
705 | "name": "python3"
706 | },
707 | "language_info": {
708 | "codemirror_mode": {
709 | "name": "ipython",
710 | "version": 3
711 | },
712 | "file_extension": ".py",
713 | "mimetype": "text/x-python",
714 | "name": "python",
715 | "nbconvert_exporter": "python",
716 | "pygments_lexer": "ipython3",
717 | "version": "3.8.5"
718 | }
719 | },
720 | "nbformat": 4,
721 | "nbformat_minor": 2
722 | }
--------------------------------------------------------------------------------
/legacy/A - Using TorchText with Your Own Datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# A - Using TorchText with Your Own Datasets\n",
8 | "\n",
9 | "In this series we have used the IMDb dataset included as a dataset in TorchText. TorchText has many canonical datasets included for classification, language modelling, sequence tagging, etc. However, frequently you'll be wanting to use your own datasets. Luckily, TorchText has functions to help you to this.\n",
10 | "\n",
11 | "Recall in the series, we:\n",
12 | "- defined the `Field`s\n",
13 | "- loaded the dataset\n",
14 | "- created the splits\n",
15 | "\n",
16 | "As a reminder, the code is shown below:\n",
17 | "\n",
18 | "```python\n",
19 | "TEXT = data.Field()\n",
20 | "LABEL = data.LabelField()\n",
21 | "\n",
22 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
23 | "\n",
24 | "train_data, valid_data = train_data.split()\n",
25 | "```"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "There are three data formats TorchText can read: `json`, `tsv` (tab separated values) and`csv` (comma separated values).\n",
33 | "\n",
34 | "**In my opinion, the best formatting for TorchText is `json`, which I'll explain later on.**\n",
35 | "\n",
36 | "## Reading JSON\n",
37 | "\n",
38 | "Starting with `json`, your data must be in the `json lines` format, i.e. it must be something like:\n",
39 | "\n",
40 | "```\n",
41 | "{\"name\": \"John\", \"location\": \"United Kingdom\", \"age\": 42, \"quote\": [\"i\", \"love\", \"the\", \"united kingdom\"]}\n",
42 | "{\"name\": \"Mary\", \"location\": \"United States\", \"age\": 36, \"quote\": [\"i\", \"want\", \"more\", \"telescopes\"]}\n",
43 | "```\n",
44 | "\n",
45 | "That is, each line is a `json` object. See `data/train.json` for an example.\n",
46 | "\n",
47 | "We then define the fields:"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 1,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "from torchtext.legacy import data\n",
57 | "from torchtext.legacy import datasets\n",
58 | "\n",
59 | "NAME = data.Field()\n",
60 | "SAYING = data.Field()\n",
61 | "PLACE = data.Field()"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "Next, we must tell TorchText which fields apply to which elements of the `json` object. \n",
69 | "\n",
70 | "For `json` data, we must create a dictionary where:\n",
71 | "- the key matches the key of the `json` object\n",
72 | "- the value is a tuple where:\n",
73 | " - the first element becomes the batch object's attribute name\n",
74 | " - the second element is the name of the `Field`\n",
75 | " \n",
76 | "What do we mean when we say \"becomes the batch object's attribute name\"? Recall in the previous exercises where we accessed the `TEXT` and `LABEL` fields in the train/evaluation loop by using `batch.text` and `batch.label`, this is because TorchText sets the batch object to have a `text` and `label` attribute, each being a tensor containing either the text or the label.\n",
77 | "\n",
78 | "A few notes:\n",
79 | "\n",
80 | "* The order of the keys in the `fields` dictionary does not matter, as long as its keys match the `json` data keys.\n",
81 | "\n",
82 | "- The `Field` name does not have to match the key in the `json` object, e.g. we use `PLACE` for the `\"location\"` field.\n",
83 | "\n",
84 | "- When dealing with `json` data, not all of the keys have to be used, e.g. we did not use the `\"age\"` field.\n",
85 | "\n",
86 | "- Also, if the values of `json` field are a string then the `Fields` tokenization is applied (default is to split the string on spaces), however if the values are a list then no tokenization is applied. Usually it is a good idea for the data to already be tokenized into a list, this saves time as you don't have to wait for TorchText to do it.\n",
87 | "\n",
88 | "- The value of the `json` fields do not have to be the same type. Some examples can have their `\"quote\"` as a string, and some as a list. The tokenization will only get applied to the ones with their `\"quote\"` as a string.\n",
89 | "\n",
90 | "- If you are using a `json` field, every single example must have an instance of that field, e.g. in this example all examples must have a name, location and quote. However, as we are not using the age field, it does not matter if an example does not have it."
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 2,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "fields = {'name': ('n', NAME), 'location': ('p', PLACE), 'quote': ('s', SAYING)}"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "Now, in a training loop we can iterate over the data iterator and access the name via `batch.n`, the location via `batch.p`, and the quote via `batch.s`.\n",
107 | "\n",
108 | "We then create our datasets (`train_data` and `test_data`) with the `TabularDataset.splits` function. \n",
109 | "\n",
110 | "The `path` argument specifices the top level folder common among both datasets, and the `train` and `test` arguments specify the filename of each dataset, e.g. here the train dataset is located at `data/train.json`.\n",
111 | "\n",
112 | "We tell the function we are using `json` data, and pass in our `fields` dictionary defined previously."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 3,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "train_data, test_data = data.TabularDataset.splits(\n",
122 | " path = 'data',\n",
123 | " train = 'train.json',\n",
124 | " test = 'test.json',\n",
125 | " format = 'json',\n",
126 | " fields = fields\n",
127 | ")"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "If you already had a validation dataset, the location of this can be passed as the `validation` argument."
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 4,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "train_data, valid_data, test_data = data.TabularDataset.splits(\n",
144 | " path = 'data',\n",
145 | " train = 'train.json',\n",
146 | " validation = 'valid.json',\n",
147 | " test = 'test.json',\n",
148 | " format = 'json',\n",
149 | " fields = fields\n",
150 | ")"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "We can then view an example to make sure it has worked correctly.\n",
158 | "\n",
159 | "Notice how the field names (`n`, `p` and `s`) match up with what was defined in the `fields` dictionary.\n",
160 | "\n",
161 | "Also notice how the word `\"United Kingdom\"` in `p` has been split by the tokenization, whereas the `\"united kingdom\"` in `s` has not. This is due to what was mentioned previously, where TorchText assumes that any `json` fields that are lists are already tokenized and no further tokenization is applied. "
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 5,
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "{'n': ['John'], 'p': ['United', 'Kingdom'], 's': ['i', 'love', 'the', 'united kingdom']}\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "print(vars(train_data[0]))"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "We can now use `train_data`, `test_data` and `valid_data` to build a vocabulary and create iterators, as in the other notebooks. We can access all attributes by using `batch.n`, `batch.p` and `batch.s` for the names, places and sayings, respectively.\n",
186 | "\n",
187 | "## Reading CSV/TSV\n",
188 | "\n",
189 | "`csv` and `tsv` are very similar, except csv has elements separated by commas and tsv by tabs.\n",
190 | "\n",
191 | "Using the same example above, our `tsv` data will be in the form of:\n",
192 | "\n",
193 | "```\n",
194 | "name\tlocation\tage\tquote\n",
195 | "John\tUnited Kingdom\t42\ti love the united kingdom\n",
196 | "Mary\tUnited States\t36\ti want more telescopes\n",
197 | "```\n",
198 | "\n",
199 | "That is, on each row the elements are separated by tabs and we have one example per row. The first row is usually a header (i.e. the name of each of the columns), but your data could have no header.\n",
200 | "\n",
201 | "You cannot have lists within `tsv` or `csv` data.\n",
202 | "\n",
203 | "The way the fields are defined is a bit different to `json`. We now use a list of tuples, where each element is also a tuple. The first element of these inner tuples will become the batch object's attribute name, second element is the `Field` name.\n",
204 | "\n",
205 | "Unlike the `json` data, the tuples have to be in the same order that they are within the `tsv` data. Due to this, when skipping a column of data a tuple of `None`s needs to be used, if not then our `SAYING` field will be applied to the `age` column of the `tsv` data and the `quote` column will not be used. \n",
206 | "\n",
207 | "However, if you only wanted to use the `name` and `age` column, you could just use two tuples as they are the first two columns.\n",
208 | "\n",
209 | "We change our `TabularDataset` to read the correct `.tsv` files, and change the `format` argument to `'tsv'`.\n",
210 | "\n",
211 | "If your data has a header, which ours does, it must be skipped by passing `skip_header = True`. If not, TorchText will think the header is an example. By default, `skip_header` will be `False`."
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 6,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "fields = [('n', NAME), ('p', PLACE), (None, None), ('s', SAYING)]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 7,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "train_data, valid_data, test_data = data.TabularDataset.splits(\n",
230 | " path = 'data',\n",
231 | " train = 'train.tsv',\n",
232 | " validation = 'valid.tsv',\n",
233 | " test = 'test.tsv',\n",
234 | " format = 'tsv',\n",
235 | " fields = fields,\n",
236 | " skip_header = True\n",
237 | ")"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 8,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "name": "stdout",
247 | "output_type": "stream",
248 | "text": [
249 | "{'n': ['John'], 'p': ['United', 'Kingdom'], 's': ['i', 'love', 'the', 'united', 'kingdom']}\n"
250 | ]
251 | }
252 | ],
253 | "source": [
254 | "print(vars(train_data[0]))"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "Finally, we'll cover `csv` files. \n",
262 | "\n",
263 | "This is pretty much the exact same as the `tsv` files, expect with the `format` argument set to `'csv'`."
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 9,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "fields = [('n', NAME), ('p', PLACE), (None, None), ('s', SAYING)]"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 10,
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "train_data, valid_data, test_data = data.TabularDataset.splits(\n",
282 | " path = 'data',\n",
283 | " train = 'train.csv',\n",
284 | " validation = 'valid.csv',\n",
285 | " test = 'test.csv',\n",
286 | " format = 'csv',\n",
287 | " fields = fields,\n",
288 | " skip_header = True\n",
289 | ")"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 11,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "name": "stdout",
299 | "output_type": "stream",
300 | "text": [
301 | "{'n': ['John'], 'p': ['United', 'Kingdom'], 's': ['i', 'love', 'the', 'united', 'kingdom']}\n"
302 | ]
303 | }
304 | ],
305 | "source": [
306 | "print(vars(train_data[0]))"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "## Why JSON over CSV/TSV?\n",
314 | "\n",
315 | "1. Your `csv` or `tsv` data cannot be stored lists. This means data cannot be already be tokenized, thus everytime you run your Python script that reads this data via TorchText, it has to be tokenized. Using advanced tokenizers, such as the `spaCy` tokenizer, takes a non-negligible amount of time. Thus, it is better to tokenize your datasets and store them in the `json lines` format.\n",
316 | "\n",
317 | "2. If tabs appear in your `tsv` data, or commas appear in your `csv` data, TorchText will think they are delimiters between columns. This will cause your data to be parsed incorrectly. Worst of all TorchText will not alert you to this as it cannot tell the difference between a tab/comma in a field and a tab/comma as a delimiter. As `json` data is essentially a dictionary, you access the data within the fields via its key, so do not have to worry about \"surprise\" delimiters."
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "## Iterators \n",
325 | "\n",
326 | "Using any of the above datasets, we can then build the vocab and create the iterators."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 12,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "NAME.build_vocab(train_data)\n",
336 | "SAYING.build_vocab(train_data)\n",
337 | "PLACE.build_vocab(train_data)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "Then, we can create the iterators after defining our batch size and device.\n",
345 | "\n",
346 | "By default, the train data is shuffled each epoch, but the validation/test data is sorted. However, TorchText doesn't know what to use to sort our data and it would throw an error if we don't tell it. \n",
347 | "\n",
348 | "There are two ways to handle this, you can either tell the iterator not to sort the validation/test data by passing `sort = False`, or you can tell it how to sort the data by passing a `sort_key`. A sort key is a function that returns a key on which to sort the data on. For example, `lambda x: x.s` will sort the examples by their `s` attribute, i.e their quote. Ideally, you want to use a sort key as the `BucketIterator` will then be able to sort your examples and then minimize the amount of padding within each batch.\n",
349 | "\n",
350 | "We can then iterate over our iterator to get batches of data. Note how by default TorchText has the batch dimension second."
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 13,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "name": "stdout",
360 | "output_type": "stream",
361 | "text": [
362 | "Train:\n",
363 | "\n",
364 | "[torchtext.data.batch.Batch of size 1]\n",
365 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
366 | "\t[.p]:[torch.cuda.LongTensor of size 2x1 (GPU 0)]\n",
367 | "\t[.s]:[torch.cuda.LongTensor of size 5x1 (GPU 0)]\n",
368 | "\n",
369 | "[torchtext.data.batch.Batch of size 1]\n",
370 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
371 | "\t[.p]:[torch.cuda.LongTensor of size 2x1 (GPU 0)]\n",
372 | "\t[.s]:[torch.cuda.LongTensor of size 4x1 (GPU 0)]\n",
373 | "Valid:\n",
374 | "\n",
375 | "[torchtext.data.batch.Batch of size 1]\n",
376 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
377 | "\t[.p]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
378 | "\t[.s]:[torch.cuda.LongTensor of size 2x1 (GPU 0)]\n",
379 | "\n",
380 | "[torchtext.data.batch.Batch of size 1]\n",
381 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
382 | "\t[.p]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
383 | "\t[.s]:[torch.cuda.LongTensor of size 4x1 (GPU 0)]\n",
384 | "Test:\n",
385 | "\n",
386 | "[torchtext.data.batch.Batch of size 1]\n",
387 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
388 | "\t[.p]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
389 | "\t[.s]:[torch.cuda.LongTensor of size 3x1 (GPU 0)]\n",
390 | "\n",
391 | "[torchtext.data.batch.Batch of size 1]\n",
392 | "\t[.n]:[torch.cuda.LongTensor of size 1x1 (GPU 0)]\n",
393 | "\t[.p]:[torch.cuda.LongTensor of size 2x1 (GPU 0)]\n",
394 | "\t[.s]:[torch.cuda.LongTensor of size 3x1 (GPU 0)]\n"
395 | ]
396 | }
397 | ],
398 | "source": [
399 | "import torch\n",
400 | "\n",
401 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
402 | "\n",
403 | "BATCH_SIZE = 1\n",
404 | "\n",
405 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
406 | " (train_data, valid_data, test_data),\n",
407 | " sort = False, #don't sort test/validation data\n",
408 | " batch_size=BATCH_SIZE,\n",
409 | " device=device)\n",
410 | "\n",
411 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
412 | " (train_data, valid_data, test_data),\n",
413 | " sort_key = lambda x: x.s, #sort by s attribute (quote)\n",
414 | " batch_size=BATCH_SIZE,\n",
415 | " device=device)\n",
416 | "\n",
417 | "print('Train:')\n",
418 | "for batch in train_iterator:\n",
419 | " print(batch)\n",
420 | " \n",
421 | "print('Valid:')\n",
422 | "for batch in valid_iterator:\n",
423 | " print(batch)\n",
424 | " \n",
425 | "print('Test:')\n",
426 | "for batch in test_iterator:\n",
427 | " print(batch)"
428 | ]
429 | }
430 | ],
431 | "metadata": {
432 | "kernelspec": {
433 | "display_name": "Python 3",
434 | "language": "python",
435 | "name": "python3"
436 | },
437 | "language_info": {
438 | "codemirror_mode": {
439 | "name": "ipython",
440 | "version": 3
441 | },
442 | "file_extension": ".py",
443 | "mimetype": "text/x-python",
444 | "name": "python",
445 | "nbconvert_exporter": "python",
446 | "pygments_lexer": "ipython3",
447 | "version": "3.7.6"
448 | }
449 | },
450 | "nbformat": 4,
451 | "nbformat_minor": 2
452 | }
--------------------------------------------------------------------------------
/legacy/B - A Closer Look at Word Embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# B - A Closer Look at Word Embeddings\n",
8 | "\n",
9 | "We have very briefly covered how word embeddings (also known as word vectors) are used in the tutorials. In this appendix we'll have a closer look at these embeddings and find some (hopefully) interesting results.\n",
10 | "\n",
11 | "Embeddings transform a one-hot encoded vector (a vector that is 0 in elements except one, which is 1) into a much smaller dimension vector of real numbers. The one-hot encoded vector is also known as a *sparse vector*, whilst the real valued vector is known as a *dense vector*. \n",
12 | "\n",
13 | "The key concept in these word embeddings is that words that appear in similar _contexts_ appear nearby in the vector space, i.e. the Euclidean distance between these two word vectors is small. By context here, we mean the surrounding words. For example in the sentences \"I purchased some items at the shop\" and \"I purchased some items at the store\" the words 'shop' and 'store' appear in the same context and thus should be close together in vector space.\n",
14 | "\n",
15 | "You may have also heard about *word2vec*. *word2vec* is an algorithm (actually a bunch of algorithms) that calculates word vectors from a corpus. In this appendix we use *GloVe* vectors, *GloVe* being another algorithm to calculate word vectors. If you want to know how *word2vec* works, check out a two part series [here](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/) and [here](http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/), and if you want to find out more about *GloVe*, check the website [here](https://nlp.stanford.edu/projects/glove/).\n",
16 | "\n",
17 | "In PyTorch, we use word vectors with the `nn.Embedding` layer, which takes a _**[sentence length, batch size]**_ tensor and transforms it into a _**[sentence length, batch size, embedding dimensions]**_ tensor.\n",
18 | "\n",
19 | "In tutorial 2 onwards, we also used pre-trained word embeddings (specifically the GloVe vectors) provided by TorchText. These embeddings have been trained on a gigantic corpus. We can use these pre-trained vectors within any of our models, with the idea that as they have already learned the context of each word they will give us a better starting point for our word vectors. This usually leads to faster training time and/or improved accuracy.\n",
20 | "\n",
21 | "In this appendix we won't be training any models, instead we'll be looking at the word embeddings and finding a few interesting things about them.\n",
22 | "\n",
23 | "A lot of the code from the first half of this appendix is taken from [here](https://github.com/spro/practical-pytorch/blob/master/glove-word-vectors/glove-word-vectors.ipynb). For more information about word embeddings, go [here](https://monkeylearn.com/blog/word-embeddings-transform-text-numbers/). \n",
24 | "\n",
25 | "## Loading the GloVe vectors\n",
26 | "\n",
27 | "First, we'll load the GloVe vectors. The `name` field specifies what the vectors have been trained on, here the `6B` means a corpus of 6 billion words. The `dim` argument specifies the dimensionality of the word vectors. GloVe vectors are available in 50, 100, 200 and 300 dimensions. There is also a `42B` and `840B` glove vectors, however they are only available at 300 dimensions."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 1,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "There are 400000 words in the vocabulary\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "import torchtext.vocab\n",
45 | "\n",
46 | "glove = torchtext.vocab.GloVe(name = '6B', dim = 100)\n",
47 | "\n",
48 | "print(f'There are {len(glove.itos)} words in the vocabulary')"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "As shown above, there are 400,000 unique words in the GloVe vocabulary. These are the most common words found in the corpus the vectors were trained on. **In these set of GloVe vectors, every single word is lower-case only.**\n",
56 | "\n",
57 | "`glove.vectors` is the actual tensor containing the values of the embeddings."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/plain": [
68 | "torch.Size([400000, 100])"
69 | ]
70 | },
71 | "execution_count": 2,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "glove.vectors.shape"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "We can see what word is associated with each row by checking the `itos` (int to string) list. \n",
85 | "\n",
86 | "Below implies that row 0 is the vector associated with the word 'the', row 1 for ',' (comma), row 2 for '.' (period), etc."
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 3,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "data": {
96 | "text/plain": [
97 | "['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '\"', \"'s\"]"
98 | ]
99 | },
100 | "execution_count": 3,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "glove.itos[:10]"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "We can also use the `stoi` (string to int) dictionary, in which we input a word and receive the associated integer/index. If you try get the index of a word that is not in the vocabulary, you receive an error."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 4,
119 | "metadata": {},
120 | "outputs": [
121 | {
122 | "data": {
123 | "text/plain": [
124 | "0"
125 | ]
126 | },
127 | "execution_count": 4,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "glove.stoi['the']"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "We can get the vector of a word by first getting the integer associated with it and then indexing into the word embedding tensor with that index."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "torch.Size([100])"
152 | ]
153 | },
154 | "execution_count": 5,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "glove.vectors[glove.stoi['the']].shape"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "We'll be doing this a lot, so we'll create a function that takes in word embeddings and a word then returns the associated vector. It'll also throw an error if the word doesn't exist in the vocabulary."
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 6,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "def get_vector(embeddings, word):\n",
177 | " assert word in embeddings.stoi, f'*{word}* is not in the vocab!'\n",
178 | " return embeddings.vectors[embeddings.stoi[word]]"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "As before, we use a word to get the associated vector."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 7,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "data": {
195 | "text/plain": [
196 | "torch.Size([100])"
197 | ]
198 | },
199 | "execution_count": 7,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "get_vector(glove, 'the').shape"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "## Similar Contexts\n",
213 | "\n",
214 | "Now to start looking at the context of different words. \n",
215 | "\n",
216 | "If we want to find the words similar to a certain input word, we first find the vector of this input word, then we scan through our vocabulary calculating the distance between the vector of each word and our input word vector. We then sort these from closest to furthest away.\n",
217 | "\n",
218 | "The function below returns the closest 10 words to an input word vector:"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 8,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "import torch\n",
228 | "\n",
229 | "def closest_words(embeddings, vector, n = 10):\n",
230 | " \n",
231 | " distances = [(word, torch.dist(vector, get_vector(embeddings, word)).item())\n",
232 | " for word in embeddings.itos]\n",
233 | " \n",
234 | " return sorted(distances, key = lambda w: w[1])[:n]"
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "Let's try it out with 'korea'. The closest word is the word 'korea' itself (not very interesting), however all of the words are related in some way. Pyongyang is the capital of North Korea, DPRK is the official name of North Korea, etc.\n",
242 | "\n",
243 | "Interestingly, we also get 'Japan' and 'China', implies that Korea, Japan and China are frequently talked about together in similar contexts. This makes sense as they are geographically situated near each other. "
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "data": {
253 | "text/plain": [
254 | "[('korea', 0.0),\n",
255 | " ('pyongyang', 3.9039554595947266),\n",
256 | " ('korean', 4.068886756896973),\n",
257 | " ('dprk', 4.2631049156188965),\n",
258 | " ('seoul', 4.340494155883789),\n",
259 | " ('japan', 4.551243782043457),\n",
260 | " ('koreans', 4.615609169006348),\n",
261 | " ('south', 4.65822696685791),\n",
262 | " ('china', 4.839518070220947),\n",
263 | " ('north', 4.986356735229492)]"
264 | ]
265 | },
266 | "execution_count": 9,
267 | "metadata": {},
268 | "output_type": "execute_result"
269 | }
270 | ],
271 | "source": [
272 | "word_vector = get_vector(glove, 'korea')\n",
273 | "\n",
274 | "closest_words(glove, word_vector)"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "Looking at another country, India, we also get nearby countries: Thailand, Malaysia and Sri Lanka (as two separate words). Australia is relatively close to India (geographically), but Thailand and Malaysia are closer. So why is Australia closer to India in vector space? This is most probably due to India and Australia appearing in the context of [cricket](https://en.wikipedia.org/wiki/Cricket) matches together."
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 10,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "data": {
291 | "text/plain": [
292 | "[('india', 0.0),\n",
293 | " ('pakistan', 3.6954822540283203),\n",
294 | " ('indian', 4.114313125610352),\n",
295 | " ('delhi', 4.155975818634033),\n",
296 | " ('bangladesh', 4.261017799377441),\n",
297 | " ('lanka', 4.435845851898193),\n",
298 | " ('sri', 4.515716552734375),\n",
299 | " ('australia', 4.806082725524902),\n",
300 | " ('thailand', 4.994781017303467),\n",
301 | " ('malaysia', 5.009334087371826)]"
302 | ]
303 | },
304 | "execution_count": 10,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "word_vector = get_vector(glove, 'india')\n",
311 | "\n",
312 | "closest_words(glove, word_vector)"
313 | ]
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {},
318 | "source": [
319 | "We'll also create another function that will nicely print out the tuples returned by our `closest_words` function."
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 11,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "def print_tuples(tuples):\n",
329 | " for w, d in tuples:\n",
330 | " print(f'({d:02.04f}) {w}') "
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "A final word to look at, 'sports'. As we can see, the closest words are most of the sports themselves. "
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 12,
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "name": "stdout",
347 | "output_type": "stream",
348 | "text": [
349 | "(0.0000) sports\n",
350 | "(3.5875) sport\n",
351 | "(4.4590) soccer\n",
352 | "(4.6508) basketball\n",
353 | "(4.6561) baseball\n",
354 | "(4.8028) sporting\n",
355 | "(4.8763) football\n",
356 | "(4.9624) professional\n",
357 | "(4.9824) entertainment\n",
358 | "(5.0975) media\n"
359 | ]
360 | }
361 | ],
362 | "source": [
363 | "word_vector = get_vector(glove, 'sports')\n",
364 | "\n",
365 | "print_tuples(closest_words(glove, word_vector))"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "## Analogies\n",
373 | "\n",
374 | "Another property of word embeddings is that they can be operated on just as any standard vector and give interesting results.\n",
375 | "\n",
376 | "We'll show an example of this first, and then explain it:"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 13,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "def analogy(embeddings, word1, word2, word3, n=5):\n",
386 | " \n",
387 | " #get vectors for each word\n",
388 | " word1_vector = get_vector(embeddings, word1)\n",
389 | " word2_vector = get_vector(embeddings, word2)\n",
390 | " word3_vector = get_vector(embeddings, word3)\n",
391 | " \n",
392 | " #calculate analogy vector\n",
393 | " analogy_vector = word2_vector - word1_vector + word3_vector\n",
394 | " \n",
395 | " #find closest words to analogy vector\n",
396 | " candidate_words = closest_words(embeddings, analogy_vector, n+3)\n",
397 | " \n",
398 | " #filter out words already in analogy\n",
399 | " candidate_words = [(word, dist) for (word, dist) in candidate_words \n",
400 | " if word not in [word1, word2, word3]][:n]\n",
401 | " \n",
402 | " print(f'{word1} is to {word2} as {word3} is to...')\n",
403 | " \n",
404 | " return candidate_words"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 14,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "name": "stdout",
414 | "output_type": "stream",
415 | "text": [
416 | "man is to king as woman is to...\n",
417 | "(4.0811) queen\n",
418 | "(4.6429) monarch\n",
419 | "(4.9055) throne\n",
420 | "(4.9216) elizabeth\n",
421 | "(4.9811) prince\n"
422 | ]
423 | }
424 | ],
425 | "source": [
426 | "print_tuples(analogy(glove, 'man', 'king', 'woman'))"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "This is the canonical example which shows off this property of word embeddings. So why does it work? Why does the vector of 'woman' added to the vector of 'king' minus the vector of 'man' give us 'queen'?\n",
434 | "\n",
435 | "If we think about it, the vector calculated from 'king' minus 'man' gives us a \"royalty vector\". This is the vector associated with traveling from a man to his royal counterpart, a king. If we add this \"royality vector\" to 'woman', this should travel to her royal equivalent, which is a queen!\n",
436 | "\n",
437 | "We can do this with other analogies too. For example, this gets an \"acting career vector\":"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 15,
443 | "metadata": {},
444 | "outputs": [
445 | {
446 | "name": "stdout",
447 | "output_type": "stream",
448 | "text": [
449 | "man is to actor as woman is to...\n",
450 | "(2.8133) actress\n",
451 | "(5.0039) comedian\n",
452 | "(5.1399) actresses\n",
453 | "(5.2773) starred\n",
454 | "(5.3085) screenwriter\n"
455 | ]
456 | }
457 | ],
458 | "source": [
459 | "print_tuples(analogy(glove, 'man', 'actor', 'woman'))"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "For a \"baby animal vector\":"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 16,
472 | "metadata": {},
473 | "outputs": [
474 | {
475 | "name": "stdout",
476 | "output_type": "stream",
477 | "text": [
478 | "cat is to kitten as dog is to...\n",
479 | "(3.8146) puppy\n",
480 | "(4.2944) rottweiler\n",
481 | "(4.5888) puppies\n",
482 | "(4.6086) pooch\n",
483 | "(4.6520) pug\n"
484 | ]
485 | }
486 | ],
487 | "source": [
488 | "print_tuples(analogy(glove, 'cat', 'kitten', 'dog'))"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {},
494 | "source": [
495 | "A \"capital city vector\":"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 17,
501 | "metadata": {},
502 | "outputs": [
503 | {
504 | "name": "stdout",
505 | "output_type": "stream",
506 | "text": [
507 | "france is to paris as england is to...\n",
508 | "(4.1426) london\n",
509 | "(4.4938) melbourne\n",
510 | "(4.7087) sydney\n",
511 | "(4.7630) perth\n",
512 | "(4.7952) birmingham\n"
513 | ]
514 | }
515 | ],
516 | "source": [
517 | "print_tuples(analogy(glove, 'france', 'paris', 'england'))"
518 | ]
519 | },
520 | {
521 | "cell_type": "markdown",
522 | "metadata": {},
523 | "source": [
524 | "A \"musician's genre vector\":"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 18,
530 | "metadata": {},
531 | "outputs": [
532 | {
533 | "name": "stdout",
534 | "output_type": "stream",
535 | "text": [
536 | "elvis is to rock as eminem is to...\n",
537 | "(5.6597) rap\n",
538 | "(6.2057) rappers\n",
539 | "(6.2161) rapper\n",
540 | "(6.2444) punk\n",
541 | "(6.2690) hop\n"
542 | ]
543 | }
544 | ],
545 | "source": [
546 | "print_tuples(analogy(glove, 'elvis', 'rock', 'eminem'))"
547 | ]
548 | },
549 | {
550 | "cell_type": "markdown",
551 | "metadata": {},
552 | "source": [
553 | "And an \"ingredient vector\":"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 19,
559 | "metadata": {},
560 | "outputs": [
561 | {
562 | "name": "stdout",
563 | "output_type": "stream",
564 | "text": [
565 | "beer is to barley as wine is to...\n",
566 | "(5.6021) grape\n",
567 | "(5.6760) beans\n",
568 | "(5.8174) grapes\n",
569 | "(5.9035) lentils\n",
570 | "(5.9454) figs\n"
571 | ]
572 | }
573 | ],
574 | "source": [
575 | "print_tuples(analogy(glove, 'beer', 'barley', 'wine'))"
576 | ]
577 | },
578 | {
579 | "cell_type": "markdown",
580 | "metadata": {},
581 | "source": [
582 | "## Correcting Spelling Mistakes\n",
583 | "\n",
584 | "Another interesting property of word embeddings is that they can actually be used to correct spelling mistakes! \n",
585 | "\n",
586 | "We'll put their findings into code and briefly explain them, but to read more about this, check out the [original thread](http://forums.fast.ai/t/nlp-any-libraries-dictionaries-out-there-for-fixing-common-spelling-errors/16411) and the associated [write-up](https://blog.usejournal.com/a-simple-spell-checker-built-from-word-vectors-9f28452b6f26).\n",
587 | "\n",
588 | "First, we need to load up the much larger vocabulary GloVe vectors, this is due to the spelling mistakes not appearing in the smaller vocabulary. \n",
589 | "\n",
590 | "**Note**: these vectors are very large (~2GB), so watch out if you have a limited internet connection."
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 20,
596 | "metadata": {},
597 | "outputs": [],
598 | "source": [
599 | "glove = torchtext.vocab.GloVe(name = '840B', dim = 300)"
600 | ]
601 | },
602 | {
603 | "cell_type": "markdown",
604 | "metadata": {},
605 | "source": [
606 | "Checking the vocabulary size of these embeddings, we can see we now have over 2 million unique words in our vocabulary!"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 21,
612 | "metadata": {},
613 | "outputs": [
614 | {
615 | "data": {
616 | "text/plain": [
617 | "torch.Size([2196017, 300])"
618 | ]
619 | },
620 | "execution_count": 21,
621 | "metadata": {},
622 | "output_type": "execute_result"
623 | }
624 | ],
625 | "source": [
626 | "glove.vectors.shape"
627 | ]
628 | },
629 | {
630 | "cell_type": "markdown",
631 | "metadata": {},
632 | "source": [
633 | "As the vectors were trained with a much larger vocabulary on a larger corpus of text, the words that appear are a little different. Notice how the words 'north', 'south', 'pyongyang' and 'dprk' no longer appear in the most closest words to 'korea'."
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 22,
639 | "metadata": {},
640 | "outputs": [
641 | {
642 | "name": "stdout",
643 | "output_type": "stream",
644 | "text": [
645 | "(0.0000) korea\n",
646 | "(3.9857) taiwan\n",
647 | "(4.4022) korean\n",
648 | "(4.9016) asia\n",
649 | "(4.9593) japan\n",
650 | "(5.0721) seoul\n",
651 | "(5.4058) thailand\n",
652 | "(5.6025) singapore\n",
653 | "(5.7010) russia\n",
654 | "(5.7240) hong\n"
655 | ]
656 | }
657 | ],
658 | "source": [
659 | "word_vector = get_vector(glove, 'korea')\n",
660 | "\n",
661 | "print_tuples(closest_words(glove, word_vector))"
662 | ]
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "Our first step to correcting spelling mistakes is looking at the vector for a misspelling of the word 'reliable'."
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 23,
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "name": "stdout",
678 | "output_type": "stream",
679 | "text": [
680 | "(0.0000) relieable\n",
681 | "(5.0366) relyable\n",
682 | "(5.2610) realible\n",
683 | "(5.4719) realiable\n",
684 | "(5.5402) relable\n",
685 | "(5.5917) relaible\n",
686 | "(5.6412) reliabe\n",
687 | "(5.8802) relaiable\n",
688 | "(5.9593) stabel\n",
689 | "(5.9981) consitant\n"
690 | ]
691 | }
692 | ],
693 | "source": [
694 | "word_vector = get_vector(glove, 'relieable')\n",
695 | "\n",
696 | "print_tuples(closest_words(glove, word_vector))"
697 | ]
698 | },
699 | {
700 | "cell_type": "markdown",
701 | "metadata": {},
702 | "source": [
703 | "Notice how the correct spelling, \"reliable\", does not appear in the top 10 closest words. Surely the misspellings of a word should appear next to the correct spelling of the word as they appear in the same context, right? \n",
704 | "\n",
705 | "The hypothesis is that misspellings of words are all equally shifted away from their correct spelling. This is because articles of text that contain spelling mistakes are usually written in an informal manner where correct spelling doesn't matter as much (such as tweets/blog posts), thus spelling errors will appear together as they appear in context of informal articles.\n",
706 | "\n",
707 | "Similar to how we created analogies before, we can create a \"correct spelling\" vector. This time, instead of using a single example to create our vector, we'll use the average of multiple examples. This will hopefully give better accuracy!\n",
708 | "\n",
709 | "We first create a vector for the correct spelling, 'reliable', then calculate the difference between the \"reliable vector\" and each of the 8 misspellings of 'reliable'. As we are going to concatenate these 8 misspelling tensors together we need to unsqueeze a \"batch\" dimension to them."
710 | ]
711 | },
712 | {
713 | "cell_type": "code",
714 | "execution_count": 24,
715 | "metadata": {},
716 | "outputs": [],
717 | "source": [
718 | "reliable_vector = get_vector(glove, 'reliable')\n",
719 | "\n",
720 | "reliable_misspellings = ['relieable', 'relyable', 'realible', 'realiable', \n",
721 | " 'relable', 'relaible', 'reliabe', 'relaiable']\n",
722 | "\n",
723 | "diff_reliable = [(reliable_vector - get_vector(glove, s)).unsqueeze(0) \n",
724 | " for s in reliable_misspellings]"
725 | ]
726 | },
727 | {
728 | "cell_type": "markdown",
729 | "metadata": {},
730 | "source": [
731 | "We take the average of these 8 'difference from reliable' vectors to get our \"misspelling vector\"."
732 | ]
733 | },
734 | {
735 | "cell_type": "code",
736 | "execution_count": 25,
737 | "metadata": {},
738 | "outputs": [],
739 | "source": [
740 | "misspelling_vector = torch.cat(diff_reliable, dim = 0).mean(dim = 0)"
741 | ]
742 | },
743 | {
744 | "cell_type": "markdown",
745 | "metadata": {},
746 | "source": [
747 | "We can now correct other spelling mistakes using this \"misspelling vector\" by finding the closest words to the sum of the vector of a misspelled word and the \"misspelling vector\".\n",
748 | "\n",
749 | "For a misspelling of \"because\":"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 26,
755 | "metadata": {},
756 | "outputs": [
757 | {
758 | "name": "stdout",
759 | "output_type": "stream",
760 | "text": [
761 | "(6.1090) because\n",
762 | "(6.4250) even\n",
763 | "(6.4358) fact\n",
764 | "(6.4914) sure\n",
765 | "(6.5094) though\n",
766 | "(6.5601) obviously\n",
767 | "(6.5682) reason\n",
768 | "(6.5856) if\n",
769 | "(6.6099) but\n",
770 | "(6.6415) why\n"
771 | ]
772 | }
773 | ],
774 | "source": [
775 | "word_vector = get_vector(glove, 'becuase')\n",
776 | "\n",
777 | "print_tuples(closest_words(glove, word_vector + misspelling_vector))"
778 | ]
779 | },
780 | {
781 | "cell_type": "markdown",
782 | "metadata": {},
783 | "source": [
784 | "For a misspelling of \"definitely\":"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 27,
790 | "metadata": {},
791 | "outputs": [
792 | {
793 | "name": "stdout",
794 | "output_type": "stream",
795 | "text": [
796 | "(5.4070) definitely\n",
797 | "(5.5643) certainly\n",
798 | "(5.7192) sure\n",
799 | "(5.8152) well\n",
800 | "(5.8588) always\n",
801 | "(5.8812) also\n",
802 | "(5.9557) simply\n",
803 | "(5.9667) consider\n",
804 | "(5.9821) probably\n",
805 | "(5.9948) definately\n"
806 | ]
807 | }
808 | ],
809 | "source": [
810 | "word_vector = get_vector(glove, 'defintiely')\n",
811 | "\n",
812 | "print_tuples(closest_words(glove, word_vector + misspelling_vector))"
813 | ]
814 | },
815 | {
816 | "cell_type": "markdown",
817 | "metadata": {},
818 | "source": [
819 | "For a misspelling of \"consistent\":"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 28,
825 | "metadata": {},
826 | "outputs": [
827 | {
828 | "name": "stdout",
829 | "output_type": "stream",
830 | "text": [
831 | "(5.9641) consistent\n",
832 | "(6.3674) reliable\n",
833 | "(7.0195) consistant\n",
834 | "(7.0299) consistently\n",
835 | "(7.1605) accurate\n",
836 | "(7.2737) fairly\n",
837 | "(7.3037) good\n",
838 | "(7.3520) reasonable\n",
839 | "(7.3801) dependable\n",
840 | "(7.4027) ensure\n"
841 | ]
842 | }
843 | ],
844 | "source": [
845 | "word_vector = get_vector(glove, 'consistant')\n",
846 | "\n",
847 | "print_tuples(closest_words(glove, word_vector + misspelling_vector))"
848 | ]
849 | },
850 | {
851 | "cell_type": "markdown",
852 | "metadata": {},
853 | "source": [
854 | "For a misspelling of \"package\":"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": 29,
860 | "metadata": {},
861 | "outputs": [
862 | {
863 | "name": "stdout",
864 | "output_type": "stream",
865 | "text": [
866 | "(6.6117) package\n",
867 | "(6.9315) packages\n",
868 | "(7.0195) pakage\n",
869 | "(7.0911) comes\n",
870 | "(7.1241) provide\n",
871 | "(7.1469) offer\n",
872 | "(7.1861) reliable\n",
873 | "(7.2431) well\n",
874 | "(7.2434) choice\n",
875 | "(7.2453) offering\n"
876 | ]
877 | }
878 | ],
879 | "source": [
880 | "word_vector = get_vector(glove, 'pakage')\n",
881 | "\n",
882 | "print_tuples(closest_words(glove, word_vector + misspelling_vector))"
883 | ]
884 | },
885 | {
886 | "cell_type": "markdown",
887 | "metadata": {},
888 | "source": [
889 | "For a more in-depth look at this, check out the [write-up](https://blog.usejournal.com/a-simple-spell-checker-built-from-word-vectors-9f28452b6f26)."
890 | ]
891 | }
892 | ],
893 | "metadata": {
894 | "kernelspec": {
895 | "display_name": "Python 3",
896 | "language": "python",
897 | "name": "python3"
898 | },
899 | "language_info": {
900 | "codemirror_mode": {
901 | "name": "ipython",
902 | "version": 3
903 | },
904 | "file_extension": ".py",
905 | "mimetype": "text/x-python",
906 | "name": "python",
907 | "nbconvert_exporter": "python",
908 | "pygments_lexer": "ipython3",
909 | "version": "3.7.0"
910 | }
911 | },
912 | "nbformat": 4,
913 | "nbformat_minor": 2
914 | }
--------------------------------------------------------------------------------
/legacy/C - Loading, Saving and Freezing Embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# C - Loading, Saving and Freezing Embeddings\n",
8 | "\n",
9 | "This notebook will cover: how to load custom word embeddings in TorchText, how to save all the embeddings we learn during training and how to freeze/unfreeze embeddings during training. \n",
10 | "\n",
11 | "## Loading Custom Embeddings\n",
12 | "\n",
13 | "First, lets look at loading a custom set of embeddings.\n",
14 | "\n",
15 | "Your embeddings need to be formatted so each line starts with the word followed by the values of the embedding vector, all space separated. All vectors need to have the same number of elements.\n",
16 | "\n",
17 | "Let's look at the custom embeddings provided by these tutorials. These are 20-dimensional embeddings for 7 words."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {},
24 | "outputs": [
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "good 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n",
30 | "great 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n",
31 | "awesome 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n",
32 | "bad -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n",
33 | "terrible -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n",
34 | "awful -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0\n",
35 | "kwyjibo 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5\n",
36 | "\n"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "with open('custom_embeddings/embeddings.txt', 'r') as f:\n",
42 | " print(f.read())"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Now, let's setup the fields."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "import torch\n",
59 | "from torchtext.legacy import data\n",
60 | "\n",
61 | "SEED = 1234\n",
62 | "\n",
63 | "torch.manual_seed(SEED)\n",
64 | "torch.backends.cudnn.deterministic = True\n",
65 | "\n",
66 | "TEXT = data.Field(tokenize = 'spacy')\n",
67 | "LABEL = data.LabelField(dtype = torch.float)"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "Then, we'll load our dataset and create the validation set."
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 3,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "from torchtext.legacy import datasets\n",
84 | "import random\n",
85 | "\n",
86 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
87 | "\n",
88 | "train_data, valid_data = train_data.split(random_state = random.seed(SEED))"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "We can only load our custom embeddings after they have been turned into a `Vectors` object.\n",
96 | "\n",
97 | "We create a `Vector` object by passing it the location of the embeddings (`name`), a location for the cached embeddings (`cache`) and a function that will later initialize tokens in our embeddings that aren't within our dataset (`unk_init`). As have done in previous notebooks, we have initialized these to $\\mathcal{N}(0,1)$."
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 4,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stderr",
107 | "output_type": "stream",
108 | "text": [
109 | " 0%| | 0/7 [00:00, ?it/s]\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "import torchtext.vocab as vocab\n",
115 | "\n",
116 | "custom_embeddings = vocab.Vectors(name = 'custom_embeddings/embeddings.txt',\n",
117 | " cache = 'custom_embeddings',\n",
118 | " unk_init = torch.Tensor.normal_)"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "To check the embeddings have loaded correctly we can print out the words loaded from our custom embedding."
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 5,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "{'good': 0, 'great': 1, 'awesome': 2, 'bad': 3, 'terrible': 4, 'awful': 5, 'kwyjibo': 6}\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "print(custom_embeddings.stoi)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "We can also directly print out the embedding values."
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 6,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "tensor([[ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
162 | " 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
163 | " 1.0000, 1.0000, 1.0000, 1.0000],\n",
164 | " [ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
165 | " 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
166 | " 1.0000, 1.0000, 1.0000, 1.0000],\n",
167 | " [ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
168 | " 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n",
169 | " 1.0000, 1.0000, 1.0000, 1.0000],\n",
170 | " [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
171 | " -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
172 | " -1.0000, -1.0000, -1.0000, -1.0000],\n",
173 | " [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
174 | " -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
175 | " -1.0000, -1.0000, -1.0000, -1.0000],\n",
176 | " [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
177 | " -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,\n",
178 | " -1.0000, -1.0000, -1.0000, -1.0000],\n",
179 | " [ 0.5000, -0.5000, 0.5000, -0.5000, 0.5000, -0.5000, 0.5000, -0.5000,\n",
180 | " 0.5000, -0.5000, 0.5000, -0.5000, 0.5000, -0.5000, 0.5000, -0.5000,\n",
181 | " 0.5000, -0.5000, 0.5000, -0.5000]])\n"
182 | ]
183 | }
184 | ],
185 | "source": [
186 | "print(custom_embeddings.vectors)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "We then build our vocabulary, passing our `Vectors` object.\n",
194 | "\n",
195 | "Note that the `unk_init` should be declared when creating our `Vectors`, and not here!"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 7,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "MAX_VOCAB_SIZE = 25_000\n",
205 | "\n",
206 | "TEXT.build_vocab(train_data, \n",
207 | " max_size = MAX_VOCAB_SIZE, \n",
208 | " vectors = custom_embeddings)\n",
209 | "\n",
210 | "LABEL.build_vocab(train_data)"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "Now our vocabulary vectors for the words in our custom embeddings should match what we loaded."
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 8,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "data": {
227 | "text/plain": [
228 | "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
229 | " 1., 1.])"
230 | ]
231 | },
232 | "execution_count": 8,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "TEXT.vocab.vectors[TEXT.vocab.stoi['good']]"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 9,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "data": {
248 | "text/plain": [
249 | "tensor([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
250 | " -1., -1., -1., -1., -1., -1.])"
251 | ]
252 | },
253 | "execution_count": 9,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "TEXT.vocab.vectors[TEXT.vocab.stoi['bad']]"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "Words that were in our custom embeddings but not in our dataset vocabulary are initialized by the `unk_init` function we passed earlier, $\\mathcal{N}(0,1)$. They are also the same size as our custom embeddings (20-dimensional)."
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 10,
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "tensor([-0.1117, -0.4966, 0.1631, -0.8817, 0.2891, 0.4899, -0.3853, -0.7120,\n",
278 | " 0.6369, -0.7141, -1.0831, -0.5547, -1.3248, 0.6970, -0.6631, 1.2158,\n",
279 | " -2.5273, 1.4778, -0.1696, -0.9919])"
280 | ]
281 | },
282 | "execution_count": 10,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "TEXT.vocab.vectors[TEXT.vocab.stoi['kwjibo']]"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "The rest of the set-up is the same as it is when using the GloVe vectors, with the next step being to set-up the iterators."
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 11,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "BATCH_SIZE = 64\n",
305 | "\n",
306 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
307 | "\n",
308 | "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
309 | " (train_data, valid_data, test_data), \n",
310 | " batch_size = BATCH_SIZE,\n",
311 | " device = device)"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "Then, we define our model."
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 12,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "import torch.nn as nn\n",
328 | "import torch.nn.functional as F\n",
329 | "\n",
330 | "class CNN(nn.Module):\n",
331 | " def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, \n",
332 | " dropout, pad_idx):\n",
333 | " super().__init__()\n",
334 | " \n",
335 | " self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)\n",
336 | " \n",
337 | " self.convs = nn.ModuleList([\n",
338 | " nn.Conv2d(in_channels = 1, \n",
339 | " out_channels = n_filters, \n",
340 | " kernel_size = (fs, embedding_dim)) \n",
341 | " for fs in filter_sizes\n",
342 | " ])\n",
343 | " \n",
344 | " self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)\n",
345 | " \n",
346 | " self.dropout = nn.Dropout(dropout)\n",
347 | " \n",
348 | " def forward(self, text):\n",
349 | " \n",
350 | " #text = [sent len, batch size]\n",
351 | " \n",
352 | " text = text.permute(1, 0)\n",
353 | " \n",
354 | " #text = [batch size, sent len]\n",
355 | " \n",
356 | " embedded = self.embedding(text)\n",
357 | " \n",
358 | " #embedded = [batch size, sent len, emb dim]\n",
359 | " \n",
360 | " embedded = embedded.unsqueeze(1)\n",
361 | " \n",
362 | " #embedded = [batch size, 1, sent len, emb dim]\n",
363 | " \n",
364 | " conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]\n",
365 | " \n",
366 | " #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]\n",
367 | " \n",
368 | " pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]\n",
369 | " \n",
370 | " #pooled_n = [batch size, n_filters]\n",
371 | " \n",
372 | " cat = self.dropout(torch.cat(pooled, dim = 1))\n",
373 | "\n",
374 | " #cat = [batch size, n_filters * len(filter_sizes)]\n",
375 | " \n",
376 | " return self.fc(cat)"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "We then initialize our model, making sure `EMBEDDING_DIM` is the same as our custom embedding dimensionality, i.e. 20."
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 13,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "INPUT_DIM = len(TEXT.vocab)\n",
393 | "EMBEDDING_DIM = 20\n",
394 | "N_FILTERS = 100\n",
395 | "FILTER_SIZES = [3,4,5]\n",
396 | "OUTPUT_DIM = 1\n",
397 | "DROPOUT = 0.5\n",
398 | "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n",
399 | "\n",
400 | "model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "We have a lot less parameters in this model due to the smaller embedding size used."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 14,
413 | "metadata": {},
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "The model has 524,641 trainable parameters\n"
420 | ]
421 | }
422 | ],
423 | "source": [
424 | "def count_parameters(model):\n",
425 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
426 | "\n",
427 | "print(f'The model has {count_parameters(model):,} trainable parameters')"
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "metadata": {},
433 | "source": [
434 | "Next, we initialize our embedding layer to use our vocabulary vectors."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 15,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/plain": [
445 | "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.4778, -0.1696, -0.9919],\n",
446 | " [-0.5675, -0.2772, -2.1834, ..., 0.8504, 1.0534, 0.3692],\n",
447 | " [-0.0552, -0.6125, 0.7500, ..., -0.1261, -1.6770, 1.2068],\n",
448 | " ...,\n",
449 | " [ 0.5383, -0.1504, 1.6720, ..., -0.3857, -1.0168, 0.1849],\n",
450 | " [ 2.5640, -0.8564, -0.0219, ..., -0.3389, 0.2203, -1.6119],\n",
451 | " [ 0.1203, 1.5286, 0.6824, ..., 0.3330, -0.6704, 0.5883]])"
452 | ]
453 | },
454 | "execution_count": 15,
455 | "metadata": {},
456 | "output_type": "execute_result"
457 | }
458 | ],
459 | "source": [
460 | "embeddings = TEXT.vocab.vectors\n",
461 | "\n",
462 | "model.embedding.weight.data.copy_(embeddings)"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "metadata": {},
468 | "source": [
469 | "Then, we initialize the unknown and padding token embeddings to all zeros."
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 16,
475 | "metadata": {},
476 | "outputs": [],
477 | "source": [
478 | "UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]\n",
479 | "\n",
480 | "model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)\n",
481 | "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)"
482 | ]
483 | },
484 | {
485 | "cell_type": "markdown",
486 | "metadata": {},
487 | "source": [
488 | "Following standard procedure, we create our optimizer."
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 17,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "import torch.optim as optim\n",
498 | "\n",
499 | "optimizer = optim.Adam(model.parameters())"
500 | ]
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "metadata": {},
505 | "source": [
506 | "Define our loss function (criterion)."
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 18,
512 | "metadata": {},
513 | "outputs": [],
514 | "source": [
515 | "criterion = nn.BCEWithLogitsLoss()"
516 | ]
517 | },
518 | {
519 | "cell_type": "markdown",
520 | "metadata": {},
521 | "source": [
522 | "Then place the loss function and the model on the GPU."
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 19,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "model = model.to(device)\n",
532 | "criterion = criterion.to(device)"
533 | ]
534 | },
535 | {
536 | "cell_type": "markdown",
537 | "metadata": {},
538 | "source": [
539 | "Create the function to calculate accuracy."
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 20,
545 | "metadata": {},
546 | "outputs": [],
547 | "source": [
548 | "def binary_accuracy(preds, y):\n",
549 | " rounded_preds = torch.round(torch.sigmoid(preds))\n",
550 | " correct = (rounded_preds == y).float()\n",
551 | " acc = correct.sum() / len(correct)\n",
552 | " return acc"
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {},
558 | "source": [
559 | "Then implement our training function..."
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 21,
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "def train(model, iterator, optimizer, criterion):\n",
569 | " \n",
570 | " epoch_loss = 0\n",
571 | " epoch_acc = 0\n",
572 | " \n",
573 | " model.train()\n",
574 | " \n",
575 | " for batch in iterator:\n",
576 | " \n",
577 | " optimizer.zero_grad()\n",
578 | " \n",
579 | " predictions = model(batch.text).squeeze(1)\n",
580 | " \n",
581 | " loss = criterion(predictions, batch.label)\n",
582 | " \n",
583 | " acc = binary_accuracy(predictions, batch.label)\n",
584 | " \n",
585 | " loss.backward()\n",
586 | " \n",
587 | " optimizer.step()\n",
588 | " \n",
589 | " epoch_loss += loss.item()\n",
590 | " epoch_acc += acc.item()\n",
591 | " \n",
592 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
593 | ]
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "metadata": {},
598 | "source": [
599 | "...evaluation function..."
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 22,
605 | "metadata": {},
606 | "outputs": [],
607 | "source": [
608 | "def evaluate(model, iterator, criterion):\n",
609 | " \n",
610 | " epoch_loss = 0\n",
611 | " epoch_acc = 0\n",
612 | " \n",
613 | " model.eval()\n",
614 | " \n",
615 | " with torch.no_grad():\n",
616 | " \n",
617 | " for batch in iterator:\n",
618 | " \n",
619 | " predictions = model(batch.text).squeeze(1)\n",
620 | " \n",
621 | " loss = criterion(predictions, batch.label)\n",
622 | " \n",
623 | " acc = binary_accuracy(predictions, batch.label)\n",
624 | "\n",
625 | " epoch_loss += loss.item()\n",
626 | " epoch_acc += acc.item()\n",
627 | " \n",
628 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)"
629 | ]
630 | },
631 | {
632 | "cell_type": "markdown",
633 | "metadata": {},
634 | "source": [
635 | "...and our helpful function that tells us how long an epoch takes."
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 23,
641 | "metadata": {},
642 | "outputs": [],
643 | "source": [
644 | "import time\n",
645 | "\n",
646 | "def epoch_time(start_time, end_time):\n",
647 | " elapsed_time = end_time - start_time\n",
648 | " elapsed_mins = int(elapsed_time / 60)\n",
649 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
650 | " return elapsed_mins, elapsed_secs"
651 | ]
652 | },
653 | {
654 | "cell_type": "markdown",
655 | "metadata": {},
656 | "source": [
657 | "We've finally reached training our model!\n",
658 | "\n",
659 | "## Freezing and Unfreezing Embeddings\n",
660 | "\n",
661 | "We're going to train our model for 10 epochs. During the first 5 epochs we are going to freeze the weights (parameters) of our embedding layer. For the last 10 epochs we'll allow our embeddings to be trained. \n",
662 | "\n",
663 | "Why would we ever want to do this? Sometimes the pre-trained word embeddings we use will already be good enough and won't need to be fine-tuned with our model. If we keep the embeddings frozen then we don't have to calculate the gradients and update the weights for these parameters, giving us faster training times. This doesn't really apply for the model used here, but we're mainly covering it to show how it's done. Another reason is that if our model has a large amount of parameters it may make training difficult, so by freezing our pre-trained embeddings we reduce the amount of parameters needing to be learned.\n",
664 | "\n",
665 | "To freeze the embedding weights, we set `model.embedding.weight.requires_grad` to `False`. This will cause no gradients to be calculated for the weights in the embedding layer, and thus no parameters will be updated when `optimizer.step()` is called.\n",
666 | "\n",
667 | "Then, during training we check if `FREEZE_FOR` (which we set to 5) epochs have passed. If they have then we set `model.embedding.weight.requires_grad` to `True`, telling PyTorch that we should calculate gradients in the embedding layer and update them with our optimizer."
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 24,
673 | "metadata": {},
674 | "outputs": [
675 | {
676 | "name": "stdout",
677 | "output_type": "stream",
678 | "text": [
679 | "Epoch: 01 | Epoch Time: 0m 7s | Frozen? True\n",
680 | "\tTrain Loss: 0.724 | Train Acc: 53.68%\n",
681 | "\t Val. Loss: 0.658 | Val. Acc: 62.27%\n",
682 | "Epoch: 02 | Epoch Time: 0m 6s | Frozen? True\n",
683 | "\tTrain Loss: 0.670 | Train Acc: 59.36%\n",
684 | "\t Val. Loss: 0.626 | Val. Acc: 67.51%\n",
685 | "Epoch: 03 | Epoch Time: 0m 6s | Frozen? True\n",
686 | "\tTrain Loss: 0.636 | Train Acc: 63.62%\n",
687 | "\t Val. Loss: 0.592 | Val. Acc: 70.22%\n",
688 | "Epoch: 04 | Epoch Time: 0m 6s | Frozen? True\n",
689 | "\tTrain Loss: 0.613 | Train Acc: 66.22%\n",
690 | "\t Val. Loss: 0.573 | Val. Acc: 71.77%\n",
691 | "Epoch: 05 | Epoch Time: 0m 6s | Frozen? True\n",
692 | "\tTrain Loss: 0.599 | Train Acc: 67.40%\n",
693 | "\t Val. Loss: 0.569 | Val. Acc: 70.86%\n",
694 | "Epoch: 06 | Epoch Time: 0m 7s | Frozen? False\n",
695 | "\tTrain Loss: 0.577 | Train Acc: 69.53%\n",
696 | "\t Val. Loss: 0.520 | Val. Acc: 76.17%\n",
697 | "Epoch: 07 | Epoch Time: 0m 7s | Frozen? False\n",
698 | "\tTrain Loss: 0.544 | Train Acc: 72.21%\n",
699 | "\t Val. Loss: 0.487 | Val. Acc: 78.03%\n",
700 | "Epoch: 08 | Epoch Time: 0m 7s | Frozen? False\n",
701 | "\tTrain Loss: 0.507 | Train Acc: 74.96%\n",
702 | "\t Val. Loss: 0.450 | Val. Acc: 80.02%\n",
703 | "Epoch: 09 | Epoch Time: 0m 7s | Frozen? False\n",
704 | "\tTrain Loss: 0.469 | Train Acc: 77.72%\n",
705 | "\t Val. Loss: 0.420 | Val. Acc: 81.79%\n",
706 | "Epoch: 10 | Epoch Time: 0m 7s | Frozen? False\n",
707 | "\tTrain Loss: 0.426 | Train Acc: 80.28%\n",
708 | "\t Val. Loss: 0.392 | Val. Acc: 82.76%\n"
709 | ]
710 | }
711 | ],
712 | "source": [
713 | "N_EPOCHS = 10\n",
714 | "FREEZE_FOR = 5\n",
715 | "\n",
716 | "best_valid_loss = float('inf')\n",
717 | "\n",
718 | "#freeze embeddings\n",
719 | "model.embedding.weight.requires_grad = unfrozen = False\n",
720 | "\n",
721 | "for epoch in range(N_EPOCHS):\n",
722 | "\n",
723 | " start_time = time.time()\n",
724 | " \n",
725 | " train_loss, train_acc = train(model, train_iterator, optimizer, criterion)\n",
726 | " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)\n",
727 | " \n",
728 | " end_time = time.time()\n",
729 | "\n",
730 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
731 | " \n",
732 | " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s | Frozen? {not unfrozen}')\n",
733 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
734 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')\n",
735 | " \n",
736 | " if valid_loss < best_valid_loss:\n",
737 | " best_valid_loss = valid_loss\n",
738 | " torch.save(model.state_dict(), 'tutC-model.pt')\n",
739 | " \n",
740 | " if (epoch + 1) >= FREEZE_FOR:\n",
741 | " #unfreeze embeddings\n",
742 | " model.embedding.weight.requires_grad = unfrozen = True"
743 | ]
744 | },
745 | {
746 | "cell_type": "markdown",
747 | "metadata": {},
748 | "source": [
749 | "Another option would be to unfreeze the embeddings whenever the validation loss stops increasing using the following code snippet instead of the `FREEZE_FOR` condition:\n",
750 | " \n",
751 | "```python\n",
752 | "if valid_loss < best_valid_loss:\n",
753 | " best_valid_loss = valid_loss\n",
754 | " torch.save(model.state_dict(), 'tutC-model.pt')\n",
755 | "else:\n",
756 | " #unfreeze embeddings\n",
757 | " model.embedding.weight.requires_grad = unfrozen = True\n",
758 | "```"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": 25,
764 | "metadata": {},
765 | "outputs": [
766 | {
767 | "name": "stdout",
768 | "output_type": "stream",
769 | "text": [
770 | "Test Loss: 0.396 | Test Acc: 82.36%\n"
771 | ]
772 | }
773 | ],
774 | "source": [
775 | "model.load_state_dict(torch.load('tutC-model.pt'))\n",
776 | "\n",
777 | "test_loss, test_acc = evaluate(model, test_iterator, criterion)\n",
778 | "\n",
779 | "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')"
780 | ]
781 | },
782 | {
783 | "cell_type": "markdown",
784 | "metadata": {},
785 | "source": [
786 | "## Saving Embeddings\n",
787 | "\n",
788 | "We might want to re-use the embeddings we have trained here with another model. To do this, we'll write a function that will loop through our vocabulary, getting the word and embedding for each word, writing them to a text file in the same format as our custom embeddings so they can be used with TorchText again.\n",
789 | "\n",
790 | "Currently, TorchText Vectors seem to have issues with loading certain unicode words, so we skip these by only writing words without unicode symbols. **If you know a better solution to this then let me know**"
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 26,
796 | "metadata": {},
797 | "outputs": [],
798 | "source": [
799 | "from tqdm import tqdm\n",
800 | "\n",
801 | "def write_embeddings(path, embeddings, vocab):\n",
802 | " \n",
803 | " with open(path, 'w') as f:\n",
804 | " for i, embedding in enumerate(tqdm(embeddings)):\n",
805 | " word = vocab.itos[i]\n",
806 | " #skip words with unicode symbols\n",
807 | " if len(word) != len(word.encode()):\n",
808 | " continue\n",
809 | " vector = ' '.join([str(i) for i in embedding.tolist()])\n",
810 | " f.write(f'{word} {vector}\\n')"
811 | ]
812 | },
813 | {
814 | "cell_type": "markdown",
815 | "metadata": {},
816 | "source": [
817 | "We'll write our embeddings to `trained_embeddings.txt`."
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 27,
823 | "metadata": {},
824 | "outputs": [
825 | {
826 | "name": "stderr",
827 | "output_type": "stream",
828 | "text": [
829 | "100%|██████████| 25002/25002 [00:00<00:00, 38085.03it/s]\n"
830 | ]
831 | }
832 | ],
833 | "source": [
834 | "write_embeddings('custom_embeddings/trained_embeddings.txt', \n",
835 | " model.embedding.weight.data, \n",
836 | " TEXT.vocab)"
837 | ]
838 | },
839 | {
840 | "cell_type": "markdown",
841 | "metadata": {},
842 | "source": [
843 | "To double check they've written correctly, we can load them as `Vectors`."
844 | ]
845 | },
846 | {
847 | "cell_type": "code",
848 | "execution_count": 28,
849 | "metadata": {},
850 | "outputs": [
851 | {
852 | "name": "stderr",
853 | "output_type": "stream",
854 | "text": [
855 | " 70%|███████ | 17550/24946 [00:00<00:00, 87559.48it/s]\n"
856 | ]
857 | }
858 | ],
859 | "source": [
860 | "trained_embeddings = vocab.Vectors(name = 'custom_embeddings/trained_embeddings.txt',\n",
861 | " cache = 'custom_embeddings',\n",
862 | " unk_init = torch.Tensor.normal_)"
863 | ]
864 | },
865 | {
866 | "cell_type": "markdown",
867 | "metadata": {},
868 | "source": [
869 | "Finally, let's print out the first 5 rows of our loaded vectors and the same from our model's embeddings weights, checking they are the same values."
870 | ]
871 | },
872 | {
873 | "cell_type": "code",
874 | "execution_count": 29,
875 | "metadata": {},
876 | "outputs": [
877 | {
878 | "name": "stdout",
879 | "output_type": "stream",
880 | "text": [
881 | "tensor([[-0.2573, -0.2088, 0.2413, -0.1549, 0.1940, -0.1466, -0.2195, -0.1011,\n",
882 | " -0.1327, 0.1803, 0.2369, -0.2182, 0.1543, -0.2150, -0.0699, -0.0430,\n",
883 | " -0.1958, -0.0506, -0.0059, -0.0024],\n",
884 | " [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n",
885 | " 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n",
886 | " 0.0000, 0.0000, 0.0000, 0.0000],\n",
887 | " [-0.1427, -0.4414, 0.7181, -0.5751, -0.3183, 0.0552, -1.6764, -0.3177,\n",
888 | " 0.6592, 1.6143, -0.1920, -0.1881, -0.4321, -0.8578, 0.5266, 0.5243,\n",
889 | " -0.7083, -0.0048, -1.4680, 1.1425],\n",
890 | " [-0.4700, -0.0363, 0.0560, -0.7394, -0.2412, -0.4197, -1.7096, 0.9444,\n",
891 | " 0.9633, 0.3703, -0.2243, -1.5279, -1.9086, 0.5718, -0.5721, -0.6015,\n",
892 | " 0.3579, -0.3834, 0.8079, 1.0553],\n",
893 | " [-0.7055, 0.0954, 0.4646, -1.6595, 0.1138, 0.2208, -0.0220, 0.7397,\n",
894 | " -0.1153, 0.3586, 0.3040, -0.6414, -0.1579, -0.2738, -0.6942, 0.0083,\n",
895 | " 1.4097, 1.5225, 0.6409, 0.0076]])\n"
896 | ]
897 | }
898 | ],
899 | "source": [
900 | "print(trained_embeddings.vectors[:5])"
901 | ]
902 | },
903 | {
904 | "cell_type": "code",
905 | "execution_count": 30,
906 | "metadata": {},
907 | "outputs": [
908 | {
909 | "name": "stdout",
910 | "output_type": "stream",
911 | "text": [
912 | "tensor([[-0.2573, -0.2088, 0.2413, -0.1549, 0.1940, -0.1466, -0.2195, -0.1011,\n",
913 | " -0.1327, 0.1803, 0.2369, -0.2182, 0.1543, -0.2150, -0.0699, -0.0430,\n",
914 | " -0.1958, -0.0506, -0.0059, -0.0024],\n",
915 | " [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n",
916 | " 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n",
917 | " 0.0000, 0.0000, 0.0000, 0.0000],\n",
918 | " [-0.1427, -0.4414, 0.7181, -0.5751, -0.3183, 0.0552, -1.6764, -0.3177,\n",
919 | " 0.6592, 1.6143, -0.1920, -0.1881, -0.4321, -0.8578, 0.5266, 0.5243,\n",
920 | " -0.7083, -0.0048, -1.4680, 1.1425],\n",
921 | " [-0.4700, -0.0363, 0.0560, -0.7394, -0.2412, -0.4197, -1.7096, 0.9444,\n",
922 | " 0.9633, 0.3703, -0.2243, -1.5279, -1.9086, 0.5718, -0.5721, -0.6015,\n",
923 | " 0.3579, -0.3834, 0.8079, 1.0553],\n",
924 | " [-0.7055, 0.0954, 0.4646, -1.6595, 0.1138, 0.2208, -0.0220, 0.7397,\n",
925 | " -0.1153, 0.3586, 0.3040, -0.6414, -0.1579, -0.2738, -0.6942, 0.0083,\n",
926 | " 1.4097, 1.5225, 0.6409, 0.0076]], device='cuda:0')\n"
927 | ]
928 | }
929 | ],
930 | "source": [
931 | "print(model.embedding.weight.data[:5])"
932 | ]
933 | },
934 | {
935 | "cell_type": "markdown",
936 | "metadata": {},
937 | "source": [
938 | "All looks good! The only difference between the two is the removal of the ~50 words in the vocabulary that contain unicode symbols."
939 | ]
940 | }
941 | ],
942 | "metadata": {
943 | "kernelspec": {
944 | "display_name": "Python 3",
945 | "language": "python",
946 | "name": "python3"
947 | },
948 | "language_info": {
949 | "codemirror_mode": {
950 | "name": "ipython",
951 | "version": 3
952 | },
953 | "file_extension": ".py",
954 | "mimetype": "text/x-python",
955 | "name": "python",
956 | "nbconvert_exporter": "python",
957 | "pygments_lexer": "ipython3",
958 | "version": "3.6.5"
959 | }
960 | },
961 | "nbformat": 4,
962 | "nbformat_minor": 2
963 | }
--------------------------------------------------------------------------------
/legacy/assets/nbow_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/nbow_model.png
--------------------------------------------------------------------------------
/legacy/assets/nbow_model.xml:
--------------------------------------------------------------------------------
1 | 7VtNc5swEP0tPfjYjpGwTY6N637MpDOZSTtNTh0ZZFArkCvk2O6vrzDiU45LiRnhlFPYRQJpn97TsnJGcB7uPnC0Dj4zD9MRGHu7EXw3AsCygSX/JJ596pnBq9Thc+KpRoXjjvzGyjlW3g3xcFxpKBijgqyrTpdFEXZFxYc4Z9tqsxWj1beukY81x52LqO79RjwRpF4HzAr/R0z8IHuzNVXzC1HWWM0kDpDHtiUXXIzgnDMm0qtwN8c0CV4Wl7Tf+yfu5gPjOBJNOizQ+G34i3+//x2ugz1AX2fe3euJGpvYZxPGnpy/MhkXAfNZhOii8F5ztok8nDx1LK2izQ1ja+m0pPMHFmKvwEQbwaQrECFVd/GOiPvS9UPyqDcTZb3bqScfjH1mRILv78tGqVdiFt0OVtZvRSidM8r4YXbQQ9hZudIfC85+4tKdqevg5UreSSOShOHJQCtXzDbcxSeimy1YxH0sTrQD+XKQPMIsxHIGsh/HFAnyWB0HUgvaz9sVmMsLBfs/LAEAWqyBZ6BeIP1QBvovqBdAP2TPO4U6i4QaCLCl7W7442HFWoYAtkwirJ77iOhGvUlDvMrpbUAEvlujw9y3UtirSLbl1CPmAu9OB12PUdZhqnRUbST2WNnbkiwrV1BS5Mx39qhOB+nsUDpBQ2ZBo9IJW6yBihZdmI4aANisdIKXIJ25BvZFOmeDdHYonbAhs2yjxLIH6ewaYLPSCV+CdOZS2RfptNqkHIN2NqWW3ZBaltlP9jZlm0E8/wlhs+JpvwTxnIC+iafRcueFE8dpKo0meeNovPmkQS4XtKgiVGVBxKIEd0SJH0nTlZHC0n+dcIG4iL5VN0LieYcVcox71VVThyU/C0huxj+xcANl+BTFsbqmaInpLYuJIOzoSG5qDfIRxXIwJPK/pCt0fB46W7XPyAnU6WwfoTPojM5GS3AXTuerpvugSTpfaXQOkMADo8/EaDjuG6ONVoYunNHZkXm/a+bZKEucFgGJB06fidM26BunnYHT7Tnd9ITZaDE3G2WJ0/KDMxw4fSZOT+yecRq0ybxNFKC6pOZzM2PV9ZYR+epCv2tYwzqIqRSoXjUc82E8g8tA47KGdf9LT1O7b6Wnq2EXbE+1yyjLW3rV9tWwB55pD5xO+7YHtmH0cAhTOlxpUn1yTBIa6HntWffCleNi9+heuHQm9uRcX4S1Kk9OE1N7IdSj2E/mdMmApscp0Ozv3/QDlQvMBusMsIBpBujCEmE/xbOrhGHJhGDh/5MwaIUww/kC1H8QtQiX2PPkzKX7Bu1lsFqCX2ZVBwlkBf8y5JW1cFwtn5f01b7iTOd8UM/wbxmjBwQH5E4gB2amkZtoyN2QCCM+UK8RgFZ38inN4r8g0zJW8b+kcPEH
--------------------------------------------------------------------------------
/legacy/assets/padding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/padding.png
--------------------------------------------------------------------------------
/legacy/assets/padding.xml:
--------------------------------------------------------------------------------
1 | 7ZhNb6MwEIZ/DcdKweSjHBuapNGqe0m1e3bxFKwaTI0JZH/9joMJUNJqu9JucuAEvDOMP57X2MLxgqTaKJrFj5KBcMiEVY537xDierMJXoxyqJWFS2ohUpzZpFbY8V9gRfteVHAGeS9RSyk0z/piKNMUQt3TqFKy7Ke9SNFvNaMRDIRdSMVQ/cmZjmv1lixa/QF4FDctu3O/jiS0SbYjyWPKZNmRvJXjBUpKXd8lVQDCTF4zL/V76w+ip44pSPWfvLCZPjF/7f7YZvtv4rAtgtXj/MZW2VNR2AE7ZC6w3vLZdFkf7DzM3wrTz+WLTPVNfqR0hwmun1VtEO8iez2WKP6+RI6DgjQEk9CUw6EV75tA7bnVSK9BoqEyeqwTgYJrymolXyGQQipUUpmC6RAX4p1EBY9SfAxNN1Bf7kFpjq64s4GEM2aaWZYx17DLaGjaLHENoKZkkTIw0z6x413ThAuzAgJZKI4VyeQ7lDZoTe/6pzF0oTaEsANQdSQLeQMyAa0OmGKjt9ZvdsE19itb97qNFnec22jULpjoVLj1FN5YW33BYuT6LUZGi33JYqev87V4zB94LKYaRlp9FJaWP6Q1/Z+wGvN0aOmY5yOts7Rc79K4hmcEnPJkxHUe1+LSuIb77XZkdZbV7NKovHFlfXLImF3ZtjUd0MqL8HXct87juvy+NRvwwmM8TcxUHQ/2GWUnYTzdf0Ly4lvafEDyaTwwfoTr3+1q+Nj+rjrGOj/9vNVv
--------------------------------------------------------------------------------
/legacy/assets/sentiment1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment1.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment1.xml:
--------------------------------------------------------------------------------
1 | 7Vpbb5swGP01edwEGNzksc26i7RJk/qw9tEBB7wSnDlOQ/brZ8DmZmeNSCASCZUafGxj+M45X7CdCZiv0i8MraMfNMDxxLGCdAI+TRwHep74nwH7AgAzUAAhI0EB2RXwRP5iCVoS3ZIAbxoNOaUxJ+sm6NMkwT5vYIgxums2W9K4OeoahVgDnnwU6+gvEvCoQKcOrPCvmISRGtmGs6JmgfzXkNFtIsebOGCZH0X1CqlryQfdRCiguxoEHidgzijlxdkqneM4C60KW9Hv84Ha8r4ZTvhRHSRRG75Xz44DEQpZpIxHNKQJih8r9CF/PpxdwRKliK9icWqLU5wS/lw7f8mafPSyUsLZ/ln2yAtV3W/M+V5qAG05FVA17ndK1/KKS5pw2cyGWZnE8ZzGlOX3reIMHjac0Vdcq7HyQ9QUz5o94MFwSWhDt8yXrRwpQMRCLFuBkirhAExXWDyQaMJwjDh5a14dSS2GZbuKD3EiKTHTI4d+Q/EWK0G16GqSsYsIx09rlN/7TnizSZAeMsf3TSEL4AJ6sAzZG2Ycp/8Pmh4O1UGJXSaDqSzuKme5EopqplLYKfGz4U3e78kb6PJ2B5I3GIO8Hety8r67yfs9ebu6vL2B5O2OQt7Ty8l7Ooy8u0q4myBPVZ/s+pMSMUhJk+s2abKtFgGFAWSvFgflbRxFizcGWQN4OVnbWrxOkXWZm6tU/SLr+hd2Pa06vQjbbgsbtigovHUOYd8ZhA1jnklxjZIGY/DPNpui5ZH7sMlDdy8a2HCdVpXiLJSf+VUWCvimEHFHC60VU0haDr8tW9m1rjVY71xvl9+9glvSExbiTUU1fZfQBLdMKiEUkzARRV+IAgv8ITMkEZPoe1mxIkGQi9eUAJqCNivwrNMO++44izvnsLieEkdh8VPnvWaLO22m+rP4dCiLR4jj7i53bi7vOPsa1OVgnC4/dfpvdjloM9Wfy2c9uzxrfMZcwSOy6Z4rQIdcUTzALYUYZ7iDphB3nCmkn0kuaDPVXwpRO1H9vykIw6y6u9+9vSl0nPEPafNyu29UK1n2rBeXe+3pQH9LWbbpTeGktax3Vrm97E/zlKiB+SGvUMOL4zwOaK8RDrjmpeJqSKdlzrLMWfBa8lNb9rMB09N530IG3UfqlrpMu56nrl8eHW192zMyfLNbh77Zr8UQ7uX8oG9xmBg6uBZ7LQzZrVcqOCBF8CiKDi6kXQtFDphdjCJ9P8VE0cH1i2uhCDiXc5G+Hm6i6OAk81oocq2hKBLF6tefxYym+oUtePwH
--------------------------------------------------------------------------------
/legacy/assets/sentiment10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment10.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment10.xml:
--------------------------------------------------------------------------------
1 | 7VtbT9swFP41fRxK4tZJHkcpDG1Mk5g29mgSN/HmxJXr3vbr5zTODQMCJjiWSF6a8/l+vuPv1FY7QfNifyHJKr8SKeWTwEv3E3Q2CQLf97D+qJBDjWAvrIFMstRU6oBr9pca0DPohqV0PaiohOCKrYZgIsqSJmqAESnFblhtKfhw1BXJqAVcJ4Tb6E+WqrxGoyDs8E+UZXkzso/juqQgTWWzknVOUrHrQWgxQXMphKrfiv2c8sp5jV/qducPlLYTk7RUT2lwMf2exuf+j8vV9jM/XG7miyv8IZqayalDs2KaagcYU0iVi0yUhC869DTZyC2tevW1IcWmTI+Wp62uwRchVqbKb6rUwVBLNkpoKFcFN6V0z9RN7/1X1dXJzFhne9Pz0Tg0Rqnk4aZv9FpVZtfsaDXtlqJU56RgvALmYiMZlXr1X+nOFJpZBlXl2jOVOx70eBORRGZUPeJmHLaE651CRUH1rHRDSTlRbDscgJiQzdp6bdNvgumhA89sryg+iVD3TKO6A7PV4vgE955wOux+rZefUNNjFzf6pTfFDjpG0zMiC6F6uC3hG7O+y0mAuXbT6a12Os5U6+Ne9Cm6V8P4WCsp/tC54EJqpBRlFYJLxvkdiHCWldpMNDuaVXS6pVIxvZM/moKCpekxfnc5U/R6RZJqzJ3WLSuMnxonftzYZgVmr1dD0/3jYWNHg2kw9QY0RsbcdfLT1Mh7ytNg90XPgNvnEhnNRol4A4kIHwiKV5UIH4FqBJ5aGpETRUeZeIFM+AG0TuBRJ95CJ2YQOhHC6sTM0gmVs/WoEy/RCQytE+GoE2+hEzGATgQ+rE5gSyf0zi5GnXiBTgQesE7g0CLTom3ovvsc3GP0WTvRorVP/SRAy2WiHytOdAnCKEZpG2T/xYmP3DoL4shlTpKU3ka3r85J6Bgn8chJe13sCCdNiLxrTmaOceK7zAlMPgG/NAgDl0mBSSjwpNhX9++OlLsZBZ4U+670/ZEyc40U+2LKIVKAcgr0BVNo3wI4RApQTgEnxenTPFBOASdlPM7bOQWcFKfP8zA5Bfwysjm8ukkKTE6BJ8XpEz1MToEnZTzRWzkFnpQx0QdB7NZ9ZOR0nofhBPxAH4953iYF+htxPOZ5i5RXTCna7P6VUP80o/tvB1r8Aw==
--------------------------------------------------------------------------------
/legacy/assets/sentiment11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment11.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment11.xml:
--------------------------------------------------------------------------------
1 | 7Vtdb5swFP01PK4CnBh4XNO0q7ZOkzpt3aMDDngzOHJMQvbrZ4L5qtuq7dTYUuEl3OPve67Pja3EAYu8uuJok92wBFPHd5PKAReO73ueC+VHjRwaBLpBA6ScJKpSD9ySv1iBrkJLkuDtqKJgjAqyGYMxKwocixGGOGf7cbU1o+NRNyjFGnAbI6qjP0kisgYN/aDHP2GSZu3IHoyakhy1ldVKthlK2H4AgaUDFpwx0bzl1QLT2nmtX5p2l4+UdhPjuBDPaXA1+55El96P683uMz1cl4vlDfwQztTkxKFdMU6kA5TJuMhYygpElz16Hpd8h+tePWlwVhbJ0XKl1Tf4wthGVfmNhTgoalEpmIQykVNViisi7gbvv+quzubKuqhUz0fj0BqF4Ie7oTFoVZt9s6PVtluzQlyinNAaWLCSE8zl6r/ivSpUs/Tryo1nanc86vE2IhFPsXjCzTDoCJc7BbMcy1nJhhxTJMhuPABSIZt29bqm3xiRQ/uu2l5hdBaC/pmFTQdqq0XRGRw8wWzc/VYuP8aqxz5u5Mtgij10jKYXRBYAzXA7REu1vmvHh1S66XwlnQ5T0fl4EH0CV2IcH1vB2R+8YJRxiRSsqENwTSi9ByFK0kKasWRHsgrOd5gLInfyR1WQkyQ5xu8+IwLfblBcj7mXuqWF8XPjxItaW61A7fV6aFw9HTZ6NKgGM3dEY6jMfS8/bY1soDwt9lD0jLh9KZHhfJKIE0hE8EhQvKlEeMCoRsCZphEZEniSiVfIhOeb1gk46cQpdGJuQicCszox13RCZGQ76cRrdAKa1olg0olT6ERkQCd8z6xOQE0n5M7OJ514hU74rmGdgIFGpkbb2H0POXjA6It2okbrkHrHB3GCV+FKixNZAiCIQNIF2X9x4gG7zoIwtJmT9TqWz5tzEljGSWQzJ6fZJ911sSWctCHyrjmZW8aJN3FyP58YvzQIfJtJMZNQzJOiX91bRIqZjGKeFP2u9P2RMreNFP1i6t2RouUU0xdMgX4LYBEphnKKcVKm07yeU4yTYvVx3lBOMU7KdJ7Xcorxy8j28GonKWZyinlSphO9llPMk2L1id5MTjFPypTofT+y6z4ynPK8xonxA31kdZ43RIrpb8TRlFI0Ut4wpUiz/1dC89OM/r8dYPkP
--------------------------------------------------------------------------------
/legacy/assets/sentiment12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment12.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment12.xml:
--------------------------------------------------------------------------------
1 | 7VvbcpswEP0aHusBZAN6TBw7zbTpZCadNn1UQAa1gDyyfOvXVxhhLoqdOKkRM4aHDLu671mdRavYAONkc8vQPLqnAY4N2ww2BrgxbHsELPE3U2xzhQ2lImQkyFUVxSP5i6XSlNolCfCiVpFTGnMyryt9mqbY5zUdYoyu69VmNK6POkchVhSPPopV7U8S8CjXerZb6j9jEkbFyJYD85IEFZXlShYRCui6ogITA4wZpTx/SzZjHGe2K+ySt5seKN1PjOGUv6XB7fB7AKfWj7v56ku8vVuOJ/fOJ28oJ8e3xYpxIAwgRcp4REOaonhSaq/9JVvhrFdLCIwu02AnmUIqG3yldC6r/MacbyW0aMmpUEU8iWUp3hD+VHn/lXU1GEnpZiN73gnbQkg52z5VhUqrTCyb7aSi3YymfIoSEmeKMV0ygplY/Te8loVylnZWObdMZo6DFi88ErEQ8yNmdtw94GKjYJpgMSvRkOEYcbKqD4Cky4b7evumD5SIoW1T7i4PDjxQPkMv70BuNQgHTuVxh/XuF2L5PpY9ln4jXipTLFU7bzrBswDIh1uheCnXd2fYTizMdP0sjO6EfG/jivdxvOF1/1hwRv/gMY0pE5qUppkLzkgcN1QoJmEqRF+gI1AF1yvMOBE7+UoWJCQIdv67jgjHj3PkZ2OuBW0pbvxWP7FgIcsVyL2eDY03x91G9QbZYGjWYPSkuC7pp6gRVZin0L3kPTVsTwXSG/UU0QJFuAec4qwUYQGtHOEMFY6IEMc9TbyDJixbN084PU+0wRMjHTzh6uWJkcITPCKLnifewxOObp5we55ogyegBp6wLb084Sg8IXZ20vPEO3jCNjXzhOMqYCqw1c33koEriJ60ExVYq9AbNpiJx/MUPxElwAEQBHsn+xAmFujWWdDxekwst2OYwB6Tfbq4I5gULnLRmIw6honVY9KMJ9qTBq7dg9IMKPpBUVP3FwdKM6LoB0XNlV4eKKOugaImpjoEih/gZ++59ZiiO8HkqlmAywOlGVO0g9Lp03w7oCgxRTsonT7OtwRKM6ZoB6XT53k9MUV7MrI4vF40KG7XQOn0iV5PTNEPSqdP9HpiinZQoPcKBh+5RB2Y5tAoL1IzGRjHrlJPQjRr+YAZEYbIAP3QXWxuhLdcr+YXlcfcXFr01XtYeJ57WOA0Plvshu/kE1OuWq8YQ9tKtXlWYXFknAMZw9Ib8x7/6z2u1+nv0pZyHbBb6XPYJwUVTLTnn2Cnk4ItBdsmKLoPcLBPCiqg6P8C6vSpejrN/lvq3KCAZlLwfKAIsfxlU/5ZUP48DEz+AQ==
--------------------------------------------------------------------------------
/legacy/assets/sentiment13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment13.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment13.xml:
--------------------------------------------------------------------------------
1 | 7Vtbb5swFP41eVwFODH4cU2Trto6Teq0dY8uuODN4Mhxbvv1M8GEi5uqSZcYKfASzvH9fOd8jo/MAIzT9a3As+SeR4QNPCdaD8DNwPNc14HqJ9dsCg0cBoUiFjTSlSrFA/1LtNLR2gWNyLxRUXLOJJ01lSHPMhLKhg4LwVfNas+cNUed4ZgYiocQM1P7k0YyKbSB51f6T4TGSTmyC1FRkuKysl7JPMERX9VUYDIAY8G5LN7S9Ziw3HilXYp20z2lu4kJksm3NLgdfo/Q1P1xN1t+Zpu7xXhyDz8EQz05uSlXTCJlAC1yIRMe8wyzSaW9DhdiSfJeXSUIvsiireQoqWrwhfOZrvKbSLnR0OKF5EqVyJTpUrKm8rH2/ivv6mqkpZu17nkrbEohk2LzWBdqrXKxaraVynbPPJNTnFKWK8Z8ISgRavVfyUoX6ll6eeXCMrk59lq89EgsYiJfMTP0d4CrSCE8JWpWqqEgDEu6bA6AtcvGu3q7pt84VUN7jg6vAF0FoHrK0NKhhtAVrD3+sNn9XC0/JLrHym/US22KlWrrTQd4FgDFcEvMFnp9dwMPMmWm6ydldBjLnY1r3ifJWjb9Yy4F/0PGnHGhNBnPchd8poy1VJjROFNiqNBRqILrJRGSqkj+qAtSGkVb/10lVJKHGQ7zMVeKtww3fqufuKiU9Qp0rOdDk/XrbmN6g24wdBowBlpcVfRT1khqzFPqXvKeBraHAhmMeoo4A0X4e5zipBThAqscAYcGRyRYkp4mjqAJ17PNE7DniXPwxMgGT/h2eWJk8IRM6LzniWN4AtrmCb/niXPwBLLAE55rlyegwRMqstOeJ47gCc+xzBPQN8A0YGua7yUD1xA9KBINWOvQDzwQRuQpeDL8RJUACBCIdk72Lkxc0K2zIAx6TFy/Y5igHpNdurgjmJQuctGYjDqGidtlTJ7VEwTn3k+sJw18rwelvaHYB8VM3V8cKO0dxT4oZq708kAZdQ0UMzF1caAYe4rtBJNvZgEuD5T2nmIdlE6f5i3tKdZB6fRx3tKeYh2U/jxv7CnWk5Hl4fWiQfG7BkqnT/R2Ml/2Qen0id5O6ss+KJ3e6M8ECupWPhL1WRYDE+sHetRnWUxQbP8jRp3OsthhL+tbiuu4Bgj1ezz6esS77+ochNWxl3SKlfyXezfoNPdugNe6kQdbwO65WmN2BNvR3ZpLscKT3dFBnT7eTqf5taVTBzNoZ+eGJwtmJVafGBUYVh9qgck/
--------------------------------------------------------------------------------
/legacy/assets/sentiment14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment14.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment14.xml:
--------------------------------------------------------------------------------
1 | 7VvbctowEP0aP5axLfDlsSGQZtp0OpNOmz4qtrDVCosR4tavr4xlfFFIAxkszWC/wK513bM6i3YWC4zn2zsGF+kDjRGxXDveWuDWcl0HuL74yDW7QuO7QaFIGI5lo0rxiP8iqbSldoVjtGw05JQSjhdNZUSzDEW8oYOM0U2z2YyS5qwLmCBF8RhBomp/4pinhTYot5XrPyGcpOXMjhcWb+awbCx3skxhTDc1FZhYYMwo5cW3+XaMSG680i5Fv+mRt4eFMZTxt3S4G36Pw6nz436x/kx296vx5MH7EAzl4viu3DGKhQGkSBlPaUIzSCaV9iZasTXKR3WEwOgqi/eSLaSqwxdKF7LJb8T5TkILV5wKVcrnRL5FW8yfat9/5UMNRlK63cqR98KuFDLOdk91odYrF6tue6nsN6MZn8I5JrliTFcMIyZ2/xVt5Eu5SjdvXFgmN8dRi5ceCVmC+Ctm9vwD4OKkIDpHYlWiI0MEcrxuTgClyyaHdoeu3ygWU7u2PF5BOAhA9Qzl0ZJHLQwHXu3xh83hl2L7EZIjVn4jvtSWWKn23nSCZwFQTLeGZCX3d2+5HhFmunkWRvcSfrBxzfs42vKmfyw5o3/QmBLKhCajWe6CM0xISwUJTjIhRgIdgSq4WSPGsTjJH+WLOY7jvf9uUszR4wJG+ZwbwVuKG7/VT5ywlOUO5FnPp0bb191G9QbZYWg3YAykuKnop2yR1pin1L3kPQ1sTwUyGPUU0QFF+Eec4qIU4QCtHOENFY5IIUc9TZxBE46rmye8nie64ImRDp7w9fLESOEJnuJlzxPn8ISnmyf8nie64IlQA0+4jl6e8BSeECd73vPEGTzh2pp5wvMVMBXYmuZ7ycA1RE86iQqsdegtF0Qxeg6eFT8Rb4AHQhAfnOxdmDjArLugF/SYOL5hmIQ9Jod0sSGYlC5y1ZiMDMPE6TFpxxPtSQPf7UFpBxT9oKip+6sDpR1R9IOi5kqvD5SRaaCoiSmDQJmJJwg6jym6E0y+mgW4PlDaMUU7KEbf5rsBRYkp2kEx+jrfESjtmKIdFKPv83piivZkZHl5vWpQfNNAMfpGryem6AfF6Bu9npiiHxSjA31Hl8fQrHxk2GdZFEy0X+hDo7MsHbFXGxTdv4jDPsuigKI9pDi2o4BQr+OR5RHvrtU5Catzi3TKgiB7EIwCq1YUZL1aEiSEb4hhYdDcMW5Pq+ApamHe4Pb/LfUJL1PqA9xWEaDX8qUj1TzqQF6bUFprKXZ4sbKg0Ogb9XSaV0pdmj9AOyF4uZ8/Qqz+1VRgWP03DEz+AQ==
--------------------------------------------------------------------------------
/legacy/assets/sentiment15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment15.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment15.xml:
--------------------------------------------------------------------------------
1 | 7Vxbc9sqEP41fqxHgK6PjROnmZ6e6Uw6vTwSiVg6lYUH41t/fcFC1gXHcRz7oBnjl3gXkIBv91tYcAZoNF3fMzxLv9CE5APoJOsBuh1ACBAMxB+p2ZSaAIalYsKyRFWqFY/ZH6KUjtIusoTMWxU5pTnPZm1lTIuCxLylw4zRVbvaM83bb53hCdEUjzHOde2PLOFpqQ2rYUn9J5JN0urNwI/KkimuKquRzFOc0FVDhe4GaMQo5eW36XpEcjl51byU7cYvlO46xkjBj2lw735LojH4/jBbfs43D4vR3Rf/Q+iqzvFNNWKSiAlQImU8pRNa4Pyu1t7EC7Yk8qlACIwuimQrOUKqG/xD6UxV+Y9wvlHQ4gWnQpXyaa5KyTrjPxvff8lHDT0l3a7Vk7fCphIKzjY/m0KjlRTrZlupavdMCz7G0yyXihFdsIwwMfp/yUoVql5CWVmf4MoAMZsQfmBWfWUdciobLRUu94ROieiVqMBIjnm2bNsaViY72dXbNf1KM9EX6Cj3CqNhiOqPq1xLuVoUDf3GJ3Dbj5+L4cdEPbG2G/Gl0cVatbWmN1gWQuXrljhfqPE9DKCfi3m7eRKT7k/4bo4b1sfJmrftY84Z/U1GNKdMaApaSBN8zvK8o8J5NimEGAu4BKroZkkYz4Qnf1QF0yxJtva7SjNOHmc4lu9cCd7SzPhYOwFRJasROC8ajewNWR+0BlXqOi0YQyWuavqpaqQN5ql0+6ynhe1bgQw9SxHnp4gA9IEiADLKEb6rcUSKObE0cQJNAGiaJ3zLExfgCa8XPBGY5QlP4wmeZnPLE6fwhG+aJwLLExfgiagPPAGBWZ7wNZ4Qnj21PHECT0DHME9Um+gGmBps7enbN8ENRN/kiRqsTegHEMUJeQqfNDsRJchHEUoOOfPxmADUr72gH1pMQNAzTCKLyS5d3BNMKhO5aky8nmECLCbdeGI8aRBAC0o3oJgHRU/dXx0o3YhiHhQ9V3p9oHh9A0VPTF0dKFpMMZ1gCvQswPWB0o0pxkGxu3k9phgHxW7n9ZhiHBS7n9diivFkZLV5vWpQgr6BYnf0WkwxD4rd0WsxxTgowDl8z1OdZZ3vYHXoOHBQH65KuSo3eLwq3/iVsExMqjSO24NHruVZ5SFLV6unV89mgaMSKuc7nH2fi9plH4RRv7LTwDl8zfLcHuoMIzdseOgH51UXfbernerjp7to5B7tovDcLrr//gSqjrW7243qEWVXL3ZFIrK5Xc33jacRI5vb1UExvQ+PbG5XA6UHC1mggXCJMPm2WHdiXKtjceg1Y/Eu8u5fKZ91EVtZ+asRMvqfLhgi2Ll63A2QL9wh3BNpu4TS6culI62exxOLrh5RyHgsr2hemkJQ9yTC/Epbz1A4Q/3HC1cHjGscGH1t6gyRBcZ80D0iYTBP8Ux+jRcs39wwHP+W4eQ1eGoswYHpfh+IjHIRkKjUSfI5C0Yu9DoYeRpGcA9GAFwOpL7HGwISjwT7vCfyA4T9MyHTDfuXozUh1j/fL5cN9T9BQHd/AQ==
--------------------------------------------------------------------------------
/legacy/assets/sentiment2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment2.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment2.xml:
--------------------------------------------------------------------------------
1 | 7Vtbb5swFP41kbaHTYDBTR7brLtImzSpD+seHXCBleDMcZpkv34GzN0pBIjTplBpxZ+vnPN9R/ZxNwHz5e4LRSvvB3FwMDE0ZzcBnyaGAS2L/xsB+wQAM5AALvWdBNJz4M7/hwWoCXTjO3hdasgICZi/KoM2CUNssxKGKCXbcrMHEpRnXSEX14A7GwV19JfvMC9BpwbM8a/Yd710Zh3OkpoFsh9dSjahmG9igIf4SaqXKB1LfOjaQw7ZFiBwOwFzSghL3pa7OQ4i06ZmS/p9PlCbrZvikLXqIBy1Zvv027HDTSGKhDKPuCREwW2O3sTfh6MRNF7y2DLgrzp/xTuf3Rfef0dNPlpRKWR0fy96xIW87g9mbC84gDaMcCif9zshKzHiAwmZaKbDqOwHwZwEhMbrTu0MbtaMkkdcqNHih9ck3xp94EFzCWhNNtQWrQxBQERdLFqBzFVcAZgsMf8g3oTiADH/qTw6Elx0s3a5P/iLcIncPWLqJxRscEqoirvKzth6PsN3KxSvfcu1WXZQ3WSGbctM5sAFtGBmsidMGd49b7S6OdIOKdlFMJiK4jZXlikgryCqFOtjPx2O9G6iN6jT21REb3AJ9DbOSO+rkd5N9Dbr9LYU0du8BHqDM9J7qobeXSn8nCy6kdWqk1Wf9WSr6PqT+HzizK2WVnarrlUclqxB9Kr4LFtGKzdalyAD84wy0Gv26iODjLR5aP8t6roKoT3Zi8w2TkJs3awQG1ZckOhtCGJfSYgNAxZRcYXCksfg3010pIst92Edm+6aN9DhapdX8jdX/I5HWaTAtxThK1rUWtEU2WXTb7JWeqFrAa53LraLV5/CFepxCbEyo8q6C0mIKyIVEAp8N+RFm5MCc/wmEqTPD93XomLpO05MXlkAKBNazsBBjyn6VTuJG0NIvB4SL0Lifc/Jcokb6iQ+VSVxDzHcXeXGqPKOpzWlKgeXqfK+6QK5yoE6lc9OrPKo8YCxgnn+unusAB1iRfIBYwiRnoiVhhDzMkNI35SMPISY6kJIenN1+p0CF8yyu/rNcafQ8cSvUubZ9eALzXy1F3kxu3WiVBZUl8rSZTuFXrmshqy4Ff3UNMVrYPyIEQp48gyjAOt8Oa/UrpJwmsUsTR4F30p8qtJ+pjA8DbsLUXrv1C10yW5J++YvW1tbdk165MHClG0t3nmSDYJ2aIPA16nZ7Tu8P3h8eCsCBY36tE6lz8MH2sEY03ZTKx3n+ER1bRtapuN5V/OKuW4Ow/XsZCXILtkrwxORHcjuF1842RvytYrJftxqRrIbZyQ7fH1kb0g4Kib7casZyQ7OSPbDF+wvluwN+TXFZD9uNSPZTWVk58X8r/yTTFT+PynA7X8=
--------------------------------------------------------------------------------
/legacy/assets/sentiment3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment3.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment3.xml:
--------------------------------------------------------------------------------
1 | 7Ztdb6M4FIZ/DZezAhvT5LLNdHZXmpVW6ko7c7VywAF2CM6A0yT769cGm0+TRIQPNW0qtXBsY3PO857YhhpwtT3+muBd8Af1SGQA0zsa8LMBgIMQ/y0Mp9wAlzA3+Eno5SarNLyE/xFpNKV1H3okrVVklEYs3NWNLo1j4rKaDScJPdSrbWhU73WHfdIyvLg4alv/Dj0W5NYFcEr7byT0A9Wz5SzzkjV2f/gJ3ceyPwPATfbJi7dYXUv2kAbYo4fclN07fDbgKqGU5Ufb44pEwrXKbbmDvnSUFuNOSMyuaQBtOQ52UvdOPO4KeUoTFlCfxjh6Lq1P2f0RcQWTnwVsG/FDix+SY8i+SbM4/i6Of0HiLGbJ6ZuqJk7Ksn8JYyfJAN4zyk1lv18p3clmHk6DrFdxkg9bjLXzzpWL6T5xZS0IJEw48YmsBq3C7xxnQreEj47XSUiEWfhavz6WYPlFvdK5/ED6t8PXsvNXHO2JwqPh/LprD0HIyMsOZ8M/cKXV3b0Jo2hFI5pkbaGHyMKzuT1lCf1BKiULsIaOU3jtlSSMHM/7re0P2QA+SI2elGTl+aEUii1NQUUjynaTB9H7olWlviqtpj46w9Nq3QOtYDkjrc77otVs0wqWU9Fq3gWt1oy0PoxCa06o4rVKqzkrrWDZptWailbV+dum1YLz0WqNMhOwunPrDbRuaMxkNctpB0uuDzTBMrNPEaxLRGsmC1ptTAH4uHxzfwHX1bnMc9YOcs65rIV8N9+gwTeckO9R5g5vmm/Y5tuei294D3wDc0a+R5ltvGm+7TbfaC6+7bvgezEf33AxBt/nZtMl7t9lq2v51kJ2EVfUxnWy+TO6BzyhM2P6tVoOuwXPASBsJtlOB9e2F4bOj7LpnzTkvZYTQbseqWKZri6R60K2akShGMZVgXnQkO1ETLC4w3EtYs7PvXhKkHnuU5q57pFXsJzdsSzkR778m11lrQy/Kwsf0bpVK1GWY9H9vqhlVZpWzO3G1XrZ6JW5gR4XDKsTVRdeTGPSUKk04Sj0Y37qckoItz8J+YUujh5lwTb0vAxeXQaoA301gf1XEOppzCWJgyEk3s6Jb1Higy9h9RIHzUiNJ/HFVBIPMCP9VQ4+VN5zHTWpyuFdqHzwhbxe5bAZqfFUrtvhHVLlovKAuYIFYdo/V8AeuSK/gY8Uol2qTppChn0FZK4UMvheSUcKaUZqvBSirjz+TIELZttf/fbHTKHnkn9KmcPloDLv3HEdd9fp5i0mvartxgrfaTg8TzSDiHrwR78XdqeR+GlJiJc42UdeoWLPP+MAb0+3xaX8qsmeRYoy9UnvvaSjJvVoumwERnnvdJrnP1dNRybammw+4kCjzUZs3WyER9EO/jEN9GygVSIQyd6+zguuEFJC+FwFr7MKIkI7MdBs6OjJQJ+FhHhgUun8iqIismEaPTERNDE7csPY/yuL4Ce7M4Q3vf2CGp7XfJUXtqHVY1/xfI7E3mMWC55+IpymoXv+e+PcW4Q937eq7eMBvTcr7kIabynbrd/vXdG6oJPWhVDznRBztJmCrXvKVROcEMG70ZvdmEosNXrTEdRDb/y0/A+MPHLlf7nA5/8B
--------------------------------------------------------------------------------
/legacy/assets/sentiment4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment4.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment4.xml:
--------------------------------------------------------------------------------
1 | 7Vtdj5s4FP01eewKMBDy2JlOP6SutNKstO3TygEP0BKcOs4ks79+bbADGKeJiDGdNIw0g68/gHvPOdjXzAzcr/YfCFxnf+IEFTPPSfYz8G7mefMoZL+54aU2gHlQG1KSJ7XJbQyP+X9IGB1h3eYJ2nQaUowLmq+7xhiXJYppxwYJwbtusydcdK+6hinqGR5jWPSt/+QJzWpr5IWN/SPK00xe2Q0Xdc0Sxt9TgreluN7MA0/VUVevoBxLXGGTwQTvalP17OBhBu4JxrQ+W+3vUcFdK91WO+j9kdrDfRNU0rM6iLhs6It8dpQwV4giJjTDKS5h8dBY76rnQ3wEh5UyuirYqctO0T6nX1rnX3mTPwJeKil5+SJ6VIWm7hui9EVgAG4pZqbmup8xXosRn3BJRTM35OW8KO5xgUl139LP4G5DCf6OWjVOdbCavndkGPCWxOL5PYE3SFIkWoHaxD3T6iY8+gHhFWIPxBoQVECaP3dBBAUW00O7Jh7sRIREHx7gjREeERIZoE54mmh9Fb3ODY/WR0O87UWWvCuu/QyLLZJ0Vbzd9eUuyyl6XMPq5ndM+br+7QPSi2MdIJNwGQbhzwD5jAhF+5+CTdYKhjtCat1QlHeNcPnClLU0S9oucaAb3tRDwTPo49mfTD3AtauHxtvewpZ3r0E9PGdC9Zjf1EPBs9/HczCZevjXrh4abwPHknf9q1CPaDr1AKOsXH4leAYaeNoif3AN8AThhC83t+ewS+BpAITqK+yogzuzKdNvH9H1L5yzqzaLGL8bqV4Ial6IXkoUDrdxVmDmGmSHBeVYXMOyE7Hwx5bnYSrPvdlUrnvLGrjhet9UsrNU/K1GWUrDJ2lhd7TstSLSsj9cfnto5ba6tsz9zu121d1LswI9RhjaRVSXeCUukcJSYYJFnpasGDOUIGa/4/TLY1i8FRWrPEkq8OoUoAvosxE4fPXrBedR3DNBcbPJmakobjy5pae4p0ZqPIpHtiieQYqGs9y7sXzgKtUqy80mUaZiufEklJ7lQI3UeCxfjMxy3tigVtAs3wzXCjBAK+oHuEmIdqlqVULMZlKmkhDjmagjEqJGajwJkdvN488UGGFWw9nv32YKA5f8Nmkuxxg7n22c5O2kk7uww/JA3bZ0lAjUymOE5bqZwkXJrBObAQH/6XGK1YTVIUZo2evDDAN8JZUS2ct5Sb9q5PSgWY5eBX8XfVJhv7AoT6Ps59jZbjtrfmIpV6nueUSjTU/AK/q8Quv701//RJow2trA1+aJLnovKO+BKEb6TY5lFPhBsynckw+N387OgFrUe2BpA39kDC76GLS2DazNYrw6DKr5OZsYjK5dI4Gjwae1j0B0K+TXh89oOnwurh6fmqmYa+0jPDkDf9X4VLMWFvHpm/1OYYJVwWm8mv9cYWDy4uRaQh3H3FrC132uylaEfvavMwse3LrQziiemfusko7H0p+XDnMyLUAQGwcuqwYcWWvuqsp5wd0seMcTAgxQGwGaVn6gQE9Ukx2gHGw8+RvnZfp3hbw3/lHoXUT6oBv7eZ/zcw3ngQnO6z4+bLDgDMaCCSD83ihw/aAHg4UZ6WfF5v/KagVp/ncPPPwP
--------------------------------------------------------------------------------
/legacy/assets/sentiment5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment5.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment5.xml:
--------------------------------------------------------------------------------
1 | 7ZnNjpswFIWfhmUkMJCU5cAk06iablK1ayZ4wKrB1DGB9Ol7CQYCOJpGrUIW3iTm2Fz/fMf4Cgw7SKsXHubJK4swNZAZVYb9bCDkOAh+a+HUCMixGyHmJGokqxd25DeWoinVgkT4MGgoGKOC5ENxz7IM78VACzln5bDZO6PDXvMwxhNhtw/pVP1BIpE06ie06vXPmMRJ27O19JqaNGwby5kckjBi5YVkrw074IyJppRWAab12rXr0ty3uVLbDYzjTPzNDS/Ot8jbWN+3+fELPW2LYP26XMgox5AWcsIGWlKI57/VQxYnuQ7LX0U9Tv+dZWJxOFN6ggaWlwNov6+HUiz/z1GKf4pSMh61kWBixTg6aG+9hgZ9IYGrWk9ESkGwoHgQnP3EAaOMg5KxDNdjIZSOpJCSOIPLPawsBt0/Yi4IeOJJVqQkiupu/DIhAu/ycF/3WcIGAI2zIotwveimnOomTAmt/R+wghOIiMyvuJSV0vKW182h7g1XVylbnXdgz2GWYsFP0KS9obVbu9/kZdmb15FScuHbtlkot0vcBe4dBQVpqhsMhh7aYARIVdphNzkMme5DOcx+aIcB70UCSr3oewEe0Ga7yWzeh48za3VPt3kTtyWhwJqW8vDxZn40tEnkBS2RkIOmpaRl2XPjmmajsOSpxqXGtZob1zS322pWSlbu3KimSZKlUSnT2dnPLGeCCmlUSlTzH1juhJWtWalZzX5aLSesTM1KyWru0wpN03bD9WEOEM40R3/us6bYPHk+pmjdF6Pi5bLrS3BjmhrjFYyKfOTOGBWvcDuMY5oa4xWMqlzlzhwVL0rHHDucmuM1joo85n9xhMv+c+C57uKbqr3+Aw==
--------------------------------------------------------------------------------
/legacy/assets/sentiment6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment6.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment6.xml:
--------------------------------------------------------------------------------
1 | 7ZhNb6MwEIZ/DcdKweSjHBuapNGqe0m1e3bxFKwaTI0JZH/9joMJUNJqu9JucuAEvDOMP57X2MLxgqTaKJrFj5KBcMiEVY537xDierMJXoxyqJWFS2ohUpzZpFbY8V9gRfteVHAGeS9RSyk0z/piKNMUQt3TqFKy7Ke9SNFvNaMRDIRdSMVQ/cmZjmv1lixa/QF4FDctu3O/jiS0SbYjyWPKZNmRvJXjBUpKXd8lVQDCTF4zL/V76w+ip44pSPWfvLCZPjF/7f7YZvtv4rAtgtXj/MZW2VNR2AE7ZC6w3vLZdFkf7DzM3wrTz+WLTPVNfqR0hwmun1VtEO8iez2WKP6+RI6DgjQEk9CUw6EV75tA7bnVSK9BoqEyeqwTgYJrymolXyGQQipUUpmC6RAX4p1EBY9SfAxNN1Bf7kFpjq64s4GEM2aaWZYx17DLaGjaLHENoKZkkTIw0z6x413ThAuzAgJZKI4VyeQ7lDZoTe/6pzF0oTaEsANQdSQLeQMyAa0OmGKjt9ZvdsE19itb97qNFnec22jULpjoVLj1FN5YW33BYuT6LUZGi33JYqev87V4zB94LKYaRlp9FJaWP6Q1/Z+wGvN0aOmY5yOts7Rc79K4hmcEnPJkxHUe1+LSuIb77XZkdZbV7NKovHFlfXLImF3ZtjUd0MqL8HXct87juvy+NRvwwmM8TcxUHQ/2GWUnYTzdf0Ly4lvafEDyaTwwfoTr3+1q+Nj+rjrGOj/9vNVv
--------------------------------------------------------------------------------
/legacy/assets/sentiment7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment7.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment7.xml:
--------------------------------------------------------------------------------
1 | 7VnbctowEP0aHmF8BfwYLkkzbTKdptM2j8IStlrZcoUMpl9fCUu+YENIyyXt8ALWkbRa7Z7jlaBjj6PsjoEkfKAQkY5lwKxjTzqW5Zmu+JTAOgdcw8qBgGGYQ2YJPOFfSIGGQlMM0aI2kFNKOE7qoE/jGPm8hgHG6Ko+bE5JfdUEBKgBPPmANNGvGPIwR4fWoMTfIRyEemWz7+U9EdCD1U4WIYB0VYHsacceM0p5/hRlY0Rk7HRc8nm3O3oLxxiK+SET7pzP0Ls1v9wny/dkfZ+Opw/9rp1bWQKSqg13rD4R9kaLBMTSa75Woej/TKWrozmNeXexSdSNGGB6SVZ2iqdAfR/BCo1RNxSI3Flub8Z031JkmzKNi23nS9WXb8BWzRmLo0ziIY+IAEzpMGf0BxpTImzbk1h4IJ3FhGxBgOAgFk1fRB+xjT+MY8GbG9URYQjlMqNViDl6SoAv11wJkQiM0TSGSCbGULG4BREmUiNjmjIsLFrGI1qpTiUL09NttQPFLLk0ynbSwizIJkSKaIQ4W4shaoKjjCh9Opquq5Lt5kBhYYXpehxQAgsKyyUHxYOi4WsoaTTShKDQpGpSxkMa0BiQaYluBbQc84HSRGX2O+J8rSIJUk7reUcZ5t/k9J6rWs/KmHyeZNXGWjdisd/KJNl81vZko5y2aa1fmW/LKDgrI7A/wSJgwpKP9kRWp4wDFiC+b6DXThmGCOB4Wffk6ATQblZfSu5IgMKcob7M/KvX63Xcyf8g6yMoWdTWtyblwVXKJ5Kyd6CU7R2cOZOUvZ3nCxmcg04GTtvJYBrNEIQ4DiqVPre4o9LXWdUm3gqDXpXVxiuj+lrpWDYEaDj3G+8g0dP3h2g2P5L4Pbcmfttoin/Yov3hybQ/vGr/NNq39T3oRe27l9S+dnOrjPdsVcF7Xv7giKhfi3lDz2a9mBfn9Iqei4J/nmL+tu+KEMULdL0pnu+m2MbIMx8vvUuWGP38XCs37SXmYqXibyuAmvqR4s1pTTNhiwpm3+hZdSN5cVLztvJcOPLnqXcu8iPBidJ4+kLensZtRR+exkNM1Q3lVD0ZH2y3UZw+PT6+oVvAfI76fustAA68mVES5qi3ALPlJ4Cz3gKc5iEwLOp9OtPllnfNaiEuO/7xOmwdKa/mcEtebjOvZz0MOtZheb1mdd+Byn05q85xsiqa5T9N+Uu3/LvOnv4G
--------------------------------------------------------------------------------
/legacy/assets/sentiment8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment8.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment8.xml:
--------------------------------------------------------------------------------
1 | 7ZpLc5swEIB/jY/NAAKBj43jPKZNpzPptM1RAQFqBXJl4Ud/fSUjDBgnIY0JdgZyCLvSCrG738oIRmCSrK44msW3LMB0ZBnBagQuRpZlmgaU/5RmnWug4eaKiJNAdyoVd+Qv1kpDazMS4Hmto2CMCjKrK32WptgXNR3inC3r3UJG61edoQg3FHc+ok3tDxKIONd6llvqrzGJ4uLKJhznLQkqOus7mccoYMuKCkxHYMIZE/lZsppgqpxX+CW3u3ykdTsxjlPRxuDK/haML83vN7PFJ7q+ySbTW/jBKSYn1sUd40A6QIuMi5hFLEV0WmrPOcvSAKthDSmVfT4zNpNKUyp/YSHWOpooE0yqYpFQ3YpXRPxU5meOlu71YOr8YlUV1oWQCr6uGCnxvhhPCaXZRirsQpaKS5QQqhQTlnGCubzhL3ipG/UsLdU5d4bywKNO1qq5HMnHT3gWAJ2tiEdYPNHRhttkkBRhlmA5fWnIMUWCLOozQTqdo22/MuLyRAf9BQlQTHOBaKYvdTOyIJUzPn+QjoKR2PqlkiQCr0Q9pnPB2W88YZRxqUlZqjIlJJTuqBAlUSpFX3pURgKcLzAXRAL3UTckJAg2abaMicB3M7Rx81KWl0bqtY2tOS5kfQc669Wl8erpUDcDow1sPYgubABqeVmWiaJLXKkQwOgoko7ZiNKA8kFQtluj7PaJst1EOUYCDzS3odm0jg1na8C5I5zttjh7veJsN3AWMZkPOLfB2TKODWcw4NwRzk5bnMe94uw0cJYAJgPOrXD2jDPbsKBnOxC4jmcfGdywEbT3i+7zZcvP+GJz6+aBSYctSXecXkmHDdKfSY999FVS4UXhaTBfrQsjCwQIe6HfKCKyBfoefgi3ITvkw7HktxWhdmeEusPyW0mSzgh1T4NQdyB094G3f0S9AdG3QNQ7DUS9AdHdh9j+ER2fCqLHgdprHzi16VdG5KXLyu3tpIWxE+8cbW21E/LtNF6B5nhAUz2QOscFJ/yf9z+11H+/i+kBoXbabjdBY38Gvc366TS3m44I0jD05bEPUgDBGAQH+okL64XSbLlP1B2izRicAqJ9ofZqgh5dP89M17JN4I6BaYB6Hd8NfseLaZGSR8qpo/72LqabQw9Z5XdzdMNv7yts8wV7sX2vvCBbQqQTGv7J1Hdd59eYLrDaci9VxT6/UdjKueTmxf7/e38jYHZR2zt7BSDF8iO9HPvyU0cw/Qc=
--------------------------------------------------------------------------------
/legacy/assets/sentiment9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/sentiment9.png
--------------------------------------------------------------------------------
/legacy/assets/sentiment9.xml:
--------------------------------------------------------------------------------
1 | 7Vtdb5swFP01PK4CnBh4XNO0q7ZOkzpt3aMDLngzOHJMQvbrZ4L5ituq7dQaKc5LuMff91yfG1vBAYu8uuJond2wBFPHd5PKAReO73ueC+VXjewbBLpBA6ScJKpSD9ySv1iBrkJLkuDNqKJgjAqyHoMxKwocixGGOGe7cbV7RsejrlGKNeA2RlRHf5JEZA0a+kGPf8IkzdqRPRg1JTlqK6uVbDKUsN0AAksHLDhjonnKqwWmtfNavzTtLh8p7SbGcSGe0+Bq9j2JLr0f1+vtZ7q/LhfLG/ghnKnJiX27YpxIByiTcZGxlBWILnv0PC75Fte9etLgrCySg+VKq2/whbG1qvIbC7FX1KJSMAllIqeqFFdE3A2ef9Vdnc2VdVGpng/GvjUKwfd3Q2PQqjb7ZgerbXfPCnGJckJrYMFKTjCXq/+Kd6pQzdKvKzeeqd3xqMfbiEQ8xeIJN8OgI1zuFMxyLGclG3JMkSDb8QBIhWza1euafmNEDu27anuF0VkI+s8sbDpQWy2KzuDgE8zG3W/k8mOseuzjRj4MpthDh2h6QWQB0Ay3RbRU67t2fEilm85X0ukwFZ2PB9EncCXG8bERnP3BC0YZl0jBijoE7wmlRxCiJC2kGUt2JKvgfIu5IHInf1QFOUmSQ/zuMiLw7RrF9Zg7qVtaGD83TryotdUK1F6vh8bV02GjR4NqMHNHNIbK3PXy09bIBsrTYg9Fz4jblxIZzq1EvINEBI8ExZtKhAeMagScaRqRIYGtTLxCJjzftE5AqxPvoRNzEzoRmNWJuaYTIiMbqxOv0QloWicCqxPvoRORAZ3wPbM6ATWdkDs7tzrxCp3wXcM6AQONTI22sfsecvCA0RftRI3WIfWOD+IEr8KVFieyBEAQgaQLsv/ixAPTOgvC0HLiBRPjJLKcdNfFE+GkDZGT5mQ+MU48y8lxPjF+aRD4lpTjhGKeFP3q/uRIOc4o5knR70pPj5T51EjRL6ZOjhQtp5i+YAr0W4DTI+U4pxgnxZ7m9ZxinBR7nNdzinFS7HleyynGLyPbw+tJkxJMjRR7otdyinlS7IleyynmSbGJ3vejad1HhjbPa5wYP9BHNs/rpJj+RRzZPK+R8oYpRZr9WwnNXzP6dzvA8h8=
--------------------------------------------------------------------------------
/legacy/assets/vocabulary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bentrevett/pytorch-sentiment-analysis/329f654b2cfaed22fee432569ebd7a2674584f79/legacy/assets/vocabulary.png
--------------------------------------------------------------------------------
/legacy/assets/vocabulary.xml:
--------------------------------------------------------------------------------
1 | 7ZnLcpswFIafhqVnQAS7LAOxXU8n3bjTrhVQQFOBKBYG9+l7MMLc5EndZkwWWiF+iaPL9yOdAcP2k2qb4yx+5iFhBjLDyrCfDIQs2zHhUiunRllZqBGinIayUSfs6W8iRflcVNCQHAYNBedM0GwoBjxNSSAGGs5zXg6bvXI27DXDEZkI+wCzqfqDhiJu1E9o1emfCY3itmdr6TY1CW4by5kcYhzysifZa8P2c85FU0oqn7B68dp1aZ7bXKm9DCwnqfibB7YP30J3Y33fZccv7LQr/PXzciGjHDEr5IQNtGQQz3uphyxOch2Wv4p6nN4rT8XicKb0CA0sN6u6SihF8noOUfx7CMF/krSNA3MqxrFBe+k0NOgJCVLVeiwSBoIFxYPIIaLPGc9BSXlK6pFQxkYSZjRK4TaARSWge0eSCwp2eJQVCQ3DuhuvjKkg+wwHdZ8lmB+0nBdpSOr1NuVENzihrLa+z4ucQkRkfiWlrJRut9zLHPo0WzQwAFL1JEl3S3hCRH6CJm1t6zT5qrW3ZefbBynFPcu2zbB8U6JL4M5MUJB+usFb6B28dQb+3/ZSRaFAqtIOu8lhyHQ+lMPsD+0w4L2IQalXOBDgAW22m8zmvrmdWat7us2duC3GgmhaysPHnXlraPPHHi0R04OmpaRl2XPjmiaisOSJxqXGtZob1zS322lWSlbO3KimSZKlUSnT2dnPrIcJKqRRKVHNf2A5E1a2ZqVmNftptZywMjUrJau5Tys0TdsNx4M5QDjTHF2cJ02x2XnepmjdF6Piu7LjSXBjmhrjFYyKfOTOGBWfcC8YxzQ1xisYVbnKnTkqPpSOOV5wao7XOCrymPfiCLfdn8BzXe9/qr3+Aw==
--------------------------------------------------------------------------------
/legacy/custom_embeddings/embeddings.txt:
--------------------------------------------------------------------------------
1 | good 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
2 | great 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
3 | awesome 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
4 | bad -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
5 | terrible -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
6 | awful -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
7 | kwyjibo 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5 0.5 -0.5
8 |
--------------------------------------------------------------------------------
/legacy/data/test.csv:
--------------------------------------------------------------------------------
1 | name,location,age,quote
2 | Craig,Finland,29,go baseball team!
3 | Janet,Hong Kong,24,knowledge is great
--------------------------------------------------------------------------------
/legacy/data/test.json:
--------------------------------------------------------------------------------
1 | {"name": "Craig", "location": "Finland", "age": 29, "quote": ["go", "baseball", "team", "!"]}
2 | {"name": "Janet", "location": "Hong Kong", "age": 24, "quote": ["knowledge", "is", "great"]}
3 |
--------------------------------------------------------------------------------
/legacy/data/test.tsv:
--------------------------------------------------------------------------------
1 | name location age quote
2 | Craig Finland 29 go baseball team!
3 | Janet Hong Kong 24 knowledge is great
--------------------------------------------------------------------------------
/legacy/data/train.csv:
--------------------------------------------------------------------------------
1 | name,location,age,quote
2 | John,United Kingdom,42,i love the united kingdom
3 | Mary,United States,36,i want more telescopes
--------------------------------------------------------------------------------
/legacy/data/train.json:
--------------------------------------------------------------------------------
1 | {"name": "John", "location": "United Kingdom", "age": 42, "quote": ["i", "love", "the", "united kingdom"]}
2 | {"name": "Mary", "location": "United States", "age": 36, "quote": ["i", "want", "more", "telescopes"]}
3 |
--------------------------------------------------------------------------------
/legacy/data/train.tsv:
--------------------------------------------------------------------------------
1 | name location age quote
2 | John United Kingdom 42 i love the united kingdom
3 | Mary United States 36 i want more telescopes
--------------------------------------------------------------------------------
/legacy/data/valid.csv:
--------------------------------------------------------------------------------
1 | name,location,age,quote
2 | Fred,France,21,what am i doing?
3 | Pauline,Canada,44,hello world
--------------------------------------------------------------------------------
/legacy/data/valid.json:
--------------------------------------------------------------------------------
1 | {"name": "Fred", "location": "France", "age": 21, "quote": ["what", "am", "i", "doing", "?"]}
2 | {"name": "Pauline", "location": "Canada", "age": 44, "quote": ["hello", "world"]}
3 |
--------------------------------------------------------------------------------
/legacy/data/valid.tsv:
--------------------------------------------------------------------------------
1 | name location age quote
2 | Fred France 21 what am i doing?
3 | Pauline Canada 44 hello world
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | numpy
3 | datasets
4 | torchtext
5 | tqdm
6 | matplotlib
--------------------------------------------------------------------------------