├── docs
    ├── recipes
    │   ├── packed_sequences.rst
    │   ├── index.rst
    │   ├── compute_conv_size.rst
    │   └── load_pretrained_vectors.rst
    ├── day2
    │   ├── adventures
    │   │   ├── conv_then_rnn.rst
    │   │   ├── index.rst
    │   │   ├── lvg.rst
    │   │   ├── interpolation.rst
    │   │   └── lookups.rst
    │   ├── patterns
    │   │   ├── stacking.rst
    │   │   ├── index.rst
    │   │   ├── gating.rst
    │   │   ├── tidbits.rst
    │   │   └── attention.rst
    │   ├── warmup.rst
    │   ├── tensorfu1.rst
    │   ├── tensorfu2.rst
    │   ├── failfastprototypemode.rst
    │   └── sampling.rst
    ├── .gitignore
    ├── errata.rst
    ├── day1
    │   ├── index.rst
    │   ├── solutions.rst
    │   └── takehome.rst
    ├── migration.rst
    ├── Makefile
    ├── faq.rst
    ├── download_data.rst
    ├── index.rst
    ├── environment_setup.rst
    └── conf.py
├── README.md
├── requirements.txt
├── data
    └── README.md
├── modelzoo
    └── README.md
├── Dockerfile
├── .gitignore
├── day_1
    ├── vocabulary.py
    ├── figures
    │   └── intro_to_pytorch
    │   │   ├── computational_graph_forward.svg
    │   │   ├── pytorch_variable.svg
    │   │   └── computational_graph_backward.svg
    └── 0_Using_Pretrained_Embeddings.ipynb
└── day_2
    ├── vocabulary.py
    ├── Amazon-Reviews.ipynb
    ├── 00-Load-Vectorize-Generate-And-Sequences-as-Tensors.ipynb
    ├── 03-Char-RNN-Conditionally-Predict-Surnames.ipynb
    └── 02-Char-RNN-Predict-Surnames.ipynb


/docs/recipes/packed_sequences.rst:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/day2/adventures/conv_then_rnn.rst:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | _static/
3 | _templates/
4 | 


--------------------------------------------------------------------------------
/docs/day2/patterns/stacking.rst:
--------------------------------------------------------------------------------
1 | Stacking
2 | ========
3 | 


--------------------------------------------------------------------------------
/docs/errata.rst:
--------------------------------------------------------------------------------
1 | Errata
2 | ======
3 | 
4 | Please check back.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pytorch-nlp-tutorial-eu2018
2 | O'Reilly AI training - London 2018
3 | 


--------------------------------------------------------------------------------
/docs/day2/patterns/index.rst:
--------------------------------------------------------------------------------
1 | NN Patterns
2 | ===========
3 | 
4 | .. toctree::
5 | 
6 |    attention
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | nltk
 3 | annoy
 4 | seaborn
 5 | numpy
 6 | matplotlib
 7 | scikit-learn
 8 | tqdm
 9 | ipywidgets
10 | graphviz


--------------------------------------------------------------------------------
/docs/day1/index.rst:
--------------------------------------------------------------------------------
 1 | Day 1
 2 | =====
 3 | 
 4 | Here you will find things from Day 1!
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 2
 8 | 
 9 |    takehome
10 | 


--------------------------------------------------------------------------------
/docs/day2/adventures/index.rst:
--------------------------------------------------------------------------------
 1 | Choose Your Own Adventures
 2 | ==========================
 3 | 
 4 | .. toctree::
 5 | 
 6 |    interpolation
 7 |    lookups
 8 |    lvg
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/recipes/index.rst:
--------------------------------------------------------------------------------
 1 | Recipes and PyTorch patterns
 2 | ============================
 3 | 
 4 | 
 5 | In this section, you will find a set of recipes for doing various things with PyTorch.
 6 | 
 7 | .. toctree::
 8 | 
 9 |    load_pretrained_vectors
10 |    compute_conv_size
11 | 


--------------------------------------------------------------------------------
/docs/day2/patterns/gating.rst:
--------------------------------------------------------------------------------
 1 | Gating
 2 | ======
 3 | 
 4 | 
 5 | 
 6 | .. code-block:: python
 7 | 
 8 | 
 9 |     resetgate = F.sigmoid(i_r + h_r)
10 |     inputgate = F.sigmoid(i_i + h_i)
11 |     newgate = F.tanh(i_n + resetgate * h_n)
12 |     hy = newgate + inputgate * (hidden - newgate)
13 | 


--------------------------------------------------------------------------------
/docs/migration.rst:
--------------------------------------------------------------------------------
1 | Migrating to PyTorch 0.4.0
2 | ==========================
3 | 
4 | If you have used PyTorch before 0.4.0, some things have changed! To help you understand how to migrate, the PyTorch folks have a wonderful migration guide found `here <http://pytorch.org/2018/04/22/0_4_0-migration-guide.html>`_.
5 | 
6 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Data Folder
 2 | 
 3 | There are several files you will need. you can download them in zip format
 4 | from [here](https://drive.google.com/file/d/0B2hg7DTHpfLsdHhEUVhHWU5hUXc/view?usp=sharing).
 5 | 
 6 | 
 7 | The files you need are:
 8 | 
 9 | - surnames.csv
10 | - trump.csv
11 | - glove.6B.100d.txt
12 | - zhnews.csv
13 | - firstnames.csv
14 | - amazon_train_small.csv
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/recipes/compute_conv_size.rst:
--------------------------------------------------------------------------------
 1 | Compute Convolution Sizes
 2 | =========================
 3 | 
 4 | 
 5 | .. code-block:: python
 6 | 
 7 |    import math
 8 | 
 9 |    def conv_shape_helper_1d(input_seq_len, kernel_size, stride=1, padding=0, dilation=1):
10 |        kernel_width = dilation * (kernel_size - 1) + 1
11 |        tensor_size = input_seq_len + 2 * padding
12 |        return math.floor((tensor_size - kernel_width) / stride + 1)
13 | 


--------------------------------------------------------------------------------
/docs/day2/patterns/tidbits.rst:
--------------------------------------------------------------------------------
 1 | Small Tidbits
 2 | =============
 3 | 
 4 | 
 5 | Set Seed Everywhere
 6 | -------------------
 7 | 
 8 | .. code-block:: python
 9 | 
10 |    import numpy as np
11 |    import torch
12 | 
13 |    def set_seed_everywhere(seed, cuda):(
14 |           """Set the seed for numpy and pytorch
15 |    
16 |           Args:
17 |               seed (int): the seed to set everything to
18 |               cuda (bool): whether to set the cuda seed as well
19 |        """
20 |        np.random.seed(seed)
21 |        torch.manual_seed(seed)
22 |        if cuda:
23 |            torch.cuda.manual_seed_all(seed)
24 | 


--------------------------------------------------------------------------------
/modelzoo/README.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo files
 2 | 
 3 | In this folder, you should place the saved states that we have pre-trained.
 4 | 
 5 | You can download them from [here](https://drive.google.com/file/d/0B2hg7DTHpfLsZW44aTRVd2FrbEE/view?usp=sharing)
 6 | 
 7 | 
 8 | You should have the following files:
 9 | 
10 | - trump_twitter.vocab
11 | - surnames_classify.vocab
12 | - charnn_emb16_hid64_surnames_classify.state
13 | - charnn_emb16_hid64_surnames_predict.state
14 | - charnn_emb16_hid64_surnames_conditionally_predict.state
15 | - wordrnn_emb100_hid64_trump_tweets_predict_fresh_train_8_min.state
16 | - wordrnn_emb100_hid64_trump_tweets_predict.state
17 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = pytorch-nlp-tutorial-sf2017
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
 1 | Frequency Asked Questions
 2 | =========================
 3 | 
 4 | On this page, you will find a list of questions that we either anticipate
 5 | people will ask or that we have been asked previously.  They are intended to
 6 | be the first stop for any confusion or trouble that might occur.
 7 | 
 8 | 
 9 | Do I Need to have a NVIDIA GPU enabled laptop?
10 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
11 | 
12 | Nope!  While having a NVIDIA GPU enabled laptop will make the training run
13 | faster, we provide instructions for people who do not have one.
14 | 
15 | If you are plan on working on Natural Language Processing/Deep Learning in the future,
16 | a GPU enabled laptop might be a good investment.
17 | 


--------------------------------------------------------------------------------
/docs/day2/warmup.rst:
--------------------------------------------------------------------------------
 1 | Warm Up Exercise
 2 | ================
 3 | 
 4 | To get you back into the PyTorch groove, let's do some easy exercises. You will have 10 minutes.  See how far you can get.
 5 | 
 6 | 1. Use :code:`torch.randn` to create two tensors of size (29, 30, 32) and and (32, 100).
 7 | 2. Use :code:`torch.matmul` to matrix multiply the two tensors.
 8 | 3. Use :code:`torch.sum` on the resulting tensor, passing the optional argument of :code:`dim=1` to sum across the 1st dimension.  Before you run this, can you predict the size?
 9 | 4. Create a new long tensor of size (3, 10) from the :code:`np.random.randint` method.
10 | 5. Use this new long tensor to index into the tensor from step 3.
11 | 6. Use :code:`torch.mean` to average across the last dimension in the tensor from step 5.
12 | 


--------------------------------------------------------------------------------
/docs/day2/tensorfu1.rst:
--------------------------------------------------------------------------------
 1 | Tensor-Fu-1
 2 | ===========
 3 | 
 4 | 
 5 | Exercise 1
 6 | ----------
 7 | 
 8 | Task: create a tensor for prototyping using `torch.randn`_.
 9 | 
10 | .. code-block:: python
11 | 
12 |    import torch
13 |    import torch.nn as nn
14 | 
15 | 
16 | 
17 | Exercise 2
18 | ----------
19 | 
20 | 
21 | Task: Create a linear layer which works wih x2dim
22 | 
23 | .. code-block:: python
24 | 
25 |    import torch
26 |    import torch.nn as nn
27 | 
28 |    x2dim = torch.randn(9, 10)
29 | 
30 |    # required and default parameters:
31 |    # fc = nn.Linear(in_features, out_features)
32 | 
33 | 
34 | Exercise 3
35 | ----------
36 | 
37 | Task: Create a convolution which works on x3dim
38 | 
39 | .. code-block:: python
40 | 
41 |    import torch
42 |    import torch.nn as nn
43 | 
44 |    x3dim = torch.randn(9, 10, 11)
45 | 
46 |    # required and default parameters:
47 |    # conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0)
48 | 


--------------------------------------------------------------------------------
/docs/download_data.rst:
--------------------------------------------------------------------------------
 1 | Getting the Data
 2 | ================
 3 | 
 4 | In this training, there are two options of participating.
 5 | 
 6 | Option 1: Download and Setup things on your laptop
 7 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 8 | 
 9 | The first option is to download the data below, setup the environment, and download the notebooks when we make them available.
10 | If you choose this options but do not download the data before the first day, we will have several flash drives with the data on it.
11 | 
12 | Please visit `this link <https://drive.google.com/file/d/0B2hg7DTHpfLsdHhEUVhHWU5hUXc/view?usp=sharing>`_ to download the data.
13 | 
14 | 
15 | Option 2: Use O'Reilly's online resource through your browser
16 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17 | 
18 | The second option is to use an online resource provided by O'Reilly. On the first day of this training, you will be provided with a link to a JupyterHub instance where the environment will be pre-made and ready to go!  If you choose this option, you do not have to do anything until you arrive on Sunday.
19 | You are still required to bring your laptop.
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/recipes/load_pretrained_vectors.rst:
--------------------------------------------------------------------------------
 1 | Loading Pretrained Vectors
 2 | ==========================
 3 | 
 4 | It can be extremely useful to make a model which had as advantageous starting point.
 5 | 
 6 | To do this, we can set the values of the embedding matrix.
 7 | 
 8 | 
 9 | .. code-block:: python
10 | 
11 |    # we give an example of this function in the day 1, word vector notebook
12 |    word_to_index, word_vectors, word_vector_size = load_word_vectors()
13 | 
14 | 
15 |    # now, we want to iterate over our vocabulary items
16 |    for word, emb_index in vectorizer.word_vocab.items():
17 |        # if the word is in the loaded glove vectors
18 |        if word.lower() in word_to_index:
19 |             # get the index into the glove vectors
20 |             glove_index = word_to_index[word.lower()]
21 |             # get the glove vector itself and convert to pytorch structure
22 |             glove_vec = torch.FloatTensor(word_vectors[glove_index])
23 | 
24 |             # this only matters if using cuda :)
25 |             if settings.CUDA:
26 |                 glove_vec = glove_vec.cuda()
27 | 
28 |             # finally, if net is our network, and emb is the embedding layer:
29 |             net.emb.weight.data[emb_index, :].set_(glove_vec)
30 | 


--------------------------------------------------------------------------------
/docs/day2/tensorfu2.rst:
--------------------------------------------------------------------------------
 1 | Tensor-Fu-2
 2 | ===========
 3 | 
 4 | Exercise 1
 5 | ----------
 6 | 
 7 | 
 8 | Task: The code below is broken.  How can we fix it?  
 9 | 
10 | Hint: The input data tensor (indices) might be the wrong shape. 
11 | 
12 | .. code-block:: python
13 | 
14 |    indices = torch.from_numpy(np.random.randint(0, 10, size=(10,)))
15 | 
16 |    emb = nn.Embedding(num_embeddings=100, embedding_dim=16)
17 |    emb(indices)
18 | 
19 | Exercise 2
20 | ----------
21 | 
22 | Task: Create a MultiEmbedding class which can input two sets of indices, embed them, and concat the results!
23 | 
24 | .. code-block:: python
25 | 
26 |    class MultiEmbedding(nn.Module):
27 |        def __init__(self, num_embeddings1, num_embeddings2, embedding_dim1, embedding_dim2):
28 |            pass
29 | 
30 |        def forward(self, indices1, indices2):
31 |            # use something like
32 |            # z = torch.concat([x, y], dim=1)
33 | 
34 |            pass
35 | 
36 | 
37 |    # testing
38 | 
39 |    # use indices method from above
40 |    # the batch dimensions should agree
41 |    # indices1 = 
42 |    # indices2 = 
43 |    # multiemb = MutliEmbedding(num_emb1, num_emb2, size_emb1, size_emb2)
44 |    # output = multiemb(indices1, indices2)
45 |    # print(output.shape) # should be (batch, size_emb1 + size_emb2)


--------------------------------------------------------------------------------
/docs/day2/adventures/lvg.rst:
--------------------------------------------------------------------------------
 1 | A New Load-Vectorize-Generate
 2 | =============================
 3 | 
 4 | In this exercise, you should look into the two datasets that are not included in the exercises.  There are two datasets to work with.  The first is the Amazon Review dataset.
 5 | 
 6 | .. code-block:: python
 7 | 
 8 |    from local_settings import settings
 9 |    import pandas as pd
10 | 
11 |    data = pd.read_csv(settings.AMAZON_FILENAME, names=['rating', 'title', 'review'])
12 |    print(data.head())
13 | 
14 | The Amazon Reviews Dataset does not come with a precompute train-test split. One thing that would be important is to select a subset to do that.
15 | 
16 | The other is the first names dataset.  You can load with:
17 | 
18 | .. code-block:: python
19 | 
20 |    from local_settings import settings
21 |    import pandas as pd
22 | 
23 |    data = pd.read_csv(settings.FIRSTNAMES_CSV)
24 |    print(data.head())
25 | 
26 | 
27 | For these two datasets, you should write a Raw dataset which loads the data. Then, you should write a Vectorizer which creates the relevant vocabularies from the 'fit' method and transforms a raw dataset into a vectorized dataset using the 'transform' method.  Finally, you should write a Vectorized datset which implements the required :code:`__len__` and :code:`__getitem__` methods.
28 | 
29 | The make_generator can be reused.
30 | 


--------------------------------------------------------------------------------
/docs/day2/failfastprototypemode.rst:
--------------------------------------------------------------------------------
 1 | Fail Fast Prototype Mode
 2 | ========================
 3 | 
 4 | When building neural networks, you want things to either work or fail fast.  Long iteration loops are the truest enemy of the  machine learning practitioner.  
 5 | 
 6 | 
 7 | To that end, the following techniques will help you out. 
 8 | 
 9 | .. code-block:: python
10 | 
11 |    import torch
12 |    import torch.nn as nn
13 | 
14 |    # 2dim tensor.. aka a matrix
15 |    x = torch.randn(4, 5)
16 | 
17 |    # this is the same as:
18 |    batch_size = 4
19 |    feature_size = 5
20 |    x = torch.randn(batch_size, feature_size)
21 | 
22 |    # now let's try out some NN layer
23 |    output_size = 10
24 |    fc = nn.Linaer(feature_size, output_size)
25 |    print(fc(x).shape)
26 | 
27 | 
28 | You can construct whatever prototype variables you want doing this. 
29 | 
30 | Prototyping an embedding
31 | ^^^^^^^^^^^^^^^^^^^^^^^^
32 | 
33 | 
34 | .. code-block:: python
35 | 
36 |    import torch
37 |    import torch.nn as nn
38 | 
39 |    batch_size = 4
40 |    sequence_size = 5
41 |    integer_range = 100
42 |    embedding_size = 25
43 |    # notice rand vs randn.  rand is uniform (0,1), and randn is normal (-1,1) 
44 |    random_numbers = (torch.rand(batch_size, sequence_size) * integer_range).long()
45 | 
46 |    embedder = nn.Embedding(num_embeddings=integer_range, 
47 |                            embedding_dim=embedding_size)
48 | 
49 |    print(embedder(x).shape)
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/docs/day1/solutions.rst:
--------------------------------------------------------------------------------
 1 | Solutions 
 2 | =========
 3 | 
 4 | Problem 1
 5 | ---------
 6 | 
 7 | .. code-block:: python
 8 | 
 9 |    def f(x):
10 |        if x.data[0] > 0:
11 |            return torch.sin(x)
12 |        else:
13 |            return torch.cos(x)
14 | 
15 |    x = torch.autograd.Variable(torch.FloatTensor([1]), 
16 |                                requires_grad=True)
17 | 
18 |    y = f(x)
19 |    print(y)
20 | 
21 |    y.backward()
22 | 
23 |    x.grad
24 | 
25 |    y.grad_fn
26 | 
27 | Problem 2
28 | ---------
29 | 
30 | .. code-block:: python
31 | 
32 |    def cbow(phrase):
33 |        words = phrase.split(" ")
34 |        embeddings = []
35 |        for word in words:
36 |            if word in glove.word_to_index:
37 |                embeddings.append(glove.get_embedding(word))
38 |        embeddings = np.stack(embeddings)
39 |        return np.mean(embeddings, axis=0)
40 | 
41 |    cbow("the dog flew over the moon").shape
42 | 
43 |    # >> (100,)
44 | 
45 |    def cbow_sim(phrase1, phrase2):
46 |        vec1 = cbow(phrase1)
47 |        vec2 = cbow(phrase2)
48 |        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
49 | 
50 |    cbow_sim("green apple", "green apple")
51 |    # >> 1.0
52 | 
53 |    cbow_sim("green apple", "apple green")
54 |    # >> 1.0
55 | 
56 |    cbow_sim("green apple", "red potato")
57 |    # >> 0.749
58 | 
59 |    cbow_sim("green apple", "green alien")
60 |    # >> 0.683
61 | 
62 |    cbow_sim("green apple", "blue alien")
63 |    # >> 0.5799815958114477
64 | 
65 |    cbow_sim("eat an apple", "ingest an apple")
66 |    # >> 0.9304712574359718


--------------------------------------------------------------------------------
/docs/day1/takehome.rst:
--------------------------------------------------------------------------------
 1 | Take-Home Exercises
 2 | ===================
 3 | 
 4 | Exercise 1
 5 | ----------
 6 | 
 7 | Implement Deep Continuous Bag-of-Words (CBOW).
 8 | `Here is a link to the paper! <https://cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf>`_
 9 | 
10 | 
11 | Exercise 2
12 | ----------
13 | 
14 | Implement a convnet classifier to classify surnames
15 | 
16 | At the end of class, we talked about how CNNs can be used to incrementally shrink an intermediate data tensor until a dimension of size 1 is left.  
17 | 
18 | Here is a notebook that I pieced together for you to do this assignment with: https://gist.github.com/braingineer/1d7baecf2c99013d88d4d1db77449aec
19 | 
20 | Some other points that were made:
21 | 
22 | 1. At first, the size of the data tensor is (batch, max_seq_len).  Then, after using the embedding layer, it is (batch, max_seq_len, embeddin_dim).  However, as was pointed out, convolutions expect the channel dimension (the features per position in the sequence) to be on the 1st position. So, a conv1d will expect: (batch, feature_dim, max_seq_len).
23 | 
24 | 2. When a sequence/hierarchy of 1D convolutions are applied, they can eventually shrink the sequence dimension to size 1. This is a goal.  Specifically, you want (batch, feature_dim, 1) so that use the "squeeze" operation to remove the 1-dimension and have a single feature vector per item in the batch.
25 | 
26 | 3. Once you have the correct sequence of convolutions and/or pooling operations to create your feature vectors, then you can add a Linear layer which will map from the feature vector to a prediction vector.  This can be modeled after the other examples. 
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jupyter/minimal-notebook:ae885c0a6226
 2 | 
 3 | # launchbot-specific labels
 4 | LABEL name.launchbot.io="nlp_with_dl"
 5 | LABEL workdir.launchbot.io="/home/jovyan"
 6 | LABEL description.launchbot.io="Natural Language Processing with Deep Learning"
 7 | LABEL 8888.port.launchbot.io="Jupyter Notebook"
 8 | 
 9 | #USER root
10 | 
11 | # Install requirements
12 | COPY requirements.txt /requirements.txt
13 | RUN pip install -r /requirements.txt
14 | 
15 | # Install pytorch
16 | RUN pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp36-cp36m-linux_x86_64.whl \
17 | 			&& pip install torchvision
18 | 
19 | RUN jupyter nbextension enable --py widgetsnbextension
20 | 
21 | # Set the working directory
22 | WORKDIR /home/jovyan
23 | 
24 | # Add files
25 | COPY data/trump.csv /home/jovyan/data/trump.csv
26 | COPY data/surnames.csv /home/jovyan/data/surnames.csv
27 | COPY data/glove.6B.100d.txt /home/jovyan/data/glove.6B.100d.txt
28 | COPY data/firstnames.csv /home/jovyan/data/firstnames.csv
29 | COPY data/amazon_train_small.csv /home/jovyan/data/amazon_train_small.csv
30 | COPY data/surnames.csv /home/jovyan/data/surnames.csv
31 | COPY data/zhnews.csv /home/jovyan/data/zhnews.csv
32 | 
33 | COPY modelzoo/ /home/jovyan/modelzoo
34 | 
35 | COPY day_1 /home/jovyan/day_1
36 | COPY day_2 /home/jovyan/day_2
37 | 
38 | USER root
39 | RUN chown -R $NB_USER /home/jovyan/day_* \
40 | 		&& chmod -R 774 /home/jovyan/day_*
41 | USER $NB_USER
42 | 
43 | # Expose the notebook port
44 | EXPOSE 8888
45 | 
46 | # Start the notebook server
47 | CMD jupyter notebook --no-browser --port 8888 --ip=* --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | day_1/data/*
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | .DS_Store
103 | amazon_train_small.csv
104 | glove.6B.zip
105 | names_test.csv
106 | names_test_delip_version.csv
107 | names_train.csv
108 | snli_1.0.zip
109 | trump.csv
110 | data/
111 | .ipython/
112 | .jupyter/
113 | .local/
114 | 


--------------------------------------------------------------------------------
/docs/day2/adventures/interpolation.rst:
--------------------------------------------------------------------------------
 1 | Exercise: Interpolating Between Vectors
 2 | =======================================
 3 | 
 4 | One fun option for the conditional generation code is to interpolate
 5 | between the learned hidden vectors.
 6 | 
 7 | To do this, first look at the code for sampling given a specific nationality:
 8 | 
 9 | .. code-block:: python
10 |    :linenos:
11 | 
12 |    def sample_n_for_nationality(nationality, n=10, temp=0.8):
13 |         assert nationality in vectorizer.nationality_vocab.keys(), 'not a nationality we trained on'
14 |         keys = [nationality] * n
15 |         init_vector = torch.tensor([vectorizer.nationality_vocab[key] for key in keys], dtype=torch.int64)
16 |         init_vector = net.conditional_emb(init_vector)
17 |         samples = decode_matrix(vectorizer,
18 |                                 sample(net.emb, net.rnn, net.fc,
19 |                                    init_vector,
20 |                                    make_initial_x(n, vectorizer),
21 |                                    temp=temp))
22 |         return list(zip(keys, samples))
23 | 
24 | As you can see, we create a list of keys that is the length of the number of samples we want (n).
25 | And we use that list to retrieve the correct index from the vocabulary.
26 | Finally, we use that index in the conditional embedding inside the network to get the
27 | initial hidden state for the sampler.
28 | 
29 | To do this exercise, write a function that has the following signature:
30 | 
31 | .. code-block:: python
32 | 
33 |    def interpolate_n_samples_from_two_nationalities(nationality1, nationality2, weight, n=10, temp=0.8):
34 |        print('awesome stuff here')
35 | 
36 | 
37 | This should retrieve the :code:`init_vectors` for two different nationalities. Then, using the weight, combine the init vectors as :code:`weight * init_vector1 + (1 - weight) * init_vector2`.
38 | 
39 | For fun, after you finish this function, write a for loop which loops over the weight from 0.1 to 0.9 to see how it affects the generation.
40 | 


--------------------------------------------------------------------------------
/docs/day2/patterns/attention.rst:
--------------------------------------------------------------------------------
 1 | Design Pattern: Attention
 2 | =========================
 3 | 
 4 | Attention is a useful pattern for when you want to take a collection of vectors---whether it be a sequence of vectors representing a sequence of words, or an unordered collections of vectors representing a collection of attributes---and summarize them into a single vector.  This has similar analogs to the CBOW examples we saw on Day 1, but instead of just averaging or using max pooling, we are learning a function which learns to compute the weights for each of the vectors before summing them together.
 5 | 
 6 | Importantly, the weights that the attention module is learning is a valid probability distribution.  This means that weighting the vectors by the value the attention module learns can additionally be seen as computing the Expection. Or, it could as interpolating. In any case, attention's main use is to select 'softly' amongst a set of vectors.
 7 | 
 8 | The attention vector has several different published forms. The one below is very simple and just learns a single vector as the attention mechanism.
 9 | 
10 | Using the :code:`new_parameter` function we have been using for the RNN notebooks:
11 | 
12 | .. code-block:: python
13 | 
14 |    def new_parameter(*size):
15 |        out = Parameter(FloatTensor(*size))
16 |        torch.nn.init.xavier_normal(out)
17 |        return out
18 | 
19 | We can then do:
20 | 
21 | .. code-block:: python
22 | 
23 |    class Attention(nn.Module):
24 |        def __init__(self, attention_size):
25 |            super(Attention, self).__init__()
26 |            self.attention = new_parameter(attention_size, 1)
27 | 
28 |        def forward(self, x_in):
29 |            # after this, we have (batch, dim1) with a diff weight per each cell
30 |            attention_score = torch.matmul(x_in, self.attention).squeeze()
31 |            attention_score = F.softmax(attention_score).view(x_in.size(0), x_in.size(1), 1)
32 |            scored_x = x_in * attention_score
33 | 
34 |            # now, sum across dim 1 to get the expected feature vector
35 |            condensed_x = torch.sum(scored_x, dim=1)
36 | 
37 |            return condensed_x
38 | 
39 | 
40 | 
41 |    attn = Attention(100)
42 |    x = Variable(torch.randn(16,30,100))
43 |    attn(x).size() == (16,100)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/docs/day2/adventures/lookups.rst:
--------------------------------------------------------------------------------
 1 | Exercise: Fast Lookups for Encoded Sequences
 2 | ==========================================
 3 | 
 4 | Let's suppose that you want to embed or encode something that you want to look up at a later date.
 5 | For example, you could be embedded things that need to be identified (such as a song).  Or maybe you want to just find the neighbors of a new data point.
 6 | 
 7 | In any case, using the approximate nearest neighbors libraries are wonderful for this.
 8 | For this exercise, we will use Spotify's annoy library (we saw this on day 1, in the pretrained word vector notebook).  You should aim to complete the following steps:
 9 | 
10 | 1. Load the network from the Day 2, 01 notebook using the pre-trained weights.
11 |     - You could use the 02 notebook, but we want to get a single vector per each sequence.
12 |     - So, to use 02, you would need to port the :code:`column_gather` function.
13 |     - One reason why you might be interested in doing this is because the 02 objective function learned a better final vector representation.
14 | 2. Given a loaded network with pre-trained weights, write a function which does nearly exactly what the forward function does, but doesn't apply the fully connected layer.
15 |     - This is because we want the feature vector just before the fully connected.
16 |     - it is common to assume that the penultimate layer has learned more generalizable features than the final layer (which is used in softmax computations and is this used to being normalize inducing a probability distribution).
17 |     - The code for this shoud look something like:
18 | 
19 | .. code-block:: python
20 | 
21 |    def get_penultimate(net, x_in, x_lengths=None):
22 |         x_in = net.emb(x_in)
23 |         x_mid = net.conv(x_in.permute(0, 2, 1)).permute(0, 2, 1)
24 |         y_out = net.rnn(x_in)
25 | 
26 |         if x_lengths is not None:
27 |             y_out = column_gather(y_out, x_lengths)
28 |         else:
29 |             y_out = y_out[:, -1, :]
30 | 
31 |         return y_out
32 | 
33 | 3. As you get penultimate vectors for each datapoint, store them in spotify's annoy. This requires specifying some label for the vector.  Using :code:`vectorizer.surname_vocab.lookup` is how you can retrieve the character for each index value in the network inputs.  There are some 'decode' functions in the day 2 02 and 03 notebooks.
34 | 4. Once everything is added to spotify's annoy, you can then look up any surname and find the set of nearest neighbors!  Kind of cool!  this is one way to do the `k nearest neighbor classification rule <https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm>`_.
35 | 


--------------------------------------------------------------------------------
/docs/day2/sampling.rst:
--------------------------------------------------------------------------------
 1 | Exercise: Sampling from an RNN
 2 | ==============================
 3 | 
 4 | The goal of sampling from an RNN is to initialize the sequence in some way, feed it into the recurrent computation, and retrieve the next prediction. 
 5 | 
 6 | To start, we create the initial vectors:
 7 | 
 8 | .. code-block:: python
 9 | 
10 |    start_index = vectorizer.surname_vocab.start_index
11 |    batch_size = 2
12 |    # hidden_size = whatever hidden size the model is set to
13 | 
14 |    initial_h = torch.ones(batch_size, hidden_size)
15 |    initial_x_index = torch.ones(batch_size).long() * start_index
16 | 
17 | Then, we need to use these vectors to retrieve the next prediction:
18 | 
19 | .. code-block:: python
20 | 
21 |    # model is stored in variable called `net`
22 | 
23 |    x_t = net.emb(initial_x_index)
24 |    print(x_t.shape)
25 |    h_t = net.rnn._compute_next_hidden(x_t, initial_h)
26 | 
27 |    y_t = net.fc(h_t)
28 | 
29 | 
30 | Now that we have a prediction vector, we can create a probability distribution and sample from it.  Note we include a temperature hyper parameter for controlling how strongly we sample from the distribution (at high temperatures, everything is uniform, at low temperatures below 1, small differences are magnified).  The temperature is always greater than 0. 
31 | 
32 | .. code-block:: python
33 | 	
34 |    temperature = 1.0
35 |    y_t = F.softmax(y_t / temperature, dim=1)
36 |    x_index_t = torch.multinomial(y_t, 1)[:, 0]
37 | 
38 | 
39 | Now we can start the cycle over again:
40 | 
41 | .. code-block:: python
42 | 
43 |    x_t = net.emb(x_index_t)
44 |    h_t = net.rnn._compute_next_hidden(x_t, h_t)
45 | 
46 |    y_t = net.fc(h_t)
47 | 
48 | Write a for loop which repeats this sequence and appends the x_t variable to a list.
49 | 
50 | Then, we can do the following:
51 | 
52 | .. code-block:: python
53 | 
54 |    final_x_indices = torch.stack(x_indices).squeeze().permute(1, 0)
55 | 
56 |    # stop here if you don't know what cpu, data, and numpy do. Ask away!
57 |    final_x_indices = final_x_indices.cpu().detach().numpy()
58 | 
59 |    # loop over the items in the batch
60 |    results = []
61 |    for i in range(len(final_x_indices)):
62 |        tokens = []
63 |        index_vector = final_x_indices[i]
64 |        for x_index in index_vector:
65 |            if vectorizer.surname_vocab.start_index == x_index:
66 |                continue
67 |            elif vectorizer.surname_vocab.end_index == x_index:
68 |                break
69 |            else:
70 |                token = vectorizer.surname_vocab.lookup(x_index)
71 |                tokens.append(token)
72 | 
73 |    sampled_surname = "".join(tokens)
74 |    results.append(sampled_surname)
75 |    tokens = []


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Natural Language Processing (NLP) with PyTorch
 2 | ==============================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :hidden:
 7 |    :caption: Extra Resources
 8 | 
 9 |    download_data
10 |    environment_setup
11 |    faq
12 |    migration
13 | 
14 | .. toctree::
15 |    :hidden:
16 |    :caption: Day 1 Materials
17 | 
18 |    day1/takehome
19 |    day1/solutions
20 | 
21 | .. toctree::
22 |    :hidden:
23 |    :maxdepth: 3
24 |    :caption: Day 2 Materials
25 | 
26 |    day2/warmup
27 |    day2/failfastprototypemode
28 |    day2/tensorfu1
29 |    day2/tensorfu2
30 |    day2/patterns/tidbits
31 |    day2/adventures/lookups
32 |    day2/adventures/interpolation
33 |    day2/sampling
34 | 
35 | 
36 | Hello! This is a directory of resources for a training tutorial to be
37 | given at the O'Reilly AI Conference in London on Monday, October 8th, and Tuesday, October 9th. 
38 | 
39 | Please read below for general information.  You can find the github repository at `this link <https://github.com/joosthub/pytorch-nlp-tutorial-eu2018>`_.  Please note that there are two ways to engage in this training (described below).
40 | 
41 | More information will be added to this site as the training progresses.
42 | Specifically, we will be adding a 'recipes' section, 'errata' section, and a 'bonus exercise' section as the training progresses!
43 | 
44 | General Information
45 | -------------------
46 | 
47 | Prerequisites:
48 | ^^^^^^^^^^^^^
49 | 
50 | - A working knowledge of Python and the command line
51 | - Familiarity with precalc math (multiply matrices, dot products of vectors, etc.) and derivatives of simple functions (If you are new to linear algebra, this video course is handy.)
52 | - A general understanding of machine learning (setting up experiments, evaluation, etc.) (useful but not required)
53 | 
54 | Hardware and/or installation requirements:
55 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
56 | 
57 | - There are two options:
58 |    1. **Using O'Reilly's online resources**.  For this, you only needs a laptop; on the first day, we will provide you with credentials and a URL to use an online computing resource (a JupyterHub instance) provided by O'Reilly.  You will be able to access Jupyter notebooks through this and they will persist until the end of the second day of training.  This option is not limited by what operating system you have. You will need to have a browser installed.
59 |    2. **Setting everything up locally**.  For this, you need a laptop with the PyTorch environment set up.  This is only recommended if you want to have the environment locally or have a laptop with a GPU. (If you have trouble following the provided instructions or if you find any mistakes, please file an issue `here <https://github.com/joosthub/pytorch-nlp-tutorial-eu2018>`_.)  
60 | 


--------------------------------------------------------------------------------
/docs/environment_setup.rst:
--------------------------------------------------------------------------------
  1 | Environment Setup
  2 | =================
  3 | 
  4 | On this page, you will find not only the list of dependencies to install
  5 | for the tutorial, but a description of how to install them. This tutorial assumes
  6 | you have a laptop with OSX or Linux. If you use Windows, you might have to install
  7 | a virtual machine to get a UNIX-like environment to continue with the rest of this
  8 | instruction. A lot of this instruction is more verbose than needed to accomodate
  9 | participants of different skill levels.
 10 | 
 11 | **Please note that these are only optional.  On the first day of this training, you will be provided with a link to a JupyterHub instance where the environment will be pre-made and ready to go!**
 12 | 
 13 | 0. Get Anaconda
 14 | ---------------
 15 | 
 16 | Anaconda is a Python (and R) distribution that aims to provide everything
 17 | needed for common scientific and machine learning situations out-of-the-box.
 18 | We chose Anaconda for this tutorial as it significantly simplifies Python
 19 | dependency management.
 20 | 
 21 | In practice, Anaconda can be used to manage different environment and packages.
 22 | This setup document will assume that you have Anaconda installed as your default
 23 | Python distribution.
 24 | 
 25 | You can download Anaconda here: https://www.continuum.io/downloads
 26 | 
 27 | After installing Anaconda, you can access its command-line interface
 28 | with the :code:`conda` command.
 29 | 
 30 | 
 31 | 1. Create a new environment
 32 | ---------------------------
 33 | 
 34 | Environments are a tool for sanitary software development.  By this, we mean that
 35 | you can install specific versions of packages without worrying that it breaks
 36 | a dependency elsewhere.
 37 | 
 38 | Here is how you can create an environment with Anaconda
 39 | 
 40 | .. code-block:: bash
 41 | 
 42 |    conda create -n dl4nlp python=3.6
 43 | 
 44 | 
 45 | 2. Install Dependencies
 46 | -----------------------
 47 | 
 48 | 2a. Activate the environment
 49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 50 | 
 51 | After creating the environment, you need to **activate** the environment:
 52 | 
 53 | .. code-block:: bash
 54 | 
 55 |    source activate dl4nlp
 56 | 
 57 | After an environment is activated, it might prepend/append itself to your
 58 | console prompt to let you know it is active.
 59 | 
 60 | With the environment activated, any installation commands
 61 | (whether it is :code:`pip install X`, :code:`python setup.py install` or using
 62 | Anaconda's install command :code:`conda install X`) will only install inside
 63 | the environment.
 64 | 
 65 | 2b. Install IPython and Jupyter
 66 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 67 | 
 68 | Two core dependencies are IPython and Jupyter.  Let's install them first:
 69 | 
 70 | .. code-block:: bash
 71 | 
 72 |    conda install ipython
 73 |    conda install jupyter
 74 | 
 75 | To allow a jupyter notebooks to use this environment as their kernel, it
 76 | needs to be linked:
 77 | 
 78 | .. code-block:: bash
 79 | 
 80 |    python -m ipykernel install --user --name dl4nlp
 81 | 
 82 | 2c. Installing CUDA (optional)
 83 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 84 | 
 85 | NOTE: CUDA is currently not supported out of the conda package control manager.
 86 | Please refer to pytorch's github repository for compilation instructions.
 87 | 
 88 | If you have a CUDA compatible GPU, it is worthwhile to take advantage of it as
 89 | it can significantly speedup training and make your PyTorch experimentation more
 90 | enjoyable.
 91 | 
 92 | To install CUDA:
 93 | 
 94 | 1. Download CUDA appropriate to your OS/Arch from `here <https://developer.nvidia.com/cuda-downloads>`_.
 95 | 2. Follow installation steps for your architecture/OS. For Ubuntu/x86_64, see `here <http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu-installation>`_.
 96 | 3. Download and install CUDNN from `here <https://developer.nvidia.com/cudnn>`_.
 97 | 
 98 | Make sure you have the latest CUDA and CUDNN.
 99 | 
100 | 2d. Install PyTorch
101 | ^^^^^^^^^^^^^^^^^^^
102 | 
103 | There are instructions on http://pytorch.org which detail how to install it.
104 | If you have been following along so far and have Anaconda installed with CUDA enabled, you can simply do:
105 | 
106 | 
107 | .. code-block:: bash
108 | 
109 |    conda install pytorch torchvision -c pytorch
110 | 
111 | 
112 | 2e. Clone (or Download) Repository
113 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114 | 
115 | At this point, you may have already cloned the tutorial repository.  But if
116 | you have not, you will need it for the next step.
117 | 
118 | .. code-block:: bash
119 | 
120 |    git clone https://github.com/joosthub/pytorch-nlp-tutorial-eu2018.git
121 | 
122 | If you do not have git or do not want to use it, you can also
123 | `download the repository as a zip file <https://github.com/joosthub/pytorch-nlp-tutorial-eu2018/archive/master.zip>`_
124 | 
125 | 2f. Install Dependencies from Repository
126 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127 | 
128 | Assuming the you have cloned (or downloaded and unzipped) the repository,
129 | please navigate to the directory in your terminal.  Then, you can do the following:
130 | 
131 | .. code-block:: bash
132 | 
133 |    pip install -r requirements.txt
134 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # pytorch-nlp-tutorial documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Sep  3 13:31:44 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.mathjax']
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = '.rst'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'NLP with PyTorch'
 50 | copyright = '2018, Brian McMahan and Delip Rao'
 51 | author = 'Brian McMahan and Delip Rao'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = ''
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = ''
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'sphinx_rtd_theme'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | 
100 | # -- Options for HTMLHelp output ------------------------------------------
101 | 
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'pytorch-nlp-tutorial'
104 | 
105 | 
106 | # -- Options for LaTeX output ---------------------------------------------
107 | 
108 | latex_elements = {
109 |     # The paper size ('letterpaper' or 'a4paper').
110 |     #
111 |     # 'papersize': 'letterpaper',
112 | 
113 |     # The font size ('10pt', '11pt' or '12pt').
114 |     #
115 |     # 'pointsize': '10pt',
116 | 
117 |     # Additional stuff for the LaTeX preamble.
118 |     #
119 |     # 'preamble': '',
120 | 
121 |     # Latex figure (float) alignment
122 |     #
123 |     # 'figure_align': 'htbp',
124 | }
125 | 
126 | # Grouping the document tree into LaTeX files. List of tuples
127 | # (source start file, target name, title,
128 | #  author, documentclass [howto, manual, or own class]).
129 | latex_documents = [
130 |     (master_doc, 'pytorch-nlp-tutorial.tex', 'pytorch-nlp-tutorial Documentation',
131 |      'Brian McMahan and Delip Rao', 'manual'),
132 | ]
133 | 
134 | 
135 | # -- Options for manual page output ---------------------------------------
136 | 
137 | # One entry per manual page. List of tuples
138 | # (source start file, name, description, authors, manual section).
139 | man_pages = [
140 |     (master_doc, 'pytorch-nlp-tutorial', 'pytorch-nlp-tutorial Documentation',
141 |      [author], 1)
142 | ]
143 | 
144 | # -- Options for Texinfo output -------------------------------------------
145 | 
146 | # Grouping the document tree into Texinfo files. List of tuples
147 | # (source start file, target name, title, author,
148 | #  dir menu entry, description, category)
149 | texinfo_documents = [
150 |     (master_doc, 'pytorch-nlp-tutorial', 'pytorch-nlp-tutorial Documentation',
151 |      author, 'pytorch-nlp-tutorial', 'One line description of project.',
152 |      'Miscellaneous'),
153 | ]
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/day_1/vocabulary.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | from torch.utils.data import Dataset
  5 | import six
  6 | 
  7 | import json
  8 | 
  9 | 
 10 | class Vocabulary(object):
 11 |     """
 12 |     An implementation that manages the interface between a token dataset and the
 13 |         machine learning algorithm.
 14 |     """
 15 | 
 16 |     def __init__(self, use_unks=False, unk_token="<UNK>",
 17 |                  use_mask=False, mask_token="<MASK>", use_start_end=False,
 18 |                  start_token="<START>", end_token="<END>"):
 19 |         """
 20 |         Args:
 21 |             use_unks (bool): The vocabulary will output UNK tokens for out of
 22 |                 vocabulary items.
 23 |                 [default=False]
 24 |             unk_token (str): The token used for unknown tokens.
 25 |                 If `use_unks` is True, this will be added to the vocabulary.
 26 |                 [default='<UNK>']
 27 |             use_mask (bool): The vocabulary will reserve the 0th index for a mask token.
 28 |                 This is used to handle variable lengths in sequence models.
 29 |                 [default=False]
 30 |             mask_token (str): The token used for the mask.
 31 |                 Note: mostly a placeholder; it's unlikely the token will be seen.
 32 |                 [default='<MASK>']
 33 |             use_start_end (bool): The vocabulary will reserve indices for two tokens
 34 |                 that represent the start and end of a sequence.
 35 |                 [default=False]
 36 |             start_token: The token used to indicate the start of a sequence.
 37 |                 If `use_start_end` is True, this will be added to the vocabulary.
 38 |                 [default='<START>']
 39 |             end_token: The token used to indicate the end of a sequence
 40 |                  If `use_start_end` is True, this will be added to the vocabulary.
 41 |                  [default='<END>']
 42 |         """
 43 | 
 44 |         self._mapping = {}  # str -> int
 45 |         self._flip = {}  # int -> str;
 46 |         self._counts = Counter()  # int -> int; count occurrences
 47 |         self._forced_unks = set()  # force tokens to unk (e.g. if < 5 occurrences)
 48 |         self._i = 0
 49 |         self._frozen = False
 50 |         self._frequency_threshold = -1
 51 | 
 52 |         # mask token for use in masked recurrent networks
 53 |         # usually need to be the 0th index
 54 |         self.use_mask = use_mask
 55 |         self.mask_token = mask_token
 56 |         if self.use_mask:
 57 |             self.add(self.mask_token)
 58 | 
 59 |         # unk token for out of vocabulary tokens
 60 |         self.use_unks = use_unks
 61 |         self.unk_token = unk_token
 62 |         if self.use_unks:
 63 |             self.add(self.unk_token)
 64 | 
 65 |         # start token for sequence models
 66 |         self.use_start_end = use_start_end
 67 |         self.start_token = start_token
 68 |         self.end_token = end_token
 69 |         if self.use_start_end:
 70 |             self.add(self.start_token)
 71 |             self.add(self.end_token)
 72 | 
 73 |     def iterkeys(self):
 74 |         for k in self._mapping.keys():
 75 |             if k == self.unk_token or k == self.mask_token:
 76 |                 continue
 77 |             else:
 78 |                 yield k
 79 | 
 80 |     def keys(self):
 81 |         return list(self.iterkeys())
 82 | 
 83 |     def iteritems(self):
 84 |         for key, value in self._mapping.items():
 85 |             if key == self.unk_token or key == self.mask_token:
 86 |                 continue
 87 |             yield key, value
 88 | 
 89 |     def items(self):
 90 |         return list(self.iteritems())
 91 | 
 92 |     def values(self):
 93 |         return [value for _, value in self.iteritems()]
 94 | 
 95 |     def __getitem__(self, k):
 96 |         if self._frozen:
 97 |             if k in self._mapping:
 98 |                 out_index = self._mapping[k]
 99 |             elif self.use_unks:
100 |                 out_index = self.unk_index
101 |             else:  # case: frozen, don't want unks, raise exception
102 |                 raise VocabularyException("Vocabulary is frozen. " +
103 |                                           "Key '{}' not found.".format(k))
104 |             if out_index in self._forced_unks:
105 |                 out_index = self.unk_index
106 |         elif k in self._mapping:  # case: normal
107 |             out_index = self._mapping[k]
108 |             self._counts[out_index] += 1
109 |         else:
110 |             out_index = self._mapping[k] = self._i
111 |             self._i += 1
112 |             self._flip[out_index] = k
113 |             self._counts[out_index] = 1
114 | 
115 |         return out_index
116 | 
117 |     def add(self, k):
118 |         return self.__getitem__(k)
119 | 
120 |     def add_many(self, x):
121 |         return [self.add(k) for k in x]
122 | 
123 |     def lookup(self, i):
124 |         try:
125 |             return self._flip[i]
126 |         except KeyError:
127 |             raise VocabularyException("Key {} not in Vocabulary".format(i))
128 | 
129 |     def lookup_many(self, x):
130 |         for k in x:
131 |             yield self.lookup(k)
132 | 
133 |     def map(self, sequence, include_start_end=False):
134 |         if include_start_end:
135 |             yield self.start_index
136 | 
137 |         for item in sequence:
138 |             yield self[item]
139 | 
140 |         if include_start_end:
141 |             yield self.end_index
142 | 
143 |     def freeze(self, use_unks=False, frequency_cutoff=-1):
144 |         self.use_unks = use_unks
145 |         self._frequency_cutoff = frequency_cutoff
146 | 
147 |         if use_unks and self.unk_token not in self:
148 |             self.add(self.unk_token)
149 | 
150 |         if self._frequency_cutoff > 0:
151 |             for token, count in self._counts.items():
152 |                 if count < self._frequency_cutoff:
153 |                     self._forced_unks.add(token)
154 | 
155 |         self._frozen = True
156 | 
157 |     def unfreeze(self):
158 |         self._frozen = False
159 | 
160 |     def get_counts(self):
161 |         return {self._flip[i]: count for i, count in self._counts.items()}
162 | 
163 |     def get_count(self, token=None, index=None):
164 |         if token is None and index is None:
165 |             return None
166 |         elif token is not None and index is not None:
167 |             print("Cannot do two things at once; choose one")
168 |         elif token is not None:
169 |             return self._counts[self[token]]
170 |         elif index is not None:
171 |             return self._counts[index]
172 |         else:
173 |             raise Exception("impossible condition")
174 | 
175 |     @property
176 |     def unk_index(self):
177 |         if self.unk_token not in self:
178 |             return None
179 |         return self._mapping[self.unk_token]
180 | 
181 |     @property
182 |     def mask_index(self):
183 |         if self.mask_token not in self:
184 |             return None
185 |         return self._mapping[self.mask_token]
186 | 
187 |     @property
188 |     def start_index(self):
189 |         if self.start_token not in self:
190 |             return None
191 |         return self._mapping[self.start_token]
192 | 
193 |     @property
194 |     def end_index(self):
195 |         if self.end_token not in self:
196 |             return None
197 |         return self._mapping[self.end_token]
198 | 
199 |     def __contains__(self, k):
200 |         return k in self._mapping
201 | 
202 |     def __len__(self):
203 |         return len(self._mapping)
204 | 
205 |     def __repr__(self):
206 |         return "<Vocabulary(size={},frozen={})>".format(len(self), self._frozen)
207 | 
208 | 
209 |     def get_serializable_contents(self):
210 |         """
211 |         Creats a dict containing the necessary information to recreate this instance
212 |         """
213 |         config = {"_mapping": self._mapping,
214 |                   "_flip": self._flip,
215 |                   "_frozen": self._frozen,
216 |                   "_i": self._i,
217 |                   "_counts": list(self._counts.items()),
218 |                   "_frequency_threshold": self._frequency_threshold,
219 |                   "use_unks": self.use_unks,
220 |                   "unk_token": self.unk_token,
221 |                   "use_mask": self.use_mask,
222 |                   "mask_token": self.mask_token,
223 |                   "use_start_end": self.use_start_end,
224 |                   "start_token": self.start_token,
225 |                   "end_token": self.end_token}
226 |         return config
227 | 
228 |     @classmethod
229 |     def deserialize_from_contents(cls, content):
230 |         """
231 |         Recreate a Vocabulary instance; expect same dict as output in `serialize`
232 |         """
233 |         try:
234 |             _mapping = content.pop("_mapping")
235 |             _flip = content.pop("_flip")
236 |             _i = content.pop("_i")
237 |             _frozen = content.pop("_frozen")
238 |             _counts = content.pop("_counts")
239 |             _frequency_threshold = content.pop("_frequency_threshold")
240 |         except KeyError:
241 |             raise Exception("unable to deserialize vocabulary")
242 |         if isinstance(list(_flip.keys())[0], six.string_types):
243 |             _flip = {int(k): v for k, v in _flip.items()}
244 |         out = cls(**content)
245 |         out._mapping = _mapping
246 |         out._flip = _flip
247 |         out._i = _i
248 |         out._counts = Counter(dict(_counts))
249 |         out._frequency_threshold = _frequency_threshold
250 | 
251 |         if _frozen:
252 |             out.freeze(out.use_unks)
253 | 
254 |         return out
255 | 
256 | 


--------------------------------------------------------------------------------
/day_2/vocabulary.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import numpy as np
  4 | from torch.utils.data import Dataset
  5 | import six
  6 | 
  7 | import json
  8 | 
  9 | 
 10 | class Vocabulary(object):
 11 |     """
 12 |     An implementation that manages the interface between a token dataset and the
 13 |         machine learning algorithm.
 14 |     """
 15 | 
 16 |     def __init__(self, use_unks=False, unk_token="<UNK>",
 17 |                  use_mask=False, mask_token="<MASK>", use_start_end=False,
 18 |                  start_token="<START>", end_token="<END>"):
 19 |         """
 20 |         Args:
 21 |             use_unks (bool): The vocabulary will output UNK tokens for out of
 22 |                 vocabulary items.
 23 |                 [default=False]
 24 |             unk_token (str): The token used for unknown tokens.
 25 |                 If `use_unks` is True, this will be added to the vocabulary.
 26 |                 [default='<UNK>']
 27 |             use_mask (bool): The vocabulary will reserve the 0th index for a mask token.
 28 |                 This is used to handle variable lengths in sequence models.
 29 |                 [default=False]
 30 |             mask_token (str): The token used for the mask.
 31 |                 Note: mostly a placeholder; it's unlikely the token will be seen.
 32 |                 [default='<MASK>']
 33 |             use_start_end (bool): The vocabulary will reserve indices for two tokens
 34 |                 that represent the start and end of a sequence.
 35 |                 [default=False]
 36 |             start_token: The token used to indicate the start of a sequence.
 37 |                 If `use_start_end` is True, this will be added to the vocabulary.
 38 |                 [default='<START>']
 39 |             end_token: The token used to indicate the end of a sequence
 40 |                  If `use_start_end` is True, this will be added to the vocabulary.
 41 |                  [default='<END>']
 42 |         """
 43 | 
 44 |         self._mapping = {}  # str -> int
 45 |         self._flip = {}  # int -> str;
 46 |         self._counts = Counter()  # int -> int; count occurrences
 47 |         self._forced_unks = set()  # force tokens to unk (e.g. if < 5 occurrences)
 48 |         self._i = 0
 49 |         self._frozen = False
 50 |         self._frequency_threshold = -1
 51 | 
 52 |         # mask token for use in masked recurrent networks
 53 |         # usually need to be the 0th index
 54 |         self.use_mask = use_mask
 55 |         self.mask_token = mask_token
 56 |         if self.use_mask:
 57 |             self.add(self.mask_token)
 58 | 
 59 |         # unk token for out of vocabulary tokens
 60 |         self.use_unks = use_unks
 61 |         self.unk_token = unk_token
 62 |         if self.use_unks:
 63 |             self.add(self.unk_token)
 64 | 
 65 |         # start token for sequence models
 66 |         self.use_start_end = use_start_end
 67 |         self.start_token = start_token
 68 |         self.end_token = end_token
 69 |         if self.use_start_end:
 70 |             self.add(self.start_token)
 71 |             self.add(self.end_token)
 72 | 
 73 |     def iterkeys(self):
 74 |         for k in self._mapping.keys():
 75 |             if k == self.unk_token or k == self.mask_token:
 76 |                 continue
 77 |             else:
 78 |                 yield k
 79 | 
 80 |     def keys(self):
 81 |         return list(self.iterkeys())
 82 | 
 83 |     def iteritems(self):
 84 |         for key, value in self._mapping.items():
 85 |             if key == self.unk_token or key == self.mask_token:
 86 |                 continue
 87 |             yield key, value
 88 | 
 89 |     def items(self):
 90 |         return list(self.iteritems())
 91 | 
 92 |     def values(self):
 93 |         return [value for _, value in self.iteritems()]
 94 | 
 95 |     def __getitem__(self, k):
 96 |         if self._frozen:
 97 |             if k in self._mapping:
 98 |                 out_index = self._mapping[k]
 99 |             elif self.use_unks:
100 |                 out_index = self.unk_index
101 |             else:  # case: frozen, don't want unks, raise exception
102 |                 raise VocabularyException("Vocabulary is frozen. " +
103 |                                           "Key '{}' not found.".format(k))
104 |             if out_index in self._forced_unks:
105 |                 out_index = self.unk_index
106 |         elif k in self._mapping:  # case: normal
107 |             out_index = self._mapping[k]
108 |             self._counts[out_index] += 1
109 |         else:
110 |             out_index = self._mapping[k] = self._i
111 |             self._i += 1
112 |             self._flip[out_index] = k
113 |             self._counts[out_index] = 1
114 | 
115 |         return out_index
116 | 
117 |     def add(self, k):
118 |         return self.__getitem__(k)
119 | 
120 |     def add_many(self, x):
121 |         return [self.add(k) for k in x]
122 | 
123 |     def lookup(self, i):
124 |         try:
125 |             return self._flip[i]
126 |         except KeyError:
127 |             raise VocabularyException("Key {} not in Vocabulary".format(i))
128 | 
129 |     def lookup_many(self, x):
130 |         for k in x:
131 |             yield self.lookup(k)
132 | 
133 |     def map(self, sequence, include_start_end=False):
134 |         if include_start_end:
135 |             yield self.start_index
136 | 
137 |         for item in sequence:
138 |             yield self[item]
139 | 
140 |         if include_start_end:
141 |             yield self.end_index
142 | 
143 |     def freeze(self, use_unks=False, frequency_cutoff=-1):
144 |         self.use_unks = use_unks
145 |         self._frequency_cutoff = frequency_cutoff
146 | 
147 |         if use_unks and self.unk_token not in self:
148 |             self.add(self.unk_token)
149 | 
150 |         if self._frequency_cutoff > 0:
151 |             for token, count in self._counts.items():
152 |                 if count < self._frequency_cutoff:
153 |                     self._forced_unks.add(token)
154 | 
155 |         self._frozen = True
156 | 
157 |     def unfreeze(self):
158 |         self._frozen = False
159 | 
160 |     def get_counts(self):
161 |         return {self._flip[i]: count for i, count in self._counts.items()}
162 | 
163 |     def get_count(self, token=None, index=None):
164 |         if token is None and index is None:
165 |             return None
166 |         elif token is not None and index is not None:
167 |             print("Cannot do two things at once; choose one")
168 |         elif token is not None:
169 |             return self._counts[self[token]]
170 |         elif index is not None:
171 |             return self._counts[index]
172 |         else:
173 |             raise Exception("impossible condition")
174 | 
175 |     @property
176 |     def unk_index(self):
177 |         if self.unk_token not in self:
178 |             return None
179 |         return self._mapping[self.unk_token]
180 | 
181 |     @property
182 |     def mask_index(self):
183 |         if self.mask_token not in self:
184 |             return None
185 |         return self._mapping[self.mask_token]
186 | 
187 |     @property
188 |     def start_index(self):
189 |         if self.start_token not in self:
190 |             return None
191 |         return self._mapping[self.start_token]
192 | 
193 |     @property
194 |     def end_index(self):
195 |         if self.end_token not in self:
196 |             return None
197 |         return self._mapping[self.end_token]
198 | 
199 |     def __contains__(self, k):
200 |         return k in self._mapping
201 | 
202 |     def __len__(self):
203 |         return len(self._mapping)
204 | 
205 |     def __repr__(self):
206 |         return "<Vocabulary(size={},frozen={})>".format(len(self), self._frozen)
207 | 
208 | 
209 |     def get_serializable_contents(self):
210 |         """
211 |         Creats a dict containing the necessary information to recreate this instance
212 |         """
213 |         config = {"_mapping": self._mapping,
214 |                   "_flip": self._flip,
215 |                   "_frozen": self._frozen,
216 |                   "_i": self._i,
217 |                   "_counts": list(self._counts.items()),
218 |                   "_frequency_threshold": self._frequency_threshold,
219 |                   "use_unks": self.use_unks,
220 |                   "unk_token": self.unk_token,
221 |                   "use_mask": self.use_mask,
222 |                   "mask_token": self.mask_token,
223 |                   "use_start_end": self.use_start_end,
224 |                   "start_token": self.start_token,
225 |                   "end_token": self.end_token}
226 |         return config
227 | 
228 |     @classmethod
229 |     def deserialize_from_contents(cls, content):
230 |         """
231 |         Recreate a Vocabulary instance; expect same dict as output in `serialize`
232 |         """
233 |         try:
234 |             _mapping = content.pop("_mapping")
235 |             _flip = content.pop("_flip")
236 |             _i = content.pop("_i")
237 |             _frozen = content.pop("_frozen")
238 |             _counts = content.pop("_counts")
239 |             _frequency_threshold = content.pop("_frequency_threshold")
240 |         except KeyError:
241 |             raise Exception("unable to deserialize vocabulary")
242 |         if isinstance(list(_flip.keys())[0], six.string_types):
243 |             _flip = {int(k): v for k, v in _flip.items()}
244 |         out = cls(**content)
245 |         out._mapping = _mapping
246 |         out._flip = _flip
247 |         out._i = _i
248 |         out._counts = Counter(dict(_counts))
249 |         out._frequency_threshold = _frequency_threshold
250 | 
251 |         if _frozen:
252 |             out.freeze(out.use_unks)
253 | 
254 |         return out
255 | 
256 | 


--------------------------------------------------------------------------------
/day_1/figures/intro_to_pytorch/computational_graph_forward.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="yes"?>
2 | 
3 | <svg version="1.1" viewBox="0.0 0.0 683.2939632545932 354.2782152230971" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l683.29395 0l0 354.27823l-683.29395 0l0 -354.27823z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#ffffff" d="m0 0l683.29395 0l0 354.27823l-683.29395 0z" fill-rule="evenodd"></path><path fill="#f05732" d="m40.440945 165.88452l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.440945 165.88452l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m63.83916 190.27191l1.9218712 -7.671875l1.71875 0l-2.921875 10.03125l-1.3906212 0l-2.4375 -7.609375l-2.375 7.609375l-1.390625 0l-2.90625 -10.03125l1.703125 0l1.96875 7.5l2.34375 -7.5l1.375 0l2.390625 7.671875z" fill-rule="nonzero"></path><path fill="#f05732" d="m40.440945 39.65354l40.440945 0l0 39.65354l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.440945 39.65354l40.440945 0l0 39.65354l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m60.710224 60.02531l2.21875 -3.65625l1.9999962 0l-3.2812462 4.953125l3.3906212 5.078129l-1.9843712 0l-2.328125 -3.7500038l-2.3125 3.7500038l-2.0 0l3.390625 -5.078129l-3.28125 -4.953125l1.984375 0l2.203125 3.65625z" fill-rule="nonzero"></path><path fill="#f05732" d="m40.440945 273.86353l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.440945 273.86353l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m65.224884 295.70404q0 2.296875 -1.0625 3.703125q-1.0468712 1.390625 -2.8281212 1.390625q-1.90625 0 -2.953125 -1.34375l-0.078125 1.15625l-1.578125 0l0 -14.25l1.71875 0l0 5.3125q1.03125 -1.28125 2.859375 -1.28125q1.828125 0 2.8749962 1.390625q1.046875 1.375 1.046875 3.765625l0 0.15625zm-1.7187462 -0.203125q0 -1.75 -0.6875 -2.703125q-0.671875 -0.953125 -1.9375 -0.953125q-1.703125 0 -2.4375 1.578125l0 4.34375q0.78125 1.578125 2.453125 1.578125q1.234375 0 1.921875 -0.953125q0.6875 -0.96875 0.6875 -2.890625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m80.88189 59.480316l88.472435 0.40944672" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m80.88189 59.480316l88.472435 0.40944672" fill-rule="evenodd"></path><path fill="#942174" d="m441.7979 180.84514l0 0c0 -13.637512 11.668793 -24.692902 26.062988 -24.692902l0 0c6.912323 0 13.541565 2.6015625 18.429321 7.232376c4.8877563 4.630829 7.633667 10.91156 7.633667 17.460526l0 0c0 13.637527 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055389 -26.062988 -24.692917z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.7979 180.84514l0 0c0 -13.637512 11.668793 -24.692902 26.062988 -24.692902l0 0c6.912323 0 13.541565 2.6015625 18.429321 7.232376c4.8877563 4.630829 7.633667 10.91156 7.633667 17.460526l0 0c0 13.637527 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055389 -26.062988 -24.692917z" fill-rule="evenodd"></path><path fill="#ffffff" d="m469.13336 180.14827l5.359375 0l0 2.3125l-5.359375 0l0 6.0625l-2.453125 0l0 -6.0625l-5.34375 0l0 -2.3125l5.34375 0l0 -5.609375l2.453125 0l0 5.609375z" fill-rule="nonzero"></path><path fill="#942174" d="m214.51443 116.46719l0 0c0 -13.63752 11.668793 -24.692917 26.062988 -24.692917l0 0c6.9123383 0 13.541565 2.6015701 18.429321 7.2323914c4.8877563 4.6308136 7.633667 10.911552 7.633667 17.460526l0 0c0 13.637512 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055405 -26.062988 -24.692917z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.51443 116.46719l0 0c0 -13.63752 11.668793 -24.692917 26.062988 -24.692917l0 0c6.9123383 0 13.541565 2.6015701 18.429321 7.2323914c4.8877563 4.6308136 7.633667 10.911552 7.633667 17.460526l0 0c0 13.637512 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055405 -26.062988 -24.692917z" fill-rule="evenodd"></path><path fill="#ffffff" d="m239.18123 117.447395l-3.96875 -1.1875l0.59375 -2.0l3.984375 1.484375l-0.109375 -4.546875l2.015625 0l-0.140625 4.609375l3.921875 -1.453125l0.609375 2.0l-4.046875 1.203125l2.609375 3.578125l-1.640625 1.234375l-2.453125 -3.796875l-2.375 3.703125l-1.640625 -1.203125l2.640625 -3.625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m169.34908 59.90026l45.16536 56.566933" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m169.34908 59.90026l41.42166 51.87816" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m209.47995 112.80901l4.1223297 2.5157547l-1.5407867 -4.5769577z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m170.26247 185.31758l44.25197 -68.85039" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m170.26247 185.31758l41.00789 -63.803024" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m212.65984 122.40762l1.0641785 -4.71064l-3.8431396 2.9245224z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m82.2021 185.50656l88.47244 0.40945435" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m82.2021 185.50656l88.47244 0.40945435" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m82.2021 293.48557l130.4567 0.50393677" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m82.2021 293.48557l130.4567 0.50393677" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m396.63254 116.46719l45.165375 64.377945" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.63254 116.46719l41.71942 59.466187" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m436.99982 176.88199l3.9584656 2.7664032l-1.2541504 -4.6636505z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m212.66405 294.0105l229.13387 -113.16536" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m212.66405 294.0105l223.75417 -110.50842" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m437.14966 184.98303l3.337494 -3.4905243l-4.8003235 0.528595z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m88.973755 20.128609l27.401573 0l0 33.574802l-27.401573 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m107.942505 47.048607l-8.84375 0l0 -1.234375l4.671875 -5.1875q1.046875 -1.1875 1.4375 -1.921875q0.390625 -0.734375 0.390625 -1.53125q0 -1.046875 -0.640625 -1.71875q-0.640625 -0.6875 -1.703125 -0.6875q-1.28125 0 -2.0 0.734375q-0.703125 0.71875 -0.703125 2.015625l-1.71875 0q0 -1.859375 1.203125 -3.0q1.203125 -1.15625 3.21875 -1.15625q1.875 0 2.96875 0.984375q1.109375 0.984375 1.109375 2.625q0 2.0 -2.546875 4.75l-3.625 3.921875l6.78125 0l0 1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.973755 147.06561l27.401573 0l0 33.574814l-27.401573 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m101.598755 166.39186l1.28125 0q1.21875 -0.015625 1.90625 -0.625q0.703125 -0.625 0.703125 -1.6875q0 -2.375 -2.359375 -2.375q-1.125 0 -1.796875 0.640625q-0.65625 0.625 -0.65625 1.671875l-1.703125 0q0 -1.59375 1.171875 -2.65625q1.171875 -1.0625 2.984375 -1.0625q1.90625 0 2.984375 1.015625q1.09375 1.0 1.09375 2.796875q0 0.890625 -0.578125 1.71875q-0.5625 0.8125 -1.546875 1.21875q1.109375 0.359375 1.71875 1.1875q0.609375 0.8125 0.609375 1.984375q0 1.8125 -1.1875 2.890625q-1.1875 1.0625 -3.09375 1.0625q-1.90625 0 -3.09375 -1.03125q-1.1875 -1.03125 -1.1875 -2.71875l1.71875 0q0 1.0625 0.6875 1.703125q0.703125 0.640625 1.875 0.640625q1.25 0 1.90625 -0.640625q0.65625 -0.65625 0.65625 -1.875q0 -1.171875 -0.734375 -1.796875q-0.71875 -0.640625 -2.078125 -0.65625l-1.28125 0l0 -1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.973755 256.2021l27.401573 0l0 33.57483l-27.401573 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m104.73938 283.1221l-1.734375 0l0 -11.4375l-3.453125 1.265625l0 -1.5625l4.921875 -1.84375l0.265625 0l0 13.578125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m332.0735 79.30708l27.40158 0l0 33.574806l-27.40158 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m348.93286 92.71146l0 1.453125l-0.3125 0q-2.015625 0.03125 -3.203125 1.1875q-1.1875 1.15625 -1.359375 3.234375q1.0625 -1.21875 2.90625 -1.21875q1.765625 0 2.8125 1.25q1.0625 1.234375 1.0625 3.203125q0 2.09375 -1.140625 3.34375q-1.140625 1.25 -3.046875 1.25q-1.9375 0 -3.15625 -1.484375q-1.203125 -1.5 -1.203125 -3.84375l0 -0.65625q0 -3.734375 1.59375 -5.703125q1.59375 -1.96875 4.734375 -2.015625l0.3125 0zm-2.25 6.078125q-0.890625 0 -1.640625 0.53125q-0.734375 0.53125 -1.015625 1.328125l0 0.640625q0 1.65625 0.75 2.6875q0.75 1.015625 1.875 1.015625q1.15625 0 1.8125 -0.84375q0.671875 -0.859375 0.671875 -2.25q0 -1.390625 -0.671875 -2.25q-0.671875 -0.859375 -1.78125 -0.859375z" fill-rule="nonzero"></path><path fill="#f05732" d="m285.35303 96.64042l40.44095 0l0 39.65354l-40.44095 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m285.35303 96.64042l40.44095 0l0 39.65354l-40.44095 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m303.85666 121.98094l5.875 0l0 1.40625l-7.96875 0l0 -1.265625l5.546875 -7.359375l-5.453125 0l0 -1.40625l7.59375 0l0 1.203125l-5.59375 7.421875z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m266.6404 116.46719l18.70868 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m266.6404 116.46719l12.708679 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m279.3491 118.11893l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"></path><path fill="#f05732" d="m512.63257 161.01837l40.440918 0l0 39.65355l-40.440918 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m512.63257 161.01837l40.440918 0l0 39.65355l-40.440918 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m533.0176 185.24951l2.34375 -7.515625l1.84375 0l-4.046875 11.578125q-0.9375 2.5 -2.96875 2.5l-0.328125 -0.015625l-0.640625 -0.125l0 -1.390625l0.46875 0.03125q0.859375 0 1.34375 -0.359375q0.5 -0.34375 0.8125 -1.28125l0.375 -1.015625l-3.578125 -9.921875l1.875 0l2.5 7.515625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m493.9239 180.84514l18.70868 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m493.9239 180.84514l12.708649 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m506.63254 182.49687l4.5381165 -1.6517334l-4.5381165 -1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m560.9816 141.16011l27.40155 0l0 33.5748l-27.40155 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m579.8254 155.53323l-5.59375 12.546875l-1.796875 0l5.578125 -12.09375l-7.3125 0l0 -1.421875l9.125 0l0 0.96875z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m553.0735 180.64043l88.47247 0.4094391" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m553.0735 180.64043l88.47247 0.4094391" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m325.80054 116.26247l71.68503 0.37795258" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m325.80054 116.26247l71.68503 0.37795258" fill-rule="evenodd"></path></g></svg>
4 | 
5 | 


--------------------------------------------------------------------------------
/day_1/figures/intro_to_pytorch/pytorch_variable.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="yes"?>
2 | 
3 | <svg version="1.1" viewBox="0.0 0.0 456.90813648293965 299.8897637795276" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l456.90814 0l0 299.88977l-456.90814 0l0 -299.88977z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#ffffff" d="m0 0l456.90814 0l0 299.88977l-456.90814 0z" fill-rule="evenodd"></path><path fill="#f05732" d="m119.03674 55.07874l218.83466 0l0 181.88977l-218.83466 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m119.03674 55.07874l218.83466 0l0 181.88977l-218.83466 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m200.55896 79.639366l3.859375 -11.15625l1.9375 0l-5.0 13.515625l-1.578125 0l-5.0 -13.515625l1.9375 0l3.84375 11.15625zm13.3228 2.359375q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125zm11.663605 -7.1875q-0.390625 -0.0625 -0.84375 -0.0625q-1.6875 0 -2.296875 1.4375l0 7.125l-1.71875 0l0 -10.03125l1.671875 0l0.03125 1.15625q0.84375 -1.34375 2.390625 -1.34375q0.5 0 0.765625 0.125l0 1.59375zm3.3128815 8.5l-1.703125 0l0 -10.03125l1.703125 0l0 10.03125zm-1.84375 -12.703125q0 -0.421875 0.25 -0.703125q0.25 -0.28125 0.75 -0.28125q0.515625 0 0.765625 0.28125q0.265625 0.28125 0.265625 0.703125q0 0.421875 -0.265625 0.703125q-0.25 0.265625 -0.765625 0.265625q-0.5 0 -0.75 -0.265625q-0.25 -0.28125 -0.25 -0.703125zm10.71492 12.703125q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125zm15.3042145 -3.59375q0 2.296875 -1.0624847 3.703125q-1.046875 1.390625 -2.828125 1.390625q-1.90625 0 -2.953125 -1.34375l-0.078125 1.15625l-1.578125 0l0 -14.25l1.71875 0l0 5.3125q1.03125 -1.28125 2.859375 -1.28125q1.828125 0 2.875 1.390625q1.0468597 1.375 1.0468597 3.765625l0 0.15625zm-1.7187347 -0.203125q0 -1.75 -0.6875 -2.703125q-0.671875 -0.953125 -1.9375 -0.953125q-1.703125 0 -2.4375 1.578125l0 4.34375q0.78125 1.578125 2.453125 1.578125q1.234375 0 1.921875 -0.953125q0.6875 -0.96875 0.6875 -2.890625zm5.544937 5.109375l-1.703125 0l0 -14.25l1.703125 0l0 14.25zm6.839905 0.1875q-2.046875 0 -3.328125 -1.34375q-1.28125 -1.34375 -1.28125 -3.578125l0 -0.328125q0 -1.484375 0.5625 -2.65625q0.578125 -1.171875 1.59375 -1.828125q1.03125 -0.671875 2.234375 -0.671875q1.953125 0 3.03125 1.296875q1.09375 1.28125 1.09375 3.6875l0 0.703125l-6.796875 0q0.03125 1.484375 0.859375 2.40625q0.84375 0.90625 2.125 0.90625q0.90625 0 1.53125 -0.359375q0.640625 -0.375 1.109375 -0.984375l1.046875 0.8125q-1.265625 1.9375 -3.78125 1.9375zm-0.21875 -9.0q-1.03125 0 -1.75 0.765625q-0.703125 0.75 -0.859375 2.109375l5.015625 0l0 -0.125q-0.0625 -1.3125 -0.703125 -2.03125q-0.625 -0.71875 -1.703125 -0.71875z" fill-rule="nonzero"></path><path fill="#942174" d="m186.67998 89.8771l83.52757 0l0 38.677162l-83.52757 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m186.67998 89.8771l83.52757 0l0 38.677162l-83.52757 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m210.86292 111.026306q0 -2.296875 1.09375 -3.703125q1.109375 -1.40625 2.875 -1.40625q1.765625 0 2.796875 1.203125l0 -5.234375l1.71875 0l0 14.25l-1.578125 0l-0.09375 -1.078125q-1.03125 1.265625 -2.859375 1.265625q-1.75 0 -2.859375 -1.421875q-1.09375 -1.4375 -1.09375 -3.734375l0 -0.140625zm1.71875 0.203125q0 1.703125 0.703125 2.671875q0.71875 0.96875 1.953125 0.96875q1.640625 0 2.390625 -1.46875l0 -4.609375q-0.78125 -1.421875 -2.375 -1.421875q-1.25 0 -1.96875 0.984375q-0.703125 0.96875 -0.703125 2.875zm15.427719 4.90625q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125zm9.132355 -11.15625l0 2.4375l1.875 0l0 1.3125l-1.875 0l0 6.234375q0 0.59375 0.25 0.90625q0.25 0.296875 0.859375 0.296875q0.296875 0 0.8125 -0.109375l0 1.390625q-0.671875 0.1875 -1.3125 0.1875q-1.15625 0 -1.75 -0.6875q-0.578125 -0.703125 -0.578125 -1.984375l0 -6.234375l-1.828125 0l0 -1.3125l1.828125 0l0 -2.4375l1.71875 0zm9.969254 12.46875q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125z" fill-rule="nonzero"></path><path fill="#942174" d="m186.67998 136.97548l83.52757 0l0 38.67717l-83.52757 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m186.67998 136.97548l83.52757 0l0 38.67717l-83.52757 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m210.60982 158.12468q0 -2.34375 1.078125 -3.71875q1.09375 -1.390625 2.890625 -1.390625q1.828125 0 2.859375 1.296875l0.078125 -1.109375l1.578125 0l0 9.796875q0 1.9375 -1.15625 3.0625q-1.15625 1.125 -3.109375 1.125q-1.078125 0 -2.125 -0.46875q-1.03125 -0.453125 -1.578125 -1.265625l0.890625 -1.03125q1.09375 1.359375 2.6875 1.359375q1.265625 0 1.953125 -0.703125q0.703125 -0.703125 0.703125 -1.984375l0 -0.859375q-1.03125 1.1875 -2.8125 1.1875q-1.75 0 -2.84375 -1.421875q-1.09375 -1.421875 -1.09375 -3.875zm1.71875 0.203125q0 1.703125 0.703125 2.671875q0.703125 0.96875 1.953125 0.96875q1.625 0 2.375 -1.484375l0 -4.578125q-0.78125 -1.4375 -2.359375 -1.4375q-1.25 0 -1.96875 0.984375q-0.703125 0.96875 -0.703125 2.875zm14.013687 -3.59375q-0.390625 -0.0625 -0.84375 -0.0625q-1.6875 0 -2.296875 1.4375l0 7.125l-1.71875 0l0 -10.03125l1.671875 0l0.03125 1.15625q0.84375 -1.34375 2.390625 -1.34375q0.5 0 0.765625 0.125l0 1.59375zm7.6566315 8.5q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125zm6.3823547 -3.796875q0 -2.296875 1.09375 -3.703125q1.109375 -1.40625 2.875 -1.40625q1.765625 0 2.796875 1.203125l0 -5.234375l1.71875 0l0 14.25l-1.578125 0l-0.09375 -1.078125q-1.03125 1.265625 -2.859375 1.265625q-1.75 0 -2.859375 -1.421875q-1.09375 -1.4375 -1.09375 -3.734375l0 -0.140625zm1.71875 0.203125q0 1.703125 0.703125 2.671875q0.71875 0.96875 1.953125 0.96875q1.640625 0 2.390625 -1.46875l0 -4.609375q-0.78125 -1.421875 -2.375 -1.421875q-1.25 0 -1.96875 0.984375q-0.703125 0.96875 -0.703125 2.875z" fill-rule="nonzero"></path><path fill="#942174" d="m186.67998 184.07385l83.52757 0l0 38.67717l-83.52757 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m186.67998 184.07385l83.52757 0l0 38.67717l-83.52757 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m204.19334 209.1137q0.921875 0 1.59375 -0.546875q0.6875 -0.5625 0.765625 -1.390625l1.625 0q-0.046875 0.859375 -0.59375 1.640625q-0.546875 0.765625 -1.46875 1.234375q-0.90625 0.46875 -1.921875 0.46875q-2.046875 0 -3.265625 -1.359375q-1.203125 -1.375 -1.203125 -3.75l0 -0.296875q0 -1.46875 0.53125 -2.609375q0.546875 -1.140625 1.546875 -1.765625q1.0 -0.625 2.375 -0.625q1.6875 0 2.796875 1.015625q1.125 1.0 1.203125 2.609375l-1.625 0q-0.078125 -0.96875 -0.734375 -1.59375q-0.65625 -0.625 -1.640625 -0.625q-1.296875 0 -2.03125 0.9375q-0.71875 0.9375 -0.71875 2.71875l0 0.328125q0 1.734375 0.71875 2.671875q0.71875 0.9375 2.046875 0.9375zm10.593506 -7.28125q-0.390625 -0.0625 -0.84375 -0.0625q-1.6875 0 -2.296875 1.4375l0 7.125l-1.71875 0l0 -10.03125l1.671875 0l0.03125 1.15625q0.84375 -1.34375 2.390625 -1.34375q0.5 0 0.765625 0.125l0 1.59375zm5.6253815 8.6875q-2.046875 0 -3.328125 -1.34375q-1.28125 -1.34375 -1.28125 -3.578125l0 -0.328125q0 -1.484375 0.5625 -2.65625q0.578125 -1.171875 1.59375 -1.828125q1.03125 -0.671875 2.234375 -0.671875q1.953125 0 3.03125 1.296875q1.09375 1.28125 1.09375 3.6875l0 0.703125l-6.796875 0q0.03125 1.484375 0.859375 2.40625q0.84375 0.90625 2.125 0.90625q0.90625 0 1.53125 -0.359375q0.640625 -0.375 1.109375 -0.984375l1.046875 0.8125q-1.265625 1.9375 -3.78125 1.9375zm-0.21875 -9.0q-1.03125 0 -1.75 0.765625q-0.703125 0.75 -0.859375 2.109375l5.015625 0l0 -0.125q-0.0625 -1.3125 -0.703125 -2.03125q-0.625 -0.71875 -1.703125 -0.71875zm12.133804 8.8125q-0.15625 -0.296875 -0.25 -1.0625q-1.1875 1.25 -2.859375 1.25q-1.484375 0 -2.4375 -0.84375q-0.9375 -0.84375 -0.9375 -2.125q0 -1.5625 1.1875 -2.4375q1.1875 -0.875 3.359375 -0.875l1.671875 0l0 -0.78125q0 -0.90625 -0.546875 -1.4375q-0.53125 -0.53125 -1.578125 -0.53125q-0.921875 0 -1.546875 0.46875q-0.625 0.46875 -0.625 1.125l-1.71875 0q0 -0.75 0.53125 -1.453125q0.53125 -0.703125 1.4375 -1.109375q0.921875 -0.40625 2.015625 -0.40625q1.734375 0 2.71875 0.875q0.984375 0.859375 1.015625 2.375l0 4.625q0 1.375 0.359375 2.1875l0 0.15625l-1.796875 0zm-2.859375 -1.3125q0.8125 0 1.53125 -0.40625q0.734375 -0.421875 1.0625 -1.09375l0 -2.0625l-1.359375 0q-3.140625 0 -3.140625 1.84375q0 0.8125 0.53125 1.265625q0.53125 0.453125 1.375 0.453125zm9.132355 -11.15625l0 2.4375l1.875 0l0 1.3125l-1.875 0l0 6.234375q0 0.59375 0.25 0.90625q0.25 0.296875 0.859375 0.296875q0.296875 0 0.8125 -0.109375l0 1.390625q-0.671875 0.1875 -1.3125 0.1875q-1.15625 0 -1.75 -0.6875q-0.578125 -0.703125 -0.578125 -1.984375l0 -6.234375l-1.828125 0l0 -1.3125l1.828125 0l0 -2.4375l1.71875 0zm3.3130035 7.359375q0 -1.484375 0.578125 -2.65625q0.578125 -1.171875 1.609375 -1.8125q1.046875 -0.640625 2.375 -0.640625q2.046875 0 3.3125 1.421875q1.265625 1.40625 1.265625 3.765625l0 0.125q0 1.46875 -0.5625 2.640625q-0.5625 1.15625 -1.609375 1.8125q-1.046875 0.640625 -2.390625 0.640625q-2.046875 0 -3.3125 -1.421875q-1.265625 -1.421875 -1.265625 -3.75l0 -0.125zm1.71875 0.203125q0 1.671875 0.78125 2.6875q0.78125 1.0 2.078125 1.0q1.296875 0 2.0625 -1.015625q0.78125 -1.03125 0.78125 -2.875q0 -1.65625 -0.78125 -2.671875q-0.78125 -1.03125 -2.078125 -1.03125q-1.28125 0 -2.0625 1.015625q-0.78125 1.0 -0.78125 2.890625zm14.233643 -3.59375q-0.390625 -0.0625 -0.84375 -0.0625q-1.6875 0 -2.296875 1.4375l0 7.125l-1.71875 0l0 -10.03125l1.671875 0l0.03125 1.15625q0.84375 -1.34375 2.390625 -1.34375q0.5 0 0.765625 0.125l0 1.59375z" fill-rule="nonzero"></path></g></svg>
4 | 
5 | 


--------------------------------------------------------------------------------
/day_1/0_Using_Pretrained_Embeddings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from annoy import AnnoyIndex\n",
 10 |     "import numpy as np\n",
 11 |     "import torch\n",
 12 |     "from tqdm import tqdm_notebook\n",
 13 |     "from argparse import Namespace"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "args = Namespace(\n",
 23 |     "    glove_filename='../data/glove.6B.100d.txt'\n",
 24 |     ")"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def load_word_vectors(filename):\n",
 34 |     "    word_to_index = {}\n",
 35 |     "    word_vectors = []\n",
 36 |     "    \n",
 37 |     "    with open(filename) as fp:\n",
 38 |     "        for line in tqdm_notebook(fp.readlines(), leave=False):\n",
 39 |     "            line = line.split(\" \")\n",
 40 |     "            \n",
 41 |     "            word = line[0]\n",
 42 |     "            word_to_index[word] = len(word_to_index)\n",
 43 |     "            \n",
 44 |     "            vec = np.array([float(x) for x in line[1:]])\n",
 45 |     "            word_vectors.append(vec)\n",
 46 |     "            \n",
 47 |     "    return word_to_index, word_vectors"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "class PreTrainedEmbeddings(object):\n",
 57 |     "    def __init__(self, glove_filename):\n",
 58 |     "        self.word_to_index, self.word_vectors = load_word_vectors(glove_filename)\n",
 59 |     "        self.word_vector_size = len(self.word_vectors[0])\n",
 60 |     "        \n",
 61 |     "        self.index_to_word = {v: k for k, v in self.word_to_index.items()}\n",
 62 |     "        self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')\n",
 63 |     "        print('Building Index')\n",
 64 |     "        for _, i in tqdm_notebook(self.word_to_index.items(), leave=False):\n",
 65 |     "            self.index.add_item(i, self.word_vectors[i])\n",
 66 |     "        self.index.build(50)\n",
 67 |     "        print('Finished!')\n",
 68 |     "    \n",
 69 |     "    def get_embedding(self, word):\n",
 70 |     "        return self.word_vectors[self.word_to_index[word]]\n",
 71 |     "    \n",
 72 |     "    def closest(self, word, n=1):\n",
 73 |     "        vector = self.get_embedding(word)\n",
 74 |     "        nn_indices = self.index.get_nns_by_vector(vector, n)\n",
 75 |     "        return [self.index_to_word[neighbor] for neighbor in nn_indices]\n",
 76 |     "    \n",
 77 |     "    def closest_v(self, vector, n=1):\n",
 78 |     "        nn_indices = self.index.get_nns_by_vector(vector, n)\n",
 79 |     "        return [self.index_to_word[neighbor] for neighbor in nn_indices]\n",
 80 |     "    \n",
 81 |     "    def sim(self, w1, w2):\n",
 82 |     "        return np.dot(self.get_embedding(w1), self.get_embedding(w2))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "application/vnd.jupyter.widget-view+json": {
 93 |        "model_id": "",
 94 |        "version_major": 2,
 95 |        "version_minor": 0
 96 |       },
 97 |       "text/plain": [
 98 |        "HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))"
 99 |       ]
100 |      },
101 |      "metadata": {},
102 |      "output_type": "display_data"
103 |     },
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "\r",
109 |       "Building Index\n"
110 |      ]
111 |     },
112 |     {
113 |      "data": {
114 |       "application/vnd.jupyter.widget-view+json": {
115 |        "model_id": "",
116 |        "version_major": 2,
117 |        "version_minor": 0
118 |       },
119 |       "text/plain": [
120 |        "HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))"
121 |       ]
122 |      },
123 |      "metadata": {},
124 |      "output_type": "display_data"
125 |     },
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Finished!\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "glove = PreTrainedEmbeddings(args.glove_filename)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 6,
141 |    "metadata": {
142 |     "scrolled": true
143 |    },
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "['apple', 'microsoft', 'dell', 'pc', 'compaq']"
149 |       ]
150 |      },
151 |      "execution_count": 6,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "glove.closest('apple', n=5)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 7,
163 |    "metadata": {
164 |     "scrolled": true
165 |    },
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/plain": [
170 |        "['plane', 'airplane', 'jet', 'flight', 'crashed']"
171 |       ]
172 |      },
173 |      "execution_count": 7,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "glove.closest('plane', n=5)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 8,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "(26.873448266652, 16.501491855324)"
191 |       ]
192 |      },
193 |      "execution_count": 8,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "glove.sim('beer', 'wine'), glove.sim('beer', 'gasoline')"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "** Lexical relationships uncovered by word embeddings **"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 9,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "def SAT_analogy(w1, w2, w3):\n",
216 |     "    '''\n",
217 |     "    Solves problems of the type:\n",
218 |     "    w1 : w2 :: w3 : __\n",
219 |     "    '''\n",
220 |     "    closest_words = []\n",
221 |     "    try:\n",
222 |     "        w1v = glove.get_embedding(w1)\n",
223 |     "        w2v = glove.get_embedding(w2)\n",
224 |     "        w3v = glove.get_embedding(w3)\n",
225 |     "        w4v = w3v + (w2v - w1v)\n",
226 |     "        closest_words = glove.closest_v(w4v, n=5)\n",
227 |     "        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]\n",
228 |     "    except:\n",
229 |     "        pass\n",
230 |     "    if len(closest_words) == 0:\n",
231 |     "        print(':-(')\n",
232 |     "    else:\n",
233 |     "        the_closest_word = closest_words[0]\n",
234 |     "        print('{} : {} :: {} : {}'.format(w1, w2, w3, the_closest_word))"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "**Pronouns**"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 10,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "man : he :: woman : she\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "SAT_analogy('man', 'he', 'woman')"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "** Verb-Noun relationships **"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 11,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "name": "stdout",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "fly : plane :: sail : ship\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "SAT_analogy('fly', 'plane', 'sail')"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {},
288 |    "source": [
289 |     "**Noun-Noun relationships**"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 12,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "name": "stdout",
299 |      "output_type": "stream",
300 |      "text": [
301 |       "cat : kitten :: dog : pug\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "SAT_analogy('cat', 'kitten', 'dog')"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 13,
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "human : baby :: dog : puppy\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "SAT_analogy('human', 'baby', 'dog')"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 14,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "name": "stdout",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "human : babies :: dog : puppies\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "SAT_analogy('human', 'babies', 'dog')"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "**Hypernymy**"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 15,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "blue : color :: dog : animal\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "SAT_analogy('blue', 'color', 'dog')"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {
370 |     "collapsed": true
371 |    },
372 |    "source": [
373 |     "**Meronymy**"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 16,
379 |    "metadata": {},
380 |    "outputs": [
381 |     {
382 |      "name": "stdout",
383 |      "output_type": "stream",
384 |      "text": [
385 |       "leg : legs :: hand : hands\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "SAT_analogy('leg', 'legs', 'hand')"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "**Troponymy**"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 17,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "talk : communicate :: read : correctly\n"
410 |      ]
411 |     }
412 |    ],
413 |    "source": [
414 |     "SAT_analogy('talk', 'communicate', 'read')"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "**Metonymy**"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 18,
427 |    "metadata": {},
428 |    "outputs": [
429 |     {
430 |      "name": "stdout",
431 |      "output_type": "stream",
432 |      "text": [
433 |       "blue : democrat :: red : republican\n"
434 |      ]
435 |     }
436 |    ],
437 |    "source": [
438 |     "SAT_analogy('blue', 'democrat', 'red')"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "**Misc**"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 19,
451 |    "metadata": {},
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "man : doctor :: woman : nurse\n"
458 |      ]
459 |     }
460 |    ],
461 |    "source": [
462 |     "SAT_analogy('man', 'doctor', 'woman')"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 20,
468 |    "metadata": {},
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "man : leader :: woman : opposition\n"
475 |      ]
476 |     }
477 |    ],
478 |    "source": [
479 |     "SAT_analogy('man', 'leader', 'woman')"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": []
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": null,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": []
495 |   }
496 |  ],
497 |  "metadata": {
498 |   "kernelspec": {
499 |    "display_name": "pytorch04",
500 |    "language": "python",
501 |    "name": "pytorch04"
502 |   },
503 |   "language_info": {
504 |    "codemirror_mode": {
505 |     "name": "ipython",
506 |     "version": 3
507 |    },
508 |    "file_extension": ".py",
509 |    "mimetype": "text/x-python",
510 |    "name": "python",
511 |    "nbconvert_exporter": "python",
512 |    "pygments_lexer": "ipython3",
513 |    "version": "3.6.6"
514 |   }
515 |  },
516 |  "nbformat": 4,
517 |  "nbformat_minor": 2
518 | }
519 | 


--------------------------------------------------------------------------------
/day_1/figures/intro_to_pytorch/computational_graph_backward.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="yes"?>
2 | 
3 | <svg version="1.1" viewBox="0.0 0.0 666.0472440944882 353.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l666.04724 0l0 353.0l-666.04724 0l0 -353.0z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#ffffff" d="m0 0l666.04724 0l0 353.0l-666.04724 0z" fill-rule="evenodd"></path><path fill="#f05732" d="m40.439632 165.88452l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.439632 165.88452l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m63.83785 190.27191l1.9218712 -7.671875l1.71875 0l-2.921875 10.03125l-1.3906212 0l-2.4375 -7.609375l-2.375 7.609375l-1.390625 0l-2.90625 -10.03125l1.703125 0l1.96875 7.5l2.34375 -7.5l1.375 0l2.390625 7.671875z" fill-rule="nonzero"></path><path fill="#f05732" d="m40.439632 39.65354l40.440945 0l0 39.65354l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.439632 39.65354l40.440945 0l0 39.65354l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m60.70891 60.02531l2.21875 -3.65625l1.9999962 0l-3.2812462 4.953125l3.3906212 5.078129l-1.9843712 0l-2.328125 -3.7500038l-2.3125 3.7500038l-2.0 0l3.390625 -5.078129l-3.28125 -4.953125l1.984375 0l2.203125 3.65625z" fill-rule="nonzero"></path><path fill="#f05732" d="m40.439632 273.86353l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m40.439632 273.86353l40.440945 0l0 39.653534l-40.440945 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m65.22357 295.70404q0 2.296875 -1.0625 3.703125q-1.0468712 1.390625 -2.8281212 1.390625q-1.90625 0 -2.953125 -1.34375l-0.078125 1.15625l-1.578125 0l0 -14.25l1.71875 0l0 5.3125q1.03125 -1.28125 2.859375 -1.28125q1.828125 0 2.8749962 1.390625q1.046875 1.375 1.046875 3.765625l0 0.15625zm-1.7187462 -0.203125q0 -1.75 -0.6875 -2.703125q-0.671875 -0.953125 -1.9375 -0.953125q-1.703125 0 -2.4375 1.578125l0 4.34375q0.78125 1.578125 2.453125 1.578125q1.234375 0 1.921875 -0.953125q0.6875 -0.96875 0.6875 -2.890625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m80.88058 59.480316l88.472435 0.40944672" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m80.88058 59.480316l88.472435 0.40944672" fill-rule="evenodd"></path><path fill="#942174" d="m441.7966 180.84514l0 0c0 -13.637512 11.668793 -24.692902 26.062988 -24.692902l0 0c6.912323 0 13.541565 2.6015625 18.429321 7.232376c4.8877563 4.630829 7.633667 10.91156 7.633667 17.460526l0 0c0 13.637527 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055389 -26.062988 -24.692917z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.7966 180.84514l0 0c0 -13.637512 11.668793 -24.692902 26.062988 -24.692902l0 0c6.912323 0 13.541565 2.6015625 18.429321 7.232376c4.8877563 4.630829 7.633667 10.91156 7.633667 17.460526l0 0c0 13.637527 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055389 -26.062988 -24.692917z" fill-rule="evenodd"></path><path fill="#ffffff" d="m469.13205 180.14827l5.359375 0l0 2.3125l-5.359375 0l0 6.0625l-2.453125 0l0 -6.0625l-5.34375 0l0 -2.3125l5.34375 0l0 -5.609375l2.453125 0l0 5.609375z" fill-rule="nonzero"></path><path fill="#942174" d="m214.51312 116.46719l0 0c0 -13.63752 11.668793 -24.692917 26.062988 -24.692917l0 0c6.9123383 0 13.541565 2.6015701 18.429321 7.2323914c4.8877563 4.6308136 7.633667 10.911552 7.633667 17.460526l0 0c0 13.637512 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055405 -26.062988 -24.692917z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.51312 116.46719l0 0c0 -13.63752 11.668793 -24.692917 26.062988 -24.692917l0 0c6.9123383 0 13.541565 2.6015701 18.429321 7.2323914c4.8877563 4.6308136 7.633667 10.911552 7.633667 17.460526l0 0c0 13.637512 -11.668793 24.692917 -26.062988 24.692917l0 0c-14.394196 0 -26.062988 -11.055405 -26.062988 -24.692917z" fill-rule="evenodd"></path><path fill="#ffffff" d="m239.17992 117.447395l-3.96875 -1.1875l0.59375 -2.0l3.984375 1.484375l-0.109375 -4.546875l2.015625 0l-0.140625 4.609375l3.921875 -1.453125l0.609375 2.0l-4.046875 1.203125l2.609375 3.578125l-1.640625 1.234375l-2.453125 -3.796875l-2.375 3.703125l-1.640625 -1.203125l2.640625 -3.625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m169.34776 59.90026l45.16536 56.566933" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m169.34776 59.90026l41.42166 51.87816" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m209.47864 112.80901l4.1223297 2.5157547l-1.5407867 -4.5769577z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m170.26115 185.31758l44.25197 -68.85039" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m170.26115 185.31758l41.00789 -63.803024" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m212.65852 122.40762l1.0641785 -4.71064l-3.8431396 2.9245224z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m82.20079 185.50656l88.47244 0.40945435" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m82.20079 185.50656l88.47244 0.40945435" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m82.20079 293.48557l130.4567 0.50393677" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m82.20079 293.48557l130.4567 0.50393677" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m396.63123 116.46719l45.165375 64.377945" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.63123 116.46719l41.71942 59.466187" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m436.9985 176.88199l3.9584656 2.7664032l-1.2541504 -4.6636505z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m212.66273 294.0105l229.13387 -113.16536" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m212.66273 294.0105l223.75417 -110.50842" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m437.14835 184.98303l3.337494 -3.4905243l-4.8003235 0.528595z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m88.97592 20.128609l72.37796 0l0 33.574802l-72.37796 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m107.94467 47.048607l-8.84375 0l0 -1.234375l4.671875 -5.1875q1.046875 -1.1875 1.4375 -1.921875q0.390625 -0.734375 0.390625 -1.53125q0 -1.046875 -0.640625 -1.71875q-0.640625 -0.6875 -1.703125 -0.6875q-1.28125 0 -2.0 0.734375q-0.703125 0.71875 -0.703125 2.015625l-1.71875 0q0 -1.859375 1.203125 -3.0q1.203125 -1.15625 3.21875 -1.15625q1.875 0 2.96875 0.984375q1.109375 0.984375 1.109375 2.625q0 2.0 -2.546875 4.75l-3.625 3.921875l6.78125 0l0 1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.97244 147.06561l27.401573 0l0 33.574814l-27.401573 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m101.59744 166.39186l1.28125 0q1.21875 -0.015625 1.90625 -0.625q0.703125 -0.625 0.703125 -1.6875q0 -2.375 -2.359375 -2.375q-1.125 0 -1.796875 0.640625q-0.65625 0.625 -0.65625 1.671875l-1.703125 0q0 -1.59375 1.171875 -2.65625q1.171875 -1.0625 2.984375 -1.0625q1.90625 0 2.984375 1.015625q1.09375 1.0 1.09375 2.796875q0 0.890625 -0.578125 1.71875q-0.5625 0.8125 -1.546875 1.21875q1.109375 0.359375 1.71875 1.1875q0.609375 0.8125 0.609375 1.984375q0 1.8125 -1.1875 2.890625q-1.1875 1.0625 -3.09375 1.0625q-1.90625 0 -3.09375 -1.03125q-1.1875 -1.03125 -1.1875 -2.71875l1.71875 0q0 1.0625 0.6875 1.703125q0.703125 0.640625 1.875 0.640625q1.25 0 1.90625 -0.640625q0.65625 -0.65625 0.65625 -1.875q0 -1.171875 -0.734375 -1.796875q-0.71875 -0.640625 -2.078125 -0.65625l-1.28125 0l0 -1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.97244 256.2021l27.401573 0l0 33.57483l-27.401573 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m104.73807 283.1221l-1.734375 0l0 -11.4375l-3.453125 1.265625l0 -1.5625l4.921875 -1.84375l0.265625 0l0 13.578125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m332.07217 79.30708l27.40158 0l0 33.574806l-27.40158 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m348.93155 92.71146l0 1.453125l-0.3125 0q-2.015625 0.03125 -3.203125 1.1875q-1.1875 1.15625 -1.359375 3.234375q1.0625 -1.21875 2.90625 -1.21875q1.765625 0 2.8125 1.25q1.0625 1.234375 1.0625 3.203125q0 2.09375 -1.140625 3.34375q-1.140625 1.25 -3.046875 1.25q-1.9375 0 -3.15625 -1.484375q-1.203125 -1.5 -1.203125 -3.84375l0 -0.65625q0 -3.734375 1.59375 -5.703125q1.59375 -1.96875 4.734375 -2.015625l0.3125 0zm-2.25 6.078125q-0.890625 0 -1.640625 0.53125q-0.734375 0.53125 -1.015625 1.328125l0 0.640625q0 1.65625 0.75 2.6875q0.75 1.015625 1.875 1.015625q1.15625 0 1.8125 -0.84375q0.671875 -0.859375 0.671875 -2.25q0 -1.390625 -0.671875 -2.25q-0.671875 -0.859375 -1.78125 -0.859375z" fill-rule="nonzero"></path><path fill="#f05732" d="m285.3517 96.64042l40.44095 0l0 39.65354l-40.44095 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m285.3517 96.64042l40.44095 0l0 39.65354l-40.44095 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m303.85535 121.98094l5.875 0l0 1.40625l-7.96875 0l0 -1.265625l5.546875 -7.359375l-5.453125 0l0 -1.40625l7.59375 0l0 1.203125l-5.59375 7.421875z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m266.6391 116.46719l18.70868 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m266.6391 116.46719l12.708679 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m279.34778 118.11893l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"></path><path fill="#f05732" d="m512.6312 161.01837l40.44098 0l0 39.65355l-40.44098 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m512.6312 161.01837l40.44098 0l0 39.65355l-40.44098 0z" fill-rule="evenodd"></path><path fill="#ffffff" d="m533.0163 185.24951l2.34375 -7.515625l1.84375 0l-4.046875 11.578125q-0.9375 2.5 -2.96875 2.5l-0.328125 -0.015625l-0.640625 -0.125l0 -1.390625l0.46875 0.03125q0.859375 0 1.34375 -0.359375q0.5 -0.34375 0.8125 -1.28125l0.375 -1.015625l-3.578125 -9.921875l1.875 0l2.5 7.515625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m325.79266 116.46719l72.37793 0.8503952" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m325.79266 116.46719l72.37793 0.8503952" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m493.92258 180.84514l18.708649 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m493.92258 180.84514l12.708649 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m506.63123 182.49687l4.5381165 -1.6517334l-4.5381165 -1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m560.9803 141.16011l27.401611 0l0 33.5748l-27.401611 0z" fill-rule="evenodd"></path><path fill="#35524a" d="m579.82404 155.53323l-5.59375 12.546875l-1.796875 0l5.578125 -12.09375l-7.3125 0l0 -1.421875l9.125 0l0 0.96875z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.97244 293.48557l27.401573 0l0 33.5748l-27.401573 0z" fill-rule="evenodd"></path><path fill="#e06666" d="m104.73807 320.40555l-1.734375 0l0 -11.4375l-3.453125 1.265625l0 -1.5625l4.921875 -1.84375l0.265625 0l0 13.578125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.97244 59.479004l27.401573 0l0 33.5748l-27.401573 0z" fill-rule="evenodd"></path><path fill="#e06666" d="m101.59744 78.80525l1.28125 0q1.21875 -0.015625 1.90625 -0.625q0.703125 -0.625 0.703125 -1.6875q0 -2.375 -2.359375 -2.375q-1.125 0 -1.796875 0.640625q-0.65625 0.625 -0.65625 1.671875l-1.703125 0q0 -1.59375 1.171875 -2.65625q1.171875 -1.0625 2.984375 -1.0625q1.90625 0 2.984375 1.015625q1.09375 1.0 1.09375 2.796875q0 0.890625 -0.578125 1.71875q-0.5625 0.8125 -1.546875 1.21875q1.109375 0.359375 1.71875 1.1875q0.609375 0.8125 0.609375 1.984375q0 1.8125 -1.1875 2.890625q-1.1875 1.0625 -3.09375 1.0625q-1.90625 0 -3.09375 -1.03125q-1.1875 -1.03125 -1.1875 -2.71875l1.71875 0q0 1.0625 0.6875 1.703125q0.703125 0.640625 1.875 0.640625q1.25 0 1.90625 -0.640625q0.65625 -0.65625 0.65625 -1.875q0 -1.171875 -0.734375 -1.796875q-0.71875 -0.640625 -2.078125 -0.65625l-1.28125 0l0 -1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m88.97244 190.78215l27.401573 0l0 33.5748l-27.401573 0z" fill-rule="evenodd"></path><path fill="#e06666" d="m107.94119 217.70215l-8.84375 0l0 -1.234375l4.671875 -5.1875q1.046875 -1.1875 1.4375 -1.921875q0.390625 -0.734375 0.390625 -1.53125q0 -1.046875 -0.640625 -1.71875q-0.640625 -0.6875 -1.703125 -0.6875q-1.28125 0 -2.0 0.734375q-0.703125 0.71875 -0.703125 2.015625l-1.71875 0q0 -1.859375 1.203125 -3.0q1.203125 -1.15625 3.21875 -1.15625q1.875 0 2.96875 0.984375q1.109375 0.984375 1.109375 2.625q0 2.0 -2.546875 4.75l-3.625 3.921875l6.78125 0l0 1.40625z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m560.19727 186.95538l32.472412 0l0 33.5748l-32.472412 0z" fill-rule="evenodd"></path><path fill="#e06666" d="m575.9629 213.87538l-1.734375 0l0 -11.4375l-3.453125 1.265625l0 -1.5625l4.921875 -1.84375l0.265625 0l0 13.578125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m329.5378 120.902885l32.472443 0l0 33.574806l-32.472443 0z" fill-rule="evenodd"></path><path fill="#e06666" d="m347.3017 147.82289l-1.734375 0l0 -11.4375l-3.453125 1.265625l0 -1.5625l4.921875 -1.84375l0.265625 0l0 13.578125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m553.0722 180.64043l88.47241 0.4094391" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m553.0722 180.64043l88.47241 0.4094391" fill-rule="evenodd"></path></g></svg>
4 | 
5 | 


--------------------------------------------------------------------------------
/day_2/Amazon-Reviews.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 28,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from argparse import Namespace\n",
 10 |     "import collections\n",
 11 |     "import json\n",
 12 |     "import os\n",
 13 |     "import re\n",
 14 |     "\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import seaborn as sns\n",
 19 |     "import torch\n",
 20 |     "import torch.nn as nn\n",
 21 |     "import torch.nn.functional as F\n",
 22 |     "import torch.optim as optim\n",
 23 |     "from torch.utils.data import Dataset, DataLoader\n",
 24 |     "from tqdm import tqdm_notebook\n",
 25 |     "\n",
 26 |     "from vocabulary import Vocabulary\n",
 27 |     "\n",
 28 |     "%matplotlib inline\n",
 29 |     "\n",
 30 |     "plt.style.use('fivethirtyeight')\n",
 31 |     "plt.rcParams['figure.figsize'] = (14, 6)\n",
 32 |     "\n",
 33 |     "START_TOKEN = \"^\"\n",
 34 |     "END_TOKEN = \"_\""
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 29,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "SEED = 0\n",
 44 |     "TRAIN_PROP = 0.7\n",
 45 |     "VAL_PROP = 0.15\n",
 46 |     "TEST_PROP = 0.15"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 30,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "df = pd.read_csv(\"../data/amazon_train_small.csv\", header=None)\n",
 56 |     "df.columns = ['label', 'title', 'body']"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 31,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/html": [
 67 |        "<div>\n",
 68 |        "<style scoped>\n",
 69 |        "    .dataframe tbody tr th:only-of-type {\n",
 70 |        "        vertical-align: middle;\n",
 71 |        "    }\n",
 72 |        "\n",
 73 |        "    .dataframe tbody tr th {\n",
 74 |        "        vertical-align: top;\n",
 75 |        "    }\n",
 76 |        "\n",
 77 |        "    .dataframe thead th {\n",
 78 |        "        text-align: right;\n",
 79 |        "    }\n",
 80 |        "</style>\n",
 81 |        "<table border=\"1\" class=\"dataframe\">\n",
 82 |        "  <thead>\n",
 83 |        "    <tr style=\"text-align: right;\">\n",
 84 |        "      <th></th>\n",
 85 |        "      <th>label</th>\n",
 86 |        "      <th>title</th>\n",
 87 |        "      <th>body</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>2</td>\n",
 94 |        "      <td>Right on the money</td>\n",
 95 |        "      <td>We are using the this book to get 100+ certifi...</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>1</th>\n",
 99 |        "      <td>2</td>\n",
100 |        "      <td>Serves its Purpose!</td>\n",
101 |        "      <td>Couldn't go without it. My 3 1/2 year still we...</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>2</th>\n",
105 |        "      <td>2</td>\n",
106 |        "      <td>Trailer Park Bwoys!!!</td>\n",
107 |        "      <td>we get to see it on paramount in ol' LND UK an...</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>3</th>\n",
111 |        "      <td>1</td>\n",
112 |        "      <td>buyer beware</td>\n",
113 |        "      <td>There are companies selling Bosch knock-offs o...</td>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>4</th>\n",
117 |        "      <td>2</td>\n",
118 |        "      <td>Great for those cold winters</td>\n",
119 |        "      <td>If you are looking to keep your water liquifie...</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>5</th>\n",
123 |        "      <td>1</td>\n",
124 |        "      <td>keeps breaking!</td>\n",
125 |        "      <td>I own a Nomad II 64 MP3 player and it has brok...</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>6</th>\n",
129 |        "      <td>1</td>\n",
130 |        "      <td>Not Happy</td>\n",
131 |        "      <td>Thought this was in English but it is in Germa...</td>\n",
132 |        "    </tr>\n",
133 |        "    <tr>\n",
134 |        "      <th>7</th>\n",
135 |        "      <td>1</td>\n",
136 |        "      <td>mount doesn't stay put</td>\n",
137 |        "      <td>I saw quite a few very positive reviews for th...</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>8</th>\n",
141 |        "      <td>2</td>\n",
142 |        "      <td>Finally , Some Common Sense!</td>\n",
143 |        "      <td>I was afraid this book would just bash media, ...</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>9</th>\n",
147 |        "      <td>2</td>\n",
148 |        "      <td>Good value, time saver</td>\n",
149 |        "      <td>My wife is a lifelong weightwatcher. She has b...</td>\n",
150 |        "    </tr>\n",
151 |        "    <tr>\n",
152 |        "      <th>10</th>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>Shyamalan bested!</td>\n",
155 |        "      <td>Shyamalan was at some point being credited wit...</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>11</th>\n",
159 |        "      <td>2</td>\n",
160 |        "      <td>My first Sunn O))) experience</td>\n",
161 |        "      <td>This is the first CD I've bought from this ban...</td>\n",
162 |        "    </tr>\n",
163 |        "    <tr>\n",
164 |        "      <th>12</th>\n",
165 |        "      <td>1</td>\n",
166 |        "      <td>Cannot get this product set up.</td>\n",
167 |        "      <td>I received this router today and have spent 3 ...</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>13</th>\n",
171 |        "      <td>1</td>\n",
172 |        "      <td>Could be great but...</td>\n",
173 |        "      <td>Who would have thought that someone could cram...</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>14</th>\n",
177 |        "      <td>1</td>\n",
178 |        "      <td>Not good at all</td>\n",
179 |        "      <td>This is probably the worst design for an egg p...</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>15</th>\n",
183 |        "      <td>2</td>\n",
184 |        "      <td>Disturbing and compelling</td>\n",
185 |        "      <td>Like a rotted tooth or a troubling sore I retu...</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>16</th>\n",
189 |        "      <td>1</td>\n",
190 |        "      <td>Unintentional camp and a very bad film!</td>\n",
191 |        "      <td>Now we all know at this point that Tom Cruise ...</td>\n",
192 |        "    </tr>\n",
193 |        "    <tr>\n",
194 |        "      <th>17</th>\n",
195 |        "      <td>2</td>\n",
196 |        "      <td>bitchen</td>\n",
197 |        "      <td>My dad found this book for 3 payments of 28 do...</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>18</th>\n",
201 |        "      <td>2</td>\n",
202 |        "      <td>Interesting Book</td>\n",
203 |        "      <td>The internet sites out there that talk about w...</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>19</th>\n",
207 |        "      <td>2</td>\n",
208 |        "      <td>Alice!!!! What happened to YOU AND ME?</td>\n",
209 |        "      <td>Wonderful sound, excellent video.... but, but,...</td>\n",
210 |        "    </tr>\n",
211 |        "  </tbody>\n",
212 |        "</table>\n",
213 |        "</div>"
214 |       ],
215 |       "text/plain": [
216 |        "    label                                    title  \\\n",
217 |        "0       2                       Right on the money   \n",
218 |        "1       2                      Serves its Purpose!   \n",
219 |        "2       2                    Trailer Park Bwoys!!!   \n",
220 |        "3       1                             buyer beware   \n",
221 |        "4       2             Great for those cold winters   \n",
222 |        "5       1                          keeps breaking!   \n",
223 |        "6       1                                Not Happy   \n",
224 |        "7       1                   mount doesn't stay put   \n",
225 |        "8       2             Finally , Some Common Sense!   \n",
226 |        "9       2                   Good value, time saver   \n",
227 |        "10      1                        Shyamalan bested!   \n",
228 |        "11      2            My first Sunn O))) experience   \n",
229 |        "12      1          Cannot get this product set up.   \n",
230 |        "13      1                    Could be great but...   \n",
231 |        "14      1                          Not good at all   \n",
232 |        "15      2                Disturbing and compelling   \n",
233 |        "16      1  Unintentional camp and a very bad film!   \n",
234 |        "17      2                                  bitchen   \n",
235 |        "18      2                         Interesting Book   \n",
236 |        "19      2   Alice!!!! What happened to YOU AND ME?   \n",
237 |        "\n",
238 |        "                                                 body  \n",
239 |        "0   We are using the this book to get 100+ certifi...  \n",
240 |        "1   Couldn't go without it. My 3 1/2 year still we...  \n",
241 |        "2   we get to see it on paramount in ol' LND UK an...  \n",
242 |        "3   There are companies selling Bosch knock-offs o...  \n",
243 |        "4   If you are looking to keep your water liquifie...  \n",
244 |        "5   I own a Nomad II 64 MP3 player and it has brok...  \n",
245 |        "6   Thought this was in English but it is in Germa...  \n",
246 |        "7   I saw quite a few very positive reviews for th...  \n",
247 |        "8   I was afraid this book would just bash media, ...  \n",
248 |        "9   My wife is a lifelong weightwatcher. She has b...  \n",
249 |        "10  Shyamalan was at some point being credited wit...  \n",
250 |        "11  This is the first CD I've bought from this ban...  \n",
251 |        "12  I received this router today and have spent 3 ...  \n",
252 |        "13  Who would have thought that someone could cram...  \n",
253 |        "14  This is probably the worst design for an egg p...  \n",
254 |        "15  Like a rotted tooth or a troubling sore I retu...  \n",
255 |        "16  Now we all know at this point that Tom Cruise ...  \n",
256 |        "17  My dad found this book for 3 payments of 28 do...  \n",
257 |        "18  The internet sites out there that talk about w...  \n",
258 |        "19  Wonderful sound, excellent video.... but, but,...  "
259 |       ]
260 |      },
261 |      "execution_count": 31,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "df.head(n=20)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 50,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "class AmazonReviewsDataset(object):\n",
277 |     "    \"\"\"Amazon Reviews text dataset for language modeling.\n",
278 |     "    \n",
279 |     "       Args:\n",
280 |     "        data_path (str): Path to Amazon reviews data file.\n",
281 |     "        num_samples (int): Number of amazon reviews to load.\n",
282 |     "        max_review_length (int): Filters reviews longer than specified length.\n",
283 |     "            [default=400]\n",
284 |     "        max_sequence_length (int): Max length of sequences for use in training language model.\n",
285 |     "            [default=40]\n",
286 |     "        sentiment (int): sentiment of reviews to select, 1 (negative) or 2 (positive).\n",
287 |     "            [default=2]\n",
288 |     "    \"\"\"\n",
289 |     "\n",
290 |     "    def __init__(self, data_path):\n",
291 |     "        data = pd.read_csv(data_path, names=['sentiment', 'title', 'review'])\n",
292 |     "        self.data = self.preprocess(data)\n",
293 |     "        \n",
294 |     "    def preprocess(self, review_df):\n",
295 |     "        def _preprocess_func(text):\n",
296 |     "            text = text.lower()\n",
297 |     "            text = re.sub(r\"([.,!?])\", r\" \\1 \", text)\n",
298 |     "            text = re.sub(r\"[^a-zA-Z.,!?]+\", r\" \", text)\n",
299 |     "            return text\n",
300 |     "        \n",
301 |     "        # Splitting the subset by sentiment to create our new train, val, and test splits\n",
302 |     "        by_sentiment = collections.defaultdict(list)\n",
303 |     "        for _, row in review_df.iterrows():\n",
304 |     "            by_sentiment[row.sentiment].append(row.to_dict())\n",
305 |     "\n",
306 |     "        final_list = []\n",
307 |     "        np.random.seed(SEED)\n",
308 |     "\n",
309 |     "        for _, item_list in sorted(by_sentiment.items()):\n",
310 |     "\n",
311 |     "            np.random.shuffle(item_list)\n",
312 |     "\n",
313 |     "            n_total = len(item_list)\n",
314 |     "            n_train = int(TRAIN_PROP * n_total)\n",
315 |     "            n_val = int(VAL_PROP * n_total)\n",
316 |     "            n_test = int(TEST_PROP * n_total)\n",
317 |     "\n",
318 |     "            # Give data point a split attribute\n",
319 |     "            for item in item_list[:n_train]:\n",
320 |     "                item['split'] = 'train'\n",
321 |     "\n",
322 |     "            for item in item_list[n_train:n_train+n_val]:\n",
323 |     "                item['split'] = 'val'\n",
324 |     "\n",
325 |     "            for item in item_list[n_train+n_val:n_train+n_val+n_test]:\n",
326 |     "                item['split'] = 'test'\n",
327 |     "            \n",
328 |     "            # Add to final list\n",
329 |     "            final_list.extend(item_list)\n",
330 |     "        \n",
331 |     "        output_df = pd.DataFrame(final_list)\n",
332 |     "        output_df['review'] = output_df.review.apply(_preprocess_func)\n",
333 |     "        return output_df\n",
334 |     "        \n",
335 |     "    def get_data(self):\n",
336 |     "        return self.data\n",
337 |     "\n",
338 |     "class AmazonReviewsVectorizer(object):\n",
339 |     "    def __init__(self, word_vocab, max_seq_length):\n",
340 |     "        self.word_vocab = word_vocab\n",
341 |     "        self.max_seq_length = max_seq_length\n",
342 |     "\n",
343 |     "    @classmethod\n",
344 |     "    def fit(cls, review_df):\n",
345 |     "        \"\"\"\n",
346 |     "        \"\"\"\n",
347 |     "        vocab = Vocabulary(use_unks=False,\n",
348 |     "                           use_start_end=True,\n",
349 |     "                           use_mask=True,\n",
350 |     "                           start_token=START_TOKEN,\n",
351 |     "                           end_token=END_TOKEN)\n",
352 |     "        max_seq_length = 0\n",
353 |     "        for review in review_df['review'].values:\n",
354 |     "            review_split = review.split(\" \")\n",
355 |     "            for word in review_split:\n",
356 |     "                vocab.add(word)\n",
357 |     "            if len(review_split) > max_seq_length:\n",
358 |     "                max_seq_length = len(review_split)\n",
359 |     "        max_seq_length += 2\n",
360 |     "        return cls(vocab, max_seq_length)\n",
361 |     "\n",
362 |     "    def transform(self, review_df, split='train'):\n",
363 |     "        review_df = review_df[review_df.split==split].reset_index()\n",
364 |     "        num_data = len(review_df)\n",
365 |     "        \n",
366 |     "        x_words = np.zeros((num_data, self.max_seq_length), dtype=np.int64)\n",
367 |     "        y_sentiment = np.zeros(num_data, dtype=np.int64)\n",
368 |     "\n",
369 |     "        for index, row in review_df.iterrows():\n",
370 |     "            x_indices = list(self.word_vocab.map(row['review'].split(' '), include_start_end=True))\n",
371 |     "            x_words[index, :len(x_indices)] = x_indices \n",
372 |     "            y_sentiment[index] = row['sentiment']\n",
373 |     "            \n",
374 |     "        return VectorizedAmazonReviews(x_words, y_sentiment)\n",
375 |     "\n",
376 |     "class VectorizedAmazonReviews(Dataset):\n",
377 |     "    def __init__(self, x_input, y_target):\n",
378 |     "        self.x_input = x_input\n",
379 |     "        self.y_target = y_target\n",
380 |     "\n",
381 |     "    def __len__(self):\n",
382 |     "        return len(self.x_input)\n",
383 |     "\n",
384 |     "    def __getitem__(self, index):\n",
385 |     "        return {'x_input': self.x_input[index],\n",
386 |     "                'y_target': self.y_target[index],\n",
387 |     "                'x_lengths': len(self.x_input[index].nonzero()[0])}\n",
388 |     "    \n",
389 |     "\n",
390 |     "def generate_batches(dataset, batch_size, shuffle=True,\n",
391 |     "                     drop_last=True, device=\"cpu\"): \n",
392 |     "    \"\"\"\n",
393 |     "    A generator function which wraps the PyTorch DataLoader. It will \n",
394 |     "      ensure each tensor is on the write device location.\n",
395 |     "    \"\"\"\n",
396 |     "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
397 |     "                            shuffle=shuffle, drop_last=drop_last)\n",
398 |     "\n",
399 |     "    for data_dict in dataloader:\n",
400 |     "        out_data_dict = {}\n",
401 |     "        for name, tensor in data_dict.items():\n",
402 |     "            out_data_dict[name] = data_dict[name].to(device)\n",
403 |     "        yield out_data_dict"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 51,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "dataset = AmazonReviewsDataset(\"../data/amazon_train_small.csv\")"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 52,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "vectorizer = AmazonReviewsVectorizer.fit(dataset.get_data())"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 53,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "train_dataset = vectorizer.transform(dataset.get_data(), split='train')"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 54,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "val_dataset = vectorizer.transform(dataset.get_data(), split='val')"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 56,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "batch_dict = next(generate_batches(train_dataset, 8))"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "## TASK\n",
456 |     "\n",
457 |     "1. create an embedding layer and get it to work with the batch_dict above\n",
458 |     "2. use either:\n",
459 |     "    1. a deep averaging network\n",
460 |     "    2. a convnet\n",
461 |     "    3. a RNN"
462 |    ]
463 |   }
464 |  ],
465 |  "metadata": {
466 |   "kernelspec": {
467 |    "display_name": "pytorch04",
468 |    "language": "python",
469 |    "name": "pytorch04"
470 |   },
471 |   "language_info": {
472 |    "codemirror_mode": {
473 |     "name": "ipython",
474 |     "version": 3
475 |    },
476 |    "file_extension": ".py",
477 |    "mimetype": "text/x-python",
478 |    "name": "python",
479 |    "nbconvert_exporter": "python",
480 |    "pygments_lexer": "ipython3",
481 |    "version": "3.6.6"
482 |   }
483 |  },
484 |  "nbformat": 4,
485 |  "nbformat_minor": 2
486 | }
487 | 


--------------------------------------------------------------------------------
/day_2/00-Load-Vectorize-Generate-And-Sequences-as-Tensors.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from argparse import Namespace\n",
 10 |     "import json\n",
 11 |     "\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "import torch\n",
 15 |     "import torch.nn as nn\n",
 16 |     "import torch.nn.functional as F\n",
 17 |     "import torch.optim as optim\n",
 18 |     "from torch.utils.data import Dataset, DataLoader\n",
 19 |     "from tqdm import tqdm_notebook\n",
 20 |     "\n",
 21 |     "from vocabulary import Vocabulary"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Data Structures\n",
 29 |     "\n",
 30 |     "For the notebooks presented today, we will be using a pattern that we have employed many times.  For this, we break the machine learning data pipeline into 4 distinct parts:\n",
 31 |     "\n",
 32 |     "1. Raw Data\n",
 33 |     "2. Vectorized Data\n",
 34 |     "3. A Vectorizer\n",
 35 |     "4. A (python) generator\n",
 36 |     "\n",
 37 |     "To give it a name, I'll called it Load-Vectorize-Generate (LVG)\n",
 38 |     "\n",
 39 |     "This pipeline turns letters or words into integers and then batches them to yield matrices of integers.  For language, since it is variable length, there are also 0-valued positions in the matrix. we will see how we tell PyTorch to treat these 0s as ignore-values.  \n",
 40 |     "\n",
 41 |     "After I introduce LVG, I will show quickly how to use the data generated from LVG ( a matrix of integers ). First, it is embedded so a vector of numbers is associated with each integer, then the batch is put on the 0th dimension so that it can be iterated over. "
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "args = Namespace(\n",
 51 |     "    surname_csv=\"../data/surnames.csv\"\n",
 52 |     ")\n",
 53 |     "\n",
 54 |     "START_TOKEN = \"^\"\n",
 55 |     "END_TOKEN = \"_\""
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Load\n",
 63 |     "\n",
 64 |     "Loading the raw data from disk should be relatively quickly.  Preferably, all munging should have happened & the form that is loaded should have precomputed things like split (between train/test/eval or fold #).  "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "class RawSurnames(object):\n",
 74 |     "    def __init__(self, data_path, delimiter=\",\"):\n",
 75 |     "        self.data = pd.read_csv(data_path, delimiter=delimiter)\n",
 76 |     "\n",
 77 |     "    def get_data(self, filter_to_nationality=None):\n",
 78 |     "        if filter_to_nationality is not None:\n",
 79 |     "            return self.data[self.data.nationality.isin(filter_to_nationality)]\n",
 80 |     "        return self.data"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Vectorize\n",
 88 |     "\n",
 89 |     "The first class is here is for managing the vectorized data structure.  It subclasses PyTorch's dataset class, which is supposed to implement two functions: `__len__` and `__getitem__`.  Our assumption with this is that no data processing is happening here; it is given the final tensors at init time and it just provides them through `__getitem__`.  PyTorch has things available to use this for sophisticated data queueing with the `DataLoader` class.  The `DataLoader` class will also convert these structures into PyTorch tensors, so we don't have to do that conversion. \n",
 90 |     "\n",
 91 |     "Some additional things: we also are returning the lengths of the sequences so that we can use them in the model.  "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "class VectorizedSurnames(Dataset):\n",
101 |     "    def __init__(self, x_surnames, y_nationalities):\n",
102 |     "        self.x_surnames = x_surnames\n",
103 |     "        self.y_nationalities = y_nationalities\n",
104 |     "\n",
105 |     "    def __len__(self):\n",
106 |     "        return len(self.x_surnames)\n",
107 |     "\n",
108 |     "    def __getitem__(self, index):\n",
109 |     "        return {'x_surnames': self.x_surnames[index],\n",
110 |     "                'y_nationalities': self.y_nationalities[index],\n",
111 |     "                'x_lengths': len(self.x_surnames[index].nonzero()[0])}"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "#### Vectorizer\n",
119 |     "\n",
120 |     "The actual vectorizer has a lot of responsibility.  \n",
121 |     "\n",
122 |     "Primarily, it manages the Vocabulary object, saving and loading it, and applying it to a dataset to create a vectorized form. "
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 6,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "class SurnamesVectorizer(object):\n",
132 |     "    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):\n",
133 |     "        self.surname_vocab = surname_vocab\n",
134 |     "        self.nationality_vocab = nationality_vocab\n",
135 |     "        self.max_seq_length = max_seq_length\n",
136 |     "        \n",
137 |     "    def save(self, filename):\n",
138 |     "        vec_dict = {\"surname_vocab\": self.surname_vocab.get_serializable_contents(),\n",
139 |     "                    \"nationality_vocab\": self.nationality_vocab.get_serializable_contents(),\n",
140 |     "                    'max_seq_length': self.max_seq_length}\n",
141 |     "\n",
142 |     "        with open(filename, \"w\") as fp:\n",
143 |     "            json.dump(vec_dict, fp)\n",
144 |     "        \n",
145 |     "    @classmethod\n",
146 |     "    def load(cls, filename):\n",
147 |     "        with open(filename, \"r\") as fp:\n",
148 |     "            vec_dict = json.load(fp)\n",
149 |     "\n",
150 |     "        vec_dict[\"surname_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"surname_vocab\"])\n",
151 |     "        vec_dict[\"nationality_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"nationality_vocab\"])\n",
152 |     "        return cls(**vec_dict)\n",
153 |     "\n",
154 |     "    @classmethod\n",
155 |     "    def fit(cls, surname_df):\n",
156 |     "        surname_vocab = Vocabulary(use_unks=False,\n",
157 |     "                                   use_mask=True,\n",
158 |     "                                   use_start_end=True,\n",
159 |     "                                   start_token=START_TOKEN,\n",
160 |     "                                   end_token=END_TOKEN)\n",
161 |     "\n",
162 |     "        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)\n",
163 |     "\n",
164 |     "        max_seq_length = 0\n",
165 |     "        for index, row in surname_df.iterrows():\n",
166 |     "            surname_vocab.add_many(row.surname)\n",
167 |     "            nationality_vocab.add(row.nationality)\n",
168 |     "\n",
169 |     "            if len(row.surname) > max_seq_length:\n",
170 |     "                max_seq_length = len(row.surname)\n",
171 |     "        max_seq_length = max_seq_length + 2\n",
172 |     "\n",
173 |     "        return cls(surname_vocab, nationality_vocab, max_seq_length)\n",
174 |     "\n",
175 |     "    @classmethod\n",
176 |     "    def fit_transform(cls, surname_df, split='train'):\n",
177 |     "        vectorizer = cls.fit(surname_df)\n",
178 |     "        return vectorizer, vectorizer.transform(surname_df, split)\n",
179 |     "\n",
180 |     "    def transform(self, surname_df, split='train'):\n",
181 |     "\n",
182 |     "        df = surname_df[surname_df.split==split].reset_index()\n",
183 |     "        n_data = len(df)\n",
184 |     "        \n",
185 |     "        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)\n",
186 |     "        y_nationalities = np.zeros(n_data, dtype=np.int64)\n",
187 |     "\n",
188 |     "        for index, row in df.iterrows():\n",
189 |     "            vectorized_surname = list(self.surname_vocab.map(row.surname, \n",
190 |     "                                                             include_start_end=True))\n",
191 |     "            x_surnames[index, :len(vectorized_surname)] = vectorized_surname\n",
192 |     "            y_nationalities[index] = self.nationality_vocab[row.nationality]\n",
193 |     "\n",
194 |     "        return VectorizedSurnames(x_surnames, y_nationalities)\n"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "### Generate\n",
202 |     "\n",
203 |     "Finally, the make_data_generator interacts with PyTorch's `DataLoader` and returns a generator. It basically just iterates over the `DataLoader` generator and does some processing.  Currently, it returns a function rather than just making the generator itself so some control can be had over num_batches & volatile mode, and other run time things. It's mostly a cheap and easy function that can be written in many ways. "
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 7,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "def generate_batches(dataset, batch_size, shuffle=True,\n",
213 |     "                     drop_last=True, device=\"cpu\"): \n",
214 |     "    \"\"\"\n",
215 |     "    A generator function which wraps the PyTorch DataLoader. It will \n",
216 |     "      ensure each tensor is on the write device location.\n",
217 |     "    \"\"\"\n",
218 |     "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
219 |     "                            shuffle=shuffle, drop_last=drop_last)\n",
220 |     "\n",
221 |     "    for data_dict in dataloader:\n",
222 |     "        out_data_dict = {}\n",
223 |     "        for name, tensor in data_dict.items():\n",
224 |     "            out_data_dict[name] = data_dict[name].to(device)\n",
225 |     "        yield out_data_dict"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 8,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "raw_data = RawSurnames(args.surname_csv).get_data()"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 9,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>split</th>\n",
264 |        "      <th>surname</th>\n",
265 |        "      <th>nationality</th>\n",
266 |        "    </tr>\n",
267 |        "  </thead>\n",
268 |        "  <tbody>\n",
269 |        "    <tr>\n",
270 |        "      <th>0</th>\n",
271 |        "      <td>train</td>\n",
272 |        "      <td>Hadad</td>\n",
273 |        "      <td>arabic</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <th>1</th>\n",
277 |        "      <td>train</td>\n",
278 |        "      <td>Prikazchikov</td>\n",
279 |        "      <td>russian</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>2</th>\n",
283 |        "      <td>train</td>\n",
284 |        "      <td>Bajov</td>\n",
285 |        "      <td>russian</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>3</th>\n",
289 |        "      <td>train</td>\n",
290 |        "      <td>Awduewsky</td>\n",
291 |        "      <td>russian</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>4</th>\n",
295 |        "      <td>train</td>\n",
296 |        "      <td>Jablochkov</td>\n",
297 |        "      <td>russian</td>\n",
298 |        "    </tr>\n",
299 |        "  </tbody>\n",
300 |        "</table>\n",
301 |        "</div>"
302 |       ],
303 |       "text/plain": [
304 |        "   split       surname nationality\n",
305 |        "0  train         Hadad      arabic\n",
306 |        "1  train  Prikazchikov     russian\n",
307 |        "2  train         Bajov     russian\n",
308 |        "3  train     Awduewsky     russian\n",
309 |        "4  train    Jablochkov     russian"
310 |       ]
311 |      },
312 |      "execution_count": 9,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "raw_data.head()"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 10,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "(<Vocabulary(size=18,frozen=False)>, <Vocabulary(size=90,frozen=False)>)"
330 |       ]
331 |      },
332 |      "execution_count": 10,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "vectorizer = SurnamesVectorizer.fit(raw_data)\n",
339 |     "\n",
340 |     "vectorizer.nationality_vocab, vectorizer.surname_vocab"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 11,
346 |    "metadata": {},
347 |    "outputs": [
348 |     {
349 |      "data": {
350 |       "text/plain": [
351 |        "True"
352 |       ]
353 |      },
354 |      "execution_count": 11,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "vec_train = vectorizer.transform(raw_data, split='train')\n",
361 |     "isinstance(vec_train, Dataset)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 12,
367 |    "metadata": {},
368 |    "outputs": [
369 |     {
370 |      "data": {
371 |       "text/plain": [
372 |        "(array([[ 1,  3,  4, ...,  0,  0,  0],\n",
373 |        "        [ 1,  6,  7, ...,  0,  0,  0],\n",
374 |        "        [ 1, 15,  4, ...,  0,  0,  0],\n",
375 |        "        ...,\n",
376 |        "        [ 1, 44, 12, ...,  0,  0,  0],\n",
377 |        "        [ 1, 17, 26, ...,  0,  0,  0],\n",
378 |        "        [ 1, 32,  4, ...,  0,  0,  0]]), (16059, 22))"
379 |       ]
380 |      },
381 |      "execution_count": 12,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "vec_train.x_surnames, vec_train.x_surnames.shape"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 13,
393 |    "metadata": {},
394 |    "outputs": [
395 |     {
396 |      "data": {
397 |       "text/plain": [
398 |        "(array([0, 1, 1, ..., 1, 1, 1]), (16059,))"
399 |       ]
400 |      },
401 |      "execution_count": 13,
402 |      "metadata": {},
403 |      "output_type": "execute_result"
404 |     }
405 |    ],
406 |    "source": [
407 |     "vec_train.y_nationalities, vec_train.y_nationalities.shape"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 14,
413 |    "metadata": {},
414 |    "outputs": [
415 |     {
416 |      "data": {
417 |       "text/plain": [
418 |        "dict_keys(['x_surnames', 'y_nationalities', 'x_lengths'])"
419 |       ]
420 |      },
421 |      "execution_count": 14,
422 |      "metadata": {},
423 |      "output_type": "execute_result"
424 |     }
425 |    ],
426 |    "source": [
427 |     "# let's say we are making a randomized batch. \n",
428 |     "batch_generator = generate_batches(vec_train, batch_size=16)\n",
429 |     "\n",
430 |     "batch_dict = next(batch_generator)\n",
431 |     "batch_dict.keys()"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 15,
437 |    "metadata": {},
438 |    "outputs": [
439 |     {
440 |      "data": {
441 |       "text/plain": [
442 |        "torch.Size([16, 22])"
443 |       ]
444 |      },
445 |      "execution_count": 15,
446 |      "metadata": {},
447 |      "output_type": "execute_result"
448 |     }
449 |    ],
450 |    "source": [
451 |     "batch_dict['x_surnames'].shape"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "## Embedding sequences\n",
459 |     "\n",
460 |     "Let's take a look at how sequences are embedded"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 16,
466 |    "metadata": {},
467 |    "outputs": [
468 |     {
469 |      "data": {
470 |       "text/plain": [
471 |        "{'<MASK>': 0,\n",
472 |        " '^': 1,\n",
473 |        " '_': 2,\n",
474 |        " 'H': 3,\n",
475 |        " 'a': 4,\n",
476 |        " 'd': 5,\n",
477 |        " 'P': 6,\n",
478 |        " 'r': 7,\n",
479 |        " 'i': 8,\n",
480 |        " 'k': 9,\n",
481 |        " 'z': 10,\n",
482 |        " 'c': 11,\n",
483 |        " 'h': 12,\n",
484 |        " 'o': 13,\n",
485 |        " 'v': 14,\n",
486 |        " 'B': 15,\n",
487 |        " 'j': 16,\n",
488 |        " 'A': 17,\n",
489 |        " 'w': 18,\n",
490 |        " 'u': 19,\n",
491 |        " 'e': 20,\n",
492 |        " 's': 21,\n",
493 |        " 'y': 22,\n",
494 |        " 'J': 23,\n",
495 |        " 'b': 24,\n",
496 |        " 'l': 25,\n",
497 |        " 'n': 26,\n",
498 |        " 'I': 27,\n",
499 |        " 'm': 28,\n",
500 |        " 'K': 29,\n",
501 |        " 't': 30,\n",
502 |        " 'M': 31,\n",
503 |        " 'G': 32,\n",
504 |        " 'f': 33,\n",
505 |        " 'W': 34,\n",
506 |        " 'T': 35,\n",
507 |        " 'F': 36,\n",
508 |        " 'g': 37,\n",
509 |        " 'L': 38,\n",
510 |        " 'S': 39,\n",
511 |        " 'p': 40,\n",
512 |        " 'E': 41,\n",
513 |        " 'R': 42,\n",
514 |        " 'O': 43,\n",
515 |        " 'Z': 44,\n",
516 |        " 'V': 45,\n",
517 |        " 'C': 46,\n",
518 |        " 'Y': 47,\n",
519 |        " 'N': 48,\n",
520 |        " 'D': 49,\n",
521 |        " 'x': 50,\n",
522 |        " 'é': 51,\n",
523 |        " 'U': 52,\n",
524 |        " ' ': 53,\n",
525 |        " 'à': 54,\n",
526 |        " 'q': 55,\n",
527 |        " \"'\": 56,\n",
528 |        " 'ó': 57,\n",
529 |        " 'ö': 58,\n",
530 |        " 'ü': 59,\n",
531 |        " 'Q': 60,\n",
532 |        " 'X': 61,\n",
533 |        " 'í': 62,\n",
534 |        " '-': 63,\n",
535 |        " 'ń': 64,\n",
536 |        " 'ä': 65,\n",
537 |        " 'ê': 66,\n",
538 |        " 'ú': 67,\n",
539 |        " 'ñ': 68,\n",
540 |        " 'á': 69,\n",
541 |        " ',': 70,\n",
542 |        " 'Ś': 71,\n",
543 |        " 'ą': 72,\n",
544 |        " 'ò': 73,\n",
545 |        " 'ã': 74,\n",
546 |        " 'ß': 75,\n",
547 |        " 'ù': 76,\n",
548 |        " 'õ': 77,\n",
549 |        " 'ì': 78,\n",
550 |        " 'è': 79,\n",
551 |        " '/': 80,\n",
552 |        " 'ł': 81,\n",
553 |        " '1': 82,\n",
554 |        " 'Á': 83,\n",
555 |        " 'ż': 84,\n",
556 |        " ':': 85,\n",
557 |        " 'ç': 86,\n",
558 |        " 'Ż': 87,\n",
559 |        " '\\xa0': 88,\n",
560 |        " 'É': 89}"
561 |       ]
562 |      },
563 |      "execution_count": 16,
564 |      "metadata": {},
565 |      "output_type": "execute_result"
566 |     }
567 |    ],
568 |    "source": [
569 |     "vectorizer.surname_vocab._mapping"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 15,
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "n_surname_characters = len(vectorizer.surname_vocab)\n",
579 |     "# padding_idx is very important!\n",
580 |     "emb = nn.Embedding(embedding_dim=8, num_embeddings=n_surname_characters, padding_idx=0)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 16,
586 |    "metadata": {},
587 |    "outputs": [
588 |     {
589 |      "data": {
590 |       "text/plain": [
591 |        "torch.Size([16, 22, 8])"
592 |       ]
593 |      },
594 |      "execution_count": 16,
595 |      "metadata": {},
596 |      "output_type": "execute_result"
597 |     }
598 |    ],
599 |    "source": [
600 |     "x_seq = emb(batch_dict['x_surnames'])\n",
601 |     "x_seq.size()"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {},
607 |    "source": [
608 |     "### Common Pattern: putting sequence dimension on dimension 0\n",
609 |     "\n",
610 |     "Because dimension 0 is indexed faster, and it's easier to write code for, many times the dimensions are permuted to put the sequence on the first dimension. this is done like the following"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 17,
616 |    "metadata": {},
617 |    "outputs": [
618 |     {
619 |      "data": {
620 |       "text/plain": [
621 |        "torch.Size([22, 16, 8])"
622 |       ]
623 |      },
624 |      "execution_count": 17,
625 |      "metadata": {},
626 |      "output_type": "execute_result"
627 |     }
628 |    ],
629 |    "source": [
630 |     "# where this swaps 1 and 0. if we did it twice, it would swap back. \n",
631 |     "x_seq_on_dim0 = x_seq.permute(1, 0, 2)\n",
632 |     "x_seq_on_dim0.size()"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "markdown",
637 |    "metadata": {},
638 |    "source": [
639 |     "so, later when we want to get the 5th item in the sequence, we can"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": 18,
645 |    "metadata": {},
646 |    "outputs": [
647 |     {
648 |      "data": {
649 |       "text/plain": [
650 |        "torch.Size([16, 8])"
651 |       ]
652 |      },
653 |      "execution_count": 18,
654 |      "metadata": {},
655 |      "output_type": "execute_result"
656 |     }
657 |    ],
658 |    "source": [
659 |     "x_5th_step = x_seq_on_dim0[4, :, :]\n",
660 |     "x_5th_step.size()"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "markdown",
665 |    "metadata": {},
666 |    "source": [
667 |     "so, this is the gist of how we will be using sequences as tensors.  we construct a matrix of embedding integers, use an embedding module to retrieve their corresponding vectors, and then move the sequence to the first dimension so we can index into it easier & faster. "
668 |    ]
669 |   }
670 |  ],
671 |  "metadata": {
672 |   "kernelspec": {
673 |    "display_name": "pytorch04",
674 |    "language": "python",
675 |    "name": "pytorch04"
676 |   },
677 |   "language_info": {
678 |    "codemirror_mode": {
679 |     "name": "ipython",
680 |     "version": 3
681 |    },
682 |    "file_extension": ".py",
683 |    "mimetype": "text/x-python",
684 |    "name": "python",
685 |    "nbconvert_exporter": "python",
686 |    "pygments_lexer": "ipython3",
687 |    "version": "3.6.6"
688 |   }
689 |  },
690 |  "nbformat": 4,
691 |  "nbformat_minor": 2
692 | }
693 | 


--------------------------------------------------------------------------------
/day_2/03-Char-RNN-Conditionally-Predict-Surnames.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from argparse import Namespace\n",
 10 |     "import json\n",
 11 |     "import os\n",
 12 |     "\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import seaborn as sns\n",
 17 |     "import torch\n",
 18 |     "import torch.nn as nn\n",
 19 |     "import torch.nn.functional as F\n",
 20 |     "import torch.optim as optim\n",
 21 |     "from torch.utils.data import Dataset, DataLoader\n",
 22 |     "from tqdm import tqdm_notebook\n",
 23 |     "\n",
 24 |     "from vocabulary import Vocabulary\n",
 25 |     "\n",
 26 |     "%matplotlib inline\n",
 27 |     "\n",
 28 |     "plt.style.use('fivethirtyeight')\n",
 29 |     "plt.rcParams['figure.figsize'] = (14, 6)\n",
 30 |     "\n",
 31 |     "START_TOKEN = \"^\"\n",
 32 |     "END_TOKEN = \"_\"\n",
 33 |     "IGNORE_INDEX_VALUE = -1"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Class Definitions \n",
 41 |     "\n",
 42 |     "Data Model:\n",
 43 |     "- Raw data\n",
 44 |     "- Vectorizer\n",
 45 |     "- Vectorized Data\n",
 46 |     "- Data generator"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "class RawSurnames(object):\n",
 56 |     "    def __init__(self, data_path, delimiter=\",\"):\n",
 57 |     "        self.data = pd.read_csv(data_path, delimiter=delimiter)\n",
 58 |     "\n",
 59 |     "    def get_data(self, filter_to_nationality=None):\n",
 60 |     "        if filter_to_nationality is not None:\n",
 61 |     "            return self.data[self.data.nationality.isin(filter_to_nationality)]\n",
 62 |     "        return self.data\n",
 63 |     "\n",
 64 |     "# vectorizer\n",
 65 |     "\n",
 66 |     "class SurnamesVectorizer(object):\n",
 67 |     "    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):\n",
 68 |     "        self.surname_vocab = surname_vocab\n",
 69 |     "        self.nationality_vocab = nationality_vocab\n",
 70 |     "        self.max_seq_length = max_seq_length\n",
 71 |     "        \n",
 72 |     "    def save(self, filename):\n",
 73 |     "        vec_dict = {\"surname_vocab\": self.surname_vocab.get_serializable_contents(),\n",
 74 |     "                    \"nationality_vocab\": self.nationality_vocab.get_serializable_contents(),\n",
 75 |     "                    'max_seq_length': self.max_seq_length}\n",
 76 |     "\n",
 77 |     "        with open(filename, \"w\") as fp:\n",
 78 |     "            json.dump(vec_dict, fp)\n",
 79 |     "        \n",
 80 |     "    @classmethod\n",
 81 |     "    def load(cls, filename):\n",
 82 |     "        with open(filename, \"r\") as fp:\n",
 83 |     "            vec_dict = json.load(fp)\n",
 84 |     "\n",
 85 |     "        vec_dict[\"surname_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"surname_vocab\"])\n",
 86 |     "        vec_dict[\"nationality_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"nationality_vocab\"])\n",
 87 |     "        return cls(**vec_dict)\n",
 88 |     "\n",
 89 |     "    @classmethod\n",
 90 |     "    def fit(cls, surname_df):\n",
 91 |     "        surname_vocab = Vocabulary(use_unks=False,\n",
 92 |     "                                   use_mask=True,\n",
 93 |     "                                   use_start_end=True,\n",
 94 |     "                                   start_token=START_TOKEN,\n",
 95 |     "                                   end_token=END_TOKEN)\n",
 96 |     "\n",
 97 |     "        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)\n",
 98 |     "\n",
 99 |     "        max_seq_length = 0\n",
100 |     "        for index, row in surname_df.iterrows():\n",
101 |     "            surname_vocab.add_many(row.surname)\n",
102 |     "            nationality_vocab.add(row.nationality)\n",
103 |     "\n",
104 |     "            if len(row.surname) > max_seq_length:\n",
105 |     "                max_seq_length = len(row.surname)\n",
106 |     "        max_seq_length = max_seq_length + 2\n",
107 |     "\n",
108 |     "        return cls(surname_vocab, nationality_vocab, max_seq_length)\n",
109 |     "\n",
110 |     "    @classmethod\n",
111 |     "    def fit_transform(cls, surname_df, split='train'):\n",
112 |     "        vectorizer = cls.fit(surname_df)\n",
113 |     "        return vectorizer, vectorizer.transform(surname_df, split)\n",
114 |     "    \n",
115 |     "    def transform(self, surname_df, split='train'):\n",
116 |     "\n",
117 |     "        df = surname_df[surname_df.split==split].reset_index()\n",
118 |     "        n_data = len(df)\n",
119 |     "        \n",
120 |     "        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)\n",
121 |     "        y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * IGNORE_INDEX_VALUE\n",
122 |     "        x_nationalities = np.zeros(n_data, dtype=np.int64)\n",
123 |     "\n",
124 |     "        for index, row in df.iterrows():\n",
125 |     "            vectorized_surname = list(self.surname_vocab.map(row.surname, \n",
126 |     "                                                             include_start_end=True))\n",
127 |     "            x_part = vectorized_surname[:-1]\n",
128 |     "            y_part = vectorized_surname[1:]\n",
129 |     "            x_surnames[index, :len(x_part)] = x_part\n",
130 |     "            y_surnames[index, :len(y_part)] = y_part\n",
131 |     "            x_nationalities[index] = self.nationality_vocab[row.nationality]\n",
132 |     "\n",
133 |     "        return VectorizedSurnames(x_surnames, x_nationalities, y_surnames)\n",
134 |     "\n",
135 |     "# vec data\n",
136 |     "\n",
137 |     "class VectorizedSurnames(Dataset):\n",
138 |     "    def __init__(self, x_surnames, x_nationalities, y_surnames):\n",
139 |     "        self.x_surnames = x_surnames\n",
140 |     "        self.x_nationalities = x_nationalities\n",
141 |     "        self.y_surnames = y_surnames\n",
142 |     "\n",
143 |     "    def __len__(self):\n",
144 |     "        return len(self.x_surnames)\n",
145 |     "\n",
146 |     "    def __getitem__(self, index):\n",
147 |     "        return {'x_surnames': self.x_surnames[index],\n",
148 |     "                'x_nationalities': self.x_nationalities[index],\n",
149 |     "                'y_surnames': self.y_surnames[index],\n",
150 |     "                'x_lengths': len(self.x_surnames[index].nonzero()[0])}\n",
151 |     "\n",
152 |     "# data generator\n",
153 |     "\n",
154 |     "def generate_batches(dataset, batch_size, shuffle=True,\n",
155 |     "                     drop_last=True, device=\"cpu\"): \n",
156 |     "    \"\"\"\n",
157 |     "    A generator function which wraps the PyTorch DataLoader. It will \n",
158 |     "      ensure each tensor is on the write device location.\n",
159 |     "    \"\"\"\n",
160 |     "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
161 |     "                            shuffle=shuffle, drop_last=drop_last)\n",
162 |     "\n",
163 |     "    for data_dict in dataloader:\n",
164 |     "        out_data_dict = {}\n",
165 |     "        for name, tensor in data_dict.items():\n",
166 |     "            out_data_dict[name] = data_dict[name].to(device)\n",
167 |     "        yield out_data_dict"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Class definitions for the model"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 3,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "def new_parameter(*size):\n",
184 |     "    out = torch.randn(*size, requires_grad=True, dtype=torch.float32)\n",
185 |     "    torch.nn.init.xavier_normal_(out)\n",
186 |     "    return nn.Parameter(out)\n",
187 |     "\n",
188 |     "def column_gather(y_out, x_lengths):\n",
189 |     "    '''Get a specific vector from each batch datapoint in `y_out`.\n",
190 |     "\n",
191 |     "    More precisely, iterate over batch row indices, get the vector that's at\n",
192 |     "    the position indicated by the corresponding value in `x_lengths` at the row\n",
193 |     "    index.\n",
194 |     "\n",
195 |     "    Args:\n",
196 |     "        y_out (torch.FloatTensor, torch.cuda.FloatTensor)\n",
197 |     "            shape: (batch, sequence, feature)\n",
198 |     "        x_lengths (torch.LongTensor, torch.cuda.LongTensor)\n",
199 |     "            shape: (batch,)\n",
200 |     "\n",
201 |     "    Returns:\n",
202 |     "        y_out (torch.FloatTensor, torch.cuda.FloatTensor)\n",
203 |     "            shape: (batch, feature)\n",
204 |     "    '''\n",
205 |     "    x_lengths = x_lengths.long().detach().cpu().numpy() - 1\n",
206 |     "\n",
207 |     "    out = []\n",
208 |     "    for batch_index, column_index in enumerate(x_lengths):\n",
209 |     "        out.append(y_out[batch_index, column_index])\n",
210 |     "\n",
211 |     "    return torch.stack(out)\n",
212 |     "\n",
213 |     "\n",
214 |     "class ExplicitRNN(nn.Module):\n",
215 |     "    def __init__(self, input_size, hidden_size, batch_first=False):\n",
216 |     "        super(ExplicitRNN, self).__init__()\n",
217 |     "        self.W_in2hid = new_parameter(input_size, hidden_size)\n",
218 |     "        self.W_hid2hid = new_parameter(hidden_size, hidden_size)\n",
219 |     "            \n",
220 |     "        self.b_hid = new_parameter(1, hidden_size)\n",
221 |     "        \n",
222 |     "        self.hidden_size = hidden_size\n",
223 |     "\n",
224 |     "        self.batch_first = batch_first\n",
225 |     "    \n",
226 |     "    def _compute_next_hidden(self, x, h):\n",
227 |     "        return F.tanh(x.matmul(self.W_in2hid) + \n",
228 |     "                      h.matmul(self.W_hid2hid) + \n",
229 |     "                      self.b_hid)\n",
230 |     "\n",
231 |     "    def forward(self, x_in, hid_t=None):\n",
232 |     "        if self.batch_first:\n",
233 |     "            batch_size, seq_size, feat_size = x_in.size()\n",
234 |     "            x_in = x_in.permute(1, 0, 2)\n",
235 |     "        else:\n",
236 |     "            seq_size, batch_size, feat_size = x_in.size()\n",
237 |     "\n",
238 |     "        hiddens = []\n",
239 |     "        if hid_t is None:\n",
240 |     "            hid_t = torch.ones((batch_size, self.hidden_size))\n",
241 |     "        \n",
242 |     "        if x_in.is_cuda:\n",
243 |     "            hid_t = hid_t.cuda()\n",
244 |     "            \n",
245 |     "        for t in range(seq_size):\n",
246 |     "            x_t = x_in[t]\n",
247 |     "            hid_t = self._compute_next_hidden(x_t, hid_t)\n",
248 |     "            \n",
249 |     "            hiddens.append(hid_t)\n",
250 |     "        hiddens = torch.stack(hiddens)\n",
251 |     "\n",
252 |     "        if self.batch_first:\n",
253 |     "            hiddens = hiddens.permute(1, 0, 2)\n",
254 |     "\n",
255 |     "        return hiddens\n",
256 |     "    \n",
257 |     "    \n",
258 |     "class CharNN(nn.Module):\n",
259 |     "    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, num_conditioning_states,\n",
260 |     "                 batch_first=False):\n",
261 |     "        super(CharNN, self).__init__()\n",
262 |     "        \n",
263 |     "        self.emb = nn.Embedding(embedding_dim=embedding_size, \n",
264 |     "                                num_embeddings=in_vocab_size, \n",
265 |     "                                padding_idx=0)\n",
266 |     "        self.conditional_emb = nn.Embedding(embedding_dim=hidden_size, \n",
267 |     "                                            num_embeddings=num_conditioning_states)\n",
268 |     "        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)\n",
269 |     "        self.rnn = ExplicitRNN(input_size=embedding_size, hidden_size=hidden_size, \n",
270 |     "                               batch_first=batch_first)\n",
271 |     "    \n",
272 |     "    def forward(self, x_in, state_in, x_lengths=None, apply_softmax=False):\n",
273 |     "        x_in = self.emb(x_in)\n",
274 |     "        state_in = self.conditional_emb(state_in)\n",
275 |     "        y_out = self.rnn(x_in, state_in)\n",
276 |     "\n",
277 |     "        dim0, dim1, dim2 = y_out.size()\n",
278 |     "        y_out = y_out.contiguous().view(-1, dim2)\n",
279 |     "\n",
280 |     "        y_out = self.fc(y_out)\n",
281 |     "\n",
282 |     "        # optionally apply the softmax\n",
283 |     "        if apply_softmax:\n",
284 |     "            y_out = F.softmax(y_out, dim=1)\n",
285 |     "\n",
286 |     "        y_out = y_out.view(dim0, dim1, -1)\n",
287 |     "        \n",
288 |     "        return y_out\n",
289 |     "    \n",
290 |     "def normalize_sizes(net_output, y_true):\n",
291 |     "    net_output = net_output.cpu()\n",
292 |     "    y_true = y_true.cpu()\n",
293 |     "    if len(net_output.size()) == 3:\n",
294 |     "        net_output.contiguous()\n",
295 |     "        net_output = net_output.view(-1, net_output.size(2))\n",
296 |     "    if len(y_true.size()) == 2:\n",
297 |     "        y_true.contiguous()\n",
298 |     "        y_true = y_true.view(-1)\n",
299 |     "    return net_output, y_true\n",
300 |     "\n",
301 |     "def compute_accuracy(y_pred, y_true, mask_index):\n",
302 |     "    y_pred, y_true = normalize_sizes(y_pred, y_true)\n",
303 |     "\n",
304 |     "    _, y_pred_indices = y_pred.max(dim=1)\n",
305 |     "    \n",
306 |     "    correct_indices = torch.eq(y_pred_indices, y_true).float()\n",
307 |     "    valid_indices = torch.ne(y_true, mask_index).float()\n",
308 |     "    \n",
309 |     "    n_correct = (correct_indices * valid_indices).sum().item()\n",
310 |     "    n_valid = valid_indices.sum().item()\n",
311 |     "\n",
312 |     "    return n_correct / n_valid * 100"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 4,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "name": "stdout",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "Using CUDA: False\n"
325 |      ]
326 |     },
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "device(type='cpu')"
331 |       ]
332 |      },
333 |      "execution_count": 4,
334 |      "metadata": {},
335 |      "output_type": "execute_result"
336 |     }
337 |    ],
338 |    "source": [
339 |     "args = Namespace(\n",
340 |     "    surname_csv=\"../data/surnames.csv\",\n",
341 |     "    batch_size = 128,\n",
342 |     "    cuda=False,\n",
343 |     "    learning_rate=0.001,\n",
344 |     "    num_epochs=100,\n",
345 |     "    load_zoo_model=True,\n",
346 |     "    zoo={\n",
347 |     "        'filename': '../modelzoo/charnn_emb16_hid64_surnames_conditionally_predict.state',\n",
348 |     "        'vocab': '../modelzoo/surnames_classify.vocab',\n",
349 |     "        'comments': 'pre-trained surname conditioned sequence prediction (& conditioned generation)',\n",
350 |     "        'parameters': {\n",
351 |     "            'embedding_size': 16,\n",
352 |     "            'hidden_size': 64\n",
353 |     "        }\n",
354 |     "    }\n",
355 |     ")\n",
356 |     "\n",
357 |     "\n",
358 |     "\n",
359 |     "# Check CUDA\n",
360 |     "if not torch.cuda.is_available():\n",
361 |     "    args.cuda = False\n",
362 |     "\n",
363 |     "print(\"Using CUDA: {}\".format(args.cuda))\n",
364 |     "\n",
365 |     "args.device = torch.device(\"cuda\" if args.cuda else \"cpu\")\n",
366 |     "args.device"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 5,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "# optional: set this to false to learn from scratch!\n",
376 |     "# args.load_zoo_model = False"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 6,
382 |    "metadata": {},
383 |    "outputs": [
384 |     {
385 |      "name": "stdout",
386 |      "output_type": "stream",
387 |      "text": [
388 |       "Loading vectorizer!\n",
389 |       "Loading state dict!\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "raw_data = RawSurnames(args.surname_csv).get_data()\n",
395 |     "\n",
396 |     "if os.path.exists(args.zoo['vocab']):\n",
397 |     "    vectorizer = SurnamesVectorizer.load(args.zoo['vocab'])\n",
398 |     "    print(\"Loading vectorizer!\")\n",
399 |     "else:\n",
400 |     "    vectorizer = SurnamesVectorizer.fit(raw_data)\n",
401 |     "    print(\"Creating a new vectorizer.\")\n",
402 |     "    \n",
403 |     "    \n",
404 |     "train_dataset = vectorizer.transform(raw_data, split='train')\n",
405 |     "test_dataset = vectorizer.transform(raw_data, split='test')\n",
406 |     "\n",
407 |     "\n",
408 |     "zoo_params = args.zoo['parameters']\n",
409 |     "\n",
410 |     "net = CharNN(embedding_size=zoo_params['embedding_size'], \n",
411 |     "             hidden_size=zoo_params['hidden_size'],\n",
412 |     "             in_vocab_size=len(vectorizer.surname_vocab), \n",
413 |     "             out_vocab_size=len(vectorizer.surname_vocab), \n",
414 |     "             num_conditioning_states=len(vectorizer.nationality_vocab),\n",
415 |     "             batch_first=True)\n",
416 |     "\n",
417 |     "if args.load_zoo_model and os.path.exists(args.zoo['filename']):\n",
418 |     "    print(\"Loading state dict!\")\n",
419 |     "    net.load_state_dict(torch.load(args.zoo['filename'], \n",
420 |     "                                   map_location=lambda storage, loc: storage))\n",
421 |     "else:\n",
422 |     "    print(\"Using newly initiated network!\")"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 7,
428 |    "metadata": {},
429 |    "outputs": [
430 |     {
431 |      "data": {
432 |       "application/vnd.jupyter.widget-view+json": {
433 |        "model_id": "b88870b8058346958583d6341f90616c",
434 |        "version_major": 2,
435 |        "version_minor": 0
436 |       },
437 |       "text/plain": [
438 |        "HBox(children=(IntProgress(value=0, description='epochs'), HTML(value='')))"
439 |       ]
440 |      },
441 |      "metadata": {},
442 |      "output_type": "display_data"
443 |     },
444 |     {
445 |      "data": {
446 |       "application/vnd.jupyter.widget-view+json": {
447 |        "model_id": "6b834ec74cda4fc8bd1fccd1decf8320",
448 |        "version_major": 2,
449 |        "version_minor": 0
450 |       },
451 |       "text/plain": [
452 |        "HBox(children=(IntProgress(value=0, description='training', max=125), HTML(value='')))"
453 |       ]
454 |      },
455 |      "metadata": {},
456 |      "output_type": "display_data"
457 |     },
458 |     {
459 |      "data": {
460 |       "application/vnd.jupyter.widget-view+json": {
461 |        "model_id": "34359d0f78374df28762205749ab1429",
462 |        "version_major": 2,
463 |        "version_minor": 0
464 |       },
465 |       "text/plain": [
466 |        "HBox(children=(IntProgress(value=0, description='test', max=31), HTML(value='')))"
467 |       ]
468 |      },
469 |      "metadata": {},
470 |      "output_type": "display_data"
471 |     },
472 |     {
473 |      "name": "stdout",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "...\n"
477 |      ]
478 |     }
479 |    ],
480 |    "source": [
481 |     "net = net.to(args.device)\n",
482 |     "    \n",
483 |     "# optimizer \n",
484 |     "\n",
485 |     "optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)\n",
486 |     "\n",
487 |     "# loss function\n",
488 |     "\n",
489 |     "def sequence_loss(y_pred, y_true, mask_index):\n",
490 |     "    y_pred, y_true = normalize_sizes(y_pred, y_true)\n",
491 |     "    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)\n",
492 |     "\n",
493 |     "# progress bars\n",
494 |     "\n",
495 |     "epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)\n",
496 |     "\n",
497 |     "num_train_batches = len(train_dataset) // args.batch_size\n",
498 |     "train_bar = tqdm_notebook(desc='training', total=num_train_batches, position=2)\n",
499 |     "\n",
500 |     "num_test_batches = len(test_dataset) // args.batch_size\n",
501 |     "test_bar = tqdm_notebook(desc='test', total=num_test_batches, position=3)\n",
502 |     "\n",
503 |     "# history\n",
504 |     "\n",
505 |     "train_loss_history = []\n",
506 |     "train_accuracy_history = []\n",
507 |     "\n",
508 |     "test_loss_history = []\n",
509 |     "test_accuracy_history = []\n",
510 |     "\n",
511 |     "\n",
512 |     "try:\n",
513 |     "    for _ in range(args.num_epochs):\n",
514 |     "        batch_generator = generate_batches(train_dataset, batch_size=args.batch_size,\n",
515 |     "                                           device=args.device)\n",
516 |     "        \n",
517 |     "        per_epoch_loss = []\n",
518 |     "        per_epoch_accuracy = []\n",
519 |     "        \n",
520 |     "        net.train()\n",
521 |     "            \n",
522 |     "        for batch_dict in batch_generator:\n",
523 |     "            # step 1\n",
524 |     "            optimizer.zero_grad()\n",
525 |     "            \n",
526 |     "            # step 2\n",
527 |     "            y_pred = net(batch_dict['x_surnames'], \n",
528 |     "                         batch_dict['x_nationalities'],\n",
529 |     "                         batch_dict['x_lengths'])\n",
530 |     "            y_target = batch_dict['y_surnames'] \n",
531 |     "            \n",
532 |     "            # step 3\n",
533 |     "            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)\n",
534 |     "\n",
535 |     "            # step 4\n",
536 |     "            loss.backward()\n",
537 |     "            optimizer.step()\n",
538 |     "\n",
539 |     "            # bonus steps: bookkeeping\n",
540 |     "            per_epoch_loss.append(loss.item())        \n",
541 |     "        \n",
542 |     "            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)\n",
543 |     "            per_epoch_accuracy.append(accuracy)\n",
544 |     "\n",
545 |     "            train_bar.update()\n",
546 |     "            \n",
547 |     "            train_bar.set_postfix(loss=per_epoch_loss[-1], \n",
548 |     "                                  accuracy=per_epoch_accuracy[-1])\n",
549 |     "            \n",
550 |     "        train_loss_history.append(np.mean(per_epoch_loss))\n",
551 |     "        train_accuracy_history.append(np.mean(per_epoch_accuracy))\n",
552 |     "        \n",
553 |     "        # loop over test dataset\n",
554 |     "        \n",
555 |     "        batch_generator = generate_batches(test_dataset, batch_size=args.batch_size, \n",
556 |     "                                           device=args.device)\n",
557 |     "        \n",
558 |     "        per_epoch_loss = []\n",
559 |     "        per_epoch_accuracy = []\n",
560 |     "            \n",
561 |     "        # set it to eval mode; this turns stochastic functions off\n",
562 |     "        net.eval()\n",
563 |     "            \n",
564 |     "        for batch_dict in batch_generator:\n",
565 |     "            # step 1: compute output\n",
566 |     "            y_pred = net(batch_dict['x_surnames'], \n",
567 |     "                         batch_dict['x_nationalities'],\n",
568 |     "                         batch_dict['x_lengths'])\n",
569 |     "            y_target = batch_dict['y_surnames'] \n",
570 |     "            \n",
571 |     "            # step 2: compute metrics\n",
572 |     "            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)\n",
573 |     "            per_epoch_loss.append(loss.item())\n",
574 |     "          \n",
575 |     "            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)\n",
576 |     "            per_epoch_accuracy.append(accuracy)\n",
577 |     "\n",
578 |     "            test_bar.update()\n",
579 |     "            \n",
580 |     "            test_bar.set_postfix(loss=per_epoch_loss[-1], \n",
581 |     "                                 accuracy=per_epoch_accuracy[-1])\n",
582 |     "            \n",
583 |     "        test_loss_history.append(np.mean(per_epoch_loss))\n",
584 |     "        test_accuracy_history.append(np.mean(per_epoch_accuracy))\n",
585 |     "        \n",
586 |     "        # update bars\n",
587 |     "        \n",
588 |     "        epoch_bar.set_postfix(train_loss=train_loss_history[-1], \n",
589 |     "                              train_accuracy=train_accuracy_history[-1],\n",
590 |     "                              test_loss=test_loss_history[-1],\n",
591 |     "                              test_accuracy=test_accuracy_history[-1])\n",
592 |     "        epoch_bar.update()\n",
593 |     "        test_bar.n = 0\n",
594 |     "        train_bar.n = 0\n",
595 |     "        \n",
596 |     "except KeyboardInterrupt:\n",
597 |     "    print(\"...\")"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "markdown",
602 |    "metadata": {},
603 |    "source": [
604 |     "## Sampling\n",
605 |     "\n",
606 |     "In the plain prediction notebook, the exercise was to create the sampling routine. Below is code already written to sample, so you can see how to handle the conditional nature.  \n",
607 |     "\n",
608 |     "As an exercise, you should consider what it would mean to interpolate between two conditioning vectors!  \n",
609 |     "\n",
610 |     "For instance, you could take the Irish and Chinese embeddings and average them (which is the same thing as multiplying each vector by 0.5 and adding them together!). \n",
611 |     "\n",
612 |     "Or you could weight them 0.3 and 0.7 and then add them together.  This is referred to as interpolation.  The weights should add up to 1 to be a valid interpolation. "
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 8,
618 |    "metadata": {},
619 |    "outputs": [],
620 |    "source": [
621 |     "def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):\n",
622 |     "    hiddens = [h_t]\n",
623 |     "    indices = [idx_t]\n",
624 |     "    out_dists = []\n",
625 |     "    \n",
626 |     "    for t in range(n):\n",
627 |     "        x_t = emb(idx_t)\n",
628 |     "        h_t = rnn._compute_next_hidden(x_t, h_t)\n",
629 |     "        \n",
630 |     "        y_t = fc(h_t)\n",
631 |     "        y_t = F.softmax( y_t / temp, dim=1)\n",
632 |     "        idx_t = torch.multinomial(y_t, 1)[:, 0]\n",
633 |     "    \n",
634 |     "        hiddens.append(h_t)\n",
635 |     "        indices.append(idx_t)\n",
636 |     "        out_dists.append(y_t)\n",
637 |     "     \n",
638 |     "    indices = torch.stack(indices).squeeze().permute(1, 0)\n",
639 |     "    \n",
640 |     "    return indices\n",
641 |     "\n",
642 |     "def make_initial_hidden(batch_size, hidden_size):\n",
643 |     "    return torch.ones(batch_size, hidden_size)\n",
644 |     "    \n",
645 |     "def make_initial_x(batch_size, vectorizer):\n",
646 |     "    return torch.ones(batch_size, dtype=torch.int64) * vectorizer.surname_vocab.start_index\n",
647 |     "    \n",
648 |     "def decode_one(vectorizer, seq):\n",
649 |     "    out = []\n",
650 |     "    for i in seq:\n",
651 |     "        if vectorizer.surname_vocab.start_index == i:\n",
652 |     "            continue\n",
653 |     "        if vectorizer.surname_vocab.end_index == i:\n",
654 |     "            return ''.join(out)\n",
655 |     "        out.append(vectorizer.surname_vocab.lookup(i))\n",
656 |     "    return ''.join(out)\n",
657 |     "            \n",
658 |     "def decode_matrix(vectorizer, mat):\n",
659 |     "    mat = mat.cpu().detach().numpy()\n",
660 |     "    return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]\n",
661 |     "\n",
662 |     "def n_random_nationalities(n):\n",
663 |     "    keys = np.random.choice(vectorizer.nationality_vocab.keys(), size=n, replace=True)\n",
664 |     "    indices = torch.tensor([vectorizer.nationality_vocab[key] for key in keys], dtype=torch.int64)\n",
665 |     "    return keys, indices\n",
666 |     "\n",
667 |     "def sample_n(n=10, temp=0.8):\n",
668 |     "    init_names, init_vector = n_random_nationalities(n)\n",
669 |     "    init_vector = net.conditional_emb(init_vector)\n",
670 |     "    samples = decode_matrix(vectorizer, \n",
671 |     "                            sample(net.emb, net.rnn, net.fc, \n",
672 |     "                                   init_vector, \n",
673 |     "                                   make_initial_x(n, vectorizer),\n",
674 |     "                                   temp=temp))\n",
675 |     "    return list(zip(init_names, samples))\n",
676 |     "\n",
677 |     "def sample_n_for_nationality(nationality, n=10, temp=0.8):\n",
678 |     "    assert nationality in vectorizer.nationality_vocab.keys(), 'not a nationality we trained on'\n",
679 |     "    keys = [nationality] * n\n",
680 |     "    init_vector = torch.tensor([vectorizer.nationality_vocab[key] for key in keys], dtype=torch.int64)\n",
681 |     "    init_vector = net.conditional_emb(init_vector)\n",
682 |     "    samples = decode_matrix(vectorizer, \n",
683 |     "                        sample(net.emb, net.rnn, net.fc, \n",
684 |     "                               init_vector, \n",
685 |     "                               make_initial_x(n, vectorizer),\n",
686 |     "                               temp=temp))\n",
687 |     "    return list(zip(keys, samples))"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": 20,
693 |    "metadata": {},
694 |    "outputs": [
695 |     {
696 |      "data": {
697 |       "text/plain": [
698 |        "[('irish', \"O'Haler\"),\n",
699 |        " ('irish', 'Sankins'),\n",
700 |        " ('irish', 'Lowgon'),\n",
701 |        " ('irish', \"D'uest\"),\n",
702 |        " ('irish', 'Maclerson'),\n",
703 |        " ('irish', 'Gale'),\n",
704 |        " ('irish', 'Cahrifoud'),\n",
705 |        " ('irish', 'Nawad'),\n",
706 |        " ('irish', 'Malist'),\n",
707 |        " ('irish', \"O'Hardsis\")]"
708 |       ]
709 |      },
710 |      "execution_count": 20,
711 |      "metadata": {},
712 |      "output_type": "execute_result"
713 |     }
714 |    ],
715 |    "source": [
716 |     "sample_n_for_nationality('irish', n=10)"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": 19,
722 |    "metadata": {},
723 |    "outputs": [
724 |     {
725 |      "data": {
726 |       "text/plain": [
727 |        "[('vietnamese', 'Tras'),\n",
728 |        " ('korean', 'Pon'),\n",
729 |        " ('dutch', 'Rear'),\n",
730 |        " ('irish', 'Molder'),\n",
731 |        " ('italian', 'Bacama'),\n",
732 |        " ('vietnamese', 'Jugh'),\n",
733 |        " ('english', 'Peimer'),\n",
734 |        " ('arabic', 'Boury'),\n",
735 |        " ('scottish', 'Gorran'),\n",
736 |        " ('german', 'Gollann')]"
737 |       ]
738 |      },
739 |      "execution_count": 19,
740 |      "metadata": {},
741 |      "output_type": "execute_result"
742 |     }
743 |    ],
744 |    "source": [
745 |     "sample_n()"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": 11,
751 |    "metadata": {},
752 |    "outputs": [
753 |     {
754 |      "data": {
755 |       "text/plain": [
756 |        "[('scottish', 'Stlars'),\n",
757 |        " ('german', 'Wenstin'),\n",
758 |        " ('japanese', 'Yihno'),\n",
759 |        " ('spanish', 'Abanno'),\n",
760 |        " ('german', 'Wattala'),\n",
761 |        " ('russian', 'Dubrorov'),\n",
762 |        " ('german', 'Maresen'),\n",
763 |        " ('dutch', 'Deeber'),\n",
764 |        " ('russian', 'Bakhanovsky'),\n",
765 |        " ('korean', 'Ob'),\n",
766 |        " ('italian', 'Petela'),\n",
767 |        " ('spanish', 'Sereetsi'),\n",
768 |        " ('greek', 'Chellis'),\n",
769 |        " ('scottish', 'Caunet'),\n",
770 |        " ('portuguese', 'Sancour'),\n",
771 |        " ('vietnamese', 'Tris'),\n",
772 |        " ('french', 'Banezer'),\n",
773 |        " ('portuguese', 'Gelrine'),\n",
774 |        " ('korean', 'Chon'),\n",
775 |        " ('korean', 'Rher'),\n",
776 |        " ('russian', 'Jukovenko'),\n",
777 |        " ('polish', 'Kadili'),\n",
778 |        " ('chinese', 'Han'),\n",
779 |        " ('greek', 'Pichanama'),\n",
780 |        " ('chinese', 'Yia'),\n",
781 |        " ('chinese', 'Hing'),\n",
782 |        " ('scottish', 'Wlellice'),\n",
783 |        " ('greek', 'Kalosa'),\n",
784 |        " ('scottish', 'Cogley'),\n",
785 |        " ('polish', 'Provensky')]"
786 |       ]
787 |      },
788 |      "execution_count": 11,
789 |      "metadata": {},
790 |      "output_type": "execute_result"
791 |     }
792 |    ],
793 |    "source": [
794 |     "sample_n(30, 0.8)"
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "markdown",
799 |    "metadata": {
800 |     "collapsed": true
801 |    },
802 |    "source": [
803 |     "## Exercise!\n",
804 |     "\n",
805 |     "Can you figure out how to take the embedding for TWO nationalities, average them, and use that to generate a new surname? "
806 |    ]
807 |   }
808 |  ],
809 |  "metadata": {
810 |   "kernelspec": {
811 |    "display_name": "pytorch04",
812 |    "language": "python",
813 |    "name": "pytorch04"
814 |   },
815 |   "language_info": {
816 |    "codemirror_mode": {
817 |     "name": "ipython",
818 |     "version": 3
819 |    },
820 |    "file_extension": ".py",
821 |    "mimetype": "text/x-python",
822 |    "name": "python",
823 |    "nbconvert_exporter": "python",
824 |    "pygments_lexer": "ipython3",
825 |    "version": "3.6.6"
826 |   }
827 |  },
828 |  "nbformat": 4,
829 |  "nbformat_minor": 2
830 | }
831 | 


--------------------------------------------------------------------------------
/day_2/02-Char-RNN-Predict-Surnames.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from argparse import Namespace\n",
 10 |     "import json\n",
 11 |     "import os\n",
 12 |     "\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import seaborn as sns\n",
 17 |     "import torch\n",
 18 |     "import torch.nn as nn\n",
 19 |     "import torch.nn.functional as F\n",
 20 |     "import torch.optim as optim\n",
 21 |     "from torch.utils.data import Dataset, DataLoader\n",
 22 |     "from tqdm import tqdm_notebook\n",
 23 |     "\n",
 24 |     "from vocabulary import Vocabulary\n",
 25 |     "\n",
 26 |     "%matplotlib inline\n",
 27 |     "\n",
 28 |     "plt.style.use('fivethirtyeight')\n",
 29 |     "plt.rcParams['figure.figsize'] = (14, 6)\n",
 30 |     "\n",
 31 |     "START_TOKEN = \"^\"\n",
 32 |     "END_TOKEN = \"_\"\n",
 33 |     "\n",
 34 |     "IGNORE_INDEX_VALUE = -1"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Class Definitions \n",
 42 |     "\n",
 43 |     "Data Model:\n",
 44 |     "- Raw data\n",
 45 |     "- Vectorizer\n",
 46 |     "- Vectorized Data\n",
 47 |     "- Data generator"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "class RawSurnames(object):\n",
 57 |     "    def __init__(self, data_path, delimiter=\",\"):\n",
 58 |     "        self.data = pd.read_csv(data_path, delimiter=delimiter)\n",
 59 |     "\n",
 60 |     "    def get_data(self, filter_to_nationality=None):\n",
 61 |     "        if filter_to_nationality is not None:\n",
 62 |     "            return self.data[self.data.nationality.isin(filter_to_nationality)]\n",
 63 |     "        return self.data\n",
 64 |     "\n",
 65 |     "# vectorizer\n",
 66 |     "\n",
 67 |     "class SurnamesVectorizer(object):\n",
 68 |     "    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):\n",
 69 |     "        self.surname_vocab = surname_vocab\n",
 70 |     "        self.nationality_vocab = nationality_vocab\n",
 71 |     "        self.max_seq_length = max_seq_length\n",
 72 |     "        \n",
 73 |     "    def save(self, filename):\n",
 74 |     "        vec_dict = {\"surname_vocab\": self.surname_vocab.get_serializable_contents(),\n",
 75 |     "                    \"nationality_vocab\": self.nationality_vocab.get_serializable_contents(),\n",
 76 |     "                    'max_seq_length': self.max_seq_length}\n",
 77 |     "\n",
 78 |     "        with open(filename, \"w\") as fp:\n",
 79 |     "            json.dump(vec_dict, fp)\n",
 80 |     "        \n",
 81 |     "    @classmethod\n",
 82 |     "    def load(cls, filename):\n",
 83 |     "        with open(filename, \"r\") as fp:\n",
 84 |     "            vec_dict = json.load(fp)\n",
 85 |     "\n",
 86 |     "        vec_dict[\"surname_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"surname_vocab\"])\n",
 87 |     "        vec_dict[\"nationality_vocab\"] = Vocabulary.deserialize_from_contents(vec_dict[\"nationality_vocab\"])\n",
 88 |     "        return cls(**vec_dict)\n",
 89 |     "\n",
 90 |     "    @classmethod\n",
 91 |     "    def fit(cls, surname_df):\n",
 92 |     "        surname_vocab = Vocabulary(use_unks=False,\n",
 93 |     "                                   use_mask=True,\n",
 94 |     "                                   use_start_end=True,\n",
 95 |     "                                   start_token=START_TOKEN,\n",
 96 |     "                                   end_token=END_TOKEN)\n",
 97 |     "\n",
 98 |     "        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)\n",
 99 |     "\n",
100 |     "        max_seq_length = 0\n",
101 |     "        for index, row in surname_df.iterrows():\n",
102 |     "            surname_vocab.add_many(row.surname)\n",
103 |     "            nationality_vocab.add(row.nationality)\n",
104 |     "\n",
105 |     "            if len(row.surname) > max_seq_length:\n",
106 |     "                max_seq_length = len(row.surname)\n",
107 |     "        max_seq_length = max_seq_length + 2\n",
108 |     "\n",
109 |     "        return cls(surname_vocab, nationality_vocab, max_seq_length)\n",
110 |     "\n",
111 |     "    @classmethod\n",
112 |     "    def fit_transform(cls, surname_df, split='train'):\n",
113 |     "        vectorizer = cls.fit(surname_df)\n",
114 |     "        return vectorizer, vectorizer.transform(surname_df, split)\n",
115 |     "\n",
116 |     "    def transform(self, surname_df, split='train'):\n",
117 |     "\n",
118 |     "        df = surname_df[surname_df.split==split].reset_index()\n",
119 |     "        n_data = len(df)\n",
120 |     "        \n",
121 |     "        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)\n",
122 |     "        y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * IGNORE_INDEX_VALUE\n",
123 |     "\n",
124 |     "        for index, row in df.iterrows():\n",
125 |     "            vectorized_surname = list(self.surname_vocab.map(row.surname, \n",
126 |     "                                                             include_start_end=True))\n",
127 |     "            # $McMahan_\n",
128 |     "            # $ -> M\n",
129 |     "            # M -> c\n",
130 |     "            x_part = vectorized_surname[:-1]\n",
131 |     "            y_part = vectorized_surname[1:]\n",
132 |     "            x_surnames[index, :len(x_part)] = x_part\n",
133 |     "            y_surnames[index, :len(y_part)] = y_part\n",
134 |     "\n",
135 |     "        return VectorizedSurnames(x_surnames, y_surnames)\n",
136 |     "\n",
137 |     "# vec data\n",
138 |     "\n",
139 |     "class VectorizedSurnames(Dataset):\n",
140 |     "    def __init__(self, x_surnames, y_surnames):\n",
141 |     "        self.x_surnames = x_surnames\n",
142 |     "        self.y_surnames = y_surnames\n",
143 |     "\n",
144 |     "    def __len__(self):\n",
145 |     "        return len(self.x_surnames)\n",
146 |     "\n",
147 |     "    def __getitem__(self, index):\n",
148 |     "        return {'x_surnames': self.x_surnames[index],\n",
149 |     "                'y_surnames': self.y_surnames[index],\n",
150 |     "                'x_lengths': len(self.x_surnames[index].nonzero()[0])}\n",
151 |     "\n",
152 |     "# data generator\n",
153 |     "\n",
154 |     "def generate_batches(dataset, batch_size, shuffle=True,\n",
155 |     "                     drop_last=True, device=\"cpu\"): \n",
156 |     "    \"\"\"\n",
157 |     "    A generator function which wraps the PyTorch DataLoader. It will \n",
158 |     "      ensure each tensor is on the write device location.\n",
159 |     "    \"\"\"\n",
160 |     "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,\n",
161 |     "                            shuffle=shuffle, drop_last=drop_last)\n",
162 |     "\n",
163 |     "    for data_dict in dataloader:\n",
164 |     "        out_data_dict = {}\n",
165 |     "        for name, tensor in data_dict.items():\n",
166 |     "            out_data_dict[name] = data_dict[name].to(device)\n",
167 |     "        yield out_data_dict"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Class definitions for the model"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 4,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "def new_parameter(*size):\n",
184 |     "    out = torch.randn(*size, requires_grad=True, dtype=torch.float32)\n",
185 |     "    torch.nn.init.xavier_normal_(out)\n",
186 |     "    return nn.Parameter(out)\n",
187 |     "\n",
188 |     "def column_gather(y_out, x_lengths):\n",
189 |     "    '''Get a specific vector from each batch datapoint in `y_out`.\n",
190 |     "\n",
191 |     "    More precisely, iterate over batch row indices, get the vector that's at\n",
192 |     "    the position indicated by the corresponding value in `x_lengths` at the row\n",
193 |     "    index.\n",
194 |     "\n",
195 |     "    Args:\n",
196 |     "        y_out (torch.FloatTensor, torch.cuda.FloatTensor)\n",
197 |     "            shape: (batch, sequence, feature)\n",
198 |     "        x_lengths (torch.LongTensor, torch.cuda.LongTensor)\n",
199 |     "            shape: (batch,)\n",
200 |     "\n",
201 |     "    Returns:\n",
202 |     "        y_out (torch.FloatTensor, torch.cuda.FloatTensor)\n",
203 |     "            shape: (batch, feature)\n",
204 |     "    '''\n",
205 |     "    x_lengths = x_lengths.long().detach().cpu().numpy() - 1\n",
206 |     "\n",
207 |     "    out = []\n",
208 |     "    for batch_index, column_index in enumerate(x_lengths):\n",
209 |     "        out.append(y_out[batch_index, column_index])\n",
210 |     "\n",
211 |     "    return torch.stack(out)\n",
212 |     "\n",
213 |     "\n",
214 |     "class ExplicitRNN(nn.Module):\n",
215 |     "    def __init__(self, input_size, hidden_size, batch_first=False):\n",
216 |     "        super(ExplicitRNN, self).__init__()\n",
217 |     "        self.W_in2hid = new_parameter(input_size, hidden_size)\n",
218 |     "        self.W_hid2hid = new_parameter(hidden_size, hidden_size)\n",
219 |     "            \n",
220 |     "        self.b_hid = new_parameter(1, hidden_size)\n",
221 |     "        \n",
222 |     "        self.hidden_size = hidden_size\n",
223 |     "\n",
224 |     "        self.batch_first = batch_first\n",
225 |     "    \n",
226 |     "    def _compute_next_hidden(self, x, h):\n",
227 |     "        return F.tanh(x.matmul(self.W_in2hid) + \n",
228 |     "                      h.matmul(self.W_hid2hid) + \n",
229 |     "                      self.b_hid)\n",
230 |     "\n",
231 |     "    def forward(self, x_in, hid_t=None):\n",
232 |     "        if self.batch_first:\n",
233 |     "            batch_size, seq_size, feat_size = x_in.size()\n",
234 |     "            x_in = x_in.permute(1, 0, 2)\n",
235 |     "        else:\n",
236 |     "            seq_size, batch_size, feat_size = x_in.size()\n",
237 |     "\n",
238 |     "        hiddens = []\n",
239 |     "        if hid_t is None:\n",
240 |     "            hid_t = torch.ones((batch_size, self.hidden_size))\n",
241 |     "        \n",
242 |     "        if x_in.is_cuda:\n",
243 |     "            hid_t = hid_t.cuda()\n",
244 |     "            \n",
245 |     "        for t in range(seq_size):\n",
246 |     "            x_t = x_in[t]\n",
247 |     "            hid_t = self._compute_next_hidden(x_t, hid_t)\n",
248 |     "            \n",
249 |     "            hiddens.append(hid_t)\n",
250 |     "        hiddens = torch.stack(hiddens)\n",
251 |     "\n",
252 |     "        if self.batch_first:\n",
253 |     "            hiddens = hiddens.permute(1, 0, 2)\n",
254 |     "\n",
255 |     "        return hiddens\n",
256 |     "    \n",
257 |     "    \n",
258 |     "    \n",
259 |     "class CharNN(nn.Module):\n",
260 |     "    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, \n",
261 |     "                 batch_first=False):\n",
262 |     "        super(CharNN, self).__init__()\n",
263 |     "        \n",
264 |     "        self.emb = nn.Embedding(embedding_dim=embedding_size, \n",
265 |     "                                num_embeddings=in_vocab_size,\n",
266 |     "                                padding_idx=0)\n",
267 |     "        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)\n",
268 |     "        self.rnn = ExplicitRNN(input_size=embedding_size, \n",
269 |     "                               hidden_size=hidden_size, \n",
270 |     "                               batch_first=batch_first)\n",
271 |     "    \n",
272 |     "    def forward(self, x_in, x_lengths=None, apply_softmax=False):\n",
273 |     "        # x_in.shape == (batch_size, max_seq_length)\n",
274 |     "        x_in = self.emb(x_in)\n",
275 |     "        # x_in.shape == (batch_size, max_seq_length, embedding_size)\n",
276 |     "        y_out = self.rnn(x_in)\n",
277 |     "        # y_out.shape == (batch_size, max_seq_length, hidden_size)\n",
278 |     "\n",
279 |     "        # reshape into a matrix so we can apply a linear layer\n",
280 |     "        dim0, dim1, dim2 = y_out.size()\n",
281 |     "        y_out = y_out.contiguous().view(dim0 * dim1, dim2)\n",
282 |     "\n",
283 |     "        # now that it's a matrix, can apply linear layer\n",
284 |     "        y_out = self.fc(y_out) \n",
285 |     "        \n",
286 |     "        # y_out.shape == (batch_size * max_seq_length, character_vocab_size)\n",
287 |     "\n",
288 |     "        # optionally apply the softmax\n",
289 |     "        if apply_softmax:\n",
290 |     "            y_out = F.softmax(y_out, dim=1)\n",
291 |     "\n",
292 |     "        y_out = y_out.view(dim0, dim1, -1)\n",
293 |     "        # y_out.shape == (batch_size, max_seq_length, character_vocab_size)\n",
294 |     "\n",
295 |     "        return y_out\n",
296 |     "    \n",
297 |     "def normalize_sizes(net_output, y_true):\n",
298 |     "    net_output = net_output.cpu()\n",
299 |     "    y_true = y_true.cpu()\n",
300 |     "    if len(net_output.size()) == 3:\n",
301 |     "        net_output.contiguous()\n",
302 |     "        net_output = net_output.view(-1, net_output.size(2))\n",
303 |     "    if len(y_true.size()) == 2:\n",
304 |     "        y_true.contiguous()\n",
305 |     "        y_true = y_true.view(-1)\n",
306 |     "    return net_output, y_true\n",
307 |     "\n",
308 |     "def compute_accuracy(y_pred, y_true, mask_index):\n",
309 |     "    y_pred, y_true = normalize_sizes(y_pred, y_true)\n",
310 |     "\n",
311 |     "    _, y_pred_indices = y_pred.max(dim=1)\n",
312 |     "    \n",
313 |     "    correct_indices = torch.eq(y_pred_indices, y_true).float()\n",
314 |     "    valid_indices = torch.ne(y_true, mask_index).float()\n",
315 |     "    \n",
316 |     "    n_correct = (correct_indices * valid_indices).sum().item()\n",
317 |     "    n_valid = valid_indices.sum().item()\n",
318 |     "\n",
319 |     "    return n_correct / n_valid * 100\n"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "## Make, Train, and Eval"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 6,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "Using CUDA: True\n"
339 |      ]
340 |     },
341 |     {
342 |      "data": {
343 |       "text/plain": [
344 |        "device(type='cuda')"
345 |       ]
346 |      },
347 |      "execution_count": 6,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "args = Namespace(\n",
354 |     "    surname_csv=\"../data/surnames.csv\",\n",
355 |     "    batch_size = 128,\n",
356 |     "    cuda=True,\n",
357 |     "    learning_rate=0.001,\n",
358 |     "    num_epochs=100,\n",
359 |     "    load_zoo_model=True,\n",
360 |     "    zoo={\n",
361 |     "        'filename': '../modelzoo/charnn_emb16_hid64_surnames_predict.state',\n",
362 |     "        'vocab': '../modelzoo/surnames_classify.vocab',\n",
363 |     "        'comments': 'pre-trained surname sequence prediction (& generation model)',\n",
364 |     "        'parameters': {\n",
365 |     "            'embedding_size': 16,\n",
366 |     "            'hidden_size': 64\n",
367 |     "        }\n",
368 |     "    }\n",
369 |     ")\n",
370 |     "# Check CUDA\n",
371 |     "if not torch.cuda.is_available():\n",
372 |     "    args.cuda = False\n",
373 |     "\n",
374 |     "print(\"Using CUDA: {}\".format(args.cuda))\n",
375 |     "\n",
376 |     "args.device = torch.device(\"cuda\" if args.cuda else \"cpu\")\n",
377 |     "args.device"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 11,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# optional: set this to false to learn from scratch!\n",
387 |     "# args.load_zoo_model = False"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 7,
393 |    "metadata": {},
394 |    "outputs": [
395 |     {
396 |      "name": "stdout",
397 |      "output_type": "stream",
398 |      "text": [
399 |       "Loading vectorizer!\n",
400 |       "Loading state dict!\n"
401 |      ]
402 |     }
403 |    ],
404 |    "source": [
405 |     "raw_data = RawSurnames(args.surname_csv).get_data()\n",
406 |     "\n",
407 |     "if os.path.exists(args.zoo['vocab']):\n",
408 |     "    vectorizer = SurnamesVectorizer.load(args.zoo['vocab'])\n",
409 |     "    print(\"Loading vectorizer!\")\n",
410 |     "else:\n",
411 |     "    vectorizer = SurnamesVectorizer.fit(raw_data)\n",
412 |     "    print(\"Creating a new vectorizer.\")\n",
413 |     "    \n",
414 |     "train_dataset = vectorizer.transform(raw_data, split='train')\n",
415 |     "test_dataset = vectorizer.transform(raw_data, split='test')\n",
416 |     "\n",
417 |     "zoo_params = args.zoo['parameters']\n",
418 |     "\n",
419 |     "net = CharNN(embedding_size=zoo_params['embedding_size'], \n",
420 |     "             hidden_size=zoo_params['hidden_size'],\n",
421 |     "             in_vocab_size=len(vectorizer.surname_vocab), \n",
422 |     "             out_vocab_size=len(vectorizer.surname_vocab), \n",
423 |     "             batch_first=True)\n",
424 |     "\n",
425 |     "if args.load_zoo_model and os.path.exists(args.zoo['filename']):\n",
426 |     "    print(\"Loading state dict!\")\n",
427 |     "    net.load_state_dict(torch.load(args.zoo['filename'], \n",
428 |     "                                   map_location=lambda storage, loc: storage))\n",
429 |     "else:\n",
430 |     "    print(\"Using newly initiated network!\")"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 8,
436 |    "metadata": {},
437 |    "outputs": [
438 |     {
439 |      "data": {
440 |       "text/plain": [
441 |        "-1"
442 |       ]
443 |      },
444 |      "execution_count": 8,
445 |      "metadata": {},
446 |      "output_type": "execute_result"
447 |     }
448 |    ],
449 |    "source": [
450 |     "IGNORE_INDEX_VALUE"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 9,
456 |    "metadata": {},
457 |    "outputs": [
458 |     {
459 |      "data": {
460 |       "application/vnd.jupyter.widget-view+json": {
461 |        "model_id": "59f459522d524b9d81f5d1fbaea5f4df",
462 |        "version_major": 2,
463 |        "version_minor": 0
464 |       },
465 |       "text/plain": [
466 |        "HBox(children=(IntProgress(value=0, description='epochs'), HTML(value='')))"
467 |       ]
468 |      },
469 |      "metadata": {},
470 |      "output_type": "display_data"
471 |     },
472 |     {
473 |      "data": {
474 |       "application/vnd.jupyter.widget-view+json": {
475 |        "model_id": "e36604c8085b4e15978fb5c241f4f58f",
476 |        "version_major": 2,
477 |        "version_minor": 0
478 |       },
479 |       "text/plain": [
480 |        "HBox(children=(IntProgress(value=0, description='training', max=125), HTML(value='')))"
481 |       ]
482 |      },
483 |      "metadata": {},
484 |      "output_type": "display_data"
485 |     },
486 |     {
487 |      "data": {
488 |       "application/vnd.jupyter.widget-view+json": {
489 |        "model_id": "29ec7c1dd6d6441cab61827a29a161eb",
490 |        "version_major": 2,
491 |        "version_minor": 0
492 |       },
493 |       "text/plain": [
494 |        "HBox(children=(IntProgress(value=0, description='test', max=31), HTML(value='')))"
495 |       ]
496 |      },
497 |      "metadata": {},
498 |      "output_type": "display_data"
499 |     },
500 |     {
501 |      "name": "stdout",
502 |      "output_type": "stream",
503 |      "text": [
504 |       "...\n"
505 |      ]
506 |     }
507 |    ],
508 |    "source": [
509 |     "net = net.to(args.device)\n",
510 |     "    \n",
511 |     "optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)\n",
512 |     "\n",
513 |     "# loss function\n",
514 |     "\n",
515 |     "def sequence_loss(y_pred, y_true, mask_index):\n",
516 |     "    y_pred, y_true = normalize_sizes(y_pred, y_true)\n",
517 |     "    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)\n",
518 |     "\n",
519 |     "# progress bars\n",
520 |     "\n",
521 |     "epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)\n",
522 |     "\n",
523 |     "num_train_batches = len(train_dataset) // args.batch_size\n",
524 |     "train_bar = tqdm_notebook(desc='training', total=num_train_batches, position=2)\n",
525 |     "\n",
526 |     "num_test_batches = len(test_dataset) // args.batch_size\n",
527 |     "test_bar = tqdm_notebook(desc='test', total=num_test_batches, position=3)\n",
528 |     "\n",
529 |     "# history\n",
530 |     "\n",
531 |     "train_loss_history = []\n",
532 |     "train_accuracy_history = []\n",
533 |     "\n",
534 |     "test_loss_history = []\n",
535 |     "test_accuracy_history = []\n",
536 |     "\n",
537 |     "\n",
538 |     "try:\n",
539 |     "    for _ in range(args.num_epochs):\n",
540 |     "        batch_generator = generate_batches(train_dataset, batch_size=args.batch_size,\n",
541 |     "                                           device=args.device)\n",
542 |     "        \n",
543 |     "        per_epoch_loss = []\n",
544 |     "        per_epoch_accuracy = []\n",
545 |     "        \n",
546 |     "        net.train()\n",
547 |     "            \n",
548 |     "        for batch_dict in batch_generator:\n",
549 |     "            # step 1\n",
550 |     "            optimizer.zero_grad()\n",
551 |     "\n",
552 |     "            # step 2\n",
553 |     "            y_pred = net(batch_dict['x_surnames'], batch_dict['x_lengths'])\n",
554 |     "            y_target = batch_dict['y_surnames']\n",
555 |     "            \n",
556 |     "            # step 3\n",
557 |     "            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)\n",
558 |     "            \n",
559 |     "            # step 4\n",
560 |     "            loss.backward()\n",
561 |     "            optimizer.step()\n",
562 |     "          \n",
563 |     "            # bonus steps: bookkeeping\n",
564 |     "            \n",
565 |     "            per_epoch_loss.append(loss.item())\n",
566 |     "            \n",
567 |     "            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)\n",
568 |     "            per_epoch_accuracy.append(accuracy)\n",
569 |     "\n",
570 |     "            train_bar.update()\n",
571 |     "            \n",
572 |     "            train_bar.set_postfix(loss=per_epoch_loss[-1], \n",
573 |     "                                  accuracy=per_epoch_accuracy[-1])\n",
574 |     "            \n",
575 |     "        train_loss_history.append(np.mean(per_epoch_loss))\n",
576 |     "        train_accuracy_history.append(np.mean(per_epoch_accuracy))\n",
577 |     "        \n",
578 |     "        # loop over test dataset\n",
579 |     "        \n",
580 |     "        batch_generator = generate_batches(test_dataset, batch_size=args.batch_size, \n",
581 |     "                                           device=args.device)\n",
582 |     "        \n",
583 |     "        per_epoch_loss = []\n",
584 |     "        per_epoch_accuracy = []\n",
585 |     "            \n",
586 |     "        # set it to eval mode; this turns stochastic functions off\n",
587 |     "        net.eval()\n",
588 |     "            \n",
589 |     "        for batch_dict in batch_generator:\n",
590 |     "            \n",
591 |     "            # step 1: compute output\n",
592 |     "            y_pred = net(batch_dict['x_surnames'], batch_dict['x_lengths'])\n",
593 |     "            y_target = batch_dict['y_surnames']\n",
594 |     "            \n",
595 |     "            # step 2: compute metrics\n",
596 |     "            \n",
597 |     "            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)\n",
598 |     "            per_epoch_loss.append(loss.item())\n",
599 |     "          \n",
600 |     "            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)\n",
601 |     "            per_epoch_accuracy.append(accuracy)\n",
602 |     "\n",
603 |     "            test_bar.update()\n",
604 |     "            \n",
605 |     "            test_bar.set_postfix(loss=per_epoch_loss[-1], \n",
606 |     "                                 accuracy=per_epoch_accuracy[-1])\n",
607 |     "            \n",
608 |     "        test_loss_history.append(np.mean(per_epoch_loss))\n",
609 |     "        test_accuracy_history.append(np.mean(per_epoch_accuracy))\n",
610 |     "        \n",
611 |     "        # update bars\n",
612 |     "        \n",
613 |     "        epoch_bar.set_postfix(train_loss=train_loss_history[-1], \n",
614 |     "                              train_accuracy=train_accuracy_history[-1],\n",
615 |     "                              test_loss=test_loss_history[-1],\n",
616 |     "                              test_accuracy=test_accuracy_history[-1])\n",
617 |     "        epoch_bar.update()\n",
618 |     "        test_bar.n = 0\n",
619 |     "        train_bar.n = 0\n",
620 |     "        \n",
621 |     "except KeyboardInterrupt:\n",
622 |     "    print(\"...\")"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {},
628 |    "source": [
629 |     "##  Exercise!\n",
630 |     "\n",
631 |     "Now that we have a model which was trained to predict sequences, let's make our own sampler!\n",
632 |     "\n",
633 |     "The sampler will walk through the generation procedure, selecting one character a time.  The result is something like this: \n",
634 |     "\n",
635 |     "```\n",
636 |     "['Poldtoff',\n",
637 |     " 'Schestars',\n",
638 |     " 'Gordoud',\n",
639 |     " 'Kinsen',\n",
640 |     " 'Venzey',\n",
641 |     " 'Tumali',\n",
642 |     " 'Pets',\n",
643 |     " 'Aänchekin',\n",
644 |     " 'GDigkov',\n",
645 |     " 'Shadonov',\n",
646 |     " 'Boulyanson',\n",
647 |     " 'Gwae',\n",
648 |     " 'Zgerege',\n",
649 |     " 'Foxchevtsev',\n",
650 |     " 'Progkin',\n",
651 |     " 'Ussin']\n",
652 |     "```\n",
653 |     "\n"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": null,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "def forward(self, x_in, x_lengths=None, apply_softmax=False):\n",
663 |     "        # x_in.shape == (batch_size, max_seq_length)\n",
664 |     "        x_in = self.emb(x_in)\n",
665 |     "        # x_in.shape == (batch_size, max_seq_length, embedding_size)\n",
666 |     "        y_out = self.rnn(x_in)\n",
667 |     "        # y_out.shape == (batch_size, max_seq_length, hidden_size)\n",
668 |     "\n",
669 |     "        # reshape into a matrix so we can apply a linear layer\n",
670 |     "        dim0, dim1, dim2 = y_out.size()\n",
671 |     "        y_out = y_out.contiguous().view(dim0 * dim1, dim2)\n",
672 |     "\n",
673 |     "        # now that it's a matrix, can apply linear layer\n",
674 |     "        y_out = self.fc(y_out) \n",
675 |     "        \n",
676 |     "        # y_out.shape == (batch_size * max_seq_length, character_vocab_size)\n",
677 |     "\n",
678 |     "        # optionally apply the softmax\n",
679 |     "        if apply_softmax:\n",
680 |     "            y_out = F.softmax(y_out, dim=1)\n",
681 |     "\n",
682 |     "        y_out = y_out.view(dim0, dim1, -1)\n",
683 |     "        # y_out.shape == (batch_size, max_seq_length, character_vocab_size)\n",
684 |     "\n",
685 |     "        return y_out\n",
686 |     "    "
687 |    ]
688 |   },
689 |   {
690 |    "cell_type": "code",
691 |    "execution_count": 10,
692 |    "metadata": {},
693 |    "outputs": [
694 |     {
695 |      "data": {
696 |       "text/plain": [
697 |        "CharNN(\n",
698 |        "  (emb): Embedding(90, 16, padding_idx=0)\n",
699 |        "  (fc): Linear(in_features=64, out_features=90, bias=True)\n",
700 |        "  (rnn): ExplicitRNN()\n",
701 |        ")"
702 |       ]
703 |      },
704 |      "execution_count": 10,
705 |      "metadata": {},
706 |      "output_type": "execute_result"
707 |     }
708 |    ],
709 |    "source": [
710 |     "net"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": 22,
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "data": {
720 |       "text/plain": [
721 |        "tensor([  1,  31,  11,  31,   4,  12,   4,  26])"
722 |       ]
723 |      },
724 |      "execution_count": 22,
725 |      "metadata": {},
726 |      "output_type": "execute_result"
727 |     }
728 |    ],
729 |    "source": [
730 |     "def long_tensor_from_list(indices_list):\n",
731 |     "    return torch.LongTensor(indices_list)\n",
732 |     "\n",
733 |     "def long_tensor_from_string(some_string):\n",
734 |     "    indices = [vectorizer.surname_vocab.start_index] + list(vectorizer.surname_vocab.map(some_string))\n",
735 |     "    return long_tensor_from_list(indices)\n",
736 |     "\n",
737 |     "long_tensor_from_string(\"McMahan\")"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": 29,
743 |    "metadata": {},
744 |    "outputs": [
745 |     {
746 |      "data": {
747 |       "text/plain": [
748 |        "90"
749 |       ]
750 |      },
751 |      "execution_count": 29,
752 |      "metadata": {},
753 |      "output_type": "execute_result"
754 |     }
755 |    ],
756 |    "source": [
757 |     "len(vectorizer.surname_vocab)"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "markdown",
762 |    "metadata": {},
763 |    "source": [
764 |     "## generation task\n",
765 |     "\n",
766 |     "1. get indices\n",
767 |     "2. call the network\n",
768 |     "3. get new index from last timestep of predictions\n",
769 |     "4. add character corresponding to index to string\n",
770 |     "5. repeat until end token"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": 34,
776 |    "metadata": {},
777 |    "outputs": [
778 |     {
779 |      "name": "stdout",
780 |      "output_type": "stream",
781 |      "text": [
782 |       "torch.Size([1, 2])\n"
783 |      ]
784 |     }
785 |    ],
786 |    "source": [
787 |     "# step 1\n",
788 |     "indices = long_tensor_from_string(\"M\").unsqueeze(dim=0)\n",
789 |     "print(indices.shape)\n",
790 |     "\n",
791 |     "# step 2\n",
792 |     "predictions = net(indices, apply_softmax=True)\n",
793 |     "# predictions.shape = (1, 2, 90)\n",
794 |     "\n",
795 |     "# step 3\n",
796 |     "_, max_index = predictions[0, -1].max(dim=0)\n",
797 |     "vectorizer.surname_vocab.lookup(max_index.item())"
798 |    ]
799 |   },
800 |   {
801 |    "cell_type": "code",
802 |    "execution_count": 43,
803 |    "metadata": {},
804 |    "outputs": [],
805 |    "source": [
806 |     "torch.multinomial?"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": 42,
812 |    "metadata": {},
813 |    "outputs": [
814 |     {
815 |      "data": {
816 |       "text/plain": [
817 |        "'a'"
818 |       ]
819 |      },
820 |      "execution_count": 42,
821 |      "metadata": {},
822 |      "output_type": "execute_result"
823 |     }
824 |    ],
825 |    "source": []
826 |   },
827 |   {
828 |    "cell_type": "code",
829 |    "execution_count": 32,
830 |    "metadata": {},
831 |    "outputs": [
832 |     {
833 |      "data": {
834 |       "text/plain": [
835 |        "torch.Size([1, 2, 90])"
836 |       ]
837 |      },
838 |      "execution_count": 32,
839 |      "metadata": {},
840 |      "output_type": "execute_result"
841 |     }
842 |    ],
843 |    "source": []
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": 11,
848 |    "metadata": {},
849 |    "outputs": [
850 |     {
851 |      "data": {
852 |       "text/plain": [
853 |        "<__main__.SurnamesVectorizer at 0x7f976cac0940>"
854 |       ]
855 |      },
856 |      "execution_count": 11,
857 |      "metadata": {},
858 |      "output_type": "execute_result"
859 |     }
860 |    ],
861 |    "source": [
862 |     "vectorizer"
863 |    ]
864 |   },
865 |   {
866 |    "cell_type": "code",
867 |    "execution_count": 14,
868 |    "metadata": {},
869 |    "outputs": [
870 |     {
871 |      "data": {
872 |       "text/plain": [
873 |        "1"
874 |       ]
875 |      },
876 |      "execution_count": 14,
877 |      "metadata": {},
878 |      "output_type": "execute_result"
879 |     }
880 |    ],
881 |    "source": [
882 |     "vectorizer.surname_vocab.start_index"
883 |    ]
884 |   },
885 |   {
886 |    "cell_type": "code",
887 |    "execution_count": null,
888 |    "metadata": {},
889 |    "outputs": [],
890 |    "source": [
891 |     "def predict_next_character(some_string):\n",
892 |     "    return next_character"
893 |    ]
894 |   },
895 |   {
896 |    "cell_type": "code",
897 |    "execution_count": null,
898 |    "metadata": {},
899 |    "outputs": [],
900 |    "source": []
901 |   },
902 |   {
903 |    "cell_type": "code",
904 |    "execution_count": null,
905 |    "metadata": {},
906 |    "outputs": [],
907 |    "source": []
908 |   },
909 |   {
910 |    "cell_type": "code",
911 |    "execution_count": null,
912 |    "metadata": {},
913 |    "outputs": [],
914 |    "source": []
915 |   },
916 |   {
917 |    "cell_type": "code",
918 |    "execution_count": null,
919 |    "metadata": {},
920 |    "outputs": [],
921 |    "source": []
922 |   },
923 |   {
924 |    "cell_type": "code",
925 |    "execution_count": null,
926 |    "metadata": {},
927 |    "outputs": [],
928 |    "source": []
929 |   }
930 |  ],
931 |  "metadata": {
932 |   "kernelspec": {
933 |    "display_name": "pytorch04",
934 |    "language": "python",
935 |    "name": "pytorch04"
936 |   },
937 |   "language_info": {
938 |    "codemirror_mode": {
939 |     "name": "ipython",
940 |     "version": 3
941 |    },
942 |    "file_extension": ".py",
943 |    "mimetype": "text/x-python",
944 |    "name": "python",
945 |    "nbconvert_exporter": "python",
946 |    "pygments_lexer": "ipython3",
947 |    "version": "3.6.6"
948 |   }
949 |  },
950 |  "nbformat": 4,
951 |  "nbformat_minor": 2
952 | }
953 | 


--------------------------------------------------------------------------------