├── .gitignore
├── AWS-tutorial.md
├── Docker-tutorial.md
├── README.md
├── common
    ├── README.md
    ├── __init__.py
    ├── download_utils.py
    ├── requirements_colab.txt
    └── tqdm_utils.py
├── docker
    ├── Dockerfile
    ├── requirements.txt
    └── welcome_message.txt
├── honor
    ├── README.md
    ├── datasets.py
    ├── download_cornell.sh
    ├── download_opensubs.sh
    └── example.py
├── optional
    ├── README.md
    └── telegram_bot
    │   ├── README.md
    │   ├── dialogue_manager.py
    │   ├── main_bot.py
    │   ├── utils.py
    │   └── week5-project.ipynb
├── setup_google_colab.py
├── week1
    ├── grader.py
    ├── lemmatization_demo.ipynb
    ├── metrics.py
    ├── tfidf_demo.ipynb
    └── week1-MultilabelClassification.ipynb
├── week2
    ├── evaluation.py
    └── week2-NER.ipynb
├── week3
    ├── grader.py
    ├── util.py
    └── week3-Embeddings.ipynb
├── week4
    ├── encoder-decoder-pic.png
    └── week4-seq2seq.ipynb
└── week5
    ├── dialogue_manager.py
    ├── utils.py
    └── week5-project.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Data for assignments
104 | data/
105 | 
106 | .idea


--------------------------------------------------------------------------------
/AWS-tutorial.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for setting up an AWS Virtual Machine
 2 | 
 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 
 4 | 
 5 | ### 1. Register with AWS and launch an EC2 instance
 6 | 
 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them):
 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged.
 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH.
10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change.
11 | 
12 | Next, you are ready to create your first EC2 instance:
13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) choose **Ubuntu Server 16.04 LTS**.
14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH. If you have problems connecting to the instance, try following this [troubleshooting guide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/TroubleshootingInstancesConnecting.html).
15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end.
16 | 
17 | ### 2. Set up dependencies and run your project
18 | 
19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions.
20 | 
21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling:
22 | ```sh
23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com
24 | ```
25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine.
26 | 
27 | If you're using PuTTY, before you start your SSH connection, go to the PuTTY Tunnels panel. Make sure the «Local» and «Auto» radio buttons are set. Enter the local port 8080 number into the «Source port» box. Enter the destination host name and port number into the «Destination» box, separated by a colon ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:8080.
28 | For more details see [this guide](https://www.akadia.com/services/ssh_putty.html).
29 | 
30 | - Bring code and data to AWS instance, e.g.
31 | ```sh
32 | scp -i path/to/your_key.pem -r path/to/local_directory ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file
33 | ``` 
34 | You might need to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows.
35 | 
36 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop. 
37 | 
38 | Thus, to run your scripts on the machine, we suggest that you run:  ssh -> tmux -> Docker -> Python.
39 | 


--------------------------------------------------------------------------------
/Docker-tutorial.md:
--------------------------------------------------------------------------------
  1 | # Docker container with course dependencies
  2 | 
  3 | This file describes how to use a Docker container with Jupyter notebook and
  4 | all dependencies required for the course.
  5 | 
  6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/.
  7 | 
  8 | ## Install Stable Docker Community Edition (CE)
  9 | 
 10 | - For Mac: 
 11 | https://docs.docker.com/docker-for-mac/install/
 12 | 
 13 | - For Ubuntu: 
 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu).
 15 | 
 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education):
 17 | https://docs.docker.com/docker-for-windows/install/ 
 18 | 
 19 | - For Windows (older versions):
 20 | https://docs.docker.com/toolbox/toolbox_install_windows/
 21 | 
 22 | 
 23 | 
 24 | ## Get container image
 25 | 
 26 | To get the latest version of the container image run:
 27 | ```sh
 28 | docker pull akashin/coursera-aml-nlp
 29 | ```
 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 
 31 | 
 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group:
 33 | ```sh
 34 | sudo usermod -a -G docker $USER
 35 | sudo service docker restart
 36 | ```
 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it).
 38 | 
 39 | 
 40 | ## Run container for the first time
 41 | 
 42 | Now you can start new container from this image with the following command:
 43 | ```sh
 44 | docker run -it -p 8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp -v path_on_your_machine:path_within_docker
 45 | ```
 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server.
 47 | 
 48 | Note that we are using `-p 8080:8080` argument to set up port forwarding to make IPython notebook accessible at address http://localhost:8080. If you're using AWS, make sure that you've [set up the port forwarding](https://github.com/hse-aml/natural-language-processing/blob/master/AWS-tutorial.md#2-set-up-dependencies-and-run-your-project) there as well.
 49 | 
 50 | **Important:** Docker image only contains system dependencies for the project (e.g. TensorFlow, Starspace).
 51 | All other project-related files (e.g. input data) need to be exposed to the container manually though [Docker volumes](https://docs.docker.com/storage/volumes/). To do this, we are mounting a directory from your machine within the container using `-v` option.
 52 | 
 53 | On Linux and OSX, an example command looks like:
 54 | ```sh
 55 | docker run -it -p 8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp
 56 | ```
 57 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path.
 58 | 
 59 | **On Windows**, there are some extra [steps](https://rominirani.com/docker-on-windows-mounting-host-directories-d96f3f056a2c) involved, and the launch command looks like
 60 | ```sh
 61 | docker run -it -p 8080:8080 --name coursera-aml-nlp --user root -v /c/Users/$YOUR_USERNAME:/root/coursera akashin/coursera-aml-nlp
 62 | ```
 63 | Where `/c/Users/$YOUR_USERNAME` is the path to your user's home folder.
 64 | 
 65 | If you're using Docker Toolbox on Windows, the command given above might not work because of the additional VirtualBox layer involved. Instead, we recommend that you follow the guidance in http://blog.shahinrostami.com/2017/11/docker-toolbox-windows-7-shared-volumes/.
 66 | 
 67 | ## Stop and resume container
 68 | 
 69 | To stop the container use:
 70 | ```sh
 71 | docker stop coursera-aml-nlp
 72 | ```
 73 | All the changes that were made within container will be saved.
 74 | 
 75 | To resume the stopped container use:
 76 | ```sh
 77 | docker start -i coursera-aml-nlp
 78 | ```
 79 | ## Other operations on the container
 80 | 
 81 | There are many other operations that you can perform on the container, to show all of them:
 82 | ```sh
 83 | docker container
 84 | ```
 85 | Some particularly useful would be **showing a list of containers** and **removing container**.
 86 | 
 87 | To show currently running and stopped containers with their status:
 88 | ```sh
 89 | docker ps -a
 90 | ```
 91 | 
 92 | To connect to a Bash shell in the already running container with name `coursera-aml-nlp` run:
 93 | ```
 94 | docker exec -it coursera-aml-nlp bash
 95 | ```
 96 | This will drop you into the standard Linux Bash shell that supports common commands like `ls`, `wget` or `python3`.
 97 | 
 98 | To remove the container and all data associated with it:
 99 | ```sh
100 | docker rm coursera-aml-nlp
101 | ```
102 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected.
103 | 
104 | ## Install more packages
105 | 
106 | You can install more packages in the container if needed:
107 | ```sh
108 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME
109 | ```
110 | 
111 | ## Change RAM limits of the container
112 | 
113 | Your container might have memory limits that are different from the actual limits of your physical machine, which might lead to a crash of your code due memory shortage.
114 | 
115 | - If you're running Windows or OSX, the default limit is 2GB, but you can change it by following this tutorials:
116 |   - For Windows: https://docs.docker.com/docker-for-windows/#advanced
117 |   - For Mac OSX: https://docs.docker.com/docker-for-mac/#advanced
118 | 
119 | - If you're running Linux, you're all set as the memory limits are the same as the physical memory of your machine.
120 | 
121 | 
122 | ## Further reading
123 | 
124 | If you are interested to know more about Docker, check out this articles: 
125 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/
126 | - General introduction to Docker: https://docker-curriculum.com/
127 | 
128 | ## Troubleshooting
129 | 
130 | ### Verify your Docker installation by running "Hello World" application
131 | - Run `docker pull hello-world`. You should see a message that ends with 
132 |     “Status: Downloaded newer image for hello-world:latest”.
133 | - Run `docker run hello-world`.  You should see a message that starts with
134 |     “Hello from Docker!
135 |     This message shows that your installation appears to be working correctly.”
136 | 
137 | If you see any errors, follow relevant troubleshooting steps.
138 | 
139 | ### “Unauthorized: authentication required” when trying to pull Docker image
140 | Run `docker logout` and try pulling again. If this doesn't help, make sure the system date is set correctly and try again. If this doesn't help, reinstall Docker and try again.
141 | 
142 | ### Can't open Jupyter notebook in the browser
143 | If you try to open "http://localhost:8080" or "http://127.0.0.1:8080" in your browser, when `run_notebook` command is started, and you can't access your notebooks, here are some advices:
144 | - If you're using Docker Toolbox on Windows, try accessing "http://192.168.99.100:8080" instead. If this doesn't work, follow the instructions [on official Docker docs](https://docs.docker.com/docker-for-windows/troubleshoot/#limitations-of-windows-containers-for-localhost-and-published-ports) and on [Stackoverflow](https://stackoverflow.com/questions/42866013/docker-toolbox-localhost-not-working).
145 | - Make sure that you're running container with `-p` flag as described [here](#run-container-for-the-first-time) and that the output of `docker ps` contains a message like this:
146 | ```
147 | CONTAINER ID        IMAGE                      COMMAND             CREATED                  STATUS              PORTS               NAMES
148 | e5b7bcd85a1b        akashin/coursera-aml-nlp   "/bin/bash"         Less than a second ago   Up 2 seconds        8080/tcp            peaceful_lamarr
149 | ```
150 | If the part about `PORTS` differs, remove the current container following [instructions](#other-operations-on-the-container) and start it again.
151 | - Make sure that browser proxy settings don't interfere with accessing local web sites.
152 | - If you're running Docker on AWS, make sure you've set up port forwarding as described [here](https://github.com/hse-aml/natural-language-processing/blob/master/AWS-tutorial.md#2-set-up-dependencies-and-run-your-project).
153 | 
154 | ### How do I load data into Docker container?
155 | To access the data in the container, we recommend to use `-v` flag described [here](#run-container-for-the-first-time) to mount a local directory from your computer into the container filesystem. For more details read [Docker documentation](https://docs.docker.com/storage/volumes/).
156 | 
157 | Alternatively, you can download data using Jupyter "Upload" button or `wget` command in the [Bash shell](#other-operations-on-the-container) of the container.
158 | 
159 | ### Can't run `run_notebook` or `starspace` command
160 | Make sure that you're executing it in the context of the Docker container as described [here](#run-container-for-the-first-time).
161 | 
162 | ### "Name is already in use by container" when trying to run the container
163 | This means that the container with this name is already created. You can connect to this container or remove it by following [instructions](#other-operations-on-the-container).
164 | 
165 | ### StarSpace/Jupyter notebook crashes in Docker
166 | This usually happens due to low default 2GB memory limit on Windows and OSX. Follow this [instructions](#change-ram-limits-of-the-container) to fix this.
167 | 
168 | ### "This computer doesn't have VT-X/AMD-v enabled", when trying to run the container
169 | This usually happens if you're using Docker Toolbox that needs Virtual Box support - hence the need for the hardware virtualization that can be enabled in BIOS.
170 | Try to turn on the VT-X support in BIOS as described in [Microsoft documentation](https://blogs.technet.microsoft.com/canitpro/2015/09/08/step-by-step-enabling-hyper-v-for-use-on-windows-10/) or on [GitHub](https://github.com/docker/machine/issues/4271).
171 | 
172 | ## Reporting the issue to the Coursera forum
173 | Before reporting the issue to the Coursera forum, please, make sure that you've checked the [troubleshooting](#troubleshooting) steps. Only if they don't help, post all relevant error messages, throubleshooting results, and the following information to your post:
174 | 
175 | - Your operating system (e.g. Windows 7, Ubuntu Linux, OSX 10.13.3)
176 | - Your docker version (e.g. Docker Toolbox, Docker for Windows, output of `docker --version`)
177 | - Output of `docker ps -a`, `docker info`, `docker version -f "{{ .Server.Os }}"` (share thorough https://gist.github.com/ or https://pastebin.com/)
178 | - Output of `wget http://localhost:8080` (or `wget http://192.168.99.100:8080` for Docker Toolbox), executed from within Docker container and outside of it
179 | 
180 | ## Credits
181 | 
182 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Natural Language Processing course resources
 2 | This github contains practical assignments for Natural Language Processing course by Higher School of Economics:
 3 | https://www.coursera.org/learn/language-processing.
 4 | In this course you will learn how to solve common NLP problems using classical and deep learning approaches.
 5 | 
 6 | From a practical side, we expect your familiarity with Python, since we will use it for all assignments in the course. Two of the assignments will also involve TensorFlow. You will work with many other libraries, including NLTK, Scikit-learn, and Gensim. You have several options on how to set it up.
 7 | 
 8 | ## 1. Running on Google Colab
 9 | Google has released its own flavour of Jupyter called Colab, which has free GPUs!
10 | 
11 | Here's how you can use it:
12 | 1. Open https://colab.research.google.com, click **Sign in** in the upper right corner, use your Google credentials to sign in.
13 | 2. Click **GITHUB** tab, paste https://github.com/hse-aml/natural-language-processing and press Enter
14 | 3. Choose the notebook you want to open, e.g. week1/week1-MultilabelClassification.ipynb
15 | 4. Click **File -> Save a copy in Drive...** to save your progress in Google Drive
16 | 5. _If you need a GPU_, click **Runtime -> Change runtime type** and select **GPU** in Hardware accelerator box
17 | 6. **Execute** the following code in the first cell that downloads dependencies (change for your week number):
18 | ```python
19 | ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py
20 | import setup_google_colab
21 | # please, uncomment the week you're working on
22 | # setup_google_colab.setup_week1()  
23 | # setup_google_colab.setup_week2()
24 | # setup_google_colab.setup_week3()
25 | # setup_google_colab.setup_week4()
26 | # setup_google_colab.setup_project()
27 | # setup_google_colab.setup_honor()
28 | ```
29 | 7. If you run many notebooks on Colab, they can continue to eat up memory,
30 | you can kill them with `! pkill -9 python3` and check with `! nvidia-smi` that GPU memory is freed.
31 | 
32 | **Known issues:**
33 | * No support for `ipywidgets`, so we cannot use fancy `tqdm` progress bars.
34 | For now, we use a simplified version of a progress bar suitable for Colab.
35 | * Blinking animation with `IPython.display.clear_output()`.
36 | It's usable, but still looking for a workaround.
37 | * If you see an error "No module named 'common'", make sure you've uncommented the assignment-specific line in step 6, restart your kernel and execute all cells again
38 | 
39 | ## 2. Running locally
40 | 
41 | Two options here:
42 | 
43 | 1. Use the Docker container of our course. It already has all libraries, that you will need. The setup for you is very simple: install Docker application depending on your OS, download our container image, run everything within the container. Please, see this [detailed Docker tutorial](Docker-tutorial.md).
44 | 
45 | 2. Manually install all the libraries depending on your OS (each task contains a list of needed libraries in the very beginning). If you use Windows/MacOS you might find useful Anaconda distribution which allows to install easily most of the needed libraries. However, some tools, like StarSpace for week 2, are not compatible with Windows, so it's likely that you will have to use Docker anyways, if you go for these tasks.
46 | 
47 | It might take a significant amount of time and resources to run the assignments code, but we expect that an average laptop is enough to accomplish the tasks. All assignments were tested in the Docker on Mac with 8GB RAM. If you have memory errors, that could be caused by not tested configurations or inefficient code. Consider reporting these cases or double-checking your code.
48 | 
49 | If you want to run the code of the course on the AWS machine, we've prepared the [AWS tutorial here](AWS-tutorial.md).
50 | 


--------------------------------------------------------------------------------
/common/README.md:
--------------------------------------------------------------------------------
1 | # Common utils
2 | 
3 | This folder stores collection of functions that are common for different assignments
4 | 
5 | - `download_utils.py`: Functions for downloading data for the assignments.
6 | 


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/common/download_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import shutil
  5 | import requests
  6 | from common import tqdm_utils
  7 | 
  8 | 
  9 | REPOSITORY_PATH = "https://github.com/hse-aml/natural-language-processing"
 10 | 
 11 | 
 12 | def download_file(url, file_path):
 13 |     r = requests.get(url, stream=True)
 14 |     total_size = int(r.headers.get('content-length'))
 15 |     try:
 16 |         with open(file_path, 'wb', buffering=16*1024*1024) as f:
 17 |             bar = tqdm_utils.tqdm_notebook_failsafe(total=total_size, unit='B', unit_scale=True)
 18 |             bar.set_description(os.path.split(file_path)[-1])
 19 |             for chunk in r.iter_content(32 * 1024):
 20 |                 f.write(chunk)
 21 |                 bar.update(len(chunk))
 22 |             bar.close()
 23 |     except Exception:
 24 |         print("Download failed")
 25 |     finally:
 26 |         if os.path.getsize(file_path) != total_size:
 27 |             os.remove(file_path)
 28 |             print("Removed incomplete download")
 29 | 
 30 | 
 31 | def download_from_github(version, fn, target_dir, force=False):
 32 |     url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn)
 33 |     file_path = os.path.join(target_dir, fn)
 34 |     if os.path.exists(file_path) and not force:
 35 |         print("File {} is already downloaded.".format(file_path))
 36 |         return
 37 |     download_file(url, file_path)
 38 | 
 39 | 
 40 | def sequential_downloader(version, fns, target_dir, force=False):
 41 |     os.makedirs(target_dir, exist_ok=True)
 42 |     for fn in fns:
 43 |         download_from_github(version, fn, target_dir, force=force)
 44 | 
 45 | 
 46 | def download_week1_resources(force=False):
 47 |     sequential_downloader(
 48 |         "week1",
 49 |         [
 50 |             "train.tsv",
 51 |             "validation.tsv",
 52 |             "test.tsv",
 53 |             "text_prepare_tests.tsv",
 54 |         ],
 55 |         "data",
 56 |         force=force
 57 |     )
 58 | 
 59 | 
 60 | def download_week2_resources(force=False):
 61 |     sequential_downloader(
 62 |         "week2",
 63 |         [
 64 |             "train.txt",
 65 |             "validation.txt",
 66 |             "test.txt",
 67 |         ],
 68 |         "data",
 69 |         force=force
 70 |     )
 71 | 
 72 | 
 73 | def download_week3_resources(force=False):
 74 |     sequential_downloader(
 75 |         "week3",
 76 |         [
 77 |             "train.tsv",
 78 |             "validation.tsv",
 79 |             "test.tsv",
 80 |             "test_embeddings.tsv",
 81 |         ],
 82 |         "data",
 83 |         force=force
 84 |     )
 85 |     print("Downloading GoogleNews-vectors-negative300.bin.gz (1.5G) for you, it will take a while...")
 86 |     download_file("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz",
 87 |                   "GoogleNews-vectors-negative300.bin.gz")
 88 | 
 89 | 
 90 | def download_project_resources(force=False):
 91 |     sequential_downloader(
 92 |         "project",
 93 |         [
 94 |             "dialogues.tsv",
 95 |             "tagged_posts.tsv",
 96 |         ],
 97 |         "data",
 98 |         force=force
 99 |     )
100 | 


--------------------------------------------------------------------------------
/common/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | backports.weakref==1.0.post1
 3 | ChatterBot==0.7.6
 4 | enum34==1.1.6
 5 | funcsigs==1.0.2
 6 | gensim==3.8.0
 7 | jedi==0.11.0
 8 | libarchive==0.4.4
 9 | mock==2.0.0
10 | parso==0.1.0
11 | pbr==3.1.1
12 | regex==2017.11.9
13 | 


--------------------------------------------------------------------------------
/common/tqdm_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import print_function
 4 | 
 5 | 
 6 | class SimpleTqdm():
 7 |     def __init__(self, iterable=None, total=None, **kwargs):
 8 |         self.iterable = list(iterable) if iterable is not None else None
 9 |         self.total = len(self.iterable) if self.iterable is not None else total
10 |         assert self.iterable is not None or self.total is not None
11 |         self.current_step = 0
12 |         self.print_frequency = max(self.total // 50, 1)
13 |         self.desc = ""
14 | 
15 |     def set_description_str(self, desc):
16 |         self.desc = desc
17 | 
18 |     def set_description(self, desc):
19 |         self.desc = desc
20 | 
21 |     def update(self, steps):
22 |         last_print_step = (self.current_step // self.print_frequency) * self.print_frequency
23 |         i = 1
24 |         while last_print_step + i * self.print_frequency <= self.current_step + steps:
25 |             print("*", end='')
26 |             i += 1
27 |         self.current_step += steps
28 | 
29 |     def close(self):
30 |         print("\n" + self.desc)
31 | 
32 |     def __iter__(self):
33 |         assert self.iterable is not None
34 |         self.index = 0
35 |         return self
36 | 
37 |     def __next__(self):
38 |         if self.index < self.total:
39 |             element = self.iterable[self.index]
40 |             self.update(1)
41 |             self.index += 1
42 |             return element
43 |         else:
44 |             self.close()
45 |             raise StopIteration
46 | 
47 | 
48 | def tqdm_notebook_failsafe(*args, **kwargs):
49 |     try:
50 |         import tqdm
51 |         tqdm.monitor_interval = 0  # workaround for https://github.com/tqdm/tqdm/issues/481
52 |         return tqdm.tqdm_notebook(*args, **kwargs)
53 |     except:
54 |         # tqdm is broken on Google Colab
55 |         return SimpleTqdm(*args, **kwargs)
56 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | LABEL maintainer="Andrei Kashin <kashin.andrej@gmail.com>"
 3 | 
 4 | RUN apt-get update && apt-get install -yq \
 5 |                         python3 python3-pip htop nano git wget \
 6 |                         libglib2.0-0 autoconf automake \
 7 |                         libtool build-essential unzip \
 8 |                         libarchive-dev vim
 9 | 
10 | # Install Starspace.
11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \
12 |     unzip boost_1_63_0.zip && \
13 |     mv boost_1_63_0 /usr/local/bin
14 | 
15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \
16 |     cd Starspace && \
17 |     make && \
18 |     cp -Rf starspace /usr/local/bin
19 | 
20 | # Install Python dependencies.
21 | ADD requirements.txt /
22 | RUN pip3 install --upgrade pip
23 | RUN pip3 install -r requirements.txt
24 | 
25 | # Install Jupyter.
26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension
27 | RUN jupyter contrib nbextension install
28 | RUN jupyter nbextension enable codefolding/main
29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py
30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py
31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py
32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook
33 | 
34 | # Welcome message.
35 | ADD welcome_message.txt /
36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \
37 |         >> /etc/bash.bashrc \
38 |         ; cat welcome_message.txt > /etc/motd
39 | 
40 | WORKDIR /root
41 | EXPOSE 8080
42 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.weakref==1.0.post1
 2 | bleach==1.5.0
 3 | certifi==2017.11.5
 4 | chardet==3.0.4
 5 | ChatterBot==0.7.6
 6 | decorator==4.1.2
 7 | entrypoints==0.2.3
 8 | enum34==1.1.6
 9 | funcsigs==1.0.2
10 | gensim==3.1.0
11 | html5lib==0.9999999
12 | idna==2.6
13 | ipykernel==4.6.1
14 | ipython==6.2.1
15 | ipython-genutils==0.2.0
16 | ipywidgets==7.0.5
17 | jedi==0.11.0
18 | Jinja2==2.10
19 | jsonschema==2.6.0
20 | jupyter==1.0.0
21 | jupyter-client==5.1.0
22 | jupyter-console==5.2.0
23 | jupyter-contrib-core==0.3.3
24 | jupyter-contrib-nbextensions==0.3.3
25 | jupyter-core==4.4.0
26 | jupyter-highlight-selected-word==0.1.0
27 | jupyter-latex-envs==1.3.8.4
28 | jupyter-nbextensions-configurator==0.2.8
29 | libarchive==0.4.4
30 | Markdown==2.6.9
31 | MarkupSafe==1.0
32 | matplotlib==2.1.0
33 | mistune==0.8.1
34 | mock==2.0.0
35 | nbconvert==5.3.1
36 | nbformat==4.4.0
37 | nltk==3.4.5
38 | notebook==5.7.8
39 | numpy==1.13.3
40 | pandas==0.21.0
41 | pandocfilters==1.4.2
42 | parso==0.1.0
43 | pbr==3.1.1
44 | pexpect==4.3.0
45 | pickleshare==0.7.4
46 | prompt-toolkit==1.0.15
47 | protobuf==3.5.0.post1
48 | ptyprocess==0.5.2
49 | Pygments==2.2.0
50 | python-dateutil==2.6.1
51 | pyzmq==16.0.3
52 | qtconsole==4.3.1
53 | regex==2017.11.9
54 | requests==2.18.4
55 | scikit-learn==0.19.1
56 | scipy==1.0.0
57 | simplegeneric==0.8.1
58 | six==1.11.0
59 | tensorflow==1.15.0
60 | tensorflow-tensorboard==0.4.0rc3
61 | terminado==0.7
62 | testpath==0.3.1
63 | tornado==4.5.2
64 | tqdm==4.19.4
65 | traitlets==4.3.2
66 | urllib3==1.22
67 | wcwidth==0.1.7
68 | Werkzeug==0.12.2
69 | widgetsnbextension==3.0.8
70 | 


--------------------------------------------------------------------------------
/docker/welcome_message.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ===================================================================
 3 | Welcome to the Docker container for the Coursera NLP course.
 4 | 
 5 | This container contains dependencies that you might need
 6 | to complete course assignments.
 7 | 
 8 | You can also install any additional system dependencies with
 9 | > apt-get install PACKAGE_NAME
10 | 
11 | And Python dependencies with
12 | > pip3 install PACKAGE_NAME
13 | 
14 | To run Jupyter Notebook in the container just type
15 | > run_notebook
16 | ===================================================================
17 | 
18 | 


--------------------------------------------------------------------------------
/honor/README.md:
--------------------------------------------------------------------------------
1 | # Utils to download and read data for chat-bot training
2 | 
3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training:
4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size)
5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size)
6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset
7 | - `example.py` - example of reading the dataset
8 | 


--------------------------------------------------------------------------------
/honor/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Conchylicultor. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import ast
 17 | import os
 18 | import random
 19 | import re
 20 | from time import time
 21 | 
 22 | import nltk
 23 | from tqdm import tqdm
 24 | 
 25 | """
 26 | Load the cornell movie dialog corpus.
 27 | 
 28 | Available from here:
 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
 30 | 
 31 | """
 32 | 
 33 | class CornellData:
 34 |     """
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, dirName):
 39 |         """
 40 |         Args:
 41 |             dirName (string): directory where to load the corpus
 42 |         """
 43 |         self.lines = {}
 44 |         self.conversations = []
 45 | 
 46 |         MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
 47 |         MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]
 48 | 
 49 |         self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS)
 50 |         self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS)
 51 | 
 52 |         # TODO: Cleaner program (merge copy-paste) !!
 53 | 
 54 |     def loadLines(self, fileName, fields):
 55 |         """
 56 |         Args:
 57 |             fileName (str): file to load
 58 |             field (set<str>): fields to extract
 59 |         Return:
 60 |             dict<dict<str>>: the extracted fields for each line
 61 |         """
 62 |         lines = {}
 63 | 
 64 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 65 |             for line in f:
 66 |                 values = line.split(" +++$+++ ")
 67 | 
 68 |                 # Extract fields
 69 |                 lineObj = {}
 70 |                 for i, field in enumerate(fields):
 71 |                     lineObj[field] = values[i]
 72 | 
 73 |                 lines[lineObj['lineID']] = lineObj
 74 | 
 75 |         return lines
 76 | 
 77 |     def loadConversations(self, fileName, fields):
 78 |         """
 79 |         Args:
 80 |             fileName (str): file to load
 81 |             field (set<str>): fields to extract
 82 |         Return:
 83 |             list<dict<str>>: the extracted fields for each line
 84 |         """
 85 |         conversations = []
 86 | 
 87 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 88 |             for line in f:
 89 |                 values = line.split(" +++$+++ ")
 90 | 
 91 |                 # Extract fields
 92 |                 convObj = {}
 93 |                 for i, field in enumerate(fields):
 94 |                     convObj[field] = values[i]
 95 | 
 96 |                 # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
 97 |                 lineIds = ast.literal_eval(convObj["utteranceIDs"])
 98 | 
 99 |                 # Reassemble lines
100 |                 convObj["lines"] = []
101 |                 for lineId in lineIds:
102 |                     convObj["lines"].append(self.lines[lineId])
103 | 
104 |                 conversations.append(convObj)
105 | 
106 |         return conversations
107 | 
108 |     def getConversations(self):
109 |         return self.conversations
110 | 
111 | 
112 | # Based on code from https://github.com/AlJohri/OpenSubtitles
113 | # by Al Johri <al.johri@gmail.com>
114 | 
115 | import xml.etree.ElementTree as ET
116 | import datetime
117 | import os
118 | import sys
119 | import json
120 | import re
121 | import pprint
122 | 
123 | from gzip import GzipFile
124 | 
125 | """
126 | Load the opensubtitles dialog corpus.
127 | """
128 | 
129 | class OpensubsData:
130 |     """
131 |     """
132 | 
133 |     def __init__(self, dirName):
134 |         """
135 |         Args:
136 |             dirName (string): directory where to load the corpus
137 |         """
138 | 
139 |         # Hack this to filter on subset of Opensubtitles
140 |         # dirName = "%s/en/Action" % dirName
141 | 
142 |         print("Loading OpenSubtitles conversations in %s." % dirName)
143 |         self.conversations = []
144 |         self.tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
145 |         self.conversations = self.loadConversations(dirName)
146 | 
147 |     def loadConversations(self, dirName):
148 |         """
149 |         Args:
150 |             dirName (str): folder to load
151 |         Return:
152 |             array(question, answer): the extracted QA pairs
153 |         """
154 |         conversations = []
155 |         dirList = self.filesInDir(dirName)
156 |         for filepath in tqdm(dirList, "OpenSubtitles data files"):
157 |             if filepath.endswith('gz'):
158 |                 try:
159 |                     doc = self.getXML(filepath)
160 |                     conversations.extend(self.genList(doc))
161 |                 except ValueError:
162 |                     tqdm.write("Skipping file %s with errors." % filepath)
163 |                 except:
164 |                     print("Unexpected error:", sys.exc_info()[0])
165 |                     raise
166 |         return conversations
167 | 
168 |     def getConversations(self):
169 |         return self.conversations
170 | 
171 |     def genList(self, tree):
172 |         root = tree.getroot()
173 | 
174 |         timeFormat = '%H:%M:%S'
175 |         maxDelta = datetime.timedelta(seconds=1)
176 | 
177 |         startTime = datetime.datetime.min
178 |         strbuf = ''
179 |         sentList = []
180 | 
181 |         for child in root:
182 |             for elem in child:
183 |                 if elem.tag == 'time':
184 |                     elemID = elem.attrib['id']
185 |                     elemVal = elem.attrib['value'][:-4]
186 |                     if elemID[-1] == 'S':
187 |                         startTime = datetime.datetime.strptime(elemVal, timeFormat)
188 |                     else:
189 |                         sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat)))
190 |                         strbuf = ''
191 |                 else:
192 |                     try:
193 |                         strbuf = strbuf + " " + elem.text
194 |                     except:
195 |                         pass
196 | 
197 |         conversations = []
198 |         for idx in range(0, len(sentList) - 1):
199 |             cur = sentList[idx]
200 |             nxt = sentList[idx + 1]
201 |             if nxt[1] - cur[2] <= maxDelta and cur and nxt:
202 |                 tmp = {}
203 |                 tmp["lines"] = []
204 |                 tmp["lines"].append(self.getLine(cur[0]))
205 |                 tmp["lines"].append(self.getLine(nxt[0]))
206 |                 if self.filter(tmp):
207 |                     conversations.append(tmp)
208 | 
209 |         return conversations
210 | 
211 |     def getLine(self, sentence):
212 |         line = {}
213 |         line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower()
214 |         return line
215 | 
216 |     def filter(self, lines):
217 |         # Use the followint to customize filtering of QA pairs
218 |         #
219 |         # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will")
220 |         # question = lines["lines"][0]["text"]
221 |         # if not question.endswith('?'):
222 |         #     return False
223 |         # if not question.split(' ')[0] in startwords:
224 |         #     return False
225 |         #
226 |         return True
227 | 
228 |     def getXML(self, filepath):
229 |         fext = os.path.splitext(filepath)[1]
230 |         if fext == '.gz':
231 |             tmp = GzipFile(filename=filepath)
232 |             return ET.parse(tmp)
233 |         else:
234 |             return ET.parse(filepath)
235 | 
236 |     def filesInDir(self, dirname):
237 |         result = []
238 |         for dirpath, dirs, files in os.walk(dirname):
239 |             for filename in files:
240 |                 fname = os.path.join(dirpath, filename)
241 |                 result.append(fname)
242 |         return result
243 | 
244 | 
245 | def extractText(line, fast_preprocessing=True):
246 |     if fast_preprocessing:
247 |         GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
248 |         REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]')
249 |         REPLACE_SEVERAL_SPACES = re.compile('\s+')
250 | 
251 |         line = line.lower()
252 |         line = REPLACE_BY_SPACE_RE.sub(' ', line)
253 |         line = GOOD_SYMBOLS_RE.sub('', line)
254 |         line = REPLACE_SEVERAL_SPACES.sub(' ', line)
255 |         return line.strip()
256 |     else:
257 |         return nltk.word_tokenize(line)
258 | 
259 | 
260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True):
261 |     data = []
262 |     for i, conversation in enumerate(tqdm(conversations)):
263 |         lines = conversation['lines']
264 |         for i in range(len(lines) - 1):
265 |             request = extractText(lines[i]['text'])
266 |             reply = extractText(lines[i + 1]['text'])
267 |             if 0 < len(request) <= max_len and 0 < len(reply) <= max_len:
268 |                 data += [(request, reply)]
269 |     return data
270 | 
271 | 
272 | def readCornellData(path, max_len=20, fast_preprocessing=True):
273 |     dataset = CornellData(path)
274 |     conversations = dataset.getConversations()
275 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
276 | 
277 | 
278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True):
279 |     dataset = OpensubsData(path)
280 |     conversations = dataset.getConversations()
281 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
282 | 


--------------------------------------------------------------------------------
/honor/download_cornell.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/cornell
4 | cd data/cornell
5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt
6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt
7 | 


--------------------------------------------------------------------------------
/honor/download_opensubs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/opensubs
4 | cd data/opensubs
5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz
6 | tar -xf en.tar.gz
7 | rm en.tar.gz
8 | 


--------------------------------------------------------------------------------
/honor/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import datasets
 4 | import argparse
 5 | import os
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.")
10 |     parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.")
11 |     args = parser.parse_args()
12 | 
13 |     dataset_path = os.path.join("data", args.dataset)
14 |     if args.dataset == "cornell":
15 |         data = datasets.readCornellData(dataset_path, max_len=args.max_len)
16 |     elif args.dataset == "opensubs":
17 |         data = datasets.readOpensubsData(dataset_path, max_len=args.max_len)
18 |     else:
19 |         raise ValueError("Unrecognized dataset: {!r}".format(args.dataset))
20 | 
21 |     print("Size of dataset: {}".format(len(data)))
22 |     print("First 10 training pairs:")
23 |     for item in data[:10]:
24 |         print(item)
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/optional/README.md:
--------------------------------------------------------------------------------
1 | # Optional projects
2 | 
3 | This folder contains optional projects available in this course.
4 | 


--------------------------------------------------------------------------------
/optional/telegram_bot/README.md:
--------------------------------------------------------------------------------
 1 | # [Optional] Telegram bot 
 2 | 
 3 | This folder contains the starting code for the optional Telegram bot extension of the project.
 4 | 
 5 | If you want to permanently host your bot, you can follow our [AWS tutorial](../../AWS-tutorial.md).
 6 | 
 7 | ## Troubleshooting
 8 | 
 9 | ### Bot crashes with the unicode error 
10 | 
11 | If your bot code crashes with the error that ends with `UnicodeEncodeError: 'ascii' codec can't encode character`,
12 | your terminal likely has problems showing unicode symbols. To fix this you can change your terminal local by adding
13 | the following lines to you `~/.bashrc` file (or any other shell configuration):
14 | 
15 | ```
16 | export LC_ALL=en_US.UTF-8
17 | export LANG=en_US.UTF-8
18 | export LANGUAGE=en_US.UTF-8
19 | ```
20 | 
21 | To verify the effect, you can run the following command end check that it outputs 'utf-8'
22 | ```python
23 | > python -c 'import locale; print(locale.getpreferredencoding())'
24 | utf-8
25 | ```
26 | 
27 | You can find more details in this [article](https://perlgeek.de/en/article/set-up-a-clean-utf8-environment).
28 | 
29 | If this doesn't work, you can explicitly specify the encoding when opening files:
30 | ```python
31 | with open(filename, 'r', encoding="utf-8") as file:
32 |   ...
33 | ```
34 | 


--------------------------------------------------------------------------------
/optional/telegram_bot/dialogue_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin
 3 | 
 4 | from chatterbot import ChatBot
 5 | from utils import *
 6 | 
 7 | 
 8 | class ThreadRanker(object):
 9 |     def __init__(self, paths):
10 |         self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
11 |         self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
12 | 
13 |     def __load_embeddings_by_tag(self, tag_name):
14 |         embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
15 |         thread_ids, thread_embeddings = unpickle_file(embeddings_path)
16 |         return thread_ids, thread_embeddings
17 | 
18 |     def get_best_thread(self, question, tag_name):
19 |         """ Returns id of the most similar thread for the question.
20 |             The search is performed across the threads with a given tag.
21 |         """
22 |         thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
23 | 
24 |         # HINT: you have already implemented a similar routine in the 3rd assignment.
25 |         
26 |         question_vec = #### YOUR CODE HERE ####
27 |         best_thread = #### YOUR CODE HERE ####
28 |         
29 |         return thread_ids[best_thread]
30 | 
31 | 
32 | class DialogueManager(object):
33 |     def __init__(self, paths):
34 |         print("Loading resources...")
35 | 
36 |         # Intent recognition:
37 |         self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
38 |         self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
39 | 
40 |         self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
41 | 
42 |         # Goal-oriented part:
43 |         self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
44 |         self.thread_ranker = ThreadRanker(paths)
45 | 
46 |     def create_chitchat_bot(self):
47 |         """Initializes self.chitchat_bot with some conversational model."""
48 | 
49 |         # Hint: you might want to create and train chatterbot.ChatBot here.
50 |         # It could be done by creating ChatBot with the *trainer* parameter equals 
51 |         # "chatterbot.trainers.ChatterBotCorpusTrainer"
52 |         # and then calling *train* function with "chatterbot.corpus.english" param
53 |         
54 |         ########################
55 |         #### YOUR CODE HERE ####
56 |         ########################
57 |        
58 |     def generate_answer(self, question):
59 |         """Combines stackoverflow and chitchat parts using intent recognition."""
60 | 
61 |         # Recognize intent of the question using `intent_recognizer`.
62 |         # Don't forget to prepare question and calculate features for the question.
63 |         
64 |         prepared_question = #### YOUR CODE HERE ####
65 |         features = #### YOUR CODE HERE ####
66 |         intent = #### YOUR CODE HERE ####
67 | 
68 |         # Chit-chat part:   
69 |         if intent == 'dialogue':
70 |             # Pass question to chitchat_bot to generate a response.       
71 |             response = #### YOUR CODE HERE ####
72 |             return response
73 |         
74 |         # Goal-oriented part:
75 |         else:        
76 |             # Pass features to tag_classifier to get predictions.
77 |             tag = #### YOUR CODE HERE ####
78 |             
79 |             # Pass prepared_question to thread_ranker to get predictions.
80 |             thread_id = #### YOUR CODE HERE ####
81 |            
82 |             return self.ANSWER_TEMPLATE % (tag, thread_id)
83 | 
84 | 


--------------------------------------------------------------------------------
/optional/telegram_bot/main_bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import requests
  4 | import time
  5 | import argparse
  6 | import os
  7 | import json
  8 | 
  9 | from requests.compat import urljoin
 10 | 
 11 | 
 12 | class BotHandler(object):
 13 |     """
 14 |         BotHandler is a class which implements all back-end of the bot.
 15 |         It has tree main functions:
 16 |             'get_updates' — checks for new messages
 17 |             'send_message' – posts new message to user
 18 |             'get_answer' — computes the most relevant on a user's question
 19 |     """
 20 | 
 21 |     def __init__(self, token, dialogue_manager):
 22 |         self.token = token
 23 |         self.api_url = "https://api.telegram.org/bot{}/".format(token)
 24 |         self.dialogue_manager = dialogue_manager
 25 | 
 26 |     def get_updates(self, offset=None, timeout=30):
 27 |         params = {"timeout": timeout, "offset": offset}
 28 |         raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
 29 |         try:
 30 |             resp = raw_resp.json()
 31 |         except json.decoder.JSONDecodeError as e:
 32 |             print("Failed to parse response {}: {}.".format(raw_resp.content, e))
 33 |             return []
 34 | 
 35 |         if "result" not in resp:
 36 |             return []
 37 |         return resp["result"]
 38 | 
 39 |     def send_message(self, chat_id, text):
 40 |         params = {"chat_id": chat_id, "text": text}
 41 |         return requests.post(urljoin(self.api_url, "sendMessage"), params)
 42 | 
 43 |     def get_answer(self, question):
 44 |         if question == '/start':
 45 |             return "Hi, I am your project bot. How can I help you today?"
 46 |         return self.dialogue_manager.generate_answer(question)
 47 | 
 48 | 
 49 | def parse_args():
 50 |     parser = argparse.ArgumentParser()
 51 |     parser.add_argument('--token', type=str, default='')
 52 |     return parser.parse_args()
 53 | 
 54 | 
 55 | def is_unicode(text):
 56 |     return len(text) == len(text.encode())
 57 | 
 58 | 
 59 | class SimpleDialogueManager(object):
 60 |     """
 61 |     This is the simplest dialogue manager to test the telegram bot.
 62 |     Your task is to create a more advanced one in dialogue_manager.py."
 63 |     """
 64 |     
 65 |     def generate_answer(self, question): 
 66 |         return "Hello, world!" 
 67 |         
 68 | 
 69 | def main():
 70 |     args = parse_args()
 71 |     token = args.token
 72 | 
 73 |     if not token:
 74 |         if not "TELEGRAM_TOKEN" in os.environ:
 75 |             print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
 76 |             return
 77 |         token = os.environ["TELEGRAM_TOKEN"]
 78 | 
 79 |     #################################################################
 80 |     
 81 |     # Your task is to complete dialogue_manager.py and use your 
 82 |     # advanced DialogueManager instead of SimpleDialogueManager. 
 83 |     
 84 |     # This is the point where you plug it into the Telegram bot. 
 85 |     # Do not forget to import all needed dependencies when you do so.
 86 |     
 87 |     simple_manager = SimpleDialogueManager()
 88 |     bot = BotHandler(token, simple_manager)
 89 |     
 90 |     ###############################################################
 91 | 
 92 |     print("Ready to talk!")
 93 |     offset = 0
 94 |     while True:
 95 |         updates = bot.get_updates(offset=offset)
 96 |         for update in updates:
 97 |             print("An update received.")
 98 |             if "message" in update:
 99 |                 chat_id = update["message"]["chat"]["id"]
100 |                 if "text" in update["message"]:
101 |                     text = update["message"]["text"]
102 |                     if is_unicode(text):
103 |                         print("Update content: {}".format(update))
104 |                         bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
105 |                     else:
106 |                         bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
107 |             offset = max(offset, update['update_id'] + 1)
108 |         time.sleep(1)
109 | 
110 | if __name__ == "__main__":
111 |     main()
112 | 


--------------------------------------------------------------------------------
/optional/telegram_bot/utils.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import pickle
 3 | import re
 4 | import numpy as np
 5 | 
 6 | nltk.download('stopwords')
 7 | from nltk.corpus import stopwords
 8 | 
 9 | # Paths for all resources for the bot.
10 | RESOURCE_PATH = {
11 |     'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
12 |     'TAG_CLASSIFIER': 'tag_classifier.pkl',
13 |     'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
14 |     'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
15 |     'WORD_EMBEDDINGS': 'word_embeddings.tsv',
16 | }
17 | 
18 | 
19 | def text_prepare(text):
20 |     """Performs tokenization and simple preprocessing."""
21 | 
22 |     replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
23 |     bad_symbols_re = re.compile('[^0-9a-z #+_]')
24 |     stopwords_set = set(stopwords.words('english'))
25 | 
26 |     text = text.lower()
27 |     text = replace_by_space_re.sub(' ', text)
28 |     text = bad_symbols_re.sub('', text)
29 |     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
30 | 
31 |     return text.strip()
32 | 
33 | 
34 | def load_embeddings(embeddings_path):
35 |     """Loads pre-trained word embeddings from tsv file.
36 | 
37 |     Args:
38 |       embeddings_path - path to the embeddings file.
39 | 
40 |     Returns:
41 |       embeddings - dict mapping words to vectors;
42 |       embeddings_dim - dimension of the vectors.
43 |     """
44 | 
45 |     # Hint: you have already implemented a similar routine in the 3rd assignment.
46 |     # Note that here you also need to know the dimension of the loaded embeddings.
47 |     # When you load the embeddings, use numpy.float32 type as dtype
48 | 
49 |     ########################
50 |     #### YOUR CODE HERE ####
51 |     ########################
52 | 
53 |     # remove this when you're done
54 |     raise NotImplementedError(
55 |         "Open utils.py and fill with your code. In case of Google Colab, download"
56 |         "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
57 |         "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
58 | 
59 | 
60 | def question_to_vec(question, embeddings, dim):
61 |     """Transforms a string to an embedding by averaging word embeddings."""
62 | 
63 |     # Hint: you have already implemented exactly this function in the 3rd assignment.
64 | 
65 |     ########################
66 |     #### YOUR CODE HERE ####
67 |     ########################
68 | 
69 |     # remove this when you're done
70 |     raise NotImplementedError(
71 |         "Open utils.py and fill with your code. In case of Google Colab, download"
72 |         "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
73 |         "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
74 | 
75 | 
76 | def unpickle_file(filename):
77 |     """Returns the result of unpickling the file content."""
78 |     with open(filename, 'rb') as f:
79 |         return pickle.load(f)
80 | 


--------------------------------------------------------------------------------
/optional/telegram_bot/week5-project.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Final project: StackOverflow assistant bot\n",
  8 |     "\n",
  9 |     "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n",
 10 |     "* answer programming-related questions (using StackOverflow dataset);\n",
 11 |     "* chit-chat and simulate dialogue on all non programming-related questions.\n",
 12 |     "\n",
 13 |     "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n",
 14 |     "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n",
 15 |     "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n",
 16 |     "©[xkcd](https://xkcd.com)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Data description\n",
 24 |     "\n",
 25 |     "To detect *intent* of users questions we will need two text collections:\n",
 26 |     "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n",
 27 |     "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "try:\n",
 39 |     "    import google.colab\n",
 40 |     "    IN_COLAB = True\n",
 41 |     "except:\n",
 42 |     "    IN_COLAB = False\n",
 43 |     "\n",
 44 |     "if IN_COLAB:\n",
 45 |     "    ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n",
 46 |     "    import setup_google_colab\n",
 47 |     "    setup_google_colab.setup_project()\n",
 48 |     "\n",
 49 |     "import sys\n",
 50 |     "sys.path.append(\"..\")\n",
 51 |     "from common.download_utils import download_project_resources\n",
 52 |     "\n",
 53 |     "download_project_resources()"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n",
 61 |     "For the ranking part, you will need:\n",
 62 |     "- `word_embeddings.tsv` — word embeddings, that you  trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n",
 70 |     "\n",
 71 |     "- `intent_recognizer.pkl` — intent recognition model;\n",
 72 |     "- `tag_classifier.pkl` — programming language classification model;\n",
 73 |     "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n",
 74 |     "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n",
 75 |     "    "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "from utils import *"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## Part I. Intent and language recognition"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n",
108 |     "\n",
109 |     "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "import numpy as np\n",
121 |     "import pandas as pd\n",
122 |     "import pickle\n",
123 |     "import re\n",
124 |     "\n",
125 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Data preparation"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "def tfidf_features(X_train, X_test, vectorizer_path):\n",
151 |     "    \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n",
152 |     "    \n",
153 |     "    # Train a vectorizer on X_train data.\n",
154 |     "    # Transform X_train and X_test data.\n",
155 |     "    \n",
156 |     "    # Pickle the trained vectorizer to 'vectorizer_path'\n",
157 |     "    # Don't forget to open the file in writing bytes mode.\n",
158 |     "    \n",
159 |     "    ######################################\n",
160 |     "    ######### YOUR CODE HERE #############\n",
161 |     "    ######################################\n",
162 |     "    \n",
163 |     "    return X_train, X_test"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "sample_size = 200000\n",
182 |     "\n",
183 |     "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n",
184 |     "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Check how the data look like:"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "dialogue_df.head()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "stackoverflow_df.head()"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "Apply *text_prepare* function to preprocess the data:"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "from utils import text_prepare"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "dialogue_df['text'] = ######### YOUR CODE HERE #############\n",
243 |     "stackoverflow_df['title'] = ######### YOUR CODE HERE #############"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "### Intent recognition"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n",
258 |     "- concatenate `dialogue` and `stackoverflow` examples into one sample\n",
259 |     "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n",
260 |     "- transform it into TF-IDF features"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "from sklearn.model_selection import train_test_split"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n",
283 |     "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n",
284 |     "\n",
285 |     "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n",
286 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n",
287 |     "\n",
288 |     "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good."
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "from sklearn.linear_model import LogisticRegression\n",
307 |     "from sklearn.metrics import accuracy_score"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {
314 |     "collapsed": true
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "######################################\n",
319 |     "######### YOUR CODE HERE #############\n",
320 |     "######################################"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {
327 |     "collapsed": true
328 |    },
329 |    "outputs": [],
330 |    "source": [
331 |     "# Check test accuracy.\n",
332 |     "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n",
333 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
334 |     "print('Test accuracy = {}'.format(test_accuracy))"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "Dump the classifier to use it in the running bot."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "### Programming language classification "
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n",
367 |     "\n",
368 |     "First, let us prepare the data for this task."
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {
375 |     "collapsed": true
376 |    },
377 |    "outputs": [],
378 |    "source": [
379 |     "X = stackoverflow_df['title'].values\n",
380 |     "y = stackoverflow_df['tag'].values"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {
387 |     "collapsed": true
388 |    },
389 |    "outputs": [],
390 |    "source": [
391 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
392 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it."
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {
406 |     "collapsed": true
407 |    },
408 |    "outputs": [],
409 |    "source": [
410 |     "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n",
411 |     "\n",
412 |     "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "metadata": {},
418 |    "source": [
419 |     "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "collapsed": true
427 |    },
428 |    "outputs": [],
429 |    "source": [
430 |     "from sklearn.multiclass import OneVsRestClassifier"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "######################################\n",
442 |     "######### YOUR CODE HERE #############\n",
443 |     "######################################"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {
450 |     "collapsed": true
451 |    },
452 |    "outputs": [],
453 |    "source": [
454 |     "# Check test accuracy.\n",
455 |     "y_test_pred = tag_classifier.predict(X_test_tfidf)\n",
456 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
457 |     "print('Test accuracy = {}'.format(test_accuracy))"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {},
463 |    "source": [
464 |     "Dump the classifier to use it in the running bot."
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {
471 |     "collapsed": true
472 |    },
473 |    "outputs": [],
474 |    "source": [
475 |     "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "metadata": {},
481 |    "source": [
482 |     "## Part II. Ranking  questions with embeddings"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n",
490 |     "\n",
491 |     "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. "
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "markdown",
496 |    "metadata": {},
497 |    "source": [
498 |     "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n",
499 |     "\n",
500 |     "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StarSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data."
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {
507 |     "collapsed": true
508 |    },
509 |    "outputs": [],
510 |    "source": [
511 |     "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {},
517 |    "source": [
518 |     "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {
525 |     "collapsed": true
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "markdown",
534 |    "metadata": {},
535 |    "source": [
536 |     "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n",
537 |     "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {
544 |     "collapsed": true
545 |    },
546 |    "outputs": [],
547 |    "source": [
548 |     "counts_by_tag = ######### YOUR CODE HERE #############"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "Now for each `tag` you need to create two data structures, which will serve as online search index:\n",
556 |     "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n",
557 |     "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n",
558 |     "\n",
559 |     "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it."
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {
566 |     "collapsed": true
567 |    },
568 |    "outputs": [],
569 |    "source": [
570 |     "import os\n",
571 |     "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n",
572 |     "\n",
573 |     "for tag, count in counts_by_tag.items():\n",
574 |     "    tag_posts = posts_df[posts_df['tag'] == tag]\n",
575 |     "    \n",
576 |     "    tag_post_ids = ######### YOUR CODE HERE #############\n",
577 |     "    \n",
578 |     "    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n",
579 |     "    for i, title in enumerate(tag_posts['title']):\n",
580 |     "        tag_vectors[i, :] = ######### YOUR CODE HERE #############\n",
581 |     "\n",
582 |     "    # Dump post ids and vectors to a file.\n",
583 |     "    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n",
584 |     "    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))"
585 |    ]
586 |   }
587 |  ],
588 |  "metadata": {
589 |   "kernelspec": {
590 |    "display_name": "Python 3",
591 |    "language": "python",
592 |    "name": "python3"
593 |   },
594 |   "language_info": {
595 |    "codemirror_mode": {
596 |     "name": "ipython",
597 |     "version": 3
598 |    },
599 |    "file_extension": ".py",
600 |    "mimetype": "text/x-python",
601 |    "name": "python",
602 |    "nbconvert_exporter": "python",
603 |    "pygments_lexer": "ipython3",
604 |    "version": "3.4.3"
605 |   },
606 |   "latex_envs": {
607 |    "bibliofile": "biblio.bib",
608 |    "cite_by": "apalike",
609 |    "current_citInitial": 1,
610 |    "eqLabelWithNumbers": true,
611 |    "eqNumInitial": 0
612 |   }
613 |  },
614 |  "nbformat": 4,
615 |  "nbformat_minor": 2
616 | }
617 | 


--------------------------------------------------------------------------------
/setup_google_colab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | 
 5 | 
 6 | def download_github_code(path):
 7 |     filename = path.rsplit("/")[-1]
 8 |     os.system("wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/{} -O {}".format(path, filename))
 9 | 
10 | 
11 | def setup_common():
12 |     download_github_code("common/requirements_colab.txt")
13 |     download_github_code("common/download_utils.py")
14 |     download_github_code("common/tqdm_utils.py")
15 |     download_github_code("common/__init__.py")
16 |     os.system("mkdir common")
17 |     os.system("mv download_utils.py tqdm_utils.py __init__.py common/")
18 |     os.system("mv requirements_colab.txt common/")
19 | 
20 |     os.system("pip install -r common/requirements_colab.txt --force-reinstall")
21 | 
22 | 
23 | def setup_starspace():
24 |     if not os.path.exists("/usr/local/bin/starspace"):
25 |         os.system("wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip")
26 |         os.system("unzip boost_1_63_0.zip && mv boost_1_63_0 /usr/local/bin")
27 |         os.system("git clone https://github.com/facebookresearch/Starspace.git")
28 |         os.system("cd Starspace && make && cp -Rf starspace /usr/local/bin")
29 | 
30 | 
31 | def setup_week1():
32 |     setup_common()
33 |     download_github_code("week1/grader.py")
34 |     download_github_code("week1/metrics.py")
35 | 
36 | 
37 | def setup_week2():
38 |     setup_common()
39 |     download_github_code("week2/evaluation.py")
40 | 
41 | 
42 | def setup_week3():
43 |     setup_common()
44 |     download_github_code("week3/grader.py")
45 |     download_github_code("week3/util.py")
46 |     setup_starspace()
47 | 
48 | 
49 | def setup_week4():
50 |     setup_common()
51 | 
52 | 
53 | def setup_project():
54 |     setup_common()
55 |     download_github_code("week5/dialogue_manager.py")
56 |     download_github_code("week5/utils.py")
57 |     setup_starspace()
58 | 
59 | 
60 | def setup_honor():
61 |     setup_common()
62 |     download_github_code("honor/datasets.py")
63 |     download_github_code("honor/example.py")
64 |     download_github_code("honor/download_cornell.sh")
65 |     download_github_code("honor/download_opensubs.sh")
66 | 


--------------------------------------------------------------------------------
/week1/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g'
10 |         self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 
11 |                                   ('hTrz8', 'WordsTagsCount'), 
12 |                                   ('0kUjR', 'BagOfWords'), 
13 |                                   ('tLJV1', 'MultilabelClassification')])
14 |         self.answers = {key: None for key in self.parts}
15 | 
16 |     @staticmethod
17 |     def ravel_output(output):
18 |         '''
19 |            If student accidentally submitted np.array with one
20 |            element instead of number, this function will submit
21 |            this number instead
22 |         '''
23 |         if isinstance(output, np.ndarray) and output.size == 1:
24 |             output = output.item(0)
25 |         return output
26 | 
27 |     def submit(self, email, token):
28 |         submission = {
29 |                     "assignmentKey": self.assignment_key, 
30 |                     "submitterEmail": email, 
31 |                     "secret": token, 
32 |                     "parts": {}
33 |                   }
34 |         for part, output in self.answers.items():
35 |             if output is not None:
36 |                 submission["parts"][part] = {"output": output}
37 |             else:
38 |                 submission["parts"][part] = dict()
39 |         request = requests.post(self.submission_page, data=json.dumps(submission))
40 |         response = request.json()
41 |         if request.status_code == 201:
42 |             print('Submitted to Coursera platform. See results on assignment page!')
43 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
44 |             print(response[u'details'][u'learnerMessage'])
45 |         else:
46 |             print("Unknown response from Coursera: {}".format(request.status_code))
47 |             print(response)
48 | 
49 |     def status(self):
50 |         print("You want to submit these parts:")
51 |         for part_id, part_name in self.parts.items():
52 |             answer = self.answers[part_id]
53 |             if answer is None:
54 |                 answer = '-'*10
55 |             print("Task {}:\n {}".format(part_name, answer[:100] + '...'))
56 |                
57 |     def submit_part(self, part, output):
58 |         self.answers[part] = output
59 |         print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...'))
60 | 
61 |     def submit_tag(self, tag, output):
62 |         part_id = [k for k, v in self.parts.items() if v == tag]
63 |         if len(part_id) != 1:
64 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
65 |         part_id = part_id[0]
66 |         self.submit_part(part_id, str(self.ravel_output(output)))
67 | 


--------------------------------------------------------------------------------
/week1/lemmatization_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tokenization"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "[nltk_data] Downloading package wordnet to /Users/anton/nltk_data...\n",
 20 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 21 |      ]
 22 |     },
 23 |     {
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "True"
 27 |       ]
 28 |      },
 29 |      "execution_count": 1,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "import nltk\n",
 36 |     "nltk.download('wordnet')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "ExecuteTime": {
 44 |      "end_time": "2017-11-05T18:16:27.608310Z",
 45 |      "start_time": "2017-11-05T18:16:26.423528Z"
 46 |     }
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "text = \"This is Andrew's text, isn't it?\""
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {
 57 |     "ExecuteTime": {
 58 |      "end_time": "2017-11-05T18:16:27.633134Z",
 59 |      "start_time": "2017-11-05T18:16:27.610910Z"
 60 |     }
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "['This', 'is', \"Andrew's\", 'text,', \"isn't\", 'it?']"
 67 |       ]
 68 |      },
 69 |      "execution_count": 3,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "tokenizer = nltk.tokenize.WhitespaceTokenizer()\n",
 76 |     "tokenizer.tokenize(text)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {
 83 |     "ExecuteTime": {
 84 |      "end_time": "2017-11-05T18:16:27.647746Z",
 85 |      "start_time": "2017-11-05T18:16:27.637909Z"
 86 |     }
 87 |    },
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "text/plain": [
 92 |        "['This', 'is', 'Andrew', \"'s\", 'text', ',', 'is', \"n't\", 'it', '?']"
 93 |       ]
 94 |      },
 95 |      "execution_count": 4,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "tokenizer = nltk.tokenize.TreebankWordTokenizer()\n",
102 |     "tokenizer.tokenize(text)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 5,
108 |    "metadata": {
109 |     "ExecuteTime": {
110 |      "end_time": "2017-11-05T18:16:27.660827Z",
111 |      "start_time": "2017-11-05T18:16:27.651961Z"
112 |     }
113 |    },
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "['This', 'is', 'Andrew', \"'\", 's', 'text', ',', 'isn', \"'\", 't', 'it', '?']"
119 |       ]
120 |      },
121 |      "execution_count": 5,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "tokenizer = nltk.tokenize.WordPunctTokenizer()\n",
128 |     "tokenizer.tokenize(text)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "# Stemming (further in the video)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 6,
141 |    "metadata": {
142 |     "ExecuteTime": {
143 |      "end_time": "2017-11-05T18:16:27.674332Z",
144 |      "start_time": "2017-11-05T18:16:27.666509Z"
145 |     }
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "\n",
150 |     "text = \"feet wolves cats talked\"\n",
151 |     "tokenizer = nltk.tokenize.TreebankWordTokenizer()\n",
152 |     "tokens = tokenizer.tokenize(text)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 7,
158 |    "metadata": {
159 |     "ExecuteTime": {
160 |      "end_time": "2017-11-05T18:16:27.693761Z",
161 |      "start_time": "2017-11-05T18:16:27.677877Z"
162 |     }
163 |    },
164 |    "outputs": [
165 |     {
166 |      "data": {
167 |       "text/plain": [
168 |        "u'feet wolv cat talk'"
169 |       ]
170 |      },
171 |      "execution_count": 7,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "stemmer = nltk.stem.PorterStemmer()\n",
178 |     "\" \".join(stemmer.stem(token) for token in tokens)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 8,
184 |    "metadata": {
185 |     "ExecuteTime": {
186 |      "end_time": "2017-11-05T18:16:30.840117Z",
187 |      "start_time": "2017-11-05T18:16:27.698683Z"
188 |     }
189 |    },
190 |    "outputs": [
191 |     {
192 |      "data": {
193 |       "text/plain": [
194 |        "u'foot wolf cat talked'"
195 |       ]
196 |      },
197 |      "execution_count": 8,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "stemmer = nltk.stem.WordNetLemmatizer()\n",
204 |     "\" \".join(stemmer.lemmatize(token) for token in tokens)"
205 |    ]
206 |   }
207 |  ],
208 |  "metadata": {
209 |   "kernelspec": {
210 |    "display_name": "Python 2",
211 |    "language": "python",
212 |    "name": "python2"
213 |   },
214 |   "language_info": {
215 |    "codemirror_mode": {
216 |     "name": "ipython",
217 |     "version": 2
218 |    },
219 |    "file_extension": ".py",
220 |    "mimetype": "text/x-python",
221 |    "name": "python",
222 |    "nbconvert_exporter": "python",
223 |    "pygments_lexer": "ipython2",
224 |    "version": "2.7.15"
225 |   }
226 |  },
227 |  "nbformat": 4,
228 |  "nbformat_minor": 2
229 | }
230 | 


--------------------------------------------------------------------------------
/week1/metrics.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.metrics import roc_curve, auc
 4 | from scipy import interp
 5 | from itertools import cycle
 6 | 
 7 | def roc_auc(y_test, y_score, n_classes):  
 8 |     """Plots ROC curve for micro and macro averaging."""
 9 |     
10 |     # Compute ROC curve and ROC area for each class
11 |     fpr = {}
12 |     tpr = {}
13 |     roc_auc = {}
14 |     for i in range(n_classes):
15 |         fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
16 |         roc_auc[i] = auc(fpr[i], tpr[i])
17 |     
18 |     # Compute micro-average ROC curve and ROC area
19 |     fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
20 |     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
21 |     
22 |     # Compute macro-average ROC curve and ROC area     
23 |     all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
24 |     mean_tpr = np.zeros_like(all_fpr)
25 |     for i in range(n_classes):
26 |         mean_tpr += interp(all_fpr, fpr[i], tpr[i])
27 |     mean_tpr /= n_classes 
28 |     fpr["macro"] = all_fpr
29 |     tpr["macro"] = mean_tpr
30 |     roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
31 |     
32 |     # Plot all ROC curves
33 |     plt.figure()
34 |     plt.plot(fpr["micro"], tpr["micro"], 
35 |              label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
36 |              color='deeppink', linestyle=':', linewidth=4)
37 |     
38 |     plt.plot(fpr["macro"], tpr["macro"], 
39 |              label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]),
40 |              color='navy', linestyle=':', linewidth=4)
41 |     
42 |     colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
43 |     for i, color in zip(range(0,3), colors):
44 |         plt.plot(fpr[i], tpr[i], color=color, lw=2, 
45 |                  label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
46 |     
47 |     plt.plot([0, 1], [0, 1], 'k--', lw=2)
48 |     plt.xlim([0.0, 1.0])
49 |     plt.ylim([0.0, 1.05])
50 |     plt.xlabel('False Positive Rate')
51 |     plt.ylabel('True Positive Rate')
52 |     plt.title('Some extension of ROC to multi-class')
53 |     plt.legend(loc="lower right")
54 |     plt.show()


--------------------------------------------------------------------------------
/week1/tfidf_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tf-Idf example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "data": {
 17 |       "text/html": [
 18 |        "<div>\n",
 19 |        "<style>\n",
 20 |        "    .dataframe thead tr:only-child th {\n",
 21 |        "        text-align: right;\n",
 22 |        "    }\n",
 23 |        "\n",
 24 |        "    .dataframe thead th {\n",
 25 |        "        text-align: left;\n",
 26 |        "    }\n",
 27 |        "\n",
 28 |        "    .dataframe tbody tr th {\n",
 29 |        "        vertical-align: top;\n",
 30 |        "    }\n",
 31 |        "</style>\n",
 32 |        "<table border=\"1\" class=\"dataframe\">\n",
 33 |        "  <thead>\n",
 34 |        "    <tr style=\"text-align: right;\">\n",
 35 |        "      <th></th>\n",
 36 |        "      <th>good movie</th>\n",
 37 |        "      <th>like</th>\n",
 38 |        "      <th>movie</th>\n",
 39 |        "      <th>not</th>\n",
 40 |        "    </tr>\n",
 41 |        "  </thead>\n",
 42 |        "  <tbody>\n",
 43 |        "    <tr>\n",
 44 |        "      <th>0</th>\n",
 45 |        "      <td>0.707107</td>\n",
 46 |        "      <td>0.000000</td>\n",
 47 |        "      <td>0.707107</td>\n",
 48 |        "      <td>0.000000</td>\n",
 49 |        "    </tr>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>1</th>\n",
 52 |        "      <td>0.577350</td>\n",
 53 |        "      <td>0.000000</td>\n",
 54 |        "      <td>0.577350</td>\n",
 55 |        "      <td>0.577350</td>\n",
 56 |        "    </tr>\n",
 57 |        "    <tr>\n",
 58 |        "      <th>2</th>\n",
 59 |        "      <td>0.000000</td>\n",
 60 |        "      <td>0.707107</td>\n",
 61 |        "      <td>0.000000</td>\n",
 62 |        "      <td>0.707107</td>\n",
 63 |        "    </tr>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>3</th>\n",
 66 |        "      <td>0.000000</td>\n",
 67 |        "      <td>1.000000</td>\n",
 68 |        "      <td>0.000000</td>\n",
 69 |        "      <td>0.000000</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>4</th>\n",
 73 |        "      <td>0.000000</td>\n",
 74 |        "      <td>0.000000</td>\n",
 75 |        "      <td>0.000000</td>\n",
 76 |        "      <td>0.000000</td>\n",
 77 |        "    </tr>\n",
 78 |        "  </tbody>\n",
 79 |        "</table>\n",
 80 |        "</div>"
 81 |       ],
 82 |       "text/plain": [
 83 |        "   good movie      like     movie       not\n",
 84 |        "0    0.707107  0.000000  0.707107  0.000000\n",
 85 |        "1    0.577350  0.000000  0.577350  0.577350\n",
 86 |        "2    0.000000  0.707107  0.000000  0.707107\n",
 87 |        "3    0.000000  1.000000  0.000000  0.000000\n",
 88 |        "4    0.000000  0.000000  0.000000  0.000000"
 89 |       ]
 90 |      },
 91 |      "execution_count": 1,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 98 |     "import pandas as pd\n",
 99 |     "texts = [\n",
100 |     "    \"good movie\", \"not a good movie\", \"did not like\", \n",
101 |     "    \"i like it\", \"good one\"\n",
102 |     "]\n",
103 |     "# using default tokenizer in TfidfVectorizer\n",
104 |     "tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))\n",
105 |     "features = tfidf.fit_transform(texts)\n",
106 |     "pd.DataFrame(\n",
107 |     "    features.todense(),\n",
108 |     "    columns=tfidf.get_feature_names()\n",
109 |     ")"
110 |    ]
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 2",
116 |    "language": "python",
117 |    "name": "python2"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 2
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython2",
129 |    "version": "2.7.14"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 2
134 | }
135 | 


--------------------------------------------------------------------------------
/week1/week1-MultilabelClassification.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Predict tags on StackOverflow with linear models"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this assignment you will learn how to predict tags for posts from [StackOverflow](https://stackoverflow.com). To solve this task you will use multilabel classification approach.\n",
  15 |     "\n",
  16 |     "### Libraries\n",
  17 |     "\n",
  18 |     "In this task you will need the following libraries:\n",
  19 |     "- [Numpy](http://www.numpy.org) — a package for scientific computing.\n",
  20 |     "- [Pandas](https://pandas.pydata.org) — a library providing high-performance, easy-to-use data structures and data analysis tools for the Python\n",
  21 |     "- [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n",
  22 |     "- [NLTK](http://www.nltk.org) — a platform to work with natural language."
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "### Data\n",
  30 |     "\n",
  31 |     "The following cell will download all data required for this assignment into the folder `week1/data`."
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "code",
  36 |    "execution_count": null,
  37 |    "metadata": {},
  38 |    "outputs": [],
  39 |    "source": [
  40 |     "try:\n",
  41 |     "    import google.colab\n",
  42 |     "    IN_COLAB = True\n",
  43 |     "except:\n",
  44 |     "    IN_COLAB = False\n",
  45 |     "\n",
  46 |     "if IN_COLAB:\n",
  47 |     "    ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n",
  48 |     "    import setup_google_colab\n",
  49 |     "    setup_google_colab.setup_week1() \n",
  50 |     "    \n",
  51 |     "import sys\n",
  52 |     "sys.path.append(\"..\")\n",
  53 |     "from common.download_utils import download_week1_resources\n",
  54 |     "\n",
  55 |     "download_week1_resources()"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {},
  61 |    "source": [
  62 |     "### Grading\n",
  63 |     "We will create a grader instance below and use it to collect your answers. Note that these outputs will be stored locally inside grader and will be uploaded to platform only after running submitting function in the last part of this assignment. If you want to make partial submission, you can run that cell any time you want."
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": null,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "from grader import Grader"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": null,
  80 |    "metadata": {
  81 |     "collapsed": true
  82 |    },
  83 |    "outputs": [],
  84 |    "source": [
  85 |     "grader = Grader()"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "markdown",
  90 |    "metadata": {},
  91 |    "source": [
  92 |     "### Text preprocessing"
  93 |    ]
  94 |   },
  95 |   {
  96 |    "cell_type": "markdown",
  97 |    "metadata": {},
  98 |    "source": [
  99 |     "For this and most of the following assignments you will need to use a list of stop words. It can be downloaded from *nltk*:"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "metadata": {
 106 |     "collapsed": true
 107 |    },
 108 |    "outputs": [],
 109 |    "source": [
 110 |     "import nltk\n",
 111 |     "nltk.download('stopwords')\n",
 112 |     "from nltk.corpus import stopwords"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "In this task you will deal with a dataset of post titles from StackOverflow. You are provided a split to 3 sets: *train*, *validation* and *test*. All corpora (except for *test*) contain titles of the posts and corresponding tags (100 tags are available). The *test* set is provided for Coursera's grading and doesn't contain answers. Upload the corpora using *pandas* and look at the data:"
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": null,
 125 |    "metadata": {
 126 |     "collapsed": true
 127 |    },
 128 |    "outputs": [],
 129 |    "source": [
 130 |     "from ast import literal_eval\n",
 131 |     "import pandas as pd\n",
 132 |     "import numpy as np"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": null,
 138 |    "metadata": {
 139 |     "collapsed": true
 140 |    },
 141 |    "outputs": [],
 142 |    "source": [
 143 |     "def read_data(filename):\n",
 144 |     "    data = pd.read_csv(filename, sep='\\t')\n",
 145 |     "    data['tags'] = data['tags'].apply(literal_eval)\n",
 146 |     "    return data"
 147 |    ]
 148 |   },
 149 |   {
 150 |    "cell_type": "code",
 151 |    "execution_count": null,
 152 |    "metadata": {
 153 |     "collapsed": true
 154 |    },
 155 |    "outputs": [],
 156 |    "source": [
 157 |     "train = read_data('data/train.tsv')\n",
 158 |     "validation = read_data('data/validation.tsv')\n",
 159 |     "test = pd.read_csv('data/test.tsv', sep='\\t')"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "code",
 164 |    "execution_count": null,
 165 |    "metadata": {
 166 |     "collapsed": true
 167 |    },
 168 |    "outputs": [],
 169 |    "source": [
 170 |     "train.head()"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "metadata": {},
 176 |    "source": [
 177 |     "As you can see, *title* column contains titles of the posts and *tags* column contains the tags. It could be noticed that a number of tags for a post is not fixed and could be as many as necessary."
 178 |    ]
 179 |   },
 180 |   {
 181 |    "cell_type": "markdown",
 182 |    "metadata": {},
 183 |    "source": [
 184 |     "For a more comfortable usage, initialize *X_train*, *X_val*, *X_test*, *y_train*, *y_val*."
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "code",
 189 |    "execution_count": null,
 190 |    "metadata": {
 191 |     "collapsed": true
 192 |    },
 193 |    "outputs": [],
 194 |    "source": [
 195 |     "X_train, y_train = train['title'].values, train['tags'].values\n",
 196 |     "X_val, y_val = validation['title'].values, validation['tags'].values\n",
 197 |     "X_test = test['title'].values"
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "markdown",
 202 |    "metadata": {},
 203 |    "source": [
 204 |     "One of the most known difficulties when working with natural data is that it's unstructured. For example, if you use it \"as is\" and extract tokens just by splitting the titles by whitespaces, you will see that there are many \"weird\" tokens like *3.5?*, *\"Flip*, etc. To prevent the problems, it's usually useful to prepare the data somehow. In this task you'll write a function, which will be also used in the other assignments. \n",
 205 |     "\n",
 206 |     "**Task 1 (TextPrepare).** Implement the function *text_prepare* following the instructions. After that, run the function *test_text_prepare* to test it on tiny cases and submit it to Coursera."
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": null,
 212 |    "metadata": {
 213 |     "collapsed": true
 214 |    },
 215 |    "outputs": [],
 216 |    "source": [
 217 |     "import re"
 218 |    ]
 219 |   },
 220 |   {
 221 |    "cell_type": "code",
 222 |    "execution_count": null,
 223 |    "metadata": {
 224 |     "collapsed": true
 225 |    },
 226 |    "outputs": [],
 227 |    "source": [
 228 |     "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n",
 229 |     "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n",
 230 |     "STOPWORDS = set(stopwords.words('english'))\n",
 231 |     "\n",
 232 |     "def text_prepare(text):\n",
 233 |     "    \"\"\"\n",
 234 |     "        text: a string\n",
 235 |     "        \n",
 236 |     "        return: modified initial string\n",
 237 |     "    \"\"\"\n",
 238 |     "    text = # lowercase text\n",
 239 |     "    text = # replace REPLACE_BY_SPACE_RE symbols by space in text\n",
 240 |     "    text = # delete symbols which are in BAD_SYMBOLS_RE from text\n",
 241 |     "    text = # delete stopwords from text\n",
 242 |     "    return text"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "code",
 247 |    "execution_count": null,
 248 |    "metadata": {
 249 |     "collapsed": true
 250 |    },
 251 |    "outputs": [],
 252 |    "source": [
 253 |     "def test_text_prepare():\n",
 254 |     "    examples = [\"SQL Server - any equivalent of Excel's CHOOSE function?\",\n",
 255 |     "                \"How to free c++ memory vector<int> * arr?\"]\n",
 256 |     "    answers = [\"sql server equivalent excels choose function\", \n",
 257 |     "               \"free c++ memory vectorint arr\"]\n",
 258 |     "    for ex, ans in zip(examples, answers):\n",
 259 |     "        if text_prepare(ex) != ans:\n",
 260 |     "            return \"Wrong answer for the case: '%s'\" % ex\n",
 261 |     "    return 'Basic tests are passed.'"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "code",
 266 |    "execution_count": null,
 267 |    "metadata": {
 268 |     "collapsed": true
 269 |    },
 270 |    "outputs": [],
 271 |    "source": [
 272 |     "print(test_text_prepare())"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "markdown",
 277 |    "metadata": {},
 278 |    "source": [
 279 |     "Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points."
 280 |    ]
 281 |   },
 282 |   {
 283 |    "cell_type": "code",
 284 |    "execution_count": null,
 285 |    "metadata": {
 286 |     "collapsed": true
 287 |    },
 288 |    "outputs": [],
 289 |    "source": [
 290 |     "prepared_questions = []\n",
 291 |     "for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):\n",
 292 |     "    line = text_prepare(line.strip())\n",
 293 |     "    prepared_questions.append(line)\n",
 294 |     "text_prepare_results = '\\n'.join(prepared_questions)\n",
 295 |     "\n",
 296 |     "grader.submit_tag('TextPrepare', text_prepare_results)"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "metadata": {},
 302 |    "source": [
 303 |     "Now we can preprocess the titles using function *text_prepare* and  making sure that the headers don't have bad symbols:"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {
 310 |     "collapsed": true
 311 |    },
 312 |    "outputs": [],
 313 |    "source": [
 314 |     "X_train = [text_prepare(x) for x in X_train]\n",
 315 |     "X_val = [text_prepare(x) for x in X_val]\n",
 316 |     "X_test = [text_prepare(x) for x in X_test]"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "code",
 321 |    "execution_count": null,
 322 |    "metadata": {
 323 |     "collapsed": true
 324 |    },
 325 |    "outputs": [],
 326 |    "source": [
 327 |     "X_train[:3]"
 328 |    ]
 329 |   },
 330 |   {
 331 |    "cell_type": "markdown",
 332 |    "metadata": {},
 333 |    "source": [
 334 |     "For each tag and for each word calculate how many times they occur in the train corpus. \n",
 335 |     "\n",
 336 |     "**Task 2 (WordsTagsCount).** Find 3 most popular tags and 3 most popular words in the train data and submit the results to earn the points."
 337 |    ]
 338 |   },
 339 |   {
 340 |    "cell_type": "code",
 341 |    "execution_count": null,
 342 |    "metadata": {
 343 |     "collapsed": true
 344 |    },
 345 |    "outputs": [],
 346 |    "source": [
 347 |     "# Dictionary of all tags from train corpus with their counts.\n",
 348 |     "tags_counts = {}\n",
 349 |     "# Dictionary of all words from train corpus with their counts.\n",
 350 |     "words_counts = {}\n",
 351 |     "\n",
 352 |     "######################################\n",
 353 |     "######### YOUR CODE HERE #############\n",
 354 |     "######################################"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "metadata": {},
 360 |    "source": [
 361 |     "We are assuming that *tags_counts* and *words_counts* are dictionaries like `{'some_word_or_tag': frequency}`. After applying the sorting procedure, results will be look like this: `[('most_popular_word_or_tag', frequency), ('less_popular_word_or_tag', frequency), ...]`. The grader gets the results in the following format (two comma-separated strings with line break):\n",
 362 |     "\n",
 363 |     "    tag1,tag2,tag3\n",
 364 |     "    word1,word2,word3\n",
 365 |     "\n",
 366 |     "Pay attention that in this assignment you should not submit frequencies or some additional information."
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": null,
 372 |    "metadata": {
 373 |     "collapsed": true
 374 |    },
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n",
 378 |     "most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n",
 379 |     "\n",
 380 |     "grader.submit_tag('WordsTagsCount', '%s\\n%s' % (','.join(tag for tag, _ in most_common_tags), \n",
 381 |     "                                                ','.join(word for word, _ in most_common_words)))"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "markdown",
 386 |    "metadata": {},
 387 |    "source": [
 388 |     "### Transforming text to a vector\n",
 389 |     "\n",
 390 |     "Machine Learning algorithms work with numeric data and we cannot use the provided text data \"as is\". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.\n",
 391 |     "\n",
 392 |     "#### Bag of words\n",
 393 |     "\n",
 394 |     "One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:\n",
 395 |     "1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.\n",
 396 |     "2. For each title in the corpora create a zero vector with the dimension equals to *N*.\n",
 397 |     "3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.\n",
 398 |     "\n",
 399 |     "Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is \n",
 400 |     "\n",
 401 |     "    ['hi', 'you', 'me', 'are']\n",
 402 |     "\n",
 403 |     "Then we need to numerate them, for example, like this: \n",
 404 |     "\n",
 405 |     "    {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n",
 406 |     "\n",
 407 |     "And we have the text, which we want to transform to the vector:\n",
 408 |     "\n",
 409 |     "    'hi how are you'\n",
 410 |     "\n",
 411 |     "For this text we create a corresponding zero vector \n",
 412 |     "\n",
 413 |     "    [0, 0, 0, 0]\n",
 414 |     "    \n",
 415 |     "And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:\n",
 416 |     "\n",
 417 |     "    'hi':  [1, 0, 0, 0]\n",
 418 |     "    'how': [1, 0, 0, 0] # word 'how' is not in our dictionary\n",
 419 |     "    'are': [1, 0, 0, 1]\n",
 420 |     "    'you': [1, 1, 0, 1]\n",
 421 |     "\n",
 422 |     "The resulting vector will be \n",
 423 |     "\n",
 424 |     "    [1, 1, 0, 1]\n",
 425 |     "   \n",
 426 |     "Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*."
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "code",
 431 |    "execution_count": null,
 432 |    "metadata": {
 433 |     "collapsed": true
 434 |    },
 435 |    "outputs": [],
 436 |    "source": [
 437 |     "DICT_SIZE = 5000\n",
 438 |     "WORDS_TO_INDEX = ####### YOUR CODE HERE #######\n",
 439 |     "INDEX_TO_WORDS = ####### YOUR CODE HERE #######\n",
 440 |     "ALL_WORDS = WORDS_TO_INDEX.keys()\n",
 441 |     "\n",
 442 |     "def my_bag_of_words(text, words_to_index, dict_size):\n",
 443 |     "    \"\"\"\n",
 444 |     "        text: a string\n",
 445 |     "        dict_size: size of the dictionary\n",
 446 |     "        \n",
 447 |     "        return a vector which is a bag-of-words representation of 'text'\n",
 448 |     "    \"\"\"\n",
 449 |     "    result_vector = np.zeros(dict_size)\n",
 450 |     "    ######################################\n",
 451 |     "    ######### YOUR CODE HERE #############\n",
 452 |     "    ######################################\n",
 453 |     "    return result_vector"
 454 |    ]
 455 |   },
 456 |   {
 457 |    "cell_type": "code",
 458 |    "execution_count": null,
 459 |    "metadata": {
 460 |     "collapsed": true
 461 |    },
 462 |    "outputs": [],
 463 |    "source": [
 464 |     "def test_my_bag_of_words():\n",
 465 |     "    words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n",
 466 |     "    examples = ['hi how are you']\n",
 467 |     "    answers = [[1, 1, 0, 1]]\n",
 468 |     "    for ex, ans in zip(examples, answers):\n",
 469 |     "        if (my_bag_of_words(ex, words_to_index, 4) != ans).any():\n",
 470 |     "            return \"Wrong answer for the case: '%s'\" % ex\n",
 471 |     "    return 'Basic tests are passed.'"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": null,
 477 |    "metadata": {
 478 |     "collapsed": true
 479 |    },
 480 |    "outputs": [],
 481 |    "source": [
 482 |     "print(test_my_bag_of_words())"
 483 |    ]
 484 |   },
 485 |   {
 486 |    "cell_type": "markdown",
 487 |    "metadata": {},
 488 |    "source": [
 489 |     "Now apply the implemented function to all samples (this might take up to a minute):"
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": null,
 495 |    "metadata": {
 496 |     "collapsed": true
 497 |    },
 498 |    "outputs": [],
 499 |    "source": [
 500 |     "from scipy import sparse as sp_sparse"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": null,
 506 |    "metadata": {
 507 |     "collapsed": true
 508 |    },
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])\n",
 512 |     "X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])\n",
 513 |     "X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])\n",
 514 |     "print('X_train shape ', X_train_mybag.shape)\n",
 515 |     "print('X_val shape ', X_val_mybag.shape)\n",
 516 |     "print('X_test shape ', X_test_mybag.shape)"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "markdown",
 521 |    "metadata": {},
 522 |    "source": [
 523 |     "As you might notice, we transform the data to sparse representation, to store the useful information efficiently. There are many [types](https://docs.scipy.org/doc/scipy/reference/sparse.html) of such representations, however sklearn algorithms can work only with [csr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix) matrix, so we will use this one."
 524 |    ]
 525 |   },
 526 |   {
 527 |    "cell_type": "markdown",
 528 |    "metadata": {},
 529 |    "source": [
 530 |     "**Task 3 (BagOfWords).** For the 11th row in *X_train_mybag* find how many non-zero elements it has. In this task the answer (variable *non_zero_elements_count*) should be an integer number, e.g. 20."
 531 |    ]
 532 |   },
 533 |   {
 534 |    "cell_type": "code",
 535 |    "execution_count": null,
 536 |    "metadata": {
 537 |     "collapsed": true
 538 |    },
 539 |    "outputs": [],
 540 |    "source": [
 541 |     "row = X_train_mybag[10].toarray()[0]\n",
 542 |     "non_zero_elements_count = ####### YOUR CODE HERE #######\n",
 543 |     "\n",
 544 |     "grader.submit_tag('BagOfWords', str(non_zero_elements_count))"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "markdown",
 549 |    "metadata": {},
 550 |    "source": [
 551 |     "#### TF-IDF\n",
 552 |     "\n",
 553 |     "The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space. \n",
 554 |     "\n",
 555 |     "Implement function *tfidf_features* using class [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) from *scikit-learn*. Use *train* corpus to train a vectorizer. Don't forget to take a look into the arguments that you can pass to it. We suggest that you filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the titles). Also, use bigrams along with unigrams in your vocabulary. "
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": null,
 561 |    "metadata": {
 562 |     "collapsed": true
 563 |    },
 564 |    "outputs": [],
 565 |    "source": [
 566 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": null,
 572 |    "metadata": {
 573 |     "collapsed": true
 574 |    },
 575 |    "outputs": [],
 576 |    "source": [
 577 |     "def tfidf_features(X_train, X_val, X_test):\n",
 578 |     "    \"\"\"\n",
 579 |     "        X_train, X_val, X_test — samples        \n",
 580 |     "        return TF-IDF vectorized representation of each sample and vocabulary\n",
 581 |     "    \"\"\"\n",
 582 |     "    # Create TF-IDF vectorizer with a proper parameters choice\n",
 583 |     "    # Fit the vectorizer on the train set\n",
 584 |     "    # Transform the train, test, and val sets and return the result\n",
 585 |     "    \n",
 586 |     "    \n",
 587 |     "    tfidf_vectorizer = ####### YOUR CODE HERE #######\n",
 588 |     "    \n",
 589 |     "    ######################################\n",
 590 |     "    ######### YOUR CODE HERE #############\n",
 591 |     "    ######################################\n",
 592 |     "    \n",
 593 |     "    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_"
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "markdown",
 598 |    "metadata": {},
 599 |    "source": [
 600 |     "Once you have done text preprocessing, always have a look at the results. Be very careful at this step, because the performance of future models will drastically depend on it. \n",
 601 |     "\n",
 602 |     "In this case, check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": null,
 608 |    "metadata": {
 609 |     "collapsed": true
 610 |    },
 611 |    "outputs": [],
 612 |    "source": [
 613 |     "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)\n",
 614 |     "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}"
 615 |    ]
 616 |   },
 617 |   {
 618 |    "cell_type": "code",
 619 |    "execution_count": null,
 620 |    "metadata": {
 621 |     "collapsed": true
 622 |    },
 623 |    "outputs": [],
 624 |    "source": [
 625 |     "######### YOUR CODE HERE #############"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "markdown",
 630 |    "metadata": {},
 631 |    "source": [
 632 |     "If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\\S+)' regexp as a *token_pattern* in the constructor of the vectorizer.  "
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "markdown",
 637 |    "metadata": {},
 638 |    "source": [
 639 |     "Now, use this transormation for the data and check again."
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "code",
 644 |    "execution_count": null,
 645 |    "metadata": {
 646 |     "collapsed": true
 647 |    },
 648 |    "outputs": [],
 649 |    "source": [
 650 |     "######### YOUR CODE HERE #############"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "markdown",
 655 |    "metadata": {},
 656 |    "source": [
 657 |     "### MultiLabel classifier\n",
 658 |     "\n",
 659 |     "As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*."
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "code",
 664 |    "execution_count": null,
 665 |    "metadata": {
 666 |     "collapsed": true
 667 |    },
 668 |    "outputs": [],
 669 |    "source": [
 670 |     "from sklearn.preprocessing import MultiLabelBinarizer"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "code",
 675 |    "execution_count": null,
 676 |    "metadata": {
 677 |     "collapsed": true
 678 |    },
 679 |    "outputs": [],
 680 |    "source": [
 681 |     "mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))\n",
 682 |     "y_train = mlb.fit_transform(y_train)\n",
 683 |     "y_val = mlb.fit_transform(y_val)"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "markdown",
 688 |    "metadata": {},
 689 |    "source": [
 690 |     "Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large."
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": null,
 696 |    "metadata": {
 697 |     "collapsed": true
 698 |    },
 699 |    "outputs": [],
 700 |    "source": [
 701 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 702 |     "from sklearn.linear_model import LogisticRegression, RidgeClassifier"
 703 |    ]
 704 |   },
 705 |   {
 706 |    "cell_type": "code",
 707 |    "execution_count": null,
 708 |    "metadata": {
 709 |     "collapsed": true
 710 |    },
 711 |    "outputs": [],
 712 |    "source": [
 713 |     "def train_classifier(X_train, y_train):\n",
 714 |     "    \"\"\"\n",
 715 |     "      X_train, y_train — training data\n",
 716 |     "      \n",
 717 |     "      return: trained classifier\n",
 718 |     "    \"\"\"\n",
 719 |     "    \n",
 720 |     "    # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n",
 721 |     "\n",
 722 |     "    ######################################\n",
 723 |     "    ######### YOUR CODE HERE #############\n",
 724 |     "    ######################################    "
 725 |    ]
 726 |   },
 727 |   {
 728 |    "cell_type": "markdown",
 729 |    "metadata": {},
 730 |    "source": [
 731 |     "Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*.\n",
 732 |     "\n",
 733 |     "If you receive a convergence warning, please set parameter *max_iter* in LogisticRegression to a larger value (the default is 100)."
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": null,
 739 |    "metadata": {
 740 |     "collapsed": true
 741 |    },
 742 |    "outputs": [],
 743 |    "source": [
 744 |     "classifier_mybag = train_classifier(X_train_mybag, y_train)\n",
 745 |     "classifier_tfidf = train_classifier(X_train_tfidf, y_train)"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "markdown",
 750 |    "metadata": {},
 751 |    "source": [
 752 |     "Now you can create predictions for the data. You will need two types of predictions: labels and scores."
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "code",
 757 |    "execution_count": null,
 758 |    "metadata": {
 759 |     "collapsed": true
 760 |    },
 761 |    "outputs": [],
 762 |    "source": [
 763 |     "y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)\n",
 764 |     "y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)\n",
 765 |     "\n",
 766 |     "y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)\n",
 767 |     "y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "markdown",
 772 |    "metadata": {},
 773 |    "source": [
 774 |     "Now take a look at how classifier, which uses TF-IDF, works for a few examples:"
 775 |    ]
 776 |   },
 777 |   {
 778 |    "cell_type": "code",
 779 |    "execution_count": null,
 780 |    "metadata": {
 781 |     "collapsed": true
 782 |    },
 783 |    "outputs": [],
 784 |    "source": [
 785 |     "y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)\n",
 786 |     "y_val_inversed = mlb.inverse_transform(y_val)\n",
 787 |     "for i in range(3):\n",
 788 |     "    print('Title:\\t{}\\nTrue labels:\\t{}\\nPredicted labels:\\t{}\\n\\n'.format(\n",
 789 |     "        X_val[i],\n",
 790 |     "        ','.join(y_val_inversed[i]),\n",
 791 |     "        ','.join(y_val_pred_inversed[i])\n",
 792 |     "    ))"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "metadata": {},
 798 |    "source": [
 799 |     "Now, we would need to compare the results of different predictions, e.g. to see whether TF-IDF transformation helps or to try different regularization techniques in logistic regression. For all these experiments, we need to setup evaluation procedure. "
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "markdown",
 804 |    "metadata": {},
 805 |    "source": [
 806 |     "### Evaluation\n",
 807 |     "\n",
 808 |     "To evaluate the results we will use several classification metrics:\n",
 809 |     " - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)\n",
 810 |     " - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n",
 811 |     " - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)\n",
 812 |     " - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) \n",
 813 |     " \n",
 814 |     "Make sure you are familiar with all of them. How would you expect the things work for the multi-label scenario? Read about micro/macro/weighted averaging following the sklearn links provided above."
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "code",
 819 |    "execution_count": null,
 820 |    "metadata": {
 821 |     "collapsed": true
 822 |    },
 823 |    "outputs": [],
 824 |    "source": [
 825 |     "from sklearn.metrics import accuracy_score\n",
 826 |     "from sklearn.metrics import f1_score\n",
 827 |     "from sklearn.metrics import roc_auc_score \n",
 828 |     "from sklearn.metrics import average_precision_score\n",
 829 |     "from sklearn.metrics import recall_score"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "markdown",
 834 |    "metadata": {},
 835 |    "source": [
 836 |     "Implement the function *print_evaluation_scores* which calculates and prints to stdout:\n",
 837 |     " - *accuracy*\n",
 838 |     " - *F1-score macro/micro/weighted*\n",
 839 |     " - *Precision macro/micro/weighted*"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": null,
 845 |    "metadata": {
 846 |     "collapsed": true
 847 |    },
 848 |    "outputs": [],
 849 |    "source": [
 850 |     "def print_evaluation_scores(y_val, predicted):\n",
 851 |     "    \n",
 852 |     "    ######################################\n",
 853 |     "    ######### YOUR CODE HERE #############\n",
 854 |     "    ######################################"
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "metadata": {
 861 |     "collapsed": true
 862 |    },
 863 |    "outputs": [],
 864 |    "source": [
 865 |     "print('Bag-of-words')\n",
 866 |     "print_evaluation_scores(y_val, y_val_predicted_labels_mybag)\n",
 867 |     "print('Tfidf')\n",
 868 |     "print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "markdown",
 873 |    "metadata": {},
 874 |    "source": [
 875 |     "You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are:\n",
 876 |     " - true labels\n",
 877 |     " - decision functions scores\n",
 878 |     " - number of classes"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": null,
 884 |    "metadata": {
 885 |     "collapsed": true
 886 |    },
 887 |    "outputs": [],
 888 |    "source": [
 889 |     "from metrics import roc_auc\n",
 890 |     "%matplotlib inline"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": null,
 896 |    "metadata": {
 897 |     "collapsed": true
 898 |    },
 899 |    "outputs": [],
 900 |    "source": [
 901 |     "n_classes = len(tags_counts)\n",
 902 |     "roc_auc(y_val, y_val_predicted_scores_mybag, n_classes)"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "code",
 907 |    "execution_count": null,
 908 |    "metadata": {
 909 |     "collapsed": true
 910 |    },
 911 |    "outputs": [],
 912 |    "source": [
 913 |     "n_classes = len(tags_counts)\n",
 914 |     "roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes)"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "markdown",
 919 |    "metadata": {},
 920 |    "source": [
 921 |     "**Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation:\n",
 922 |     "- compare the quality of the bag-of-words and TF-IDF approaches and chose one of them.\n",
 923 |     "- for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100).\n",
 924 |     "\n",
 925 |     "You also could try other improvements of the preprocessing / model, if you want. "
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "code",
 930 |    "execution_count": null,
 931 |    "metadata": {
 932 |     "collapsed": true
 933 |    },
 934 |    "outputs": [],
 935 |    "source": [
 936 |     "######################################\n",
 937 |     "######### YOUR CODE HERE #############\n",
 938 |     "######################################"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "markdown",
 943 |    "metadata": {},
 944 |    "source": [
 945 |     "When you are happy with the quality, create predictions for *test* set, which you will submit to Coursera."
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": null,
 951 |    "metadata": {
 952 |     "collapsed": true
 953 |    },
 954 |    "outputs": [],
 955 |    "source": [
 956 |     "test_predictions = ######### YOUR CODE HERE #############\n",
 957 |     "test_pred_inversed = mlb.inverse_transform(test_predictions)\n",
 958 |     "\n",
 959 |     "test_predictions_for_submission = '\\n'.join('%i\\t%s' % (i, ','.join(row)) for i, row in enumerate(test_pred_inversed))\n",
 960 |     "grader.submit_tag('MultilabelClassification', test_predictions_for_submission)"
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "markdown",
 965 |    "metadata": {},
 966 |    "source": [
 967 |     "### Analysis of the most important features"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "markdown",
 972 |    "metadata": {},
 973 |    "source": [
 974 |     "Finally, it is usually a good idea to look at the features (words or n-grams) that are used with the largest weigths in your logistic regression model."
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "markdown",
 979 |    "metadata": {},
 980 |    "source": [
 981 |     "Implement the function *print_words_for_tag* to find them. Get back to sklearn documentation on [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) and [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) if needed."
 982 |    ]
 983 |   },
 984 |   {
 985 |    "cell_type": "code",
 986 |    "execution_count": null,
 987 |    "metadata": {
 988 |     "collapsed": true
 989 |    },
 990 |    "outputs": [],
 991 |    "source": [
 992 |     "def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):\n",
 993 |     "    \"\"\"\n",
 994 |     "        classifier: trained classifier\n",
 995 |     "        tag: particular tag\n",
 996 |     "        tags_classes: a list of classes names from MultiLabelBinarizer\n",
 997 |     "        index_to_words: index_to_words transformation\n",
 998 |     "        all_words: all words in the dictionary\n",
 999 |     "        \n",
1000 |     "        return nothing, just print top 5 positive and top 5 negative words for current tag\n",
1001 |     "    \"\"\"\n",
1002 |     "    print('Tag:\\t{}'.format(tag))\n",
1003 |     "    \n",
1004 |     "    # Extract an estimator from the classifier for the given tag.\n",
1005 |     "    # Extract feature coefficients from the estimator. \n",
1006 |     "    \n",
1007 |     "    ######################################\n",
1008 |     "    ######### YOUR CODE HERE #############\n",
1009 |     "    ######################################\n",
1010 |     "    \n",
1011 |     "    top_positive_words = # top-5 words sorted by the coefficiens.\n",
1012 |     "    top_negative_words = # bottom-5 words  sorted by the coefficients.\n",
1013 |     "    print('Top positive words:\\t{}'.format(', '.join(top_positive_words)))\n",
1014 |     "    print('Top negative words:\\t{}\\n'.format(', '.join(top_negative_words)))"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": null,
1020 |    "metadata": {
1021 |     "collapsed": true
1022 |    },
1023 |    "outputs": [],
1024 |    "source": [
1025 |     "print_words_for_tag(classifier_tfidf, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n",
1026 |     "print_words_for_tag(classifier_tfidf, 'c++', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n",
1027 |     "print_words_for_tag(classifier_tfidf, 'linux', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "markdown",
1032 |    "metadata": {},
1033 |    "source": [
1034 |     "### Authorization & Submission\n",
1035 |     "To submit the assignmnent to Cousera platform, please, enter your e-mail and token into variables below. You can generate the token on this programming assignment page. <b>Note:</b> The token expires 30 minutes after generation.\n",
1036 |     "\n",
1037 |     "Please, submit your solutions for the assignments **only** to Coursera platform, **do not create a Pull request on Github**."
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "code",
1042 |    "execution_count": null,
1043 |    "metadata": {
1044 |     "collapsed": true
1045 |    },
1046 |    "outputs": [],
1047 |    "source": [
1048 |     "grader.status()"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": null,
1054 |    "metadata": {
1055 |     "collapsed": true
1056 |    },
1057 |    "outputs": [],
1058 |    "source": [
1059 |     "STUDENT_EMAIL = # EMAIL \n",
1060 |     "STUDENT_TOKEN = # TOKEN \n",
1061 |     "grader.status()"
1062 |    ]
1063 |   },
1064 |   {
1065 |    "cell_type": "markdown",
1066 |    "metadata": {},
1067 |    "source": [
1068 |     "If you want to submit these answers, run cell below"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": null,
1074 |    "metadata": {
1075 |     "collapsed": true
1076 |    },
1077 |    "outputs": [],
1078 |    "source": [
1079 |     "grader.submit(STUDENT_EMAIL, STUDENT_TOKEN)"
1080 |    ]
1081 |   }
1082 |  ],
1083 |  "metadata": {
1084 |   "kernelspec": {
1085 |    "display_name": "Python 3",
1086 |    "language": "python",
1087 |    "name": "python3"
1088 |   },
1089 |   "language_info": {
1090 |    "codemirror_mode": {
1091 |     "name": "ipython",
1092 |     "version": 3
1093 |    },
1094 |    "file_extension": ".py",
1095 |    "mimetype": "text/x-python",
1096 |    "name": "python",
1097 |    "nbconvert_exporter": "python",
1098 |    "pygments_lexer": "ipython3",
1099 |    "version": "3.4.3"
1100 |   }
1101 |  },
1102 |  "nbformat": 4,
1103 |  "nbformat_minor": 2
1104 | }
1105 | 


--------------------------------------------------------------------------------
/week2/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False):
  4 |     if candidate == 'B-' + current_tag:
  5 |         if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
  6 |                 current_chunk[-1].append(current_pos - 1)
  7 |         current_chunk.append([current_pos])
  8 |     elif candidate == 'I-' + current_tag:
  9 |         if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag):
 10 |             current_chunk.append([current_pos])
 11 |         if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'):
 12 |             current_chunk.append([current_pos])
 13 |     elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag:
 14 |         if len(current_chunk) > 0:
 15 |             current_chunk[-1].append(current_pos - 1)
 16 | 
 17 | def _update_last_chunk(current_chunk, current_pos):
 18 |     if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
 19 |         current_chunk[-1].append(current_pos - 1)
 20 | 
 21 | def _tag_precision_recall_f1(tp, fp, fn):
 22 |     precision, recall, f1 = 0, 0, 0
 23 |     if tp + fp > 0:
 24 |         precision = tp / (tp + fp) * 100
 25 |     if tp + fn > 0:
 26 |         recall = tp / (tp + fn) * 100
 27 |     if precision + recall > 0:
 28 |         f1 = 2 * precision * recall / (precision + recall)
 29 |     return precision, recall, f1
 30 | 
 31 | def _aggregate_metrics(results, total_correct):
 32 |     total_true_entities = 0
 33 |     total_predicted_entities = 0
 34 |     total_precision = 0
 35 |     total_recall = 0
 36 |     total_f1 = 0
 37 |     for tag, tag_metrics in results.items():
 38 |         n_pred = tag_metrics['n_predicted_entities']
 39 |         n_true = tag_metrics['n_true_entities']
 40 |         total_true_entities += n_true
 41 |         total_predicted_entities += n_pred
 42 |         total_precision += tag_metrics['precision'] * n_pred
 43 |         total_recall += tag_metrics['recall'] * n_true
 44 |     
 45 |     accuracy = 0
 46 |     if total_true_entities > 0:
 47 |         accuracy = total_correct / total_true_entities * 100
 48 |     else:
 49 |         print('CAUTION! Accuracy equals zero because there are no '\
 50 |               'correct entities. Check the correctness of your data.')
 51 |     if total_predicted_entities > 0:
 52 |         total_precision = total_precision / total_predicted_entities
 53 |     total_recall = total_recall / total_true_entities
 54 |     if total_precision + total_recall > 0:
 55 |         total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)
 56 |     return total_true_entities, total_predicted_entities, \
 57 |            total_precision, total_recall, total_f1, accuracy
 58 | 
 59 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct):
 60 |     print('processed {len} tokens ' \
 61 |           'with {tot_true} phrases; ' \
 62 |           'found: {tot_pred} phrases; ' \
 63 |           'correct: {tot_cor}.\n'.format(len=n_tokens,
 64 |                                          tot_true=total_true_entities,
 65 |                                          tot_pred=total_predicted_entities,
 66 |                                          tot_cor=total_correct))
 67 | 
 68 | def _print_metrics(accuracy, total_precision, total_recall, total_f1):
 69 |     print('precision:  {tot_prec:.2f}%; ' \
 70 |           'recall:  {tot_recall:.2f}%; ' \
 71 |           'F1:  {tot_f1:.2f}\n'.format(acc=accuracy,
 72 |                                            tot_prec=total_precision,
 73 |                                            tot_recall=total_recall,
 74 |                                            tot_f1=total_f1))
 75 | 
 76 | def _print_tag_metrics(tag, tag_results):
 77 |     print(('\t%12s' % tag) + ': precision:  {tot_prec:6.2f}%; ' \
 78 |                                'recall:  {tot_recall:6.2f}%; ' \
 79 |                                'F1:  {tot_f1:6.2f}; ' \
 80 |                                'predicted:  {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'],
 81 |                                                                          tot_recall=tag_results['recall'],
 82 |                                                                          tot_f1=tag_results['f1'],
 83 |                                                                          tot_predicted=tag_results['n_predicted_entities']))
 84 | 
 85 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False):
 86 |     # Find all tags
 87 |     tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O'))
 88 | 
 89 |     results = OrderedDict((tag, OrderedDict()) for tag in tags)
 90 |     n_tokens = len(y_true)
 91 |     total_correct = 0
 92 | 
 93 |     # For eval_conll_try we find all chunks in the ground truth and prediction
 94 |     # For each chunk we store starting and ending indices
 95 |     for tag in tags:
 96 |         true_chunk = list()
 97 |         predicted_chunk = list()
 98 |         for position in range(n_tokens):
 99 |             _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position)
100 |             _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True)
101 | 
102 |         _update_last_chunk(true_chunk, position)
103 |         _update_last_chunk(predicted_chunk, position)
104 | 
105 |         # Then we find all correctly classified intervals
106 |         # True positive results
107 |         tp = sum(chunk in predicted_chunk for chunk in true_chunk)
108 |         total_correct += tp
109 | 
110 |         # And then just calculate errors of the first and second kind
111 |         # False negative
112 |         fn = len(true_chunk) - tp
113 |         # False positive
114 |         fp = len(predicted_chunk) - tp
115 |         precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn)
116 | 
117 |         results[tag]['precision'] = precision
118 |         results[tag]['recall'] = recall
119 |         results[tag]['f1'] = f1
120 |         results[tag]['n_predicted_entities'] = len(predicted_chunk)
121 |         results[tag]['n_true_entities'] = len(true_chunk)
122 | 
123 |     total_true_entities, total_predicted_entities, \
124 |            total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct)
125 | 
126 |     if print_results:
127 |         _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct)
128 |         _print_metrics(accuracy, total_precision, total_recall, total_f1)
129 | 
130 |         if not short_report:
131 |             for tag, tag_results in results.items():
132 |                 _print_tag_metrics(tag, tag_results)
133 |     return results
134 | 


--------------------------------------------------------------------------------
/week2/week2-NER.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Recognize named entities on Twitter with LSTMs\n",
 10 |     "\n",
 11 |     "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n",
 12 |     "\n",
 13 |     "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n",
 14 |     "\n",
 15 |     "    Ian Goodfellow works for Google Brain\n",
 16 |     "\n",
 17 |     "a NER model needs to provide the following sequence of tags:\n",
 18 |     "\n",
 19 |     "    B-PER I-PER    O     O   B-ORG  I-ORG\n",
 20 |     "\n",
 21 |     "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n",
 22 |     "\n",
 23 |     "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n",
 24 |     "\n",
 25 |     "### Libraries\n",
 26 |     "\n",
 27 |     "For this task you will need the following libraries:\n",
 28 |     " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n",
 29 |     " \n",
 30 |     "In this assignment, we use Tensorflow 1.15.0. You can install it with pip:\n",
 31 |     "\n",
 32 |     "    !pip install tensorflow==1.15.0\n",
 33 |     "     \n",
 34 |     " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n",
 35 |     " \n",
 36 |     "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Data\n",
 44 |     "\n",
 45 |     "The following cell will download all data required for this assignment into the folder `week2/data`."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "try:\n",
 57 |     "    import google.colab\n",
 58 |     "    IN_COLAB = True\n",
 59 |     "except:\n",
 60 |     "    IN_COLAB = False\n",
 61 |     "\n",
 62 |     "if IN_COLAB:\n",
 63 |     "    ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n",
 64 |     "    import setup_google_colab\n",
 65 |     "    setup_google_colab.setup_week2()\n",
 66 |     "\n",
 67 |     "import sys\n",
 68 |     "sys.path.append(\"..\")\n",
 69 |     "from common.download_utils import download_week2_resources\n",
 70 |     "\n",
 71 |     "download_week2_resources()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Load the Twitter Named Entity Recognition corpus\n",
 79 |     "\n",
 80 |     "We will work with a corpus, which contains tweets with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n",
 81 |     "\n",
 82 |     "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `<USR>` token and any URL to `<URL>` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "def read_data(file_path):\n",
 94 |     "    tokens = []\n",
 95 |     "    tags = []\n",
 96 |     "    \n",
 97 |     "    tweet_tokens = []\n",
 98 |     "    tweet_tags = []\n",
 99 |     "    for line in open(file_path, encoding='utf-8'):\n",
100 |     "        line = line.strip()\n",
101 |     "        if not line:\n",
102 |     "            if tweet_tokens:\n",
103 |     "                tokens.append(tweet_tokens)\n",
104 |     "                tags.append(tweet_tags)\n",
105 |     "            tweet_tokens = []\n",
106 |     "            tweet_tags = []\n",
107 |     "        else:\n",
108 |     "            token, tag = line.split()\n",
109 |     "            # Replace all urls with <URL> token\n",
110 |     "            # Replace all users with <USR> token\n",
111 |     "\n",
112 |     "            ######################################\n",
113 |     "            ######### YOUR CODE HERE #############\n",
114 |     "            ######################################\n",
115 |     "            \n",
116 |     "            tweet_tokens.append(token)\n",
117 |     "            tweet_tags.append(tag)\n",
118 |     "            \n",
119 |     "    return tokens, tags"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "And now we can load three separate parts of the dataset:\n",
127 |     " - *train* data for training the model;\n",
128 |     " - *validation* data for evaluation and hyperparameters tuning;\n",
129 |     " - *test* data for final evaluation of the model."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "train_tokens, train_tags = read_data('data/train.txt')\n",
141 |     "validation_tokens, validation_tags = read_data('data/validation.txt')\n",
142 |     "test_tokens, test_tags = read_data('data/test.txt')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "for i in range(3):\n",
161 |     "    for token, tag in zip(train_tokens[i], train_tags[i]):\n",
162 |     "        print('%s\\t%s' % (token, tag))\n",
163 |     "    print()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "### Prepare dictionaries\n",
171 |     "\n",
172 |     "To train a neural network, we will use two mappings: \n",
173 |     "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n",
174 |     "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n",
175 |     "\n",
176 |     "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. "
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {
183 |     "collapsed": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "from collections import defaultdict"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "collapsed": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "def build_dict(tokens_or_tags, special_tokens):\n",
199 |     "    \"\"\"\n",
200 |     "        tokens_or_tags: a list of lists of tokens or tags\n",
201 |     "        special_tokens: some special tokens\n",
202 |     "    \"\"\"\n",
203 |     "    # Create a dictionary with default value 0\n",
204 |     "    tok2idx = defaultdict(lambda: 0)\n",
205 |     "    idx2tok = []\n",
206 |     "    \n",
207 |     "    # Create mappings from tokens (or tags) to indices and vice versa.\n",
208 |     "    # At first, add special tokens (or tags) to the dictionaries.\n",
209 |     "    # The first special token must have index 0.\n",
210 |     "    \n",
211 |     "    # Mapping tok2idx should contain each token or tag only once. \n",
212 |     "    # To do so, you should:\n",
213 |     "    # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n",
214 |     "    #    occur in special_tokens (because they could have non-empty intersection)\n",
215 |     "    # 2. index them (for example, you can add them into the list idx2tok\n",
216 |     "    # 3. for each token/tag save the index into tok2idx).\n",
217 |     "    \n",
218 |     "    ######################################\n",
219 |     "    ######### YOUR CODE HERE #############\n",
220 |     "    ######################################\n",
221 |     "    \n",
222 |     "    return tok2idx, idx2tok"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n",
230 |     " - `<UNK>` token for out of vocabulary tokens;\n",
231 |     " - `<PAD>` token for padding sentence to the same length when we create batches of sentences."
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "special_tokens = ['<UNK>', '<PAD>']\n",
243 |     "special_tags = ['O']\n",
244 |     "\n",
245 |     "# Create dictionaries \n",
246 |     "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n",
247 |     "tag2idx, idx2tag = build_dict(train_tags, special_tags)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "The next additional functions will help you to create the mapping between tokens and ids for a sentence. "
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "def words2idxs(tokens_list):\n",
266 |     "    return [token2idx[word] for word in tokens_list]\n",
267 |     "\n",
268 |     "def tags2idxs(tags_list):\n",
269 |     "    return [tag2idx[tag] for tag in tags_list]\n",
270 |     "\n",
271 |     "def idxs2words(idxs):\n",
272 |     "    return [idx2token[idx] for idx in idxs]\n",
273 |     "\n",
274 |     "def idxs2tags(idxs):\n",
275 |     "    return [idx2tag[idx] for idx in idxs]"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "### Generate batches\n",
283 |     "\n",
284 |     "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `<PAD>` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. "
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "def batches_generator(batch_size, tokens, tags,\n",
296 |     "                      shuffle=True, allow_smaller_last_batch=True):\n",
297 |     "    \"\"\"Generates padded batches of tokens and tags.\"\"\"\n",
298 |     "    \n",
299 |     "    n_samples = len(tokens)\n",
300 |     "    if shuffle:\n",
301 |     "        order = np.random.permutation(n_samples)\n",
302 |     "    else:\n",
303 |     "        order = np.arange(n_samples)\n",
304 |     "\n",
305 |     "    n_batches = n_samples // batch_size\n",
306 |     "    if allow_smaller_last_batch and n_samples % batch_size:\n",
307 |     "        n_batches += 1\n",
308 |     "\n",
309 |     "    for k in range(n_batches):\n",
310 |     "        batch_start = k * batch_size\n",
311 |     "        batch_end = min((k + 1) * batch_size, n_samples)\n",
312 |     "        current_batch_size = batch_end - batch_start\n",
313 |     "        x_list = []\n",
314 |     "        y_list = []\n",
315 |     "        max_len_token = 0\n",
316 |     "        for idx in order[batch_start: batch_end]:\n",
317 |     "            x_list.append(words2idxs(tokens[idx]))\n",
318 |     "            y_list.append(tags2idxs(tags[idx]))\n",
319 |     "            max_len_token = max(max_len_token, len(tags[idx]))\n",
320 |     "            \n",
321 |     "        # Fill in the data into numpy nd-arrays filled with padding indices.\n",
322 |     "        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']\n",
323 |     "        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n",
324 |     "        lengths = np.zeros(current_batch_size, dtype=np.int32)\n",
325 |     "        for n in range(current_batch_size):\n",
326 |     "            utt_len = len(x_list[n])\n",
327 |     "            x[n, :utt_len] = x_list[n]\n",
328 |     "            lengths[n] = utt_len\n",
329 |     "            y[n, :utt_len] = y_list[n]\n",
330 |     "        yield x, y, lengths"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "## Build a recurrent neural network\n",
338 |     "\n",
339 |     "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification.  "
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "import tensorflow as tf\n",
351 |     "import numpy as np"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "class BiLSTMModel():\n",
363 |     "    pass"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {
369 |     "collapsed": true
370 |    },
371 |    "source": [
372 |     "First, we need to create [placeholders](https://www.tensorflow.org/api_docs/python/tf/compat/v1/placeholder) to specify what data we are going to feed into the network during the execution time.  For this task we will need the following placeholders:\n",
373 |     " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n",
374 |     " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n",
375 |     " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n",
376 |     " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n",
377 |     " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n",
378 |     "\n",
379 |     "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n",
380 |     "\n",
381 |     "You need to complete the function *declare_placeholders*."
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {
388 |     "collapsed": true
389 |    },
390 |    "outputs": [],
391 |    "source": [
392 |     "def declare_placeholders(self):\n",
393 |     "    \"\"\"Specifies placeholders for the model.\"\"\"\n",
394 |     "\n",
395 |     "    # Placeholders for input and ground truth output.\n",
396 |     "    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n",
397 |     "    self.ground_truth_tags = ######### YOUR CODE HERE #############\n",
398 |     "  \n",
399 |     "    # Placeholder for lengths of the sequences.\n",
400 |     "    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n",
401 |     "    \n",
402 |     "    # Placeholder for a dropout keep probability. If we don't feed\n",
403 |     "    # a value for this placeholder, it will be equal to 1.0.\n",
404 |     "    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n",
405 |     "    \n",
406 |     "    # Placeholder for a learning rate (tf.float32).\n",
407 |     "    self.learning_rate_ph = ######### YOUR CODE HERE #############"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {
414 |     "collapsed": true
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "markdown",
423 |    "metadata": {
424 |     "collapsed": true
425 |    },
426 |    "source": [
427 |     "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n",
428 |     " \n",
429 |     "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type  (*tf.float32*), and initialize with random values.\n",
430 |     "- Create forward and backward LSTM cells. TensorFlow provides a number of RNN cells ready for you. We suggest that you use *LSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n",
431 |     "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n",
432 |     " \n",
433 |     "After that, you can build the computation graph that transforms an input_batch:\n",
434 |     "\n",
435 |     "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n",
436 |     "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n",
437 |     "- Create a dense layer on top. Its output will be used directly in loss function.  \n",
438 |     " \n",
439 |     "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n",
440 |     " "
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {
447 |     "collapsed": true
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n",
452 |     "    \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n",
453 |     "    \n",
454 |     "    # Create embedding variable (tf.Variable) with dtype tf.float32\n",
455 |     "    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)\n",
456 |     "    embedding_matrix_variable = ######### YOUR CODE HERE #############\n",
457 |     "    \n",
458 |     "    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n",
459 |     "    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n",
460 |     "    forward_cell =  ######### YOUR CODE HERE #############\n",
461 |     "    backward_cell =  ######### YOUR CODE HERE #############\n",
462 |     "\n",
463 |     "    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n",
464 |     "    # Shape: [batch_size, sequence_len, embedding_dim].\n",
465 |     "    embeddings =  ######### YOUR CODE HERE #############\n",
466 |     "    \n",
467 |     "    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n",
468 |     "    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n",
469 |     "    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n",
470 |     "    (rnn_output_fw, rnn_output_bw), _ =  ######### YOUR CODE HERE #############\n",
471 |     "    rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n",
472 |     "\n",
473 |     "    # Dense layer on top.\n",
474 |     "    # Shape: [batch_size, sequence_len, n_tags].   \n",
475 |     "    self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {
482 |     "collapsed": true
483 |    },
484 |    "outputs": [],
485 |    "source": [
486 |     "BiLSTMModel.__build_layers = classmethod(build_layers)"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)."
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {
500 |     "collapsed": true
501 |    },
502 |    "outputs": [],
503 |    "source": [
504 |     "def compute_predictions(self):\n",
505 |     "    \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n",
506 |     "    \n",
507 |     "    # Create softmax (tf.nn.softmax) function\n",
508 |     "    softmax_output = ######### YOUR CODE HERE #############\n",
509 |     "    \n",
510 |     "    # Use argmax (tf.argmax) to get the most probable tags\n",
511 |     "    # Don't forget to set axis=-1\n",
512 |     "    # otherwise argmax will be calculated in a wrong way\n",
513 |     "    self.predictions = ######### YOUR CODE HERE #############"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {
520 |     "collapsed": true
521 |    },
522 |    "outputs": [],
523 |    "source": [
524 |     "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "collapsed": true
531 |    },
532 |    "source": [
533 |     "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n",
534 |     "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note,  that we do not want to take into account loss terms coming from `<PAD>` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)."
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {
541 |     "collapsed": true
542 |    },
543 |    "outputs": [],
544 |    "source": [
545 |     "def compute_loss(self, n_tags, PAD_index):\n",
546 |     "    \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n",
547 |     "    \n",
548 |     "    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)\n",
549 |     "    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n",
550 |     "    loss_tensor =  ######### YOUR CODE HERE #############\n",
551 |     "    \n",
552 |     "    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n",
553 |     "    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)\n",
554 |     "    # Be careful that the argument of tf.reduce_mean should be\n",
555 |     "    # multiplication of mask and loss_tensor.\n",
556 |     "    self.loss =  ######### YOUR CODE HERE #############"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": null,
562 |    "metadata": {
563 |     "collapsed": true
564 |    },
565 |    "outputs": [],
566 |    "source": [
567 |     "BiLSTMModel.__compute_loss = classmethod(compute_loss)"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "markdown",
572 |    "metadata": {},
573 |    "source": [
574 |     "The last thing to specify is how we want to optimize the loss. \n",
575 |     "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n",
576 |     "You will also need to apply clipping to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. "
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {
583 |     "collapsed": true
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "def perform_optimization(self):\n",
588 |     "    \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n",
589 |     "    \n",
590 |     "    # Create an optimizer (tf.train.AdamOptimizer)\n",
591 |     "    self.optimizer =  ######### YOUR CODE HERE #############\n",
592 |     "    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n",
593 |     "    \n",
594 |     "    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n",
595 |     "    # Pay attention that you need to apply this operation only for gradients \n",
596 |     "    # because self.grads_and_vars also contains variables.\n",
597 |     "    # list comprehension might be useful in this case.\n",
598 |     "    clip_norm = tf.cast(1.0, tf.float32)\n",
599 |     "    self.grads_and_vars =  ######### YOUR CODE HERE #############\n",
600 |     "    \n",
601 |     "    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {
608 |     "collapsed": true
609 |    },
610 |    "outputs": [],
611 |    "source": [
612 |     "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {
618 |     "collapsed": true
619 |    },
620 |    "source": [
621 |     "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n",
622 |     "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. "
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {
629 |     "collapsed": true
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n",
634 |     "    self.__declare_placeholders()\n",
635 |     "    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n",
636 |     "    self.__compute_predictions()\n",
637 |     "    self.__compute_loss(n_tags, PAD_index)\n",
638 |     "    self.__perform_optimization()"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {
645 |     "collapsed": true
646 |    },
647 |    "outputs": [],
648 |    "source": [
649 |     "BiLSTMModel.__init__ = classmethod(init_model)"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "markdown",
654 |    "metadata": {},
655 |    "source": [
656 |     "## Train the network and predict tags"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "markdown",
661 |    "metadata": {
662 |     "collapsed": true
663 |    },
664 |    "source": [
665 |     "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. "
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": null,
671 |    "metadata": {
672 |     "collapsed": true
673 |    },
674 |    "outputs": [],
675 |    "source": [
676 |     "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n",
677 |     "    feed_dict = {self.input_batch: x_batch,\n",
678 |     "                 self.ground_truth_tags: y_batch,\n",
679 |     "                 self.learning_rate_ph: learning_rate,\n",
680 |     "                 self.dropout_ph: dropout_keep_probability,\n",
681 |     "                 self.lengths: lengths}\n",
682 |     "    \n",
683 |     "    session.run(self.train_op, feed_dict=feed_dict)"
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "code",
688 |    "execution_count": null,
689 |    "metadata": {
690 |     "collapsed": true
691 |    },
692 |    "outputs": [],
693 |    "source": [
694 |     "BiLSTMModel.train_on_batch = classmethod(train_on_batch)"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "markdown",
699 |    "metadata": {},
700 |    "source": [
701 |     "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*."
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": null,
707 |    "metadata": {
708 |     "collapsed": true
709 |    },
710 |    "outputs": [],
711 |    "source": [
712 |     "def predict_for_batch(self, session, x_batch, lengths):\n",
713 |     "    ######################################\n",
714 |     "    ######### YOUR CODE HERE #############\n",
715 |     "    ######################################\n",
716 |     "    \n",
717 |     "    return predictions"
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": null,
723 |    "metadata": {
724 |     "collapsed": true
725 |    },
726 |    "outputs": [],
727 |    "source": [
728 |     "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "markdown",
733 |    "metadata": {},
734 |    "source": [
735 |     "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n",
736 |     "\n",
737 |     "### Evaluation \n",
738 |     "To simplify the evaluation process we provide two functions for you:\n",
739 |     " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n",
740 |     " - *eval_conll*: calculates precision, recall and F1 for the results."
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": null,
746 |    "metadata": {
747 |     "collapsed": true
748 |    },
749 |    "outputs": [],
750 |    "source": [
751 |     "from evaluation import precision_recall_f1"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": null,
757 |    "metadata": {
758 |     "collapsed": true
759 |    },
760 |    "outputs": [],
761 |    "source": [
762 |     "def predict_tags(model, session, token_idxs_batch, lengths):\n",
763 |     "    \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n",
764 |     "    \n",
765 |     "    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n",
766 |     "    \n",
767 |     "    tags_batch, tokens_batch = [], []\n",
768 |     "    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n",
769 |     "        tags, tokens = [], []\n",
770 |     "        for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n",
771 |     "            tags.append(idx2tag[tag_idx])\n",
772 |     "            tokens.append(idx2token[token_idx])\n",
773 |     "        tags_batch.append(tags)\n",
774 |     "        tokens_batch.append(tokens)\n",
775 |     "    return tags_batch, tokens_batch\n",
776 |     "    \n",
777 |     "    \n",
778 |     "def eval_conll(model, session, tokens, tags, short_report=True):\n",
779 |     "    \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n",
780 |     "    \n",
781 |     "    y_true, y_pred = [], []\n",
782 |     "    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n",
783 |     "        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n",
784 |     "        if len(x_batch[0]) != len(tags_batch[0]):\n",
785 |     "            raise Exception(\"Incorrect length of prediction for the input, \"\n",
786 |     "                            \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n",
787 |     "        predicted_tags = []\n",
788 |     "        ground_truth_tags = []\n",
789 |     "        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n",
790 |     "            if token != '<PAD>':\n",
791 |     "                ground_truth_tags.append(idx2tag[gt_tag_idx])\n",
792 |     "                predicted_tags.append(pred_tag)\n",
793 |     "\n",
794 |     "        # We extend every prediction and ground truth sequence with 'O' tag\n",
795 |     "        # to indicate a possible end of entity.\n",
796 |     "        y_true.extend(ground_truth_tags + ['O'])\n",
797 |     "        y_pred.extend(predicted_tags + ['O'])\n",
798 |     "        \n",
799 |     "    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n",
800 |     "    return results"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "markdown",
805 |    "metadata": {},
806 |    "source": [
807 |     "## Run your experiment"
808 |    ]
809 |   },
810 |   {
811 |    "cell_type": "markdown",
812 |    "metadata": {},
813 |    "source": [
814 |     "Create *BiLSTMModel* model with the following parameters:\n",
815 |     " - *vocabulary_size* — number of tokens;\n",
816 |     " - *n_tags* — number of tags;\n",
817 |     " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n",
818 |     " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n",
819 |     " - *PAD_index* — an index of the padding token (`<PAD>`).\n",
820 |     "\n",
821 |     "Set hyperparameters. You might want to start with the following recommended values:\n",
822 |     "- *batch_size*: 32;\n",
823 |     "- 4 epochs;\n",
824 |     "- starting value of *learning_rate*: 0.005\n",
825 |     "- *learning_rate_decay*: a square root of 2;\n",
826 |     "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n",
827 |     "\n",
828 |     "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment."
829 |    ]
830 |   },
831 |   {
832 |    "cell_type": "code",
833 |    "execution_count": null,
834 |    "metadata": {
835 |     "collapsed": true
836 |    },
837 |    "outputs": [],
838 |    "source": [
839 |     "tf.reset_default_graph()\n",
840 |     "\n",
841 |     "model = ######### YOUR CODE HERE #############\n",
842 |     "\n",
843 |     "batch_size = ######### YOUR CODE HERE #############\n",
844 |     "n_epochs = ######### YOUR CODE HERE #############\n",
845 |     "learning_rate = ######### YOUR CODE HERE #############\n",
846 |     "learning_rate_decay = ######### YOUR CODE HERE #############\n",
847 |     "dropout_keep_probability = ######### YOUR CODE HERE #############"
848 |    ]
849 |   },
850 |   {
851 |    "cell_type": "markdown",
852 |    "metadata": {},
853 |    "source": [
854 |     "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables."
855 |    ]
856 |   },
857 |   {
858 |    "cell_type": "markdown",
859 |    "metadata": {},
860 |    "source": [
861 |     "Finally, we are ready to run the training!"
862 |    ]
863 |   },
864 |   {
865 |    "cell_type": "code",
866 |    "execution_count": null,
867 |    "metadata": {
868 |     "collapsed": true
869 |    },
870 |    "outputs": [],
871 |    "source": [
872 |     "sess = tf.Session()\n",
873 |     "sess.run(tf.global_variables_initializer())\n",
874 |     "\n",
875 |     "print('Start training... \\n')\n",
876 |     "for epoch in range(n_epochs):\n",
877 |     "    # For each epoch evaluate the model on train and validation data\n",
878 |     "    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n",
879 |     "    print('Train data evaluation:')\n",
880 |     "    eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n",
881 |     "    print('Validation data evaluation:')\n",
882 |     "    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n",
883 |     "    \n",
884 |     "    # Train the model\n",
885 |     "    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n",
886 |     "        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n",
887 |     "        \n",
888 |     "    # Decaying the learning rate\n",
889 |     "    learning_rate = learning_rate / learning_rate_decay\n",
890 |     "    \n",
891 |     "print('...training finished.')"
892 |    ]
893 |   },
894 |   {
895 |    "cell_type": "markdown",
896 |    "metadata": {},
897 |    "source": [
898 |     "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n",
899 |     "\n",
900 |     "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**"
901 |    ]
902 |   },
903 |   {
904 |    "cell_type": "code",
905 |    "execution_count": null,
906 |    "metadata": {
907 |     "collapsed": true
908 |    },
909 |    "outputs": [],
910 |    "source": [
911 |     "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n",
912 |     "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n",
913 |     "\n",
914 |     "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n",
915 |     "validation_results = ######### YOUR CODE HERE #############\n",
916 |     "\n",
917 |     "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n",
918 |     "test_results = ######### YOUR CODE HERE #############"
919 |    ]
920 |   },
921 |   {
922 |    "cell_type": "markdown",
923 |    "metadata": {},
924 |    "source": [
925 |     "### Conclusions\n",
926 |     "\n",
927 |     "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested."
928 |    ]
929 |   }
930 |  ],
931 |  "metadata": {
932 |   "kernelspec": {
933 |    "display_name": "Python 3",
934 |    "language": "python",
935 |    "name": "python3"
936 |   },
937 |   "language_info": {
938 |    "codemirror_mode": {
939 |     "name": "ipython",
940 |     "version": 3
941 |    },
942 |    "file_extension": ".py",
943 |    "mimetype": "text/x-python",
944 |    "name": "python",
945 |    "nbconvert_exporter": "python",
946 |    "pygments_lexer": "ipython3",
947 |    "version": "3.4.3"
948 |   }
949 |  },
950 |  "nbformat": 4,
951 |  "nbformat_minor": 1
952 | }
953 | 


--------------------------------------------------------------------------------
/week3/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A'
10 |         self.parts = OrderedDict([('98mDT', 'Question2Vec'), 
11 |                                   ('nc7RP', 'HitsCount'), 
12 |                                   ('bNp90', 'DCGScore'), 
13 |                                   ('3gRlQ', 'W2VTokenizedRanks'), 
14 |                                   ('mX6wS', 'StarSpaceRanks')])
15 |         self.answers = {key: None for key in self.parts}
16 | 
17 |     @staticmethod
18 |     def ravel_output(output):
19 |         '''
20 |            If student accidentally submitted np.array with one
21 |            element instead of number, this function will submit
22 |            this number instead
23 |         '''
24 |         if isinstance(output, np.ndarray) and output.size == 1:
25 |             output = output.item(0)
26 |         return output
27 | 
28 |     def submit(self, email, token):
29 |         submission = {
30 |                     "assignmentKey": self.assignment_key, 
31 |                     "submitterEmail": email, 
32 |                     "secret": token, 
33 |                     "parts": {}
34 |                   }
35 |         for part, output in self.answers.items():
36 |             if output is not None:
37 |                 submission["parts"][part] = {"output": output}
38 |             else:
39 |                 submission["parts"][part] = dict()
40 |         request = requests.post(self.submission_page, data=json.dumps(submission))
41 |         response = request.json()
42 |         if request.status_code == 201:
43 |             print('Submitted to Coursera platform. See results on assignment page!')
44 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
45 |             print(response[u'details'][u'learnerMessage'])
46 |         else:
47 |             print("Unknown response from Coursera: {}".format(request.status_code))
48 |             print(response)
49 | 
50 |     def status(self):
51 |         print("You want to submit these parts:")
52 |         for part_id, part_name in self.parts.items():
53 |             answer = self.answers[part_id]
54 |             if answer is None:
55 |                 answer = '-'*10
56 |             print("Task {}: {}".format(part_name, answer[:100] + '...'))
57 |                
58 |     def submit_part(self, part, output):
59 |         self.answers[part] = output
60 |         print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...'))
61 | 
62 |     def submit_tag(self, tag, output):
63 |         part_id = [k for k, v in self.parts.items() if v == tag]
64 |         if len(part_id) != 1:
65 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
66 |         part_id = part_id[0]
67 |         self.submit_part(part_id, str(self.ravel_output(output)))
68 | 


--------------------------------------------------------------------------------
/week3/util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from nltk.corpus import stopwords
 3 | 
 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
 6 | STOPWORDS = set(stopwords.words('english'))
 7 | def text_prepare(text):
 8 |     text = text.lower()
 9 |     text = REPLACE_BY_SPACE_RE.sub(' ', text)
10 |     text = GOOD_SYMBOLS_RE.sub('', text)
11 |     text = ' '.join([x for x in text.split() if x and x not in STOPWORDS])
12 |     return text.strip()
13 | 
14 | def array_to_string(arr):
15 |     return '\n'.join(str(num) for num in arr)
16 | 
17 | def matrix_to_string(matrix):
18 |     return '\n'.join('\t'.join(str(num) for num in line) for line in matrix)


--------------------------------------------------------------------------------
/week4/encoder-decoder-pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hse-aml/natural-language-processing/5b06a1ac8918af5720117b1ebdc8c55de13bae59/week4/encoder-decoder-pic.png


--------------------------------------------------------------------------------
/week5/dialogue_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin
 3 | 
 4 | from chatterbot import ChatBot
 5 | from chatterbot.trainers import ChatterBotCorpusTrainer
 6 | from utils import *
 7 | 
 8 | 
 9 | class ThreadRanker(object):
10 |     def __init__(self, paths):
11 |         self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
12 |         self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
13 | 
14 |     def __load_embeddings_by_tag(self, tag_name):
15 |         embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
16 |         thread_ids, thread_embeddings = unpickle_file(embeddings_path)
17 |         return thread_ids, thread_embeddings
18 | 
19 |     def get_best_thread(self, question, tag_name):
20 |         """ Returns id of the most similar thread for the question.
21 |             The search is performed across the threads with a given tag.
22 |         """
23 |         thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
24 | 
25 |         # HINT: you have already implemented a similar routine in the 3rd assignment.
26 |         
27 |         question_vec = #### YOUR CODE HERE ####
28 |         best_thread = #### YOUR CODE HERE ####
29 |         
30 |         return thread_ids[best_thread]
31 | 
32 | 
33 | class DialogueManager(object):
34 |     def __init__(self, paths):
35 |         print("Loading resources...")
36 | 
37 |         # Intent recognition:
38 |         self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
39 |         self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
40 | 
41 |         self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
42 | 
43 |         # Goal-oriented part:
44 |         self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
45 |         self.thread_ranker = ThreadRanker(paths)
46 |         self.__init_chitchat_bot()
47 | 
48 |     def __init_chitchat_bot(self):
49 |         """Initializes self.chitchat_bot with some conversational model."""
50 | 
51 |         # Hint: you might want to create and train chatterbot.ChatBot here.
52 |         # Create an instance of the ChatBot class.
53 |         # Set a trainer set_trainer(ChatterBotCorpusTrainer) for the ChatBot.
54 |         # Train the ChatBot with "chatterbot.corpus.english" param.
55 |         # Note that we use chatterbot==0.7.6 in this project. 
56 |         # You are welcome to experiment with other versions but they might have slightly different API.
57 |         
58 |         ########################
59 |         #### YOUR CODE HERE ####
60 |         ########################
61 | 
62 |         # remove this when you're done
63 |         raise NotImplementedError(
64 |             "Open dialogue_manager.py and fill with your code. In case of Google Colab, download"
65 |             "(https://github.com/hse-aml/natural-language-processing/blob/master/project/dialogue_manager.py), "
66 |             "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
67 |        
68 |     def generate_answer(self, question):
69 |         """Combines stackoverflow and chitchat parts using intent recognition."""
70 | 
71 |         # Recognize intent of the question using `intent_recognizer`.
72 |         # Don't forget to prepare question and calculate features for the question.
73 |         
74 |         prepared_question = #### YOUR CODE HERE ####
75 |         features = #### YOUR CODE HERE ####
76 |         intent = #### YOUR CODE HERE ####
77 | 
78 |         # Chit-chat part:   
79 |         if intent == 'dialogue':
80 |             # Pass question to chitchat_bot to generate a response.       
81 |             response = #### YOUR CODE HERE ####
82 |             return response
83 |         
84 |         # Goal-oriented part:
85 |         else:        
86 |             # Pass features to tag_classifier to get predictions.
87 |             tag = #### YOUR CODE HERE ####
88 |             
89 |             # Pass prepared_question to thread_ranker to get predictions.
90 |             thread_id = #### YOUR CODE HERE ####
91 |             
92 |             return self.ANSWER_TEMPLATE % (tag, thread_id)
93 | 


--------------------------------------------------------------------------------
/week5/utils.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import pickle
 3 | import re
 4 | import numpy as np
 5 | 
 6 | nltk.download('stopwords')
 7 | from nltk.corpus import stopwords
 8 | 
 9 | # Paths for all resources for the bot.
10 | RESOURCE_PATH = {
11 |     'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
12 |     'TAG_CLASSIFIER': 'tag_classifier.pkl',
13 |     'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
14 |     'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
15 |     'WORD_EMBEDDINGS': 'data/word_embeddings.tsv',
16 | }
17 | 
18 | 
19 | def text_prepare(text):
20 |     """Performs tokenization and simple preprocessing."""
21 |     replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
22 |     good_symbols_re = re.compile('[^0-9a-z #+_]')
23 |     stopwords_set = set(stopwords.words('english'))
24 | 
25 |     text = text.lower()
26 |     text = replace_by_space_re.sub(' ', text)
27 |     text = good_symbols_re.sub('', text)
28 |     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
29 | 
30 |     return text.strip()
31 | 
32 | 
33 | def load_embeddings(embeddings_path):
34 |     """Loads pre-trained word embeddings from tsv file.
35 |     Args:
36 |       embeddings_path - path to the embeddings file.
37 |     Returns:
38 |       embeddings - dict mapping words to vectors;
39 |       embeddings_dim - dimension of the vectors.
40 |     """
41 | 
42 |     # Hint: you have already implemented a similar routine in the 3rd assignment.
43 |     # Note that here you also need to know the dimension of the loaded embeddings.
44 |     # When you load the embeddings, use numpy.float32 type as dtype
45 | 
46 |     ########################
47 |     #### YOUR CODE HERE ####
48 |     ########################
49 | 
50 |     # remove this when you're done
51 |     raise NotImplementedError(
52 |         "Open utils.py and fill with your code. In case of Google Colab, download"
53 |         "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
54 |         "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
55 | 
56 | 
57 | def question_to_vec(question, embeddings, dim):
58 |     """Transforms a string to an embedding by averaging word embeddings."""
59 | 
60 |     # Hint: you have already implemented exactly this function in the 3rd assignment.
61 | 
62 |     ########################
63 |     #### YOUR CODE HERE ####
64 |     ########################
65 | 
66 |     # remove this when you're done
67 |     raise NotImplementedError(
68 |         "Open utils.py and fill with your code. In case of Google Colab, download"
69 |         "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
70 |         "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
71 | 
72 | 
73 | def unpickle_file(filename):
74 |     """Returns the result of unpickling the file content."""
75 |     with open(filename, 'rb') as f:
76 |         return pickle.load(f)
77 | 


--------------------------------------------------------------------------------
/week5/week5-project.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Final project: StackOverflow assistant bot\n",
  8 |     "\n",
  9 |     "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n",
 10 |     "* answer programming-related questions (using StackOverflow dataset);\n",
 11 |     "* chit-chat and simulate dialogue on all non programming-related questions.\n",
 12 |     "\n",
 13 |     "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n",
 14 |     "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n",
 15 |     "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n",
 16 |     "©[xkcd](https://xkcd.com)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Data description\n",
 24 |     "\n",
 25 |     "To detect *intent* of users questions we will need two text collections:\n",
 26 |     "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n",
 27 |     "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "try:\n",
 37 |     "    import google.colab\n",
 38 |     "    IN_COLAB = True\n",
 39 |     "except:\n",
 40 |     "    IN_COLAB = False\n",
 41 |     "\n",
 42 |     "if IN_COLAB:\n",
 43 |     "    ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n",
 44 |     "    import setup_google_colab\n",
 45 |     "    setup_google_colab.setup_project()\n",
 46 |     "\n",
 47 |     "import sys\n",
 48 |     "sys.path.append(\"..\")\n",
 49 |     "from common.download_utils import download_project_resources\n",
 50 |     "\n",
 51 |     "download_project_resources()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n",
 59 |     "For the ranking part, you will need:\n",
 60 |     "- `word_embeddings.tsv` — word embeddings, that you  trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n",
 68 |     "\n",
 69 |     "- `intent_recognizer.pkl` — intent recognition model;\n",
 70 |     "- `tag_classifier.pkl` — programming language classification model;\n",
 71 |     "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n",
 72 |     "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n",
 73 |     "    "
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "from utils import *"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Part I. Intent and language recognition"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n",
104 |     "\n",
105 |     "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "import numpy as np\n",
115 |     "import pandas as pd\n",
116 |     "import pickle\n",
117 |     "import re\n",
118 |     "\n",
119 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### Data preparation"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "def tfidf_features(X_train, X_test, vectorizer_path):\n",
143 |     "    \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n",
144 |     "    \n",
145 |     "    # Train a vectorizer on X_train data.\n",
146 |     "    # Transform X_train and X_test data.\n",
147 |     "    \n",
148 |     "    # Pickle the trained vectorizer to 'vectorizer_path'\n",
149 |     "    # Don't forget to open the file in writing bytes mode.\n",
150 |     "    \n",
151 |     "    ######################################\n",
152 |     "    ######### YOUR CODE HERE #############\n",
153 |     "    ######################################\n",
154 |     "    \n",
155 |     "    return X_train, X_test"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later."
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "sample_size = 200000\n",
172 |     "\n",
173 |     "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n",
174 |     "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "Check how the data look like:"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "dialogue_df.head()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "stackoverflow_df.head()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |      "Apply *text_prepare* function to preprocess the data.\n",
207 |      "\n",
208 |      "If you filled in the file, but NotImplementedError is still displayed, please refer to [this thread](https://github.com/hse-aml/natural-language-processing/issues/27)."
209 |    ]
210 |  },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "from utils import text_prepare"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "dialogue_df['text'] = ######### YOUR CODE HERE #############\n",
227 |     "stackoverflow_df['title'] = ######### YOUR CODE HERE #############"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "### Intent recognition"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n",
242 |     "- concatenate `dialogue` and `stackoverflow` examples into one sample\n",
243 |     "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n",
244 |     "- transform it into TF-IDF features"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "from sklearn.model_selection import train_test_split"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n",
263 |     "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n",
264 |     "\n",
265 |     "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n",
266 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n",
267 |     "\n",
268 |     "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good."
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "from sklearn.linear_model import LogisticRegression\n",
285 |     "from sklearn.metrics import accuracy_score"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "######################################\n",
295 |     "######### YOUR CODE HERE #############\n",
296 |     "######################################"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "# Check test accuracy.\n",
306 |     "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n",
307 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
308 |     "print('Test accuracy = {}'.format(test_accuracy))"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "Dump the classifier to use it in the running bot."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "### Programming language classification "
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n",
339 |     "\n",
340 |     "First, let us prepare the data for this task."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "X = stackoverflow_df['title'].values\n",
350 |     "y = stackoverflow_df['tag'].values"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
360 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it."
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n",
377 |     "\n",
378 |     "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "from sklearn.multiclass import OneVsRestClassifier"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "######################################\n",
404 |     "######### YOUR CODE HERE #############\n",
405 |     "######################################"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "# Check test accuracy.\n",
415 |     "y_test_pred = tag_classifier.predict(X_test_tfidf)\n",
416 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
417 |     "print('Test accuracy = {}'.format(test_accuracy))"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "Dump the classifier to use it in the running bot."
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "## Part II. Ranking  questions with embeddings"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n",
448 |     "\n",
449 |     "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. "
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n",
457 |     "\n",
458 |     "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StarSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which are not in Stackoverflow data."
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {},
481 |    "outputs": [],
482 |    "source": [
483 |     "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {},
489 |    "source": [
490 |     "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n",
491 |     "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "counts_by_tag = ######### YOUR CODE HERE #############"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "Now for each `tag` you need to create two data structures, which will serve as online search index:\n",
508 |     "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n",
509 |     "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n",
510 |     "\n",
511 |     "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "import os\n",
521 |     "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n",
522 |     "\n",
523 |     "for tag, count in counts_by_tag.items():\n",
524 |     "    tag_posts = posts_df[posts_df['tag'] == tag]\n",
525 |     "    \n",
526 |     "    tag_post_ids = ######### YOUR CODE HERE #############\n",
527 |     "    \n",
528 |     "    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n",
529 |     "    for i, title in enumerate(tag_posts['title']):\n",
530 |     "        tag_vectors[i, :] = ######### YOUR CODE HERE ############# \n",
531 |     "\n",
532 |     "    # Dump post ids and vectors to a file.\n",
533 |     "    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n",
534 |     "    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "metadata": {},
540 |    "source": [
541 |     "## Part III. Putting all together"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "Now let's combine everything that we have done and enable the bot to maintain a dialogue. We will teach the bot to sequentially determine the intent and, depending on the intent, select the best answer. As soon as we do this, we will have the opportunity to chat with the bot and check how well it answers questions."
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "Implement Dialogue Manager that will generate the best answer. In order to do this, you should open *dialogue_manager.py* and fill in the gaps."
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {},
562 |    "outputs": [],
563 |    "source": [
564 |     "from dialogue_manager import DialogueManager"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "dialogue_manager = ######### YOUR CODE HERE #############"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "markdown",
578 |    "metadata": {},
579 |    "source": [
580 |     "Now we are ready to test our chat bot! Let's chat with the bot and ask it some questions. Check that the answers are reasonable."
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {},
587 |    "outputs": [],
588 |    "source": [
589 |     "questions = [\n",
590 |     "    \"Hey\", \n",
591 |     "    \"How are you doing?\", \n",
592 |     "    \"What's your hobby?\", \n",
593 |     "    \"How to write a loop in python?\",\n",
594 |     "    \"How to delete rows in pandas?\",\n",
595 |     "    \"python3 re\",\n",
596 |     "    \"What is the difference between c and c++\",\n",
597 |     "    \"Multithreading in Java\",\n",
598 |     "    \"Catch exceptions C++\",\n",
599 |     "    \"What is AI?\",\n",
600 |     "]\n",
601 |     "\n",
602 |     "for question in questions:\n",
603 |     "    answer = ######### YOUR CODE HERE #############\n",
604 |     "    print('Q: %s\\nA: %s \\n' % (question, answer))"
605 |    ]
606 |   }
607 |  ],
608 |  "metadata": {
609 |   "kernelspec": {
610 |    "display_name": "Python 3",
611 |    "language": "python",
612 |    "name": "python3"
613 |   },
614 |   "language_info": {
615 |    "codemirror_mode": {
616 |     "name": "ipython",
617 |     "version": 3
618 |    },
619 |    "file_extension": ".py",
620 |    "mimetype": "text/x-python",
621 |    "name": "python",
622 |    "nbconvert_exporter": "python",
623 |    "pygments_lexer": "ipython3",
624 |    "version": "3.5.2"
625 |   },
626 |   "latex_envs": {
627 |    "bibliofile": "biblio.bib",
628 |    "cite_by": "apalike",
629 |    "current_citInitial": 1,
630 |    "eqLabelWithNumbers": true,
631 |    "eqNumInitial": 0
632 |   }
633 |  },
634 |  "nbformat": 4,
635 |  "nbformat_minor": 2
636 | }
637 | 


--------------------------------------------------------------------------------