├── .gitignore
├── AWS-tutorial.md
├── Docker-tutorial.md
├── common
    ├── README.md
    └── download_utils.py
├── docker
    ├── Dockerfile
    ├── requirements.txt
    └── welcome_message.txt
├── honor
    ├── README.md
    ├── datasets.py
    ├── download_cornell.sh
    ├── download_opensubs.sh
    └── example.py
├── project
    ├── .gitignore
    ├── dialogue_manager.py
    ├── main_bot.py
    ├── utils.py
    ├── week5-project-Soln.ipynb
    └── week5-project.ipynb
├── week1
    ├── grader.py
    ├── metrics.py
    ├── week1-MultilabelClassification-NewSolution.ipynb
    ├── week1-MultilabelClassification-Solution.ipynb
    └── week1-MultilabelClassification.ipynb
├── week2
    ├── evaluation.py
    ├── week2-NER-MySolution.ipynb
    ├── week2-NER.ipynb
    ├── week2-NER_peerreview1.ipynb
    ├── week2-NER_peerreview3.ipynb
    ├── week2-NER_peerreview4.ipynb
    └── week2-NER_v1_1_peerreview2.ipynb
├── week3
    ├── .gitignore
    ├── grader.py
    ├── util.py
    ├── week3-Embeddings-Solution.ipynb
    └── week3-Embeddings.ipynb
└── week4
    ├── encoder-decoder-pic.png
    ├── week4-seq2seq-Soln.ipynb
    ├── week4-seq2seq.ipynb
    ├── week4-seq2seq_eval1.ipynb
    ├── week4-seq2seq_eval2.ipynb
    └── week4-seq2seq_eval3.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Data for assignments
104 | data/
105 | 


--------------------------------------------------------------------------------
/AWS-tutorial.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for setting up an AWS Virtual Machine
 2 | 
 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 
 4 | 
 5 | ### 1. Register with AWS and launch an EC2 instance
 6 | 
 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them):
 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged.
 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH.
10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change.
11 | 
12 | Next, you are ready to create your first EC2 instance:
13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) choose **Ubuntu Server 16.04 LTS**.
14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH.
15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end.
16 | 
17 | ### 2. Set up dependencies and run your project
18 | 
19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions.
20 | 
21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling:
22 | ```sh
23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com
24 | ```
25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine.
26 | 
27 | - Bring code and data to AWS instance, e.g.
28 | ```sh
29 | scp -i path/to/your_key.pem path/to/local_file ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file
30 | ``` 
31 | You might want to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows.
32 | 
33 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop.
34 | 
35 | 


--------------------------------------------------------------------------------
/Docker-tutorial.md:
--------------------------------------------------------------------------------
  1 | # Docker container with course dependencies
  2 | 
  3 | This file describes how to use a Docker container with Jupyter notebook and
  4 | all dependencies required for the course.
  5 | 
  6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/.
  7 | 
  8 | ## Install Stable Docker Community Edition (CE)
  9 | 
 10 | - For Mac: 
 11 | https://docs.docker.com/docker-for-mac/install/
 12 | 
 13 | - For Ubuntu: 
 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu).
 15 | 
 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education):
 17 | https://docs.docker.com/docker-for-windows/install/ 
 18 | 
 19 | - For Windows (older versions):
 20 | https://docs.docker.com/toolbox/toolbox_install_windows/
 21 | 
 22 | 
 23 | 
 24 | ## Get container image
 25 | 
 26 | To get the latest version of the container image run:
 27 | ```sh
 28 | docker pull akashin/coursera-aml-nlp
 29 | ```
 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 
 31 | 
 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group:
 33 | ```sh
 34 | sudo usermod -a -G docker $USER
 35 | sudo service docker restart
 36 | ```
 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it).
 38 | 
 39 | 
 40 | ## Run container for the first time
 41 | 
 42 | Now you can start new container from this image with:
 43 | ```sh
 44 | docker run -it -p 8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp
 45 | ```
 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server. 
 47 | 
 48 | You may find it useful to mount a directory from your local machine within the container using `-v` option.
 49 | 
 50 | For Linux and OSX, the following command should work:
 51 | ```sh
 52 | docker run -it -p 8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp
 53 | ```
 54 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path.
 55 | 
 56 | For Windows, there are some extra [steps](https://rominirani.com/docker-on-windows-mounting-host-directories-d96f3f056a2c) involved, and the launch command looks like
 57 | ```sh
 58 | docker run -it -p 8080:8080 --name coursera-aml-nlp --user root -v /c/Users/$YOUR_USERNAME:/root/coursera akashin/coursera-aml-nlp
 59 | ```
 60 | Where `/c/Users/$YOUR_USERNAME` is the path to your user's home folder.
 61 | 
 62 | If you're using Docker Toolbox on Windows, the command given above might not work because of the additional VirtualBox layer involved. Instead, we recommend you to follow the guidance in http://blog.shahinrostami.com/2017/11/docker-toolbox-windows-7-shared-volumes/.
 63 | 
 64 | ## Stop and resume container
 65 | 
 66 | To stop the container use:
 67 | ```sh
 68 | docker stop coursera-aml-nlp
 69 | ```
 70 | All the changes that were made within container will be saved.
 71 | 
 72 | To resume the stopped container use:
 73 | ```sh
 74 | docker start -i coursera-aml-nlp
 75 | ```
 76 | ## Other operations on the container
 77 | 
 78 | There are many other operations that you can perform on the container, to show all of them:
 79 | ```sh
 80 | docker container
 81 | ```
 82 | Some particularly useful would be **showing a list of containers** and **removing container**.
 83 | 
 84 | To show currently running and stopped containers with their status:
 85 | ```sh
 86 | docker ps -a
 87 | ```
 88 | 
 89 | To connect to a Bash shell in the already running container with name `coursera-aml-nlp` run:
 90 | ```
 91 | docker exec -it coursera-aml-nlp bash
 92 | ```
 93 | This will drop you into the standard Linux Bash shell that supports common commands like `ls`, `wget` or `python3`.
 94 | 
 95 | To remove the container and all data associated with it:
 96 | ```sh
 97 | docker rm coursera-aml-nlp
 98 | ```
 99 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected.
100 | 
101 | ## Install more packages
102 | 
103 | You can install more packages in the container if needed:
104 | ```sh
105 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME
106 | ```
107 | 
108 | ## Change RAM limits of the container
109 | 
110 | Your container might have memory limits that are different from the actual limits of your physical machine, which might lead to a crash of your code due memory shortage.
111 | 
112 | - If you're running Windows or OSX, the default limit is 2GB, but you can change it by following this tutorials:
113 |   - For Windows: https://docs.docker.com/docker-for-windows/#advanced
114 |   - For Mac OSX: https://docs.docker.com/docker-for-mac/#advanced
115 | 
116 | - If you're running Linux, you're all set as the memory limits are the same as the physical memory of your machine.
117 | 
118 | 
119 | ## Further reading
120 | 
121 | If you are interested to know more about Docker, check out this articles: 
122 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/
123 | - General introduction to Docker: https://docker-curriculum.com/
124 | 
125 | ## Troubleshooting
126 | 
127 | ### Verify your Docker installation by running "Hello World" application
128 | - Run `docker pull hello-world`. You should see a message that ends with 
129 |     “Status: Downloaded newer image for hello-world:latest”.
130 | - Run `docker run hello-world`.  You should see a message that starts with
131 |     “Hello from Docker!
132 |     This message shows that your installation appears to be working correctly.”
133 | 
134 | If you see any errors, follow relevant troubleshooting steps.
135 | 
136 | ### “Unauthorized: authentication required” when trying to pull Docker image
137 | Run `docker logout` and try pulling again. If this doesn't help, make sure the system date is set correctly and try again. If this doesn't help, reinstall Docker and try again.
138 | 
139 | ### Can't open Jupyter notebook in the browser
140 | If you try to open "http://localhost:8080" or "http://127.0.0.1:8080" in your browser, when `run_notebook` command is started, and you can't access your notebooks, here are some advices:
141 | - If you're using Docker Toolbox on Windows, try accessing "http://192.168.99.100:8080" instead. If this doesn't work, follow the instructions [on official Docker docs](https://docs.docker.com/docker-for-windows/troubleshoot/#limitations-of-windows-containers-for-localhost-and-published-ports) and on [Stackoverflow](https://stackoverflow.com/questions/42866013/docker-toolbox-localhost-not-working).
142 | - Make sure that you're running container with `-p` flag as described [here](#run-container-for-the-first-time) and that the output of `docker ps` contains a message like this:
143 | ```
144 | CONTAINER ID        IMAGE                      COMMAND             CREATED                  STATUS              PORTS               NAMES
145 | e5b7bcd85a1b        akashin/coursera-aml-nlp   "/bin/bash"         Less than a second ago   Up 2 seconds        8080/tcp            peaceful_lamarr
146 | ```
147 | If the part about `PORTS` differs, remove the current container following [instructions](#other-operations-on-the-container) and start it again.
148 | - Make sure that browser proxy settings don't interfere with accessing local web sites.
149 | 
150 | ### How do I load data into Docker container?
151 | To access the data in the container, we recommend to use `-v` flag described [here](#run-container-for-the-first-time) to mount a local directory from your computer into the container filesystem. For more details read [Docker documentation](https://docs.docker.com/storage/volumes/).
152 | 
153 | Alternatively, you can download data using Jupyter "Upload" button or `wget` command in the [Bash shell](#other-operations-on-the-container) of the container.
154 | 
155 | ### Can't run `run_notebook` or `starspace` command
156 | Make sure that you're executing it in the context of the Docker container as described [here](#run-container-for-the-first-time).
157 | 
158 | ### "Name is already in use by container" when trying to run the container
159 | This means that the container with this name is already created. You can connect to this container or remove it by following [instructions](#other-operations-on-the-container).
160 | 
161 | ### StarSpace/Jupyter notebook crashes in Docker
162 | This usually happens due to low default 2GB memory limit on Windows and OSX. Follow this [instructions](#change-ram-limits-of-the-container) to fix this.
163 | 
164 | ## Reporting the issue to the Coursera forum
165 | Before reporting the issue to the Coursera forum, please, make sure that you've checked the [troubleshooting](#troubleshooting) steps. Only if they don't help, post all relevant error messages, throubleshooting results, and the following information to your post:
166 | 
167 | - Your operating system (e.g. Windows 7, Ubuntu Linux, OSX 10.13.3)
168 | - Your docker version (e.g. Docker Toolbox, Docker for Windows, output of `docker --version`)
169 | - Output of `docker ps -a`, `docker info`, `docker version -f "{{ .Server.Os }}"` (share thorough https://gist.github.com/ or https://pastebin.com/)
170 | - Output of `wget http://localhost:8080` (or `wget http://192.168.99.100:8080` for Docker Toolbox), executed from within Docker container and outside of it
171 | 
172 | ## Credits
173 | 
174 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker
175 | 


--------------------------------------------------------------------------------
/common/README.md:
--------------------------------------------------------------------------------
1 | # Common utils
2 | 
3 | This folder stores collection of functions that are common for different assignments
4 | 
5 | - `download_utils.py`: Functions for downloading data for the assignments.
6 | 


--------------------------------------------------------------------------------
/common/download_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import shutil
  5 | 
  6 | try:
  7 |     import tqdm
  8 |     # Address problem in tqdm library. For details see: https://github.com/tqdm/tqdm/issues/481
  9 |     tqdm.monitor_interval = 0
 10 | except ImportError:
 11 |     tqdm = None
 12 | 
 13 | import requests
 14 | 
 15 | REPOSITORY_PATH="https://github.com/hse-aml/natural-language-processing"
 16 | 
 17 | 
 18 | def download_file(url, file_path):
 19 |     r = requests.get(url, stream=True)
 20 |     total_size = int(r.headers.get('content-length'))
 21 |     try:
 22 |         with open(file_path, 'wb', buffering=16*1024*1024) as f:
 23 |             if tqdm:
 24 |                 bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True)
 25 |                 bar.set_description(os.path.split(file_path)[-1])
 26 | 
 27 |             for chunk in r.iter_content(32 * 1024):
 28 |                 f.write(chunk)
 29 |                 if tqdm:
 30 |                     bar.update(len(chunk))
 31 | 
 32 |             if tqdm:
 33 |                 bar.close()
 34 |             else:
 35 |                 print("File {!r} successfully downloaded".format(file_path))
 36 |     except Exception:
 37 |         print("Download failed")
 38 |     finally:
 39 |         if os.path.getsize(file_path) != total_size:
 40 |             os.remove(file_path)
 41 |             print("Removed incomplete download")
 42 | 
 43 | 
 44 | def download_from_github(version, fn, target_dir, force=False):
 45 |     url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn)
 46 |     file_path = os.path.join(target_dir, fn)
 47 |     if os.path.exists(file_path) and not force:
 48 |         print("File {} is already downloaded.".format(file_path))
 49 |         return
 50 |     download_file(url, file_path)
 51 | 
 52 | 
 53 | def sequential_downloader(version, fns, target_dir, force=False):
 54 |     os.makedirs(target_dir, exist_ok=True)
 55 |     for fn in fns:
 56 |         download_from_github(version, fn, target_dir, force=force)
 57 | 
 58 | 
 59 | def link_all_files_from_dir(src_dir, dst_dir):
 60 |     os.makedirs(dst_dir, exist_ok=True)
 61 |     for fn in os.listdir(src_dir):
 62 |         src_file = os.path.join(src_dir, fn)
 63 |         dst_file = os.path.join(dst_dir, fn)
 64 |         if os.name == "nt":
 65 |             shutil.copyfile(src_file, dst_file)
 66 |         else:
 67 |             if not os.path.exists(dst_file):
 68 |                 os.symlink(os.path.abspath(src_file), dst_file)
 69 | 
 70 | 
 71 | def link_resources():
 72 |     link_all_files_from_dir("../readonly/dataset/", ".")
 73 | 
 74 | 
 75 | def download_week1_resources(force=False):
 76 |     sequential_downloader(
 77 |         "week1",
 78 |         [
 79 |             "train.tsv",
 80 |             "validation.tsv",
 81 |             "test.tsv",
 82 |             "text_prepare_tests.tsv",
 83 |         ],
 84 |         "data",
 85 |         force=force
 86 |     )
 87 | 
 88 | 
 89 | def download_week2_resources(force=False):
 90 |     sequential_downloader(
 91 |         "week2",
 92 |         [
 93 |             "train.txt",
 94 |             "validation.txt",
 95 |             "test.txt",
 96 |         ],
 97 |         "data",
 98 |         force=force
 99 |     )
100 | 
101 | 
102 | def download_week3_resources(force=False):
103 |     sequential_downloader(
104 |         "week3",
105 |         [
106 |             "train.tsv",
107 |             "validation.tsv",
108 |             "test.tsv",
109 |             "test_embeddings.tsv",
110 |         ],
111 |         "data",
112 |         force=force
113 |     )
114 | 
115 | 
116 | def download_project_resources(force=False):
117 |     sequential_downloader(
118 |         "project",
119 |         [
120 |             "dialogues.tsv",
121 |             "tagged_posts.tsv",
122 |         ],
123 |         "data",
124 |         force=force
125 |     )
126 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | LABEL maintainer="Andrei Kashin <kashin.andrej@gmail.com>"
 3 | 
 4 | RUN apt-get update && apt-get install -yq \
 5 |                         python3 python3-pip htop nano git wget \
 6 |                         libglib2.0-0 autoconf automake \
 7 |                         libtool build-essential unzip \
 8 |                         libarchive-dev vim
 9 | 
10 | # Install Starspace.
11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \
12 |     unzip boost_1_63_0.zip && \
13 |     mv boost_1_63_0 /usr/local/bin
14 | 
15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \
16 |     cd Starspace && \
17 |     make && \
18 |     cp -Rf starspace /usr/local/bin
19 | 
20 | # Install Python dependencies.
21 | ADD requirements.txt /
22 | RUN pip3 install --upgrade pip
23 | RUN pip3 install -r requirements.txt
24 | 
25 | # Install Jupyter.
26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension
27 | RUN jupyter contrib nbextension install
28 | RUN jupyter nbextension enable codefolding/main
29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py
30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py
31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py
32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook
33 | 
34 | # Welcome message.
35 | ADD welcome_message.txt /
36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \
37 |         >> /etc/bash.bashrc \
38 |         ; cat welcome_message.txt > /etc/motd
39 | 
40 | WORKDIR /root
41 | EXPOSE 8080
42 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.weakref==1.0.post1
 2 | bleach==1.5.0
 3 | certifi==2017.11.5
 4 | chardet==3.0.4
 5 | ChatterBot==0.7.6
 6 | decorator==4.1.2
 7 | entrypoints==0.2.3
 8 | enum34==1.1.6
 9 | funcsigs==1.0.2
10 | gensim==3.1.0
11 | html5lib==0.9999999
12 | idna==2.6
13 | ipykernel==4.6.1
14 | ipython==6.2.1
15 | ipython-genutils==0.2.0
16 | ipywidgets==7.0.5
17 | jedi==0.11.0
18 | Jinja2==2.10
19 | jsonschema==2.6.0
20 | jupyter==1.0.0
21 | jupyter-client==5.1.0
22 | jupyter-console==5.2.0
23 | jupyter-contrib-core==0.3.3
24 | jupyter-contrib-nbextensions==0.3.3
25 | jupyter-core==4.4.0
26 | jupyter-highlight-selected-word==0.1.0
27 | jupyter-latex-envs==1.3.8.4
28 | jupyter-nbextensions-configurator==0.2.8
29 | libarchive==0.4.4
30 | Markdown==2.6.9
31 | MarkupSafe==1.0
32 | matplotlib==2.1.0
33 | mistune==0.8.1
34 | mock==2.0.0
35 | nbconvert==5.3.1
36 | nbformat==4.4.0
37 | nltk==3.2.5
38 | notebook==5.2.1
39 | numpy==1.13.3
40 | pandas==0.21.0
41 | pandocfilters==1.4.2
42 | parso==0.1.0
43 | pbr==3.1.1
44 | pexpect==4.3.0
45 | pickleshare==0.7.4
46 | prompt-toolkit==1.0.15
47 | protobuf==3.5.0.post1
48 | ptyprocess==0.5.2
49 | Pygments==2.2.0
50 | python-dateutil==2.6.1
51 | pyzmq==16.0.3
52 | qtconsole==4.3.1
53 | regex==2017.11.9
54 | requests==2.18.4
55 | scikit-learn==0.19.1
56 | scipy==1.0.0
57 | simplegeneric==0.8.1
58 | six==1.11.0
59 | tensorflow==1.4.0
60 | tensorflow-tensorboard==0.4.0rc3
61 | terminado==0.7
62 | testpath==0.3.1
63 | tornado==4.5.2
64 | tqdm==4.19.4
65 | traitlets==4.3.2
66 | urllib3==1.22
67 | wcwidth==0.1.7
68 | Werkzeug==0.12.2
69 | widgetsnbextension==3.0.8
70 | 


--------------------------------------------------------------------------------
/docker/welcome_message.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ===================================================================
 3 | Welcome to the Docker container for the Coursera NLP course.
 4 | 
 5 | This container contains dependencies that you might need
 6 | to complete course assignments.
 7 | 
 8 | You can also install any additional system dependencies with
 9 | > apt-get install PACKAGE_NAME
10 | 
11 | And Python dependencies with
12 | > pip3 install PACKAGE_NAME
13 | 
14 | To run Jupyter Notebook in the container just type
15 | > run_notebook
16 | ===================================================================
17 | 
18 | 


--------------------------------------------------------------------------------
/honor/README.md:
--------------------------------------------------------------------------------
1 | # Utils to download and read data for chat-bot training
2 | 
3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training:
4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size)
5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size)
6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset
7 | - `example.py` - example of reading the dataset
8 | 


--------------------------------------------------------------------------------
/honor/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Conchylicultor. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import ast
 17 | import os
 18 | import random
 19 | import re
 20 | from time import time
 21 | 
 22 | import nltk
 23 | from tqdm import tqdm
 24 | 
 25 | """
 26 | Load the cornell movie dialog corpus.
 27 | 
 28 | Available from here:
 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
 30 | 
 31 | """
 32 | 
 33 | class CornellData:
 34 |     """
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, dirName):
 39 |         """
 40 |         Args:
 41 |             dirName (string): directory where to load the corpus
 42 |         """
 43 |         self.lines = {}
 44 |         self.conversations = []
 45 | 
 46 |         MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
 47 |         MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]
 48 | 
 49 |         self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS)
 50 |         self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS)
 51 | 
 52 |         # TODO: Cleaner program (merge copy-paste) !!
 53 | 
 54 |     def loadLines(self, fileName, fields):
 55 |         """
 56 |         Args:
 57 |             fileName (str): file to load
 58 |             field (set<str>): fields to extract
 59 |         Return:
 60 |             dict<dict<str>>: the extracted fields for each line
 61 |         """
 62 |         lines = {}
 63 | 
 64 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 65 |             for line in f:
 66 |                 values = line.split(" +++$+++ ")
 67 | 
 68 |                 # Extract fields
 69 |                 lineObj = {}
 70 |                 for i, field in enumerate(fields):
 71 |                     lineObj[field] = values[i]
 72 | 
 73 |                 lines[lineObj['lineID']] = lineObj
 74 | 
 75 |         return lines
 76 | 
 77 |     def loadConversations(self, fileName, fields):
 78 |         """
 79 |         Args:
 80 |             fileName (str): file to load
 81 |             field (set<str>): fields to extract
 82 |         Return:
 83 |             list<dict<str>>: the extracted fields for each line
 84 |         """
 85 |         conversations = []
 86 | 
 87 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 88 |             for line in f:
 89 |                 values = line.split(" +++$+++ ")
 90 | 
 91 |                 # Extract fields
 92 |                 convObj = {}
 93 |                 for i, field in enumerate(fields):
 94 |                     convObj[field] = values[i]
 95 | 
 96 |                 # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
 97 |                 lineIds = ast.literal_eval(convObj["utteranceIDs"])
 98 | 
 99 |                 # Reassemble lines
100 |                 convObj["lines"] = []
101 |                 for lineId in lineIds:
102 |                     convObj["lines"].append(self.lines[lineId])
103 | 
104 |                 conversations.append(convObj)
105 | 
106 |         return conversations
107 | 
108 |     def getConversations(self):
109 |         return self.conversations
110 | 
111 | 
112 | # Based on code from https://github.com/AlJohri/OpenSubtitles
113 | # by Al Johri <al.johri@gmail.com>
114 | 
115 | import xml.etree.ElementTree as ET
116 | import datetime
117 | import os
118 | import sys
119 | import json
120 | import re
121 | import pprint
122 | 
123 | from gzip import GzipFile
124 | 
125 | """
126 | Load the opensubtitles dialog corpus.
127 | """
128 | 
129 | class OpensubsData:
130 |     """
131 |     """
132 | 
133 |     def __init__(self, dirName):
134 |         """
135 |         Args:
136 |             dirName (string): directory where to load the corpus
137 |         """
138 | 
139 |         # Hack this to filter on subset of Opensubtitles
140 |         # dirName = "%s/en/Action" % dirName
141 | 
142 |         print("Loading OpenSubtitles conversations in %s." % dirName)
143 |         self.conversations = []
144 |         self.tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
145 |         self.conversations = self.loadConversations(dirName)
146 | 
147 |     def loadConversations(self, dirName):
148 |         """
149 |         Args:
150 |             dirName (str): folder to load
151 |         Return:
152 |             array(question, answer): the extracted QA pairs
153 |         """
154 |         conversations = []
155 |         dirList = self.filesInDir(dirName)
156 |         for filepath in tqdm(dirList, "OpenSubtitles data files"):
157 |             if filepath.endswith('gz'):
158 |                 try:
159 |                     doc = self.getXML(filepath)
160 |                     conversations.extend(self.genList(doc))
161 |                 except ValueError:
162 |                     tqdm.write("Skipping file %s with errors." % filepath)
163 |                 except:
164 |                     print("Unexpected error:", sys.exc_info()[0])
165 |                     raise
166 |         return conversations
167 | 
168 |     def getConversations(self):
169 |         return self.conversations
170 | 
171 |     def genList(self, tree):
172 |         root = tree.getroot()
173 | 
174 |         timeFormat = '%H:%M:%S'
175 |         maxDelta = datetime.timedelta(seconds=1)
176 | 
177 |         startTime = datetime.datetime.min
178 |         strbuf = ''
179 |         sentList = []
180 | 
181 |         for child in root:
182 |             for elem in child:
183 |                 if elem.tag == 'time':
184 |                     elemID = elem.attrib['id']
185 |                     elemVal = elem.attrib['value'][:-4]
186 |                     if elemID[-1] == 'S':
187 |                         startTime = datetime.datetime.strptime(elemVal, timeFormat)
188 |                     else:
189 |                         sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat)))
190 |                         strbuf = ''
191 |                 else:
192 |                     try:
193 |                         strbuf = strbuf + " " + elem.text
194 |                     except:
195 |                         pass
196 | 
197 |         conversations = []
198 |         for idx in range(0, len(sentList) - 1):
199 |             cur = sentList[idx]
200 |             nxt = sentList[idx + 1]
201 |             if nxt[1] - cur[2] <= maxDelta and cur and nxt:
202 |                 tmp = {}
203 |                 tmp["lines"] = []
204 |                 tmp["lines"].append(self.getLine(cur[0]))
205 |                 tmp["lines"].append(self.getLine(nxt[0]))
206 |                 if self.filter(tmp):
207 |                     conversations.append(tmp)
208 | 
209 |         return conversations
210 | 
211 |     def getLine(self, sentence):
212 |         line = {}
213 |         line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower()
214 |         return line
215 | 
216 |     def filter(self, lines):
217 |         # Use the followint to customize filtering of QA pairs
218 |         #
219 |         # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will")
220 |         # question = lines["lines"][0]["text"]
221 |         # if not question.endswith('?'):
222 |         #     return False
223 |         # if not question.split(' ')[0] in startwords:
224 |         #     return False
225 |         #
226 |         return True
227 | 
228 |     def getXML(self, filepath):
229 |         fext = os.path.splitext(filepath)[1]
230 |         if fext == '.gz':
231 |             tmp = GzipFile(filename=filepath)
232 |             return ET.parse(tmp)
233 |         else:
234 |             return ET.parse(filepath)
235 | 
236 |     def filesInDir(self, dirname):
237 |         result = []
238 |         for dirpath, dirs, files in os.walk(dirname):
239 |             for filename in files:
240 |                 fname = os.path.join(dirpath, filename)
241 |                 result.append(fname)
242 |         return result
243 | 
244 | 
245 | def extractText(line, fast_preprocessing=True):
246 |     if fast_preprocessing:
247 |         GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
248 |         REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]')
249 |         REPLACE_SEVERAL_SPACES = re.compile('\s+')
250 | 
251 |         line = line.lower()
252 |         line = REPLACE_BY_SPACE_RE.sub(' ', line)
253 |         line = GOOD_SYMBOLS_RE.sub('', line)
254 |         line = REPLACE_SEVERAL_SPACES.sub(' ', line)
255 |         return line.strip()
256 |     else:
257 |         return nltk.word_tokenize(line)
258 | 
259 | 
260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True):
261 |     data = []
262 |     for i, conversation in enumerate(tqdm(conversations)):
263 |         lines = conversation['lines']
264 |         for i in range(len(lines) - 1):
265 |             request = extractText(lines[i]['text'])
266 |             reply = extractText(lines[i + 1]['text'])
267 |             if 0 < len(request) <= max_len and 0 < len(reply) <= max_len:
268 |                 data += [(request, reply)]
269 |     return data
270 | 
271 | 
272 | def readCornellData(path, max_len=20, fast_preprocessing=True):
273 |     dataset = CornellData(path)
274 |     conversations = dataset.getConversations()
275 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
276 | 
277 | 
278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True):
279 |     dataset = OpensubsData(path)
280 |     conversations = dataset.getConversations()
281 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
282 | 


--------------------------------------------------------------------------------
/honor/download_cornell.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/cornell
4 | cd data/cornell
5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt
6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt
7 | 


--------------------------------------------------------------------------------
/honor/download_opensubs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/opensubs
4 | cd data/opensubs
5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz
6 | tar -xf en.tar.gz
7 | rm en.tar.gz
8 | 


--------------------------------------------------------------------------------
/honor/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import datasets
 4 | import argparse
 5 | import os
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.")
10 |     parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.")
11 |     args = parser.parse_args()
12 | 
13 |     dataset_path = os.path.join("data", args.dataset)
14 |     if args.dataset == "cornell":
15 |         data = datasets.readCornellData(dataset_path, max_len=args.max_len)
16 |     elif args.dataset == "opensubs":
17 |         data = datasets.readOpensubsData(dataset_path, max_len=args.max_len)
18 |     else:
19 |         raise ValueError("Unrecognized dataset: {!r}".format(args.dataset))
20 | 
21 |     print("Size of dataset: {}".format(len(data)))
22 |     print("First 10 training pairs:")
23 |     for item in data[:10]:
24 |         print(item)
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/project/.gitignore:
--------------------------------------------------------------------------------
1 | GoogleNews-vectors-negative300.*
2 | starspace_embedding
3 | starspace_embedding.*
4 | word_embedd*.*
5 | *.pkl
6 | thread_embeddings_by_tags/
7 | eval*
8 | db.sql*
9 | 


--------------------------------------------------------------------------------
/project/dialogue_manager.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sklearn.metrics.pairwise import pairwise_distances_argmin
  3 | 
  4 | from chatterbot import ChatBot
  5 | from utils import *
  6 | 
  7 | 
  8 | class ThreadRanker(object):
  9 |     def __init__(self, paths):
 10 |         self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
 11 |         self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
 12 | 
 13 |     def __load_embeddings_by_tag(self, tag_name):
 14 |         embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
 15 |         thread_ids, thread_embeddings = unpickle_file(embeddings_path)
 16 |         return thread_ids, thread_embeddings
 17 | 
 18 |     def get_best_thread(self, question, tag_name):
 19 |         """ Returns id of the most similar thread for the question.
 20 |             The search is performed across the threads with a given tag.
 21 |         """
 22 |         thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
 23 | 
 24 |         # HINT: you have already implemented a similar routine in the 3rd assignment.
 25 |         
 26 |         #question_vec = #### YOUR CODE HERE ####
 27 |         #best_thread = #### YOUR CODE HERE ####
 28 |         question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1)
 29 |         best_thread = pairwise_distances_argmin(question_vec,thread_embeddings)
 30 |         
 31 |         return thread_ids[best_thread]
 32 | 
 33 | 
 34 | class DialogueManager(object):
 35 |     def __init__(self, paths):
 36 |         print("Loading resources...")
 37 | 
 38 |         # Intent recognition:
 39 |         self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
 40 |         self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
 41 | 
 42 |         self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
 43 | 
 44 |         # Goal-oriented part:
 45 |         self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
 46 |         self.thread_ranker = ThreadRanker(paths)
 47 |         
 48 |         #init chatbot
 49 |         self.create_chitchat_bot()
 50 | 
 51 |     def create_chitchat_bot(self):
 52 |         """Initializes self.chitchat_bot with some conversational model."""
 53 | 
 54 |         # Hint: you might want to create and train chatterbot.ChatBot here.
 55 |         # It could be done by creating ChatBot with the *trainer* parameter equals 
 56 |         # "chatterbot.trainers.ChatterBotCorpusTrainer"
 57 |         # and then calling *train* function with "chatterbot.corpus.english" param
 58 |         
 59 |         ########################
 60 |         #### YOUR CODE HERE ####
 61 |         ########################
 62 |         self.chitchat_bot = ChatBot('Nim Obvious', trainer='chatterbot.trainers.ChatterBotCorpusTrainer')
 63 | 
 64 |         # Train based on the english corpus
 65 |         self.chitchat_bot.train("chatterbot.corpus.english")
 66 | 
 67 |        
 68 |     def generate_answer(self, question):
 69 |         """Combines stackoverflow and chitchat parts using intent recognition."""
 70 | 
 71 |         # Recognize intent of the question using `intent_recognizer`.
 72 |         # Don't forget to prepare question and calculate features for the question.
 73 |         
 74 |         #prepared_question = #### YOUR CODE HERE ####
 75 |         #features = #### YOUR CODE HERE ####
 76 |         #intent = #### YOUR CODE HERE ####
 77 |         
 78 |         prepared_question = text_prepare(question)
 79 |         features = self.tfidf_vectorizer.transform([prepared_question])
 80 |         intent = self.intent_recognizer.predict(features)
 81 |         
 82 |         
 83 | 
 84 |         # Chit-chat part:   
 85 |         if intent == 'dialogue':
 86 |             # Pass question to chitchat_bot to generate a response.       
 87 |             #response = #### YOUR CODE HERE ####
 88 |             response = self.chitchat_bot.get_response(question)
 89 |             return response
 90 |         
 91 |         # Goal-oriented part:
 92 |         else:        
 93 |             # Pass features to tag_classifier to get predictions.
 94 |             #tag = #### YOUR CODE HERE ####
 95 |             tag = self.tag_classifier.predict( features)[0]
 96 |             #print(tag)
 97 |             
 98 |             # Pass prepared_question to thread_ranker to get predictions.
 99 |             #thread_id = #### YOUR CODE HERE ####
100 |             thread_id = self.thread_ranker.get_best_thread(question, tag)[0]
101 |            
102 |             return self.ANSWER_TEMPLATE % (tag, thread_id)
103 | 
104 | 


--------------------------------------------------------------------------------
/project/main_bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import requests
  4 | import time
  5 | import argparse
  6 | import os
  7 | import json
  8 | 
  9 | from requests.compat import urljoin
 10 | from utils import *
 11 | from dialogue_manager import *
 12 | 
 13 | 
 14 | class BotHandler(object):
 15 |     """
 16 |         BotHandler is a class which implements all back-end of the bot.
 17 |         It has tree main functions:
 18 |             'get_updates' — checks for new messages
 19 |             'send_message' – posts new message to user
 20 |             'get_answer' — computes the most relevant on a user's question
 21 |     """
 22 | 
 23 |     def __init__(self, token, dialogue_manager):
 24 |         self.token = token
 25 |         self.api_url = "https://api.telegram.org/bot{}/".format(token)
 26 |         self.dialogue_manager = dialogue_manager
 27 | 
 28 |     def get_updates(self, offset=None, timeout=30):
 29 |         params = {"timeout": timeout, "offset": offset}
 30 |         raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
 31 |         try:
 32 |             resp = raw_resp.json()
 33 |         except json.decoder.JSONDecodeError as e:
 34 |             print("Failed to parse response {}: {}.".format(raw_resp.content, e))
 35 |             return []
 36 | 
 37 |         if "result" not in resp:
 38 |             return []
 39 |         return resp["result"]
 40 | 
 41 |     def send_message(self, chat_id, text):
 42 |         params = {"chat_id": chat_id, "text": text}
 43 |         return requests.post(urljoin(self.api_url, "sendMessage"), params)
 44 | 
 45 |     def get_answer(self, question):
 46 |         if question == '/start':
 47 |             return "Hi, I am your project bot. How can I help you today?"
 48 |         return self.dialogue_manager.generate_answer(question)
 49 | 
 50 | 
 51 | def parse_args():
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument('--token', type=str, default='')
 54 |     return parser.parse_args()
 55 | 
 56 | 
 57 | def is_unicode(text):
 58 |     return len(text) == len(text.encode())
 59 | 
 60 | 
 61 | class SimpleDialogueManager(object):
 62 |     """
 63 |     This is the simplest dialogue manager to test the telegram bot.
 64 |     Your task is to create a more advanced one in dialogue_manager.py."
 65 |     """
 66 |     
 67 |     def generate_answer(self, question): 
 68 |         return "Hello, world!" 
 69 |         
 70 | 
 71 | def main():
 72 |     args = parse_args()
 73 |     token = args.token
 74 | 
 75 |     if not token:
 76 |         if not "TELEGRAM_TOKEN" in os.environ:
 77 |             print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
 78 |             return
 79 |         token = os.environ["TELEGRAM_TOKEN"]
 80 | 
 81 |     #################################################################
 82 |     
 83 |     # Your task is to complete dialogue_manager.py and use your 
 84 |     # advanced DialogueManager instead of SimpleDialogueManager. 
 85 |     
 86 |     # This is the point where you plug it into the Telegram bot. 
 87 |     # Do not forget to import all needed dependencies when you do so.
 88 |     
 89 |     # simple_manager = SimpleDialogueManager()
 90 |     # bot = BotHandler(token, simple_manager)
 91 | 
 92 |     dialog_manager = DialogueManager(RESOURCE_PATH)
 93 |     bot = BotHandler(token, dialog_manager)
 94 | 
 95 |     ###############################################################
 96 | 
 97 |     print("Ready to talk!")
 98 |     offset = 0
 99 |     while True:
100 |         updates = bot.get_updates(offset=offset)
101 |         for update in updates:
102 |             print("An update received.")
103 |             if "message" in update:
104 |                 chat_id = update["message"]["chat"]["id"]
105 |                 if "text" in update["message"]:
106 |                     text = update["message"]["text"]
107 |                     if is_unicode(text):
108 |                         print("Update content: {}".format(update))
109 |                         bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
110 |                     else:
111 |                         bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
112 |             offset = max(offset, update['update_id'] + 1)
113 |         time.sleep(1)
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/project/utils.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import pickle
 3 | import re
 4 | import numpy as np
 5 | 
 6 | nltk.download('stopwords')
 7 | from nltk.corpus import stopwords
 8 | 
 9 | # Paths for all resources for the bot.
10 | RESOURCE_PATH = {
11 |     'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
12 |     'TAG_CLASSIFIER': 'tag_classifier.pkl',
13 |     'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
14 |     'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
15 |     'WORD_EMBEDDINGS': 'word_embeddings.tsv',
16 | }
17 | 
18 | 
19 | def text_prepare(text):
20 |     """Performs tokenization and simple preprocessing."""
21 |     
22 |     replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
23 |     bad_symbols_re = re.compile('[^0-9a-z #+_]')
24 |     stopwords_set = set(stopwords.words('english'))
25 | 
26 |     text = text.lower()
27 |     text = replace_by_space_re.sub(' ', text)
28 |     text = bad_symbols_re.sub('', text)
29 |     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
30 | 
31 |     return text.strip()
32 | 
33 | 
34 | def load_embeddings(embeddings_path):
35 |     """Loads pre-trained word embeddings from tsv file.
36 | 
37 |     Args:
38 |       embeddings_path - path to the embeddings file.
39 | 
40 |     Returns:
41 |       embeddings - dict mapping words to vectors;
42 |       embeddings_dim - dimension of the vectors.
43 |     """
44 |     
45 |     # Hint: you have already implemented a similar routine in the 3rd assignment.
46 |     # Note that here you also need to know the dimension of the loaded embeddings.
47 |     # When you load the embeddings, use numpy.float32 type as dtype
48 | 
49 |     ########################
50 |     #### YOUR CODE HERE ####
51 |     ########################
52 |     
53 |     embeddings = dict()
54 |     for line in open(embeddings_path, encoding='utf-8'):
55 |         row = line.strip().split('\t')
56 |         embeddings[row[0]] = np.array(row[1:], dtype=np.float32)
57 |     embeddings_dim = embeddings[list(embeddings)[0]].shape[0]
58 |     
59 |     return embeddings, embeddings_dim
60 |     
61 | 
62 | 
63 | def question_to_vec(question, embeddings, dim):
64 |     """Transforms a string to an embedding by averaging word embeddings."""
65 |     
66 |     # Hint: you have already implemented exactly this function in the 3rd assignment.
67 | 
68 |     ########################
69 |     #### YOUR CODE HERE ####
70 |     ########################
71 |     result = np.zeros(dim)
72 |     cnt = 0
73 |     words = question.split()
74 |     for word in words:
75 |         if word in embeddings:
76 |             result += np.array(embeddings[word])
77 |             cnt += 1
78 |     if cnt != 0:
79 |         result /= cnt
80 |     return result 
81 | 
82 | 
83 | def unpickle_file(filename):
84 |     """Returns the result of unpickling the file content."""
85 |     with open(filename, 'rb') as f:
86 |         return pickle.load(f)
87 | 


--------------------------------------------------------------------------------
/project/week5-project-Soln.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Final project: StackOverflow assistant bot\n",
  8 |     "\n",
  9 |     "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n",
 10 |     "* answer programming-related questions (using StackOverflow dataset);\n",
 11 |     "* chit-chat and simulate dialogue on all non programming-related questions.\n",
 12 |     "\n",
 13 |     "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n",
 14 |     "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n",
 15 |     "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n",
 16 |     "©[xkcd](https://xkcd.com)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Data description\n",
 24 |     "\n",
 25 |     "To detect *intent* of users questions we will need two text collections:\n",
 26 |     "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n",
 27 |     "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "application/vnd.jupyter.widget-view+json": {
 38 |        "model_id": "d0172568852c4a4e8822d8e48aedc512",
 39 |        "version_major": 2,
 40 |        "version_minor": 0
 41 |       },
 42 |       "text/html": [
 43 |        "<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
 44 |        "<p>\n",
 45 |        "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
 46 |        "  that the widgets JavaScript is still loading. If this message persists, it\n",
 47 |        "  likely means that the widgets JavaScript library is either not installed or\n",
 48 |        "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
 49 |        "  Widgets Documentation</a> for setup instructions.\n",
 50 |        "</p>\n",
 51 |        "<p>\n",
 52 |        "  If you're reading this message in another frontend (for example, a static\n",
 53 |        "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
 54 |        "  it may mean that your frontend doesn't currently support widgets.\n",
 55 |        "</p>\n"
 56 |       ],
 57 |       "text/plain": [
 58 |        "HBox(children=(IntProgress(value=0, max=18012894), HTML(value='')))"
 59 |       ]
 60 |      },
 61 |      "metadata": {},
 62 |      "output_type": "display_data"
 63 |     },
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "\n"
 69 |      ]
 70 |     },
 71 |     {
 72 |      "data": {
 73 |       "application/vnd.jupyter.widget-view+json": {
 74 |        "model_id": "8142ba6c86f24a8e8902fd105acc1f9b",
 75 |        "version_major": 2,
 76 |        "version_minor": 0
 77 |       },
 78 |       "text/html": [
 79 |        "<p>Failed to display Jupyter Widget of type <code>HBox</code>.</p>\n",
 80 |        "<p>\n",
 81 |        "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
 82 |        "  that the widgets JavaScript is still loading. If this message persists, it\n",
 83 |        "  likely means that the widgets JavaScript library is either not installed or\n",
 84 |        "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
 85 |        "  Widgets Documentation</a> for setup instructions.\n",
 86 |        "</p>\n",
 87 |        "<p>\n",
 88 |        "  If you're reading this message in another frontend (for example, a static\n",
 89 |        "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
 90 |        "  it may mean that your frontend doesn't currently support widgets.\n",
 91 |        "</p>\n"
 92 |       ],
 93 |       "text/plain": [
 94 |        "HBox(children=(IntProgress(value=0, max=145677870), HTML(value='')))"
 95 |       ]
 96 |      },
 97 |      "metadata": {},
 98 |      "output_type": "display_data"
 99 |     },
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "import sys\n",
110 |     "sys.path.append(\"..\")\n",
111 |     "from common.download_utils import download_project_resources\n",
112 |     "\n",
113 |     "download_project_resources()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n",
121 |     "For the ranking part, you will need:\n",
122 |     "- `word_embeddings.tsv` — word embeddings, that you  trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n",
130 |     "\n",
131 |     "- `intent_recognizer.pkl` — intent recognition model;\n",
132 |     "- `tag_classifier.pkl` — programming language classification model;\n",
133 |     "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n",
134 |     "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n",
135 |     "    "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 37,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "%load_ext autoreload\n",
152 |     "%autoreload 2\n",
153 |     "from utils import *"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "## Part I. Intent and language recognition"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n",
168 |     "\n",
169 |     "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 3,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "import numpy as np\n",
179 |     "import pandas as pd\n",
180 |     "import pickle\n",
181 |     "import re\n",
182 |     "\n",
183 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Data preparation"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 18,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "def tfidf_features(X_train, X_test, vectorizer_path):\n",
207 |     "    \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n",
208 |     "    \n",
209 |     "    # Train a vectorizer on X_train data.\n",
210 |     "    # Transform X_train and X_test data.\n",
211 |     "    \n",
212 |     "    # Pickle the trained vectorizer to 'vectorizer_path'\n",
213 |     "    # Don't forget to open the file in writing bytes mode.\n",
214 |     "    \n",
215 |     "    ######################################\n",
216 |     "    ######### YOUR CODE HERE #############\n",
217 |     "    ######################################\n",
218 |     "    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\\S+)')\n",
219 |     "    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
220 |     "    X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
221 |     "    with open(vectorizer_path, 'wb') as f:\n",
222 |     "        pickle.dump(tfidf_vectorizer, f)\n",
223 |     "    \n",
224 |     "    \n",
225 |     "    #return X_train, X_test\n",
226 |     "    return X_train_tfidf, X_test_tfidf"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later."
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 11,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "sample_size = 200000\n",
243 |     "\n",
244 |     "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n",
245 |     "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "Check how the data look like:"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 12,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/html": [
263 |        "<div>\n",
264 |        "<style scoped>\n",
265 |        "    .dataframe tbody tr th:only-of-type {\n",
266 |        "        vertical-align: middle;\n",
267 |        "    }\n",
268 |        "\n",
269 |        "    .dataframe tbody tr th {\n",
270 |        "        vertical-align: top;\n",
271 |        "    }\n",
272 |        "\n",
273 |        "    .dataframe thead th {\n",
274 |        "        text-align: right;\n",
275 |        "    }\n",
276 |        "</style>\n",
277 |        "<table border=\"1\" class=\"dataframe\">\n",
278 |        "  <thead>\n",
279 |        "    <tr style=\"text-align: right;\">\n",
280 |        "      <th></th>\n",
281 |        "      <th>text</th>\n",
282 |        "      <th>tag</th>\n",
283 |        "    </tr>\n",
284 |        "  </thead>\n",
285 |        "  <tbody>\n",
286 |        "    <tr>\n",
287 |        "      <th>82925</th>\n",
288 |        "      <td>Donna, you are a muffin.</td>\n",
289 |        "      <td>dialogue</td>\n",
290 |        "    </tr>\n",
291 |        "    <tr>\n",
292 |        "      <th>48774</th>\n",
293 |        "      <td>He was here last night till about two o'clock....</td>\n",
294 |        "      <td>dialogue</td>\n",
295 |        "    </tr>\n",
296 |        "    <tr>\n",
297 |        "      <th>55394</th>\n",
298 |        "      <td>All right, then make an appointment with her s...</td>\n",
299 |        "      <td>dialogue</td>\n",
300 |        "    </tr>\n",
301 |        "    <tr>\n",
302 |        "      <th>90806</th>\n",
303 |        "      <td>Hey, what is this-an interview? We're supposed...</td>\n",
304 |        "      <td>dialogue</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>107758</th>\n",
308 |        "      <td>Yeah. He's just a friend of mine I was trying ...</td>\n",
309 |        "      <td>dialogue</td>\n",
310 |        "    </tr>\n",
311 |        "  </tbody>\n",
312 |        "</table>\n",
313 |        "</div>"
314 |       ],
315 |       "text/plain": [
316 |        "                                                     text       tag\n",
317 |        "82925                            Donna, you are a muffin.  dialogue\n",
318 |        "48774   He was here last night till about two o'clock....  dialogue\n",
319 |        "55394   All right, then make an appointment with her s...  dialogue\n",
320 |        "90806   Hey, what is this-an interview? We're supposed...  dialogue\n",
321 |        "107758  Yeah. He's just a friend of mine I was trying ...  dialogue"
322 |       ]
323 |      },
324 |      "execution_count": 12,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "dialogue_df.head()"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 13,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/html": [
341 |        "<div>\n",
342 |        "<style scoped>\n",
343 |        "    .dataframe tbody tr th:only-of-type {\n",
344 |        "        vertical-align: middle;\n",
345 |        "    }\n",
346 |        "\n",
347 |        "    .dataframe tbody tr th {\n",
348 |        "        vertical-align: top;\n",
349 |        "    }\n",
350 |        "\n",
351 |        "    .dataframe thead th {\n",
352 |        "        text-align: right;\n",
353 |        "    }\n",
354 |        "</style>\n",
355 |        "<table border=\"1\" class=\"dataframe\">\n",
356 |        "  <thead>\n",
357 |        "    <tr style=\"text-align: right;\">\n",
358 |        "      <th></th>\n",
359 |        "      <th>post_id</th>\n",
360 |        "      <th>title</th>\n",
361 |        "      <th>tag</th>\n",
362 |        "    </tr>\n",
363 |        "  </thead>\n",
364 |        "  <tbody>\n",
365 |        "    <tr>\n",
366 |        "      <th>2168983</th>\n",
367 |        "      <td>43837842</td>\n",
368 |        "      <td>Efficient Algorithm to compose valid expressio...</td>\n",
369 |        "      <td>python</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "      <th>1084095</th>\n",
373 |        "      <td>15747223</td>\n",
374 |        "      <td>Why does this basic thread program fail with C...</td>\n",
375 |        "      <td>c\\c++</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>1049020</th>\n",
379 |        "      <td>15189594</td>\n",
380 |        "      <td>Link to scroll to top not working</td>\n",
381 |        "      <td>javascript</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <th>200466</th>\n",
385 |        "      <td>3273927</td>\n",
386 |        "      <td>Is it possible to implement ping on windows ph...</td>\n",
387 |        "      <td>c#</td>\n",
388 |        "    </tr>\n",
389 |        "    <tr>\n",
390 |        "      <th>1200249</th>\n",
391 |        "      <td>17684551</td>\n",
392 |        "      <td>GLSL normal mapping issue</td>\n",
393 |        "      <td>c\\c++</td>\n",
394 |        "    </tr>\n",
395 |        "  </tbody>\n",
396 |        "</table>\n",
397 |        "</div>"
398 |       ],
399 |       "text/plain": [
400 |        "          post_id                                              title  \\\n",
401 |        "2168983  43837842  Efficient Algorithm to compose valid expressio...   \n",
402 |        "1084095  15747223  Why does this basic thread program fail with C...   \n",
403 |        "1049020  15189594                  Link to scroll to top not working   \n",
404 |        "200466    3273927  Is it possible to implement ping on windows ph...   \n",
405 |        "1200249  17684551                          GLSL normal mapping issue   \n",
406 |        "\n",
407 |        "                tag  \n",
408 |        "2168983      python  \n",
409 |        "1084095       c\\c++  \n",
410 |        "1049020  javascript  \n",
411 |        "200466           c#  \n",
412 |        "1200249       c\\c++  "
413 |       ]
414 |      },
415 |      "execution_count": 13,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "stackoverflow_df.head()"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "Apply *text_prepare* function to preprocess the data:"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 14,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "from utils import text_prepare"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 15,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "dialogue_df['text'] = dialogue_df['text'].apply(lambda x: text_prepare(x)) ######### YOUR CODE HERE #############\n",
447 |     "stackoverflow_df['title'] = stackoverflow_df['title'].apply(lambda x: text_prepare(x)) ######### YOUR CODE HERE #############"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "### Intent recognition"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {},
460 |    "source": [
461 |     "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n",
462 |     "- concatenate `dialogue` and `stackoverflow` examples into one sample\n",
463 |     "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n",
464 |     "- transform it into TF-IDF features"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 16,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "from sklearn.model_selection import train_test_split"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 19,
479 |    "metadata": {},
480 |    "outputs": [
481 |     {
482 |      "name": "stderr",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
486 |       "  FutureWarning)\n"
487 |      ]
488 |     },
489 |     {
490 |      "name": "stdout",
491 |      "output_type": "stream",
492 |      "text": [
493 |       "Train size = 360000, test size = 40000\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n",
499 |     "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n",
500 |     "\n",
501 |     "X_train, X_test, y_train, y_test =  train_test_split(X, y, train_size=0.9, random_state=0) ######### YOUR CODE HERE ##########\n",
502 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n",
503 |     "\n",
504 |     "X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, RESOURCE_PATH['TFIDF_VECTORIZER']) ######### YOUR CODE HERE ###########"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "metadata": {},
510 |    "source": [
511 |     "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 20,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "from sklearn.linear_model import LogisticRegression\n",
521 |     "from sklearn.metrics import accuracy_score"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": 24,
527 |    "metadata": {},
528 |    "outputs": [
529 |     {
530 |      "data": {
531 |       "text/plain": [
532 |        "LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n",
533 |        "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
534 |        "          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n",
535 |        "          verbose=0, warm_start=False)"
536 |       ]
537 |      },
538 |      "execution_count": 24,
539 |      "metadata": {},
540 |      "output_type": "execute_result"
541 |     }
542 |    ],
543 |    "source": [
544 |     "######################################\n",
545 |     "######### YOUR CODE HERE #############\n",
546 |     "######################################\n",
547 |     "intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=0)\n",
548 |     "intent_recognizer.fit(X_train_tfidf, y_train)"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": 25,
554 |    "metadata": {},
555 |    "outputs": [
556 |     {
557 |      "name": "stdout",
558 |      "output_type": "stream",
559 |      "text": [
560 |       "Test accuracy = 0.991575\n"
561 |      ]
562 |     }
563 |    ],
564 |    "source": [
565 |     "# Check test accuracy.\n",
566 |     "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n",
567 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
568 |     "print('Test accuracy = {}'.format(test_accuracy))"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {},
574 |    "source": [
575 |     "Dump the classifier to use it in the running bot."
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": 26,
581 |    "metadata": {},
582 |    "outputs": [],
583 |    "source": [
584 |     "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "markdown",
589 |    "metadata": {},
590 |    "source": [
591 |     "### Programming language classification "
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n",
599 |     "\n",
600 |     "First, let us prepare the data for this task."
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": 27,
606 |    "metadata": {},
607 |    "outputs": [],
608 |    "source": [
609 |     "X = stackoverflow_df['title'].values\n",
610 |     "y = stackoverflow_df['tag'].values"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 28,
616 |    "metadata": {},
617 |    "outputs": [
618 |     {
619 |      "name": "stdout",
620 |      "output_type": "stream",
621 |      "text": [
622 |       "Train size = 160000, test size = 40000\n"
623 |      ]
624 |     }
625 |    ],
626 |    "source": [
627 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
628 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "markdown",
633 |    "metadata": {},
634 |    "source": [
635 |     "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it."
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": 29,
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n",
645 |     "\n",
646 |     "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {},
652 |    "source": [
653 |     "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*."
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 30,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "from sklearn.multiclass import OneVsRestClassifier"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 32,
668 |    "metadata": {},
669 |    "outputs": [
670 |     {
671 |      "data": {
672 |       "text/plain": [
673 |        "OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,\n",
674 |        "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
675 |        "          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n",
676 |        "          verbose=0, warm_start=False),\n",
677 |        "          n_jobs=1)"
678 |       ]
679 |      },
680 |      "execution_count": 32,
681 |      "metadata": {},
682 |      "output_type": "execute_result"
683 |     }
684 |    ],
685 |    "source": [
686 |     "######################################\n",
687 |     "######### YOUR CODE HERE #############\n",
688 |     "######################################\n",
689 |     "tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0))\n",
690 |     "tag_classifier.fit(X_train_tfidf, y_train)"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 33,
696 |    "metadata": {},
697 |    "outputs": [
698 |     {
699 |      "name": "stdout",
700 |      "output_type": "stream",
701 |      "text": [
702 |       "Test accuracy = 0.800725\n"
703 |      ]
704 |     }
705 |    ],
706 |    "source": [
707 |     "# Check test accuracy.\n",
708 |     "y_test_pred = tag_classifier.predict(X_test_tfidf)\n",
709 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
710 |     "print('Test accuracy = {}'.format(test_accuracy))"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "markdown",
715 |    "metadata": {},
716 |    "source": [
717 |     "Dump the classifier to use it in the running bot."
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": 34,
723 |    "metadata": {},
724 |    "outputs": [],
725 |    "source": [
726 |     "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "markdown",
731 |    "metadata": {},
732 |    "source": [
733 |     "## Part II. Ranking  questions with embeddings"
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "markdown",
738 |    "metadata": {},
739 |    "source": [
740 |     "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n",
741 |     "\n",
742 |     "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. "
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "markdown",
747 |    "metadata": {},
748 |    "source": [
749 |     "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n",
750 |     "\n",
751 |     "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StartSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data."
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": 39,
757 |    "metadata": {},
758 |    "outputs": [
759 |     {
760 |      "name": "stdout",
761 |      "output_type": "stream",
762 |      "text": [
763 |       "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
764 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
765 |      ]
766 |     }
767 |    ],
768 |    "source": [
769 |     "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "markdown",
774 |    "metadata": {},
775 |    "source": [
776 |     "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 41,
782 |    "metadata": {},
783 |    "outputs": [],
784 |    "source": [
785 |     "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')"
786 |    ]
787 |   },
788 |   {
789 |    "cell_type": "markdown",
790 |    "metadata": {},
791 |    "source": [
792 |     "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n",
793 |     "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "code",
798 |    "execution_count": 42,
799 |    "metadata": {},
800 |    "outputs": [
801 |     {
802 |      "data": {
803 |       "text/html": [
804 |        "<div>\n",
805 |        "<style scoped>\n",
806 |        "    .dataframe tbody tr th:only-of-type {\n",
807 |        "        vertical-align: middle;\n",
808 |        "    }\n",
809 |        "\n",
810 |        "    .dataframe tbody tr th {\n",
811 |        "        vertical-align: top;\n",
812 |        "    }\n",
813 |        "\n",
814 |        "    .dataframe thead th {\n",
815 |        "        text-align: right;\n",
816 |        "    }\n",
817 |        "</style>\n",
818 |        "<table border=\"1\" class=\"dataframe\">\n",
819 |        "  <thead>\n",
820 |        "    <tr style=\"text-align: right;\">\n",
821 |        "      <th></th>\n",
822 |        "      <th>post_id</th>\n",
823 |        "      <th>title</th>\n",
824 |        "      <th>tag</th>\n",
825 |        "    </tr>\n",
826 |        "  </thead>\n",
827 |        "  <tbody>\n",
828 |        "    <tr>\n",
829 |        "      <th>0</th>\n",
830 |        "      <td>9</td>\n",
831 |        "      <td>Calculate age in C#</td>\n",
832 |        "      <td>c#</td>\n",
833 |        "    </tr>\n",
834 |        "    <tr>\n",
835 |        "      <th>1</th>\n",
836 |        "      <td>16</td>\n",
837 |        "      <td>Filling a DataSet or DataTable from a LINQ que...</td>\n",
838 |        "      <td>c#</td>\n",
839 |        "    </tr>\n",
840 |        "    <tr>\n",
841 |        "      <th>2</th>\n",
842 |        "      <td>39</td>\n",
843 |        "      <td>Reliable timer in a console application</td>\n",
844 |        "      <td>c#</td>\n",
845 |        "    </tr>\n",
846 |        "    <tr>\n",
847 |        "      <th>3</th>\n",
848 |        "      <td>42</td>\n",
849 |        "      <td>Best way to allow plugins for a PHP application</td>\n",
850 |        "      <td>php</td>\n",
851 |        "    </tr>\n",
852 |        "    <tr>\n",
853 |        "      <th>4</th>\n",
854 |        "      <td>59</td>\n",
855 |        "      <td>How do I get a distinct, ordered list of names...</td>\n",
856 |        "      <td>c#</td>\n",
857 |        "    </tr>\n",
858 |        "  </tbody>\n",
859 |        "</table>\n",
860 |        "</div>"
861 |       ],
862 |       "text/plain": [
863 |        "   post_id                                              title  tag\n",
864 |        "0        9                                Calculate age in C#   c#\n",
865 |        "1       16  Filling a DataSet or DataTable from a LINQ que...   c#\n",
866 |        "2       39            Reliable timer in a console application   c#\n",
867 |        "3       42    Best way to allow plugins for a PHP application  php\n",
868 |        "4       59  How do I get a distinct, ordered list of names...   c#"
869 |       ]
870 |      },
871 |      "execution_count": 42,
872 |      "metadata": {},
873 |      "output_type": "execute_result"
874 |     }
875 |    ],
876 |    "source": [
877 |     "posts_df.head()"
878 |    ]
879 |   },
880 |   {
881 |    "cell_type": "code",
882 |    "execution_count": 48,
883 |    "metadata": {},
884 |    "outputs": [],
885 |    "source": [
886 |     "counts_by_tag = posts_df.groupby(['tag'])['tag'].count() ######### YOUR CODE HERE #############"
887 |    ]
888 |   },
889 |   {
890 |    "cell_type": "code",
891 |    "execution_count": 53,
892 |    "metadata": {},
893 |    "outputs": [
894 |     {
895 |      "data": {
896 |       "text/plain": [
897 |        "[('c#', 394451),\n",
898 |        " ('c\\\\c++', 281300),\n",
899 |        " ('java', 383456),\n",
900 |        " ('javascript', 375867),\n",
901 |        " ('php', 321752),\n",
902 |        " ('python', 208607),\n",
903 |        " ('r', 36359),\n",
904 |        " ('ruby', 99930),\n",
905 |        " ('swift', 34809),\n",
906 |        " ('vb', 35044)]"
907 |       ]
908 |      },
909 |      "execution_count": 53,
910 |      "metadata": {},
911 |      "output_type": "execute_result"
912 |     }
913 |    ],
914 |    "source": [
915 |     "list(counts_by_tag.items())"
916 |    ]
917 |   },
918 |   {
919 |    "cell_type": "markdown",
920 |    "metadata": {},
921 |    "source": [
922 |     "Now for each `tag` you need to create two data structures, which will serve as online search index:\n",
923 |     "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n",
924 |     "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n",
925 |     "\n",
926 |     "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it."
927 |    ]
928 |   },
929 |   {
930 |    "cell_type": "code",
931 |    "execution_count": 54,
932 |    "metadata": {},
933 |    "outputs": [],
934 |    "source": [
935 |     "import os\n",
936 |     "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n",
937 |     "\n",
938 |     "for tag, count in counts_by_tag.items():\n",
939 |     "    tag_posts = posts_df[posts_df['tag'] == tag]\n",
940 |     "    \n",
941 |     "    tag_post_ids = tag_posts['post_id'].values ######### YOUR CODE HERE #############\n",
942 |     "    \n",
943 |     "    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n",
944 |     "    for i, title in enumerate(tag_posts['title']):\n",
945 |     "        tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) ######### YOUR CODE HERE #############\n",
946 |     "\n",
947 |     "    # Dump post ids and vectors to a file.\n",
948 |     "    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n",
949 |     "    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))"
950 |    ]
951 |   }
952 |  ],
953 |  "metadata": {
954 |   "kernelspec": {
955 |    "display_name": "Python 3",
956 |    "language": "python",
957 |    "name": "python3"
958 |   },
959 |   "language_info": {
960 |    "codemirror_mode": {
961 |     "name": "ipython",
962 |     "version": 3
963 |    },
964 |    "file_extension": ".py",
965 |    "mimetype": "text/x-python",
966 |    "name": "python",
967 |    "nbconvert_exporter": "python",
968 |    "pygments_lexer": "ipython3",
969 |    "version": "3.5.2"
970 |   },
971 |   "latex_envs": {
972 |    "bibliofile": "biblio.bib",
973 |    "cite_by": "apalike",
974 |    "current_citInitial": 1,
975 |    "eqLabelWithNumbers": true,
976 |    "eqNumInitial": 0
977 |   }
978 |  },
979 |  "nbformat": 4,
980 |  "nbformat_minor": 2
981 | }
982 | 


--------------------------------------------------------------------------------
/project/week5-project.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Final project: StackOverflow assistant bot\n",
  8 |     "\n",
  9 |     "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n",
 10 |     "* answer programming-related questions (using StackOverflow dataset);\n",
 11 |     "* chit-chat and simulate dialogue on all non programming-related questions.\n",
 12 |     "\n",
 13 |     "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n",
 14 |     "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n",
 15 |     "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n",
 16 |     "©[xkcd](https://xkcd.com)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Data description\n",
 24 |     "\n",
 25 |     "To detect *intent* of users questions we will need two text collections:\n",
 26 |     "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n",
 27 |     "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import sys\n",
 39 |     "sys.path.append(\"..\")\n",
 40 |     "from common.download_utils import download_project_resources\n",
 41 |     "\n",
 42 |     "download_project_resources()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n",
 50 |     "For the ranking part, you will need:\n",
 51 |     "- `word_embeddings.tsv` — word embeddings, that you  trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n",
 59 |     "\n",
 60 |     "- `intent_recognizer.pkl` — intent recognition model;\n",
 61 |     "- `tag_classifier.pkl` — programming language classification model;\n",
 62 |     "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n",
 63 |     "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n",
 64 |     "    "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from utils import *"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Part I. Intent and language recognition"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n",
 97 |     "\n",
 98 |     "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "import numpy as np\n",
110 |     "import pandas as pd\n",
111 |     "import pickle\n",
112 |     "import re\n",
113 |     "\n",
114 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "### Data preparation"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "def tfidf_features(X_train, X_test, vectorizer_path):\n",
140 |     "    \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n",
141 |     "    \n",
142 |     "    # Train a vectorizer on X_train data.\n",
143 |     "    # Transform X_train and X_test data.\n",
144 |     "    \n",
145 |     "    # Pickle the trained vectorizer to 'vectorizer_path'\n",
146 |     "    # Don't forget to open the file in writing bytes mode.\n",
147 |     "    \n",
148 |     "    ######################################\n",
149 |     "    ######### YOUR CODE HERE #############\n",
150 |     "    ######################################\n",
151 |     "    \n",
152 |     "    return X_train, X_test"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later."
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "sample_size = 200000\n",
171 |     "\n",
172 |     "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n",
173 |     "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "Check how the data look like:"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "dialogue_df.head()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "stackoverflow_df.head()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "Apply *text_prepare* function to preprocess the data:"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "from utils import text_prepare"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "dialogue_df['text'] = ######### YOUR CODE HERE #############\n",
232 |     "stackoverflow_df['title'] = ######### YOUR CODE HERE #############"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Intent recognition"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n",
247 |     "- concatenate `dialogue` and `stackoverflow` examples into one sample\n",
248 |     "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n",
249 |     "- transform it into TF-IDF features"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "from sklearn.model_selection import train_test_split"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n",
272 |     "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n",
273 |     "\n",
274 |     "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n",
275 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n",
276 |     "\n",
277 |     "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good."
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "from sklearn.linear_model import LogisticRegression\n",
296 |     "from sklearn.metrics import accuracy_score"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {
303 |     "collapsed": true
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "######################################\n",
308 |     "######### YOUR CODE HERE #############\n",
309 |     "######################################"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "# Check test accuracy.\n",
321 |     "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n",
322 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
323 |     "print('Test accuracy = {}'.format(test_accuracy))"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "Dump the classifier to use it in the running bot."
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {
337 |     "collapsed": true
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "### Programming language classification "
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n",
356 |     "\n",
357 |     "First, let us prepare the data for this task."
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "X = stackoverflow_df['title'].values\n",
369 |     "y = stackoverflow_df['tag'].values"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "collapsed": true
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
381 |     "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it."
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n",
400 |     "\n",
401 |     "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {},
407 |    "source": [
408 |     "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*."
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {
415 |     "collapsed": true
416 |    },
417 |    "outputs": [],
418 |    "source": [
419 |     "from sklearn.multiclass import OneVsRestClassifier"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "collapsed": true
427 |    },
428 |    "outputs": [],
429 |    "source": [
430 |     "######################################\n",
431 |     "######### YOUR CODE HERE #############\n",
432 |     "######################################"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {
439 |     "collapsed": true
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "# Check test accuracy.\n",
444 |     "y_test_pred = tag_classifier.predict(X_test_tfidf)\n",
445 |     "test_accuracy = accuracy_score(y_test, y_test_pred)\n",
446 |     "print('Test accuracy = {}'.format(test_accuracy))"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {},
452 |    "source": [
453 |     "Dump the classifier to use it in the running bot."
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {
460 |     "collapsed": true
461 |    },
462 |    "outputs": [],
463 |    "source": [
464 |     "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "## Part II. Ranking  questions with embeddings"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n",
479 |     "\n",
480 |     "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. "
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "markdown",
485 |    "metadata": {},
486 |    "source": [
487 |     "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n",
488 |     "\n",
489 |     "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StartSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data."
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {
496 |     "collapsed": true
497 |    },
498 |    "outputs": [],
499 |    "source": [
500 |     "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {
514 |     "collapsed": true
515 |    },
516 |    "outputs": [],
517 |    "source": [
518 |     "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n",
526 |     "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {
533 |     "collapsed": true
534 |    },
535 |    "outputs": [],
536 |    "source": [
537 |     "counts_by_tag = ######### YOUR CODE HERE #############"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {},
543 |    "source": [
544 |     "Now for each `tag` you need to create two data structures, which will serve as online search index:\n",
545 |     "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n",
546 |     "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n",
547 |     "\n",
548 |     "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it."
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {
555 |     "collapsed": true
556 |    },
557 |    "outputs": [],
558 |    "source": [
559 |     "import os\n",
560 |     "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n",
561 |     "\n",
562 |     "for tag, count in counts_by_tag.items():\n",
563 |     "    tag_posts = posts_df[posts_df['tag'] == tag]\n",
564 |     "    \n",
565 |     "    tag_post_ids = ######### YOUR CODE HERE #############\n",
566 |     "    \n",
567 |     "    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n",
568 |     "    for i, title in enumerate(tag_posts['title']):\n",
569 |     "        tag_vectors[i, :] = ######### YOUR CODE HERE #############\n",
570 |     "\n",
571 |     "    # Dump post ids and vectors to a file.\n",
572 |     "    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n",
573 |     "    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))"
574 |    ]
575 |   }
576 |  ],
577 |  "metadata": {
578 |   "kernelspec": {
579 |    "display_name": "Python 3",
580 |    "language": "python",
581 |    "name": "python3"
582 |   },
583 |   "language_info": {
584 |    "codemirror_mode": {
585 |     "name": "ipython",
586 |     "version": 3
587 |    },
588 |    "file_extension": ".py",
589 |    "mimetype": "text/x-python",
590 |    "name": "python",
591 |    "nbconvert_exporter": "python",
592 |    "pygments_lexer": "ipython3",
593 |    "version": "3.4.3"
594 |   },
595 |   "latex_envs": {
596 |    "bibliofile": "biblio.bib",
597 |    "cite_by": "apalike",
598 |    "current_citInitial": 1,
599 |    "eqLabelWithNumbers": true,
600 |    "eqNumInitial": 0
601 |   }
602 |  },
603 |  "nbformat": 4,
604 |  "nbformat_minor": 2
605 | }
606 | 


--------------------------------------------------------------------------------
/week1/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g'
10 |         self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 
11 |                                   ('hTrz8', 'WordsTagsCount'), 
12 |                                   ('0kUjR', 'BagOfWords'), 
13 |                                   ('tLJV1', 'MultilabelClassification')])
14 |         self.answers = {key: None for key in self.parts}
15 | 
16 |     @staticmethod
17 |     def ravel_output(output):
18 |         '''
19 |            If student accidentally submitted np.array with one
20 |            element instead of number, this function will submit
21 |            this number instead
22 |         '''
23 |         if isinstance(output, np.ndarray) and output.size == 1:
24 |             output = output.item(0)
25 |         return output
26 | 
27 |     def submit(self, email, token):
28 |         submission = {
29 |                     "assignmentKey": self.assignment_key, 
30 |                     "submitterEmail": email, 
31 |                     "secret": token, 
32 |                     "parts": {}
33 |                   }
34 |         for part, output in self.answers.items():
35 |             if output is not None:
36 |                 submission["parts"][part] = {"output": output}
37 |             else:
38 |                 submission["parts"][part] = dict()
39 |         request = requests.post(self.submission_page, data=json.dumps(submission))
40 |         response = request.json()
41 |         if request.status_code == 201:
42 |             print('Submitted to Coursera platform. See results on assignment page!')
43 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
44 |             print(response[u'details'][u'learnerMessage'])
45 |         else:
46 |             print("Unknown response from Coursera: {}".format(request.status_code))
47 |             print(response)
48 | 
49 |     def status(self):
50 |         print("You want to submit these parts:")
51 |         for part_id, part_name in self.parts.items():
52 |             answer = self.answers[part_id]
53 |             if answer is None:
54 |                 answer = '-'*10
55 |             print("Task {}:\n {}".format(part_name, answer[:100] + '...'))
56 |                
57 |     def submit_part(self, part, output):
58 |         self.answers[part] = output
59 |         print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...'))
60 | 
61 |     def submit_tag(self, tag, output):
62 |         part_id = [k for k, v in self.parts.items() if v == tag]
63 |         if len(part_id) != 1:
64 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
65 |         part_id = part_id[0]
66 |         self.submit_part(part_id, str(self.ravel_output(output)))
67 | 


--------------------------------------------------------------------------------
/week1/metrics.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.metrics import roc_curve, auc
 4 | from scipy import interp
 5 | from itertools import cycle
 6 | 
 7 | def roc_auc(y_test, y_score, n_classes):  
 8 |     """Plots ROC curve for micro and macro averaging."""
 9 |     
10 |     # Compute ROC curve and ROC area for each class
11 |     fpr = {}
12 |     tpr = {}
13 |     roc_auc = {}
14 |     for i in range(n_classes):
15 |         fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
16 |         roc_auc[i] = auc(fpr[i], tpr[i])
17 |     
18 |     # Compute micro-average ROC curve and ROC area
19 |     fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
20 |     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
21 |     
22 |     # Compute macro-average ROC curve and ROC area     
23 |     all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
24 |     mean_tpr = np.zeros_like(all_fpr)
25 |     for i in range(n_classes):
26 |         mean_tpr += interp(all_fpr, fpr[i], tpr[i])
27 |     mean_tpr /= n_classes 
28 |     fpr["macro"] = all_fpr
29 |     tpr["macro"] = mean_tpr
30 |     roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
31 |     
32 |     # Plot all ROC curves
33 |     plt.figure()
34 |     plt.plot(fpr["micro"], tpr["micro"], 
35 |              label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
36 |              color='deeppink', linestyle=':', linewidth=4)
37 |     
38 |     plt.plot(fpr["macro"], tpr["macro"], 
39 |              label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]),
40 |              color='navy', linestyle=':', linewidth=4)
41 |     
42 |     colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
43 |     for i, color in zip(range(0,3), colors):
44 |         plt.plot(fpr[i], tpr[i], color=color, lw=2, 
45 |                  label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
46 |     
47 |     plt.plot([0, 1], [0, 1], 'k--', lw=2)
48 |     plt.xlim([0.0, 1.0])
49 |     plt.ylim([0.0, 1.05])
50 |     plt.xlabel('False Positive Rate')
51 |     plt.ylabel('True Positive Rate')
52 |     plt.title('Some extension of ROC to multi-class')
53 |     plt.legend(loc="lower right")
54 |     plt.show()


--------------------------------------------------------------------------------
/week1/week1-MultilabelClassification.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Predict tags on StackOverflow with linear models"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this assignment you will learn how to predict tags for posts from [StackOverflow](https://stackoverflow.com). To solve this task you will use multilabel classification approach.\n",
  15 |     "\n",
  16 |     "### Libraries\n",
  17 |     "\n",
  18 |     "In this task you will need the following libraries:\n",
  19 |     "- [Numpy](http://www.numpy.org) — a package for scientific computing.\n",
  20 |     "- [Pandas](https://pandas.pydata.org) — a library providing high-performance, easy-to-use data structures and data analysis tools for the Python\n",
  21 |     "- [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n",
  22 |     "- [NLTK](http://www.nltk.org) — a platform to work with natural language."
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "### Data\n",
  30 |     "\n",
  31 |     "The following cell will download all data required for this assignment into the folder `week1/data`."
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "code",
  36 |    "execution_count": null,
  37 |    "metadata": {
  38 |     "collapsed": true
  39 |    },
  40 |    "outputs": [],
  41 |    "source": [
  42 |     "import sys\n",
  43 |     "sys.path.append(\"..\")\n",
  44 |     "from common.download_utils import download_week1_resources\n",
  45 |     "\n",
  46 |     "download_week1_resources()"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "### Grading\n",
  54 |     "We will create a grader instance below and use it to collect your answers. Note that these outputs will be stored locally inside grader and will be uploaded to platform only after running submitting function in the last part of this assignment. If you want to make partial submission, you can run that cell any time you want."
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "code",
  59 |    "execution_count": null,
  60 |    "metadata": {
  61 |     "collapsed": true
  62 |    },
  63 |    "outputs": [],
  64 |    "source": [
  65 |     "from grader import Grader"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": null,
  71 |    "metadata": {
  72 |     "collapsed": true
  73 |    },
  74 |    "outputs": [],
  75 |    "source": [
  76 |     "grader = Grader()"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "markdown",
  81 |    "metadata": {},
  82 |    "source": [
  83 |     "### Text preprocessing"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "markdown",
  88 |    "metadata": {},
  89 |    "source": [
  90 |     "For this and most of the following assignments you will need to use a list of stop words. It can be downloaded from *nltk*:"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": null,
  96 |    "metadata": {
  97 |     "collapsed": true
  98 |    },
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "import nltk\n",
 102 |     "nltk.download('stopwords')\n",
 103 |     "from nltk.corpus import stopwords"
 104 |    ]
 105 |   },
 106 |   {
 107 |    "cell_type": "markdown",
 108 |    "metadata": {},
 109 |    "source": [
 110 |     "In this task you will deal with a dataset of post titles from StackOverflow. You are provided a split to 3 sets: *train*, *validation* and *test*. All corpora (except for *test*) contain titles of the posts and corresponding tags (100 tags are available). The *test* set is provided for Coursera's grading and doesn't contain answers. Upload the corpora using *pandas* and look at the data:"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": null,
 116 |    "metadata": {
 117 |     "collapsed": true
 118 |    },
 119 |    "outputs": [],
 120 |    "source": [
 121 |     "from ast import literal_eval\n",
 122 |     "import pandas as pd\n",
 123 |     "import numpy as np"
 124 |    ]
 125 |   },
 126 |   {
 127 |    "cell_type": "code",
 128 |    "execution_count": null,
 129 |    "metadata": {
 130 |     "collapsed": true
 131 |    },
 132 |    "outputs": [],
 133 |    "source": [
 134 |     "def read_data(filename):\n",
 135 |     "    data = pd.read_csv(filename, sep='\\t')\n",
 136 |     "    data['tags'] = data['tags'].apply(literal_eval)\n",
 137 |     "    return data"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "code",
 142 |    "execution_count": null,
 143 |    "metadata": {
 144 |     "collapsed": true
 145 |    },
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "train = read_data('data/train.tsv')\n",
 149 |     "validation = read_data('data/validation.tsv')\n",
 150 |     "test = pd.read_csv('data/test.tsv', sep='\\t')"
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": null,
 156 |    "metadata": {
 157 |     "collapsed": true
 158 |    },
 159 |    "outputs": [],
 160 |    "source": [
 161 |     "train.head()"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "markdown",
 166 |    "metadata": {},
 167 |    "source": [
 168 |     "As you can see, *title* column contains titles of the posts and *tags* column contains the tags. It could be noticed that a number of tags for a post is not fixed and could be as many as necessary."
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "markdown",
 173 |    "metadata": {},
 174 |    "source": [
 175 |     "For a more comfortable usage, initialize *X_train*, *X_val*, *X_test*, *y_train*, *y_val*."
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": null,
 181 |    "metadata": {
 182 |     "collapsed": true
 183 |    },
 184 |    "outputs": [],
 185 |    "source": [
 186 |     "X_train, y_train = train['title'].values, train['tags'].values\n",
 187 |     "X_val, y_val = validation['title'].values, validation['tags'].values\n",
 188 |     "X_test = test['title'].values"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "markdown",
 193 |    "metadata": {},
 194 |    "source": [
 195 |     "One of the most known difficulties when working with natural data is that it's unstructured. For example, if you use it \"as is\" and extract tokens just by splitting the titles by whitespaces, you will see that there are many \"weird\" tokens like *3.5?*, *\"Flip*, etc. To prevent the problems, it's usually useful to prepare the data somehow. In this task you'll write a function, which will be also used in the other assignments. \n",
 196 |     "\n",
 197 |     "**Task 1 (TextPrepare).** Implement the function *text_prepare* following the instructions. After that, run the function *test_test_prepare* to test it on tiny cases and submit it to Coursera."
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "code",
 202 |    "execution_count": null,
 203 |    "metadata": {
 204 |     "collapsed": true
 205 |    },
 206 |    "outputs": [],
 207 |    "source": [
 208 |     "import re"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": null,
 214 |    "metadata": {
 215 |     "collapsed": true
 216 |    },
 217 |    "outputs": [],
 218 |    "source": [
 219 |     "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n",
 220 |     "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n",
 221 |     "STOPWORDS = set(stopwords.words('english'))\n",
 222 |     "\n",
 223 |     "def text_prepare(text):\n",
 224 |     "    \"\"\"\n",
 225 |     "        text: a string\n",
 226 |     "        \n",
 227 |     "        return: modified initial string\n",
 228 |     "    \"\"\"\n",
 229 |     "    text = # lowercase text\n",
 230 |     "    text = # replace REPLACE_BY_SPACE_RE symbols by space in text\n",
 231 |     "    text = # delete symbols which are in BAD_SYMBOLS_RE from text\n",
 232 |     "    text = # delete stopwords from text\n",
 233 |     "    return text"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": null,
 239 |    "metadata": {
 240 |     "collapsed": true
 241 |    },
 242 |    "outputs": [],
 243 |    "source": [
 244 |     "def test_text_prepare():\n",
 245 |     "    examples = [\"SQL Server - any equivalent of Excel's CHOOSE function?\",\n",
 246 |     "                \"How to free c++ memory vector<int> * arr?\"]\n",
 247 |     "    answers = [\"sql server equivalent excels choose function\", \n",
 248 |     "               \"free c++ memory vectorint arr\"]\n",
 249 |     "    for ex, ans in zip(examples, answers):\n",
 250 |     "        if text_prepare(ex) != ans:\n",
 251 |     "            return \"Wrong answer for the case: '%s'\" % ex\n",
 252 |     "    return 'Basic tests are passed.'"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": null,
 258 |    "metadata": {
 259 |     "collapsed": true
 260 |    },
 261 |    "outputs": [],
 262 |    "source": [
 263 |     "print(test_text_prepare())"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {},
 269 |    "source": [
 270 |     "Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points."
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": null,
 276 |    "metadata": {
 277 |     "collapsed": true
 278 |    },
 279 |    "outputs": [],
 280 |    "source": [
 281 |     "prepared_questions = []\n",
 282 |     "for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):\n",
 283 |     "    line = text_prepare(line.strip())\n",
 284 |     "    prepared_questions.append(line)\n",
 285 |     "text_prepare_results = '\\n'.join(prepared_questions)\n",
 286 |     "\n",
 287 |     "grader.submit_tag('TextPrepare', text_prepare_results)"
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "markdown",
 292 |    "metadata": {},
 293 |    "source": [
 294 |     "Now we can preprocess the titles using function *text_prepare* and  making sure that the headers don't have bad symbols:"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": null,
 300 |    "metadata": {
 301 |     "collapsed": true
 302 |    },
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "X_train = [text_prepare(x) for x in X_train]\n",
 306 |     "X_val = [text_prepare(x) for x in X_val]\n",
 307 |     "X_test = [text_prepare(x) for x in X_test]"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": null,
 313 |    "metadata": {
 314 |     "collapsed": true
 315 |    },
 316 |    "outputs": [],
 317 |    "source": [
 318 |     "X_train[:3]"
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "markdown",
 323 |    "metadata": {},
 324 |    "source": [
 325 |     "For each tag and for each word calculate how many times they occur in the train corpus. \n",
 326 |     "\n",
 327 |     "**Task 2 (WordsTagsCount).** Find 3 most popular tags and 3 most popular words in the train data and submit the results to earn the points."
 328 |    ]
 329 |   },
 330 |   {
 331 |    "cell_type": "code",
 332 |    "execution_count": null,
 333 |    "metadata": {
 334 |     "collapsed": true
 335 |    },
 336 |    "outputs": [],
 337 |    "source": [
 338 |     "# Dictionary of all tags from train corpus with their counts.\n",
 339 |     "tags_counts = {}\n",
 340 |     "# Dictionary of all words from train corpus with their counts.\n",
 341 |     "words_counts = {}\n",
 342 |     "\n",
 343 |     "######################################\n",
 344 |     "######### YOUR CODE HERE #############\n",
 345 |     "######################################"
 346 |    ]
 347 |   },
 348 |   {
 349 |    "cell_type": "markdown",
 350 |    "metadata": {},
 351 |    "source": [
 352 |     "We are assuming that *tags_counts* and *words_counts* are dictionaries like `{'some_word_or_tag': frequency}`. After applying the sorting procedure, results will be look like this: `[('most_popular_word_or_tag', frequency), ('less_popular_word_or_tag', frequency), ...]`. The grader gets the results in the following format (two comma-separated strings with line break):\n",
 353 |     "\n",
 354 |     "    tag1,tag2,tag3\n",
 355 |     "    word1,word2,word3\n",
 356 |     "\n",
 357 |     "Pay attention that in this assignment you should not submit frequencies or some additional information."
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": null,
 363 |    "metadata": {
 364 |     "collapsed": true
 365 |    },
 366 |    "outputs": [],
 367 |    "source": [
 368 |     "most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n",
 369 |     "most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n",
 370 |     "\n",
 371 |     "grader.submit_tag('WordsTagsCount', '%s\\n%s' % (','.join(tag for tag, _ in most_common_tags), \n",
 372 |     "                                                ','.join(word for word, _ in most_common_words)))"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "metadata": {},
 378 |    "source": [
 379 |     "### Transforming text to a vector\n",
 380 |     "\n",
 381 |     "Machine Learning algorithms work with numeric data and we cannot use the provided text data \"as is\". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.\n",
 382 |     "\n",
 383 |     "#### Bag of words\n",
 384 |     "\n",
 385 |     "One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:\n",
 386 |     "1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.\n",
 387 |     "2. For each title in the corpora create a zero vector with the dimension equals to *N*.\n",
 388 |     "3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.\n",
 389 |     "\n",
 390 |     "Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is \n",
 391 |     "\n",
 392 |     "    ['hi', 'you', 'me', 'are']\n",
 393 |     "\n",
 394 |     "Then we need to numerate them, for example, like this: \n",
 395 |     "\n",
 396 |     "    {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n",
 397 |     "\n",
 398 |     "And we have the text, which we want to transform to the vector:\n",
 399 |     "\n",
 400 |     "    'hi how are you'\n",
 401 |     "\n",
 402 |     "For this text we create a corresponding zero vector \n",
 403 |     "\n",
 404 |     "    [0, 0, 0, 0]\n",
 405 |     "    \n",
 406 |     "And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:\n",
 407 |     "\n",
 408 |     "    'hi':  [1, 0, 0, 0]\n",
 409 |     "    'how': [1, 0, 0, 0] # word 'how' is not in our dictionary\n",
 410 |     "    'are': [1, 0, 0, 1]\n",
 411 |     "    'you': [1, 1, 0, 1]\n",
 412 |     "\n",
 413 |     "The resulting vector will be \n",
 414 |     "\n",
 415 |     "    [1, 1, 0, 1]\n",
 416 |     "   \n",
 417 |     "Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*."
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "code",
 422 |    "execution_count": null,
 423 |    "metadata": {
 424 |     "collapsed": true
 425 |    },
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "DICT_SIZE = 5000\n",
 429 |     "WORDS_TO_INDEX = ####### YOUR CODE HERE #######\n",
 430 |     "INDEX_TO_WORDS = ####### YOUR CODE HERE #######\n",
 431 |     "ALL_WORDS = WORDS_TO_INDEX.keys()\n",
 432 |     "\n",
 433 |     "def my_bag_of_words(text, words_to_index, dict_size):\n",
 434 |     "    \"\"\"\n",
 435 |     "        text: a string\n",
 436 |     "        dict_size: size of the dictionary\n",
 437 |     "        \n",
 438 |     "        return a vector which is a bag-of-words representation of 'text'\n",
 439 |     "    \"\"\"\n",
 440 |     "    result_vector = np.zeros(dict_size)\n",
 441 |     "    ######################################\n",
 442 |     "    ######### YOUR CODE HERE #############\n",
 443 |     "    ######################################\n",
 444 |     "    return result_vector"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "code",
 449 |    "execution_count": null,
 450 |    "metadata": {
 451 |     "collapsed": true
 452 |    },
 453 |    "outputs": [],
 454 |    "source": [
 455 |     "def test_my_bag_of_words():\n",
 456 |     "    words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n",
 457 |     "    examples = ['hi how are you']\n",
 458 |     "    answers = [[1, 1, 0, 1]]\n",
 459 |     "    for ex, ans in zip(examples, answers):\n",
 460 |     "        if (my_bag_of_words(ex, words_to_index, 4) != ans).any():\n",
 461 |     "            return \"Wrong answer for the case: '%s'\" % ex\n",
 462 |     "    return 'Basic tests are passed.'"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": null,
 468 |    "metadata": {
 469 |     "collapsed": true
 470 |    },
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "print(test_my_bag_of_words())"
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "markdown",
 478 |    "metadata": {},
 479 |    "source": [
 480 |     "Now apply the implemented function to all samples (this might take up to a minute):"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "code",
 485 |    "execution_count": null,
 486 |    "metadata": {
 487 |     "collapsed": true
 488 |    },
 489 |    "outputs": [],
 490 |    "source": [
 491 |     "from scipy import sparse as sp_sparse"
 492 |    ]
 493 |   },
 494 |   {
 495 |    "cell_type": "code",
 496 |    "execution_count": null,
 497 |    "metadata": {
 498 |     "collapsed": true
 499 |    },
 500 |    "outputs": [],
 501 |    "source": [
 502 |     "X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])\n",
 503 |     "X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])\n",
 504 |     "X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])\n",
 505 |     "print('X_train shape ', X_train_mybag.shape)\n",
 506 |     "print('X_val shape ', X_val_mybag.shape)\n",
 507 |     "print('X_test shape ', X_test_mybag.shape)"
 508 |    ]
 509 |   },
 510 |   {
 511 |    "cell_type": "markdown",
 512 |    "metadata": {},
 513 |    "source": [
 514 |     "As you might notice, we transform the data to sparse representation, to store the useful information efficiently. There are many [types](https://docs.scipy.org/doc/scipy/reference/sparse.html) of such representations, however sklearn algorithms can work only with [csr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix) matrix, so we will use this one."
 515 |    ]
 516 |   },
 517 |   {
 518 |    "cell_type": "markdown",
 519 |    "metadata": {},
 520 |    "source": [
 521 |     "**Task 3 (BagOfWords).** For the 11th row in *X_train_mybag* find how many non-zero elements it has. In this task the answer (variable *non_zero_elements_count*) should be a number, e.g. 20."
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "code",
 526 |    "execution_count": null,
 527 |    "metadata": {
 528 |     "collapsed": true
 529 |    },
 530 |    "outputs": [],
 531 |    "source": [
 532 |     "row = X_train_mybag[10].toarray()[0]\n",
 533 |     "non_zero_elements_count = ####### YOUR CODE HERE #######\n",
 534 |     "\n",
 535 |     "grader.submit_tag('BagOfWords', str(non_zero_elements_count))"
 536 |    ]
 537 |   },
 538 |   {
 539 |    "cell_type": "markdown",
 540 |    "metadata": {},
 541 |    "source": [
 542 |     "#### TF-IDF\n",
 543 |     "\n",
 544 |     "The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space. \n",
 545 |     "\n",
 546 |     "Implement function *tfidf_features* using class [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) from *scikit-learn*. Use *train* corpus to train a vectorizer. Don't forget to take a look into the arguments that you can pass to it. We suggest that you filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the titles). Also, use bigrams along with unigrams in your vocabulary. "
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "code",
 551 |    "execution_count": null,
 552 |    "metadata": {
 553 |     "collapsed": true
 554 |    },
 555 |    "outputs": [],
 556 |    "source": [
 557 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
 558 |    ]
 559 |   },
 560 |   {
 561 |    "cell_type": "code",
 562 |    "execution_count": null,
 563 |    "metadata": {
 564 |     "collapsed": true
 565 |    },
 566 |    "outputs": [],
 567 |    "source": [
 568 |     "def tfidf_features(X_train, X_val, X_test):\n",
 569 |     "    \"\"\"\n",
 570 |     "        X_train, X_val, X_test — samples        \n",
 571 |     "        return TF-IDF vectorized representation of each sample and vocabulary\n",
 572 |     "    \"\"\"\n",
 573 |     "    # Create TF-IDF vectorizer with a proper parameters choice\n",
 574 |     "    # Fit the vectorizer on the train set\n",
 575 |     "    # Transform the train, test, and val sets and return the result\n",
 576 |     "    \n",
 577 |     "    \n",
 578 |     "    tfidf_vectorizer = ####### YOUR CODE HERE #######\n",
 579 |     "    \n",
 580 |     "    ######################################\n",
 581 |     "    ######### YOUR CODE HERE #############\n",
 582 |     "    ######################################\n",
 583 |     "    \n",
 584 |     "    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "markdown",
 589 |    "metadata": {},
 590 |    "source": [
 591 |     "Once you have done text preprocessing, always have a look at the results. Be very careful at this step, because the performance of future models will drastically depend on it. \n",
 592 |     "\n",
 593 |     "In this case, check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:"
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "code",
 598 |    "execution_count": null,
 599 |    "metadata": {
 600 |     "collapsed": true
 601 |    },
 602 |    "outputs": [],
 603 |    "source": [
 604 |     "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)\n",
 605 |     "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}"
 606 |    ]
 607 |   },
 608 |   {
 609 |    "cell_type": "code",
 610 |    "execution_count": null,
 611 |    "metadata": {
 612 |     "collapsed": true
 613 |    },
 614 |    "outputs": [],
 615 |    "source": [
 616 |     "######### YOUR CODE HERE #############"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "markdown",
 621 |    "metadata": {},
 622 |    "source": [
 623 |     "If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\\S+)' regexp as a *token_pattern* in the constructor of the vectorizer.  "
 624 |    ]
 625 |   },
 626 |   {
 627 |    "cell_type": "markdown",
 628 |    "metadata": {},
 629 |    "source": [
 630 |     "Now, use this transormation for the data and check again."
 631 |    ]
 632 |   },
 633 |   {
 634 |    "cell_type": "code",
 635 |    "execution_count": null,
 636 |    "metadata": {
 637 |     "collapsed": true
 638 |    },
 639 |    "outputs": [],
 640 |    "source": [
 641 |     "######### YOUR CODE HERE #############"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "markdown",
 646 |    "metadata": {},
 647 |    "source": [
 648 |     "### MultiLabel classifier\n",
 649 |     "\n",
 650 |     "As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*."
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {
 657 |     "collapsed": true
 658 |    },
 659 |    "outputs": [],
 660 |    "source": [
 661 |     "from sklearn.preprocessing import MultiLabelBinarizer"
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "code",
 666 |    "execution_count": null,
 667 |    "metadata": {
 668 |     "collapsed": true
 669 |    },
 670 |    "outputs": [],
 671 |    "source": [
 672 |     "mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))\n",
 673 |     "y_train = mlb.fit_transform(y_train)\n",
 674 |     "y_val = mlb.fit_transform(y_val)"
 675 |    ]
 676 |   },
 677 |   {
 678 |    "cell_type": "markdown",
 679 |    "metadata": {},
 680 |    "source": [
 681 |     "Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large."
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "code",
 686 |    "execution_count": null,
 687 |    "metadata": {
 688 |     "collapsed": true
 689 |    },
 690 |    "outputs": [],
 691 |    "source": [
 692 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 693 |     "from sklearn.linear_model import LogisticRegression, RidgeClassifier"
 694 |    ]
 695 |   },
 696 |   {
 697 |    "cell_type": "code",
 698 |    "execution_count": null,
 699 |    "metadata": {
 700 |     "collapsed": true
 701 |    },
 702 |    "outputs": [],
 703 |    "source": [
 704 |     "def train_classifier(X_train, y_train):\n",
 705 |     "    \"\"\"\n",
 706 |     "      X_train, y_train — training data\n",
 707 |     "      \n",
 708 |     "      return: trained classifier\n",
 709 |     "    \"\"\"\n",
 710 |     "    \n",
 711 |     "    # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n",
 712 |     "\n",
 713 |     "    ######################################\n",
 714 |     "    ######### YOUR CODE HERE #############\n",
 715 |     "    ######################################    "
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "markdown",
 720 |    "metadata": {},
 721 |    "source": [
 722 |     "Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*."
 723 |    ]
 724 |   },
 725 |   {
 726 |    "cell_type": "code",
 727 |    "execution_count": null,
 728 |    "metadata": {
 729 |     "collapsed": true
 730 |    },
 731 |    "outputs": [],
 732 |    "source": [
 733 |     "classifier_mybag = train_classifier(X_train_mybag, y_train)\n",
 734 |     "classifier_tfidf = train_classifier(X_train_tfidf, y_train)"
 735 |    ]
 736 |   },
 737 |   {
 738 |    "cell_type": "markdown",
 739 |    "metadata": {},
 740 |    "source": [
 741 |     "Now you can create predictions for the data. You will need two types of predictions: labels and scores."
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "code",
 746 |    "execution_count": null,
 747 |    "metadata": {
 748 |     "collapsed": true
 749 |    },
 750 |    "outputs": [],
 751 |    "source": [
 752 |     "y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)\n",
 753 |     "y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)\n",
 754 |     "\n",
 755 |     "y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)\n",
 756 |     "y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)"
 757 |    ]
 758 |   },
 759 |   {
 760 |    "cell_type": "markdown",
 761 |    "metadata": {},
 762 |    "source": [
 763 |     "Now take a look at how classifier, which uses TF-IDF, works for a few examples:"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "code",
 768 |    "execution_count": null,
 769 |    "metadata": {
 770 |     "collapsed": true
 771 |    },
 772 |    "outputs": [],
 773 |    "source": [
 774 |     "y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)\n",
 775 |     "y_val_inversed = mlb.inverse_transform(y_val)\n",
 776 |     "for i in range(3):\n",
 777 |     "    print('Title:\\t{}\\nTrue labels:\\t{}\\nPredicted labels:\\t{}\\n\\n'.format(\n",
 778 |     "        X_val[i],\n",
 779 |     "        ','.join(y_val_inversed[i]),\n",
 780 |     "        ','.join(y_val_pred_inversed[i])\n",
 781 |     "    ))"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "markdown",
 786 |    "metadata": {},
 787 |    "source": [
 788 |     "Now, we would need to compare the results of different predictions, e.g. to see whether TF-IDF transformation helps or to try different regularization techniques in logistic regression. For all these experiments, we need to setup evaluation procedure. "
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "markdown",
 793 |    "metadata": {},
 794 |    "source": [
 795 |     "### Evaluation\n",
 796 |     "\n",
 797 |     "To evaluate the results we will use several classification metrics:\n",
 798 |     " - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)\n",
 799 |     " - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n",
 800 |     " - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)\n",
 801 |     " - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) \n",
 802 |     " \n",
 803 |     "Make sure you are familiar with all of them. How would you expect the things work for the multi-label scenario? Read about micro/macro/weighted averaging following the sklearn links provided above."
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": null,
 809 |    "metadata": {
 810 |     "collapsed": true
 811 |    },
 812 |    "outputs": [],
 813 |    "source": [
 814 |     "from sklearn.metrics import accuracy_score\n",
 815 |     "from sklearn.metrics import f1_score\n",
 816 |     "from sklearn.metrics import roc_auc_score \n",
 817 |     "from sklearn.metrics import average_precision_score\n",
 818 |     "from sklearn.metrics import recall_score"
 819 |    ]
 820 |   },
 821 |   {
 822 |    "cell_type": "markdown",
 823 |    "metadata": {},
 824 |    "source": [
 825 |     "Implement the function *print_evaluation_scores* which calculates and prints to stdout:\n",
 826 |     " - *accuracy*\n",
 827 |     " - *F1-score macro/micro/weighted*\n",
 828 |     " - *Precision macro/micro/weighted*"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": null,
 834 |    "metadata": {
 835 |     "collapsed": true
 836 |    },
 837 |    "outputs": [],
 838 |    "source": [
 839 |     "def print_evaluation_scores(y_val, predicted):\n",
 840 |     "    \n",
 841 |     "    ######################################\n",
 842 |     "    ######### YOUR CODE HERE #############\n",
 843 |     "    ######################################"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": null,
 849 |    "metadata": {
 850 |     "collapsed": true
 851 |    },
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "print('Bag-of-words')\n",
 855 |     "print_evaluation_scores(y_val, y_val_predicted_labels_mybag)\n",
 856 |     "print('Tfidf')\n",
 857 |     "print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "markdown",
 862 |    "metadata": {},
 863 |    "source": [
 864 |     "You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are:\n",
 865 |     " - true labels\n",
 866 |     " - decision functions scores\n",
 867 |     " - number of classes"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "code",
 872 |    "execution_count": null,
 873 |    "metadata": {
 874 |     "collapsed": true
 875 |    },
 876 |    "outputs": [],
 877 |    "source": [
 878 |     "from metrics import roc_auc\n",
 879 |     "%matplotlib inline"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "code",
 884 |    "execution_count": null,
 885 |    "metadata": {
 886 |     "collapsed": true
 887 |    },
 888 |    "outputs": [],
 889 |    "source": [
 890 |     "n_classes = len(tags_counts)\n",
 891 |     "roc_auc(y_val, y_val_predicted_scores_mybag, n_classes)"
 892 |    ]
 893 |   },
 894 |   {
 895 |    "cell_type": "code",
 896 |    "execution_count": null,
 897 |    "metadata": {
 898 |     "collapsed": true
 899 |    },
 900 |    "outputs": [],
 901 |    "source": [
 902 |     "n_classes = len(tags_counts)\n",
 903 |     "roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes)"
 904 |    ]
 905 |   },
 906 |   {
 907 |    "cell_type": "markdown",
 908 |    "metadata": {},
 909 |    "source": [
 910 |     "**Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation:\n",
 911 |     "- compare the quality of the bag-of-words and TF-IDF approaches and chose one of them.\n",
 912 |     "- for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100).\n",
 913 |     "\n",
 914 |     "You also could try other improvements of the preprocessing / model, if you want. "
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "code",
 919 |    "execution_count": null,
 920 |    "metadata": {
 921 |     "collapsed": true
 922 |    },
 923 |    "outputs": [],
 924 |    "source": [
 925 |     "######################################\n",
 926 |     "######### YOUR CODE HERE #############\n",
 927 |     "######################################"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "markdown",
 932 |    "metadata": {},
 933 |    "source": [
 934 |     "When you are happy with the quality, create predictions for *test* set, which you will submit to Coursera."
 935 |    ]
 936 |   },
 937 |   {
 938 |    "cell_type": "code",
 939 |    "execution_count": null,
 940 |    "metadata": {
 941 |     "collapsed": true
 942 |    },
 943 |    "outputs": [],
 944 |    "source": [
 945 |     "test_predictions = ######### YOUR CODE HERE #############\n",
 946 |     "test_pred_inversed = mlb.inverse_transform(test_predictions)\n",
 947 |     "\n",
 948 |     "test_predictions_for_submission = '\\n'.join('%i\\t%s' % (i, ','.join(row)) for i, row in enumerate(test_pred_inversed))\n",
 949 |     "grader.submit_tag('MultilabelClassification', test_predictions_for_submission)"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "markdown",
 954 |    "metadata": {},
 955 |    "source": [
 956 |     "### Analysis of the most important features"
 957 |    ]
 958 |   },
 959 |   {
 960 |    "cell_type": "markdown",
 961 |    "metadata": {},
 962 |    "source": [
 963 |     "Finally, it is usually a good idea to look at the features (words or n-grams) that are used with the largest weigths in your logistic regression model."
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "markdown",
 968 |    "metadata": {},
 969 |    "source": [
 970 |     "Implement the function *print_words_for_tag* to find them. Get back to sklearn documentation on [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) and [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) if needed."
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "code",
 975 |    "execution_count": null,
 976 |    "metadata": {
 977 |     "collapsed": true
 978 |    },
 979 |    "outputs": [],
 980 |    "source": [
 981 |     "def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):\n",
 982 |     "    \"\"\"\n",
 983 |     "        classifier: trained classifier\n",
 984 |     "        tag: particular tag\n",
 985 |     "        tags_classes: a list of classes names from MultiLabelBinarizer\n",
 986 |     "        index_to_words: index_to_words transformation\n",
 987 |     "        all_words: all words in the dictionary\n",
 988 |     "        \n",
 989 |     "        return nothing, just print top 5 positive and top 5 negative words for current tag\n",
 990 |     "    \"\"\"\n",
 991 |     "    print('Tag:\\t{}'.format(tag))\n",
 992 |     "    \n",
 993 |     "    # Extract an estimator from the classifier for the given tag.\n",
 994 |     "    # Extract feature coefficients from the estimator. \n",
 995 |     "    \n",
 996 |     "    ######################################\n",
 997 |     "    ######### YOUR CODE HERE #############\n",
 998 |     "    ######################################\n",
 999 |     "    \n",
1000 |     "    top_positive_words = # top-5 words sorted by the coefficiens.\n",
1001 |     "    top_negative_words = # bottom-5 words  sorted by the coefficients.\n",
1002 |     "    print('Top positive words:\\t{}'.format(', '.join(top_positive_words)))\n",
1003 |     "    print('Top negative words:\\t{}\\n'.format(', '.join(top_negative_words)))"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": null,
1009 |    "metadata": {
1010 |     "collapsed": true
1011 |    },
1012 |    "outputs": [],
1013 |    "source": [
1014 |     "print_words_for_tag(classifier_tfidf, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n",
1015 |     "print_words_for_tag(classifier_tfidf, 'c++', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n",
1016 |     "print_words_for_tag(classifier_tfidf, 'linux', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)"
1017 |    ]
1018 |   },
1019 |   {
1020 |    "cell_type": "markdown",
1021 |    "metadata": {},
1022 |    "source": [
1023 |     "### Authorization & Submission\n",
1024 |     "To submit assignment parts to Cousera platform, please, enter your e-mail and token into variables below. You can generate token on this programming assignment page. <b>Note:</b> Token expires 30 minutes after generation."
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": null,
1030 |    "metadata": {
1031 |     "collapsed": true
1032 |    },
1033 |    "outputs": [],
1034 |    "source": [
1035 |     "grader.status()"
1036 |    ]
1037 |   },
1038 |   {
1039 |    "cell_type": "code",
1040 |    "execution_count": null,
1041 |    "metadata": {
1042 |     "collapsed": true
1043 |    },
1044 |    "outputs": [],
1045 |    "source": [
1046 |     "STUDENT_EMAIL = # EMAIL \n",
1047 |     "STUDENT_TOKEN = # TOKEN \n",
1048 |     "grader.status()"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "markdown",
1053 |    "metadata": {},
1054 |    "source": [
1055 |     "If you want to submit these answers, run cell below"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": null,
1061 |    "metadata": {
1062 |     "collapsed": true
1063 |    },
1064 |    "outputs": [],
1065 |    "source": [
1066 |     "grader.submit(STUDENT_EMAIL, STUDENT_TOKEN)"
1067 |    ]
1068 |   }
1069 |  ],
1070 |  "metadata": {
1071 |   "kernelspec": {
1072 |    "display_name": "Python 3",
1073 |    "language": "python",
1074 |    "name": "python3"
1075 |   },
1076 |   "language_info": {
1077 |    "codemirror_mode": {
1078 |     "name": "ipython",
1079 |     "version": 3
1080 |    },
1081 |    "file_extension": ".py",
1082 |    "mimetype": "text/x-python",
1083 |    "name": "python",
1084 |    "nbconvert_exporter": "python",
1085 |    "pygments_lexer": "ipython3",
1086 |    "version": "3.5.2"
1087 |   }
1088 |  },
1089 |  "nbformat": 4,
1090 |  "nbformat_minor": 2
1091 | }
1092 | 


--------------------------------------------------------------------------------
/week2/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False):
  4 |     if candidate == 'B-' + current_tag:
  5 |         if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
  6 |                 current_chunk[-1].append(current_pos - 1)
  7 |         current_chunk.append([current_pos])
  8 |     elif candidate == 'I-' + current_tag:
  9 |         if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag):
 10 |             current_chunk.append([current_pos])
 11 |         if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'):
 12 |             current_chunk.append([current_pos])
 13 |     elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag:
 14 |         if len(current_chunk) > 0:
 15 |             current_chunk[-1].append(current_pos - 1)
 16 | 
 17 | def _update_last_chunk(current_chunk, current_pos):
 18 |     if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
 19 |         current_chunk[-1].append(current_pos - 1)
 20 | 
 21 | def _tag_precision_recall_f1(tp, fp, fn):
 22 |     precision, recall, f1 = 0, 0, 0
 23 |     if tp + fp > 0:
 24 |         precision = tp / (tp + fp) * 100
 25 |     if tp + fn > 0:
 26 |         recall = tp / (tp + fn) * 100
 27 |     if precision + recall > 0:
 28 |         f1 = 2 * precision * recall / (precision + recall)
 29 |     return precision, recall, f1
 30 | 
 31 | def _aggregate_metrics(results, total_correct):
 32 |     total_true_entities = 0
 33 |     total_predicted_entities = 0
 34 |     total_precision = 0
 35 |     total_recall = 0
 36 |     total_f1 = 0
 37 |     for tag, tag_metrics in results.items():
 38 |         n_pred = tag_metrics['n_predicted_entities']
 39 |         n_true = tag_metrics['n_true_entities']
 40 |         total_true_entities += n_true
 41 |         total_predicted_entities += n_pred
 42 |         total_precision += tag_metrics['precision'] * n_pred
 43 |         total_recall += tag_metrics['recall'] * n_true
 44 |     
 45 |     accuracy = total_correct / total_true_entities * 100
 46 |     if total_predicted_entities > 0:
 47 |         total_precision = total_precision / total_predicted_entities
 48 |     total_recall = total_recall / total_true_entities
 49 |     if total_precision + total_recall > 0:
 50 |         total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)
 51 |     return total_true_entities, total_predicted_entities, \
 52 |            total_precision, total_recall, total_f1, accuracy
 53 | 
 54 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct):
 55 |     print('processed {len} tokens ' \
 56 |           'with {tot_true} phrases; ' \
 57 |           'found: {tot_pred} phrases; ' \
 58 |           'correct: {tot_cor}.\n'.format(len=n_tokens,
 59 |                                          tot_true=total_true_entities,
 60 |                                          tot_pred=total_predicted_entities,
 61 |                                          tot_cor=total_correct))
 62 | 
 63 | def _print_metrics(accuracy, total_precision, total_recall, total_f1):
 64 |     print('precision:  {tot_prec:.2f}%; ' \
 65 |           'recall:  {tot_recall:.2f}%; ' \
 66 |           'F1:  {tot_f1:.2f}\n'.format(acc=accuracy,
 67 |                                            tot_prec=total_precision,
 68 |                                            tot_recall=total_recall,
 69 |                                            tot_f1=total_f1))
 70 | 
 71 | def _print_tag_metrics(tag, tag_results):
 72 |     print(('\t%12s' % tag) + ': precision:  {tot_prec:6.2f}%; ' \
 73 |                                'recall:  {tot_recall:6.2f}%; ' \
 74 |                                'F1:  {tot_f1:6.2f}; ' \
 75 |                                'predicted:  {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'],
 76 |                                                                          tot_recall=tag_results['recall'],
 77 |                                                                          tot_f1=tag_results['f1'],
 78 |                                                                          tot_predicted=tag_results['n_predicted_entities']))
 79 | 
 80 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False):
 81 |     # Find all tags
 82 |     tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O'))
 83 | 
 84 |     results = OrderedDict((tag, OrderedDict()) for tag in tags)
 85 |     n_tokens = len(y_true)
 86 |     total_correct = 0
 87 | 
 88 |     # For eval_conll_try we find all chunks in the ground truth and prediction
 89 |     # For each chunk we store starting and ending indices
 90 |     for tag in tags:
 91 |         true_chunk = list()
 92 |         predicted_chunk = list()
 93 |         for position in range(n_tokens):
 94 |             _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position)
 95 |             _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True)
 96 | 
 97 |         _update_last_chunk(true_chunk, position)
 98 |         _update_last_chunk(predicted_chunk, position)
 99 | 
100 |         # Then we find all correctly classified intervals
101 |         # True positive results
102 |         tp = sum(chunk in predicted_chunk for chunk in true_chunk)
103 |         total_correct += tp
104 | 
105 |         # And then just calculate errors of the first and second kind
106 |         # False negative
107 |         fn = len(true_chunk) - tp
108 |         # False positive
109 |         fp = len(predicted_chunk) - tp
110 |         precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn)
111 | 
112 |         results[tag]['precision'] = precision
113 |         results[tag]['recall'] = recall
114 |         results[tag]['f1'] = f1
115 |         results[tag]['n_predicted_entities'] = len(predicted_chunk)
116 |         results[tag]['n_true_entities'] = len(true_chunk)
117 | 
118 |     total_true_entities, total_predicted_entities, \
119 |            total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct)
120 | 
121 |     if print_results:
122 |         _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct)
123 |         _print_metrics(accuracy, total_precision, total_recall, total_f1)
124 | 
125 |         if not short_report:
126 |             for tag, tag_results in results.items():
127 |                 _print_tag_metrics(tag, tag_results)
128 |     return results
129 | 


--------------------------------------------------------------------------------
/week2/week2-NER.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Recognize named entities on Twitter with LSTMs\n",
 10 |     "\n",
 11 |     "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n",
 12 |     "\n",
 13 |     "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n",
 14 |     "\n",
 15 |     "    Ian Goodfellow works for Google Brain\n",
 16 |     "\n",
 17 |     "a NER model needs to provide the following sequence of tags:\n",
 18 |     "\n",
 19 |     "    B-PER I-PER    O     O   B-ORG  I-ORG\n",
 20 |     "\n",
 21 |     "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n",
 22 |     "\n",
 23 |     "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n",
 24 |     "\n",
 25 |     "### Libraries\n",
 26 |     "\n",
 27 |     "For this task you will need the following libraries:\n",
 28 |     " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n",
 29 |     " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n",
 30 |     " \n",
 31 |     "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. "
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Data\n",
 39 |     "\n",
 40 |     "The following cell will download all data required for this assignment into the folder `week2/data`."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import sys\n",
 52 |     "sys.path.append(\"..\")\n",
 53 |     "from common.download_utils import download_week2_resources\n",
 54 |     "\n",
 55 |     "download_week2_resources()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Load the Twitter Named Entity Recognition corpus\n",
 63 |     "\n",
 64 |     "We will work with a corpus, which contains twits with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n",
 65 |     "\n",
 66 |     "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `<USR>` token and any URL to `<URL>` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "def read_data(file_path):\n",
 78 |     "    tokens = []\n",
 79 |     "    tags = []\n",
 80 |     "    \n",
 81 |     "    tweet_tokens = []\n",
 82 |     "    tweet_tags = []\n",
 83 |     "    for line in open(file_path, encoding='utf-8'):\n",
 84 |     "        line = line.strip()\n",
 85 |     "        if not line:\n",
 86 |     "            if tweet_tokens:\n",
 87 |     "                tokens.append(tweet_tokens)\n",
 88 |     "                tags.append(tweet_tags)\n",
 89 |     "            tweet_tokens = []\n",
 90 |     "            tweet_tags = []\n",
 91 |     "        else:\n",
 92 |     "            token, tag = line.split()\n",
 93 |     "            # Replace all urls with <URL> token\n",
 94 |     "            # Replace all users with <USR> token\n",
 95 |     "\n",
 96 |     "            ######################################\n",
 97 |     "            ######### YOUR CODE HERE #############\n",
 98 |     "            ######################################\n",
 99 |     "            \n",
100 |     "            tweet_tokens.append(token)\n",
101 |     "            tweet_tags.append(tag)\n",
102 |     "            \n",
103 |     "    return tokens, tags"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "And now we can load three separate parts of the dataset:\n",
111 |     " - *train* data for training the model;\n",
112 |     " - *validation* data for evaluation and hyperparameters tuning;\n",
113 |     " - *test* data for final evaluation of the model."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "train_tokens, train_tags = read_data('data/train.txt')\n",
125 |     "validation_tokens, validation_tags = read_data('data/validation.txt')\n",
126 |     "test_tokens, test_tags = read_data('data/test.txt')"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "for i in range(3):\n",
145 |     "    for token, tag in zip(train_tokens[i], train_tags[i]):\n",
146 |     "        print('%s\\t%s' % (token, tag))\n",
147 |     "    print()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### Prepare dictionaries\n",
155 |     "\n",
156 |     "To train a neural network, we will use two mappings: \n",
157 |     "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n",
158 |     "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n",
159 |     "\n",
160 |     "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. "
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "from collections import defaultdict"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {
178 |     "collapsed": true
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "def build_dict(tokens_or_tags, special_tokens):\n",
183 |     "    \"\"\"\n",
184 |     "        tokens_or_tags: a list of lists of tokens or tags\n",
185 |     "        special_tokens: some special tokens\n",
186 |     "    \"\"\"\n",
187 |     "    # Create a dictionary with default value 0\n",
188 |     "    tok2idx = defaultdict(lambda: 0)\n",
189 |     "    idx2tok = []\n",
190 |     "    \n",
191 |     "    # Create mappings from tokens (or tags) to indices and vice versa.\n",
192 |     "    # At first, add special tokens (or tags) to the dictionaries.\n",
193 |     "    # The first special token must have index 0.\n",
194 |     "    \n",
195 |     "    # Mapping tok2idx should contain each token or tag only once. \n",
196 |     "    # To do so, you should:\n",
197 |     "    # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n",
198 |     "    #    occure in special_tokens (because they could have non-empty intersection)\n",
199 |     "    # 2. index them (for example, you can add them into the list idx2tok\n",
200 |     "    # 3. for each token/tag save the index into tok2idx).\n",
201 |     "    \n",
202 |     "    ######################################\n",
203 |     "    ######### YOUR CODE HERE #############\n",
204 |     "    ######################################\n",
205 |     "    \n",
206 |     "    return tok2idx, idx2tok"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n",
214 |     " - `<UNK>` token for out of vocabulary tokens;\n",
215 |     " - `<PAD>` token for padding sentence to the same length when we create batches of sentences."
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "special_tokens = ['<UNK>', '<PAD>']\n",
227 |     "special_tags = ['O']\n",
228 |     "\n",
229 |     "# Create dictionaries \n",
230 |     "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n",
231 |     "tag2idx, idx2tag = build_dict(train_tags, special_tags)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "The next additional functions will help you to create the mapping between tokens and ids for a sentence. "
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "collapsed": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "def words2idxs(tokens_list):\n",
250 |     "    return [token2idx[word] for word in tokens_list]\n",
251 |     "\n",
252 |     "def tags2idxs(tags_list):\n",
253 |     "    return [tag2idx[tag] for tag in tags_list]\n",
254 |     "\n",
255 |     "def idxs2words(idxs):\n",
256 |     "    return [idx2token[idx] for idx in idxs]\n",
257 |     "\n",
258 |     "def idxs2tags(idxs):\n",
259 |     "    return [idx2tag[idx] for idx in idxs]"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### Generate batches\n",
267 |     "\n",
268 |     "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `<PAD>` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. "
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "collapsed": true
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "def batches_generator(batch_size, tokens, tags,\n",
280 |     "                      shuffle=True, allow_smaller_last_batch=True):\n",
281 |     "    \"\"\"Generates padded batches of tokens and tags.\"\"\"\n",
282 |     "    \n",
283 |     "    n_samples = len(tokens)\n",
284 |     "    if shuffle:\n",
285 |     "        order = np.random.permutation(n_samples)\n",
286 |     "    else:\n",
287 |     "        order = np.arange(n_samples)\n",
288 |     "\n",
289 |     "    n_batches = n_samples // batch_size\n",
290 |     "    if allow_smaller_last_batch and n_samples % batch_size:\n",
291 |     "        n_batches += 1\n",
292 |     "\n",
293 |     "    for k in range(n_batches):\n",
294 |     "        batch_start = k * batch_size\n",
295 |     "        batch_end = min((k + 1) * batch_size, n_samples)\n",
296 |     "        current_batch_size = batch_end - batch_start\n",
297 |     "        x_list = []\n",
298 |     "        y_list = []\n",
299 |     "        max_len_token = 0\n",
300 |     "        for idx in order[batch_start: batch_end]:\n",
301 |     "            x_list.append(words2idxs(tokens[idx]))\n",
302 |     "            y_list.append(tags2idxs(tags[idx]))\n",
303 |     "            max_len_token = max(max_len_token, len(tags[idx]))\n",
304 |     "            \n",
305 |     "        # Fill in the data into numpy nd-arrays filled with padding indices.\n",
306 |     "        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']\n",
307 |     "        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n",
308 |     "        lengths = np.zeros(current_batch_size, dtype=np.int32)\n",
309 |     "        for n in range(current_batch_size):\n",
310 |     "            utt_len = len(x_list[n])\n",
311 |     "            x[n, :utt_len] = x_list[n]\n",
312 |     "            lengths[n] = utt_len\n",
313 |     "            y[n, :utt_len] = y_list[n]\n",
314 |     "        yield x, y, lengths"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "## Build a recurrent neural network\n",
322 |     "\n",
323 |     "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification.  "
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": true
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "import tensorflow as tf\n",
335 |     "import numpy as np"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "class BiLSTMModel():\n",
347 |     "    pass"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {
353 |     "collapsed": true
354 |    },
355 |    "source": [
356 |     "First, we need to create [placeholders](https://www.tensorflow.org/versions/master/api_docs/python/tf/placeholder) to specify what data we are going to feed into the network during the execution time.  For this task we will need the following placeholders:\n",
357 |     " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n",
358 |     " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n",
359 |     " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n",
360 |     " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n",
361 |     " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n",
362 |     "\n",
363 |     "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n",
364 |     "\n",
365 |     "You need to complete the function *declare_placeholders*."
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "collapsed": true
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "def declare_placeholders(self):\n",
377 |     "    \"\"\"Specifies placeholders for the model.\"\"\"\n",
378 |     "\n",
379 |     "    # Placeholders for input and ground truth output.\n",
380 |     "    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n",
381 |     "    self.ground_truth_tags = ######### YOUR CODE HERE #############\n",
382 |     "  \n",
383 |     "    # Placeholder for lengths of the sequences.\n",
384 |     "    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n",
385 |     "    \n",
386 |     "    # Placeholder for a dropout keep probability. If we don't feed\n",
387 |     "    # a value for this placeholder, it will be equal to 1.0.\n",
388 |     "    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n",
389 |     "    \n",
390 |     "    # Placeholder for a learning rate (tf.float32).\n",
391 |     "    self.learning_rate_ph = ######### YOUR CODE HERE #############"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "outputs": [],
401 |    "source": [
402 |     "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "collapsed": true
409 |    },
410 |    "source": [
411 |     "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n",
412 |     " \n",
413 |     "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type  (*tf.float32*), and initialize with random values.\n",
414 |     "- Create forward and backward LSTM cells. TensorFlow provides a number of [RNN cells](https://www.tensorflow.org/api_guides/python/contrib.rnn#Core_RNN_Cells_for_use_with_TensorFlow_s_core_RNN_methods) ready for you. We suggest that you use *BasicLSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n",
415 |     "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n",
416 |     " \n",
417 |     "After that, you can build the computation graph that transforms an input_batch:\n",
418 |     "\n",
419 |     "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n",
420 |     "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n",
421 |     "- Create a dense layer on top. Its output will be used directly in loss function.  \n",
422 |     " \n",
423 |     "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n",
424 |     " "
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {
431 |     "collapsed": true
432 |    },
433 |    "outputs": [],
434 |    "source": [
435 |     "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n",
436 |     "    \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n",
437 |     "    \n",
438 |     "    # Create embedding variable (tf.Variable) with dtype tf.float32\n",
439 |     "    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)\n",
440 |     "    embedding_matrix_variable = ######### YOUR CODE HERE #############\n",
441 |     "    \n",
442 |     "    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n",
443 |     "    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n",
444 |     "    forward_cell =  ######### YOUR CODE HERE #############\n",
445 |     "    backward_cell =  ######### YOUR CODE HERE #############\n",
446 |     "\n",
447 |     "    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n",
448 |     "    # Shape: [batch_size, sequence_len, embedding_dim].\n",
449 |     "    embeddings =  ######### YOUR CODE HERE #############\n",
450 |     "    \n",
451 |     "    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n",
452 |     "    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n",
453 |     "    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n",
454 |     "    (rnn_output_fw, rnn_output_bw), _ =  ######### YOUR CODE HERE #############\n",
455 |     "    rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n",
456 |     "\n",
457 |     "    # Dense layer on top.\n",
458 |     "    # Shape: [batch_size, sequence_len, n_tags].   \n",
459 |     "    self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "BiLSTMModel.__build_layers = classmethod(build_layers)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "collapsed": true
485 |    },
486 |    "outputs": [],
487 |    "source": [
488 |     "def compute_predictions(self):\n",
489 |     "    \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n",
490 |     "    \n",
491 |     "    # Create softmax (tf.nn.softmax) function\n",
492 |     "    softmax_output = ######### YOUR CODE HERE #############\n",
493 |     "    \n",
494 |     "    # Use argmax (tf.argmax) to get the most probable tags\n",
495 |     "    # Don't forget to set axis=-1\n",
496 |     "    # otherwise argmax will be calculated in a wrong way\n",
497 |     "    self.predictions = ######### YOUR CODE HERE #############"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {
504 |     "collapsed": true
505 |    },
506 |    "outputs": [],
507 |    "source": [
508 |     "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "metadata": {
514 |     "collapsed": true
515 |    },
516 |    "source": [
517 |     "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n",
518 |     "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note,  that we do not want to take into account loss terms coming from `<PAD>` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)."
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {
525 |     "collapsed": true
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "def compute_loss(self, n_tags, PAD_index):\n",
530 |     "    \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n",
531 |     "    \n",
532 |     "    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)\n",
533 |     "    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n",
534 |     "    loss_tensor =  ######### YOUR CODE HERE #############\n",
535 |     "    \n",
536 |     "    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n",
537 |     "    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)\n",
538 |     "    # Be careful that the argument of tf.reduce_mean should be\n",
539 |     "    # multiplication of mask and loss_tensor.\n",
540 |     "    self.loss =  ######### YOUR CODE HERE #############"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {
547 |     "collapsed": true
548 |    },
549 |    "outputs": [],
550 |    "source": [
551 |     "BiLSTMModel.__compute_loss = classmethod(compute_loss)"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "The last thing to specify is how we want to optimize the loss. \n",
559 |     "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n",
560 |     "You will also need to apply [clipping](https://www.tensorflow.org/versions/r0.12/api_docs/python/train/gradient_clipping) to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. "
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {
567 |     "collapsed": true
568 |    },
569 |    "outputs": [],
570 |    "source": [
571 |     "def perform_optimization(self):\n",
572 |     "    \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n",
573 |     "    \n",
574 |     "    # Create an optimizer (tf.train.AdamOptimizer)\n",
575 |     "    self.optimizer =  ######### YOUR CODE HERE #############\n",
576 |     "    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n",
577 |     "    \n",
578 |     "    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n",
579 |     "    # Pay attention that you need to apply this operation only for gradients \n",
580 |     "    # because self.grads_and_vars contains also variables.\n",
581 |     "    # list comprehension might be useful in this case.\n",
582 |     "    clip_norm = tf.cast(1.0, tf.float32)\n",
583 |     "    self.grads_and_vars =  ######### YOUR CODE HERE #############\n",
584 |     "    \n",
585 |     "    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": null,
591 |    "metadata": {
592 |     "collapsed": true
593 |    },
594 |    "outputs": [],
595 |    "source": [
596 |     "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "metadata": {
602 |     "collapsed": true
603 |    },
604 |    "source": [
605 |     "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n",
606 |     "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. "
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {
613 |     "collapsed": true
614 |    },
615 |    "outputs": [],
616 |    "source": [
617 |     "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n",
618 |     "    self.__declare_placeholders()\n",
619 |     "    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n",
620 |     "    self.__compute_predictions()\n",
621 |     "    self.__compute_loss(n_tags, PAD_index)\n",
622 |     "    self.__perform_optimization()"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {
629 |     "collapsed": true
630 |    },
631 |    "outputs": [],
632 |    "source": [
633 |     "BiLSTMModel.__init__ = classmethod(init_model)"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "markdown",
638 |    "metadata": {},
639 |    "source": [
640 |     "## Train the network and predict tags"
641 |    ]
642 |   },
643 |   {
644 |    "cell_type": "markdown",
645 |    "metadata": {
646 |     "collapsed": true
647 |    },
648 |    "source": [
649 |     "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. "
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": null,
655 |    "metadata": {
656 |     "collapsed": true
657 |    },
658 |    "outputs": [],
659 |    "source": [
660 |     "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n",
661 |     "    feed_dict = {self.input_batch: x_batch,\n",
662 |     "                 self.ground_truth_tags: y_batch,\n",
663 |     "                 self.learning_rate_ph: learning_rate,\n",
664 |     "                 self.dropout_ph: dropout_keep_probability,\n",
665 |     "                 self.lengths: lengths}\n",
666 |     "    \n",
667 |     "    session.run(self.train_op, feed_dict=feed_dict)"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {
674 |     "collapsed": true
675 |    },
676 |    "outputs": [],
677 |    "source": [
678 |     "BiLSTMModel.train_on_batch = classmethod(train_on_batch)"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "markdown",
683 |    "metadata": {},
684 |    "source": [
685 |     "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*."
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": null,
691 |    "metadata": {
692 |     "collapsed": true
693 |    },
694 |    "outputs": [],
695 |    "source": [
696 |     "def predict_for_batch(self, session, x_batch, lengths):\n",
697 |     "    ######################################\n",
698 |     "    ######### YOUR CODE HERE #############\n",
699 |     "    ######################################\n",
700 |     "    \n",
701 |     "    return predictions"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": null,
707 |    "metadata": {
708 |     "collapsed": true
709 |    },
710 |    "outputs": [],
711 |    "source": [
712 |     "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "markdown",
717 |    "metadata": {},
718 |    "source": [
719 |     "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n",
720 |     "\n",
721 |     "### Evaluation \n",
722 |     "To simplify the evaluation process we provide two functions for you:\n",
723 |     " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n",
724 |     " - *eval_conll*: calculates precision, recall and F1 for the results."
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "metadata": {
731 |     "collapsed": true
732 |    },
733 |    "outputs": [],
734 |    "source": [
735 |     "from evaluation import precision_recall_f1"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "metadata": {
742 |     "collapsed": true
743 |    },
744 |    "outputs": [],
745 |    "source": [
746 |     "def predict_tags(model, session, token_idxs_batch, lengths):\n",
747 |     "    \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n",
748 |     "    \n",
749 |     "    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n",
750 |     "    \n",
751 |     "    tags_batch, tokens_batch = [], []\n",
752 |     "    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n",
753 |     "        tags, tokens = [], []\n",
754 |     "        for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n",
755 |     "            tags.append(idx2tag[tag_idx])\n",
756 |     "            tokens.append(idx2token[token_idx])\n",
757 |     "        tags_batch.append(tags)\n",
758 |     "        tokens_batch.append(tokens)\n",
759 |     "    return tags_batch, tokens_batch\n",
760 |     "    \n",
761 |     "    \n",
762 |     "def eval_conll(model, session, tokens, tags, short_report=True):\n",
763 |     "    \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n",
764 |     "    \n",
765 |     "    y_true, y_pred = [], []\n",
766 |     "    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n",
767 |     "        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n",
768 |     "        if len(x_batch[0]) != len(tags_batch[0]):\n",
769 |     "            raise Exception(\"Incorrect length of prediction for the input, \"\n",
770 |     "                            \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n",
771 |     "        predicted_tags = []\n",
772 |     "        ground_truth_tags = []\n",
773 |     "        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n",
774 |     "            if token != '<PAD>':\n",
775 |     "                ground_truth_tags.append(idx2tag[gt_tag_idx])\n",
776 |     "                predicted_tags.append(pred_tag)\n",
777 |     "\n",
778 |     "        # We extend every prediction and ground truth sequence with 'O' tag\n",
779 |     "        # to indicate a possible end of entity.\n",
780 |     "        y_true.extend(ground_truth_tags + ['O'])\n",
781 |     "        y_pred.extend(predicted_tags + ['O'])\n",
782 |     "        \n",
783 |     "    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n",
784 |     "    return results"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "markdown",
789 |    "metadata": {},
790 |    "source": [
791 |     "## Run your experiment"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "markdown",
796 |    "metadata": {},
797 |    "source": [
798 |     "Create *BiLSTMModel* model with the following parameters:\n",
799 |     " - *vocabulary_size* — number of tokens;\n",
800 |     " - *n_tags* — number of tags;\n",
801 |     " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n",
802 |     " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n",
803 |     " - *PAD_index* — an index of the padding token (`<PAD>`).\n",
804 |     "\n",
805 |     "Set hyperparameters. You might want to start with the following recommended values:\n",
806 |     "- *batch_size*: 32;\n",
807 |     "- 4 epochs;\n",
808 |     "- starting value of *learning_rate*: 0.005\n",
809 |     "- *learning_rate_decay*: a square root of 2;\n",
810 |     "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n",
811 |     "\n",
812 |     "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment."
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": null,
818 |    "metadata": {
819 |     "collapsed": true
820 |    },
821 |    "outputs": [],
822 |    "source": [
823 |     "tf.reset_default_graph()\n",
824 |     "\n",
825 |     "model = ######### YOUR CODE HERE #############\n",
826 |     "\n",
827 |     "batch_size = ######### YOUR CODE HERE #############\n",
828 |     "n_epochs = ######### YOUR CODE HERE #############\n",
829 |     "learning_rate = ######### YOUR CODE HERE #############\n",
830 |     "learning_rate_decay = ######### YOUR CODE HERE #############\n",
831 |     "dropout_keep_probability = ######### YOUR CODE HERE #############"
832 |    ]
833 |   },
834 |   {
835 |    "cell_type": "markdown",
836 |    "metadata": {},
837 |    "source": [
838 |     "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables."
839 |    ]
840 |   },
841 |   {
842 |    "cell_type": "markdown",
843 |    "metadata": {},
844 |    "source": [
845 |     "Finally, we are ready to run the training!"
846 |    ]
847 |   },
848 |   {
849 |    "cell_type": "code",
850 |    "execution_count": null,
851 |    "metadata": {
852 |     "collapsed": true
853 |    },
854 |    "outputs": [],
855 |    "source": [
856 |     "sess = tf.Session()\n",
857 |     "sess.run(tf.global_variables_initializer())\n",
858 |     "\n",
859 |     "print('Start training... \\n')\n",
860 |     "for epoch in range(n_epochs):\n",
861 |     "    # For each epoch evaluate the model on train and validation data\n",
862 |     "    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n",
863 |     "    print('Train data evaluation:')\n",
864 |     "    eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n",
865 |     "    print('Validation data evaluation:')\n",
866 |     "    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n",
867 |     "    \n",
868 |     "    # Train the model\n",
869 |     "    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n",
870 |     "        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n",
871 |     "        \n",
872 |     "    # Decaying the learning rate\n",
873 |     "    learning_rate = learning_rate / learning_rate_decay\n",
874 |     "    \n",
875 |     "print('...training finished.')"
876 |    ]
877 |   },
878 |   {
879 |    "cell_type": "markdown",
880 |    "metadata": {},
881 |    "source": [
882 |     "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n",
883 |     "\n",
884 |     "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**"
885 |    ]
886 |   },
887 |   {
888 |    "cell_type": "code",
889 |    "execution_count": null,
890 |    "metadata": {
891 |     "collapsed": true
892 |    },
893 |    "outputs": [],
894 |    "source": [
895 |     "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n",
896 |     "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n",
897 |     "\n",
898 |     "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n",
899 |     "validation_results = ######### YOUR CODE HERE #############\n",
900 |     "\n",
901 |     "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n",
902 |     "test_results = ######### YOUR CODE HERE #############"
903 |    ]
904 |   },
905 |   {
906 |    "cell_type": "markdown",
907 |    "metadata": {},
908 |    "source": [
909 |     "### Conclusions\n",
910 |     "\n",
911 |     "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested."
912 |    ]
913 |   }
914 |  ],
915 |  "metadata": {
916 |   "kernelspec": {
917 |    "display_name": "Python 3",
918 |    "language": "python",
919 |    "name": "python3"
920 |   },
921 |   "language_info": {
922 |    "codemirror_mode": {
923 |     "name": "ipython",
924 |     "version": 3
925 |    },
926 |    "file_extension": ".py",
927 |    "mimetype": "text/x-python",
928 |    "name": "python",
929 |    "nbconvert_exporter": "python",
930 |    "pygments_lexer": "ipython3",
931 |    "version": "3.4.3"
932 |   }
933 |  },
934 |  "nbformat": 4,
935 |  "nbformat_minor": 1
936 | }
937 | 


--------------------------------------------------------------------------------
/week3/.gitignore:
--------------------------------------------------------------------------------
1 | GoogleNews-vectors-negative300.*
2 | starspace_embedding
3 | starspace_embedding.*
4 | 


--------------------------------------------------------------------------------
/week3/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A'
10 |         self.parts = OrderedDict([('98mDT', 'Question2Vec'), 
11 |                                   ('nc7RP', 'HitsCount'), 
12 |                                   ('bNp90', 'DCGScore'), 
13 |                                   ('3gRlQ', 'W2VTokenizedRanks'), 
14 |                                   ('mX6wS', 'StarSpaceRanks')])
15 |         self.answers = {key: None for key in self.parts}
16 | 
17 |     @staticmethod
18 |     def ravel_output(output):
19 |         '''
20 |            If student accidentally submitted np.array with one
21 |            element instead of number, this function will submit
22 |            this number instead
23 |         '''
24 |         if isinstance(output, np.ndarray) and output.size == 1:
25 |             output = output.item(0)
26 |         return output
27 | 
28 |     def submit(self, email, token):
29 |         submission = {
30 |                     "assignmentKey": self.assignment_key, 
31 |                     "submitterEmail": email, 
32 |                     "secret": token, 
33 |                     "parts": {}
34 |                   }
35 |         for part, output in self.answers.items():
36 |             if output is not None:
37 |                 submission["parts"][part] = {"output": output}
38 |             else:
39 |                 submission["parts"][part] = dict()
40 |         request = requests.post(self.submission_page, data=json.dumps(submission))
41 |         response = request.json()
42 |         if request.status_code == 201:
43 |             print('Submitted to Coursera platform. See results on assignment page!')
44 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
45 |             print(response[u'details'][u'learnerMessage'])
46 |         else:
47 |             print("Unknown response from Coursera: {}".format(request.status_code))
48 |             print(response)
49 | 
50 |     def status(self):
51 |         print("You want to submit these parts:")
52 |         for part_id, part_name in self.parts.items():
53 |             answer = self.answers[part_id]
54 |             if answer is None:
55 |                 answer = '-'*10
56 |             print("Task {}: {}".format(part_name, answer[:100] + '...'))
57 |                
58 |     def submit_part(self, part, output):
59 |         self.answers[part] = output
60 |         print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...'))
61 | 
62 |     def submit_tag(self, tag, output):
63 |         part_id = [k for k, v in self.parts.items() if v == tag]
64 |         if len(part_id) != 1:
65 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
66 |         part_id = part_id[0]
67 |         self.submit_part(part_id, str(self.ravel_output(output)))
68 | 


--------------------------------------------------------------------------------
/week3/util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from nltk.corpus import stopwords
 3 | 
 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
 6 | STOPWORDS = set(stopwords.words('english'))
 7 | def text_prepare(text):
 8 |     text = text.lower()
 9 |     text = REPLACE_BY_SPACE_RE.sub(' ', text)
10 |     text = GOOD_SYMBOLS_RE.sub('', text)
11 |     text = ' '.join([x for x in text.split() if x and x not in STOPWORDS])
12 |     return text.strip()
13 | 
14 | def array_to_string(arr):
15 |     return '\n'.join(str(num) for num in arr)
16 | 
17 | def matrix_to_string(matrix):
18 |     return '\n'.join('\t'.join(str(num) for num in line) for line in matrix)


--------------------------------------------------------------------------------
/week4/encoder-decoder-pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nsanghi/HSE-NLP-Coursera/9df88e63eba6dbb38cabd87bd88fff25f4abcda6/week4/encoder-decoder-pic.png


--------------------------------------------------------------------------------