├── .gitignore ├── AWS-tutorial.md ├── Docker-tutorial.md ├── common ├── README.md └── download_utils.py ├── docker ├── Dockerfile ├── requirements.txt └── welcome_message.txt ├── honor ├── README.md ├── datasets.py ├── download_cornell.sh ├── download_opensubs.sh └── example.py ├── project ├── .gitignore ├── dialogue_manager.py ├── main_bot.py ├── utils.py ├── week5-project-Soln.ipynb └── week5-project.ipynb ├── week1 ├── grader.py ├── metrics.py ├── week1-MultilabelClassification-NewSolution.ipynb ├── week1-MultilabelClassification-Solution.ipynb └── week1-MultilabelClassification.ipynb ├── week2 ├── evaluation.py ├── week2-NER-MySolution.ipynb ├── week2-NER.ipynb ├── week2-NER_peerreview1.ipynb ├── week2-NER_peerreview3.ipynb ├── week2-NER_peerreview4.ipynb └── week2-NER_v1_1_peerreview2.ipynb ├── week3 ├── .gitignore ├── grader.py ├── util.py ├── week3-Embeddings-Solution.ipynb └── week3-Embeddings.ipynb └── week4 ├── encoder-decoder-pic.png ├── week4-seq2seq-Soln.ipynb ├── week4-seq2seq.ipynb ├── week4-seq2seq_eval1.ipynb ├── week4-seq2seq_eval2.ipynb └── week4-seq2seq_eval3.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Data for assignments 104 | data/ 105 | -------------------------------------------------------------------------------- /AWS-tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial for setting up an AWS Virtual Machine 2 | 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 4 | 5 | ### 1. Register with AWS and launch an EC2 instance 6 | 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them): 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged. 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH. 10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change. 11 | 12 | Next, you are ready to create your first EC2 instance: 13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) choose **Ubuntu Server 16.04 LTS**. 14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH. 15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end. 16 | 17 | ### 2. Set up dependencies and run your project 18 | 19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions. 20 | 21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling: 22 | ```sh 23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com 24 | ``` 25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine. 26 | 27 | - Bring code and data to AWS instance, e.g. 28 | ```sh 29 | scp -i path/to/your_key.pem path/to/local_file ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file 30 | ``` 31 | You might want to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows. 32 | 33 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop. 34 | 35 | -------------------------------------------------------------------------------- /Docker-tutorial.md: -------------------------------------------------------------------------------- 1 | # Docker container with course dependencies 2 | 3 | This file describes how to use a Docker container with Jupyter notebook and 4 | all dependencies required for the course. 5 | 6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/. 7 | 8 | ## Install Stable Docker Community Edition (CE) 9 | 10 | - For Mac: 11 | https://docs.docker.com/docker-for-mac/install/ 12 | 13 | - For Ubuntu: 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu). 15 | 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education): 17 | https://docs.docker.com/docker-for-windows/install/ 18 | 19 | - For Windows (older versions): 20 | https://docs.docker.com/toolbox/toolbox_install_windows/ 21 | 22 | 23 | 24 | ## Get container image 25 | 26 | To get the latest version of the container image run: 27 | ```sh 28 | docker pull akashin/coursera-aml-nlp 29 | ``` 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 31 | 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group: 33 | ```sh 34 | sudo usermod -a -G docker $USER 35 | sudo service docker restart 36 | ``` 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it). 38 | 39 | 40 | ## Run container for the first time 41 | 42 | Now you can start new container from this image with: 43 | ```sh 44 | docker run -it -p 8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp 45 | ``` 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server. 47 | 48 | You may find it useful to mount a directory from your local machine within the container using `-v` option. 49 | 50 | For Linux and OSX, the following command should work: 51 | ```sh 52 | docker run -it -p 8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp 53 | ``` 54 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path. 55 | 56 | For Windows, there are some extra [steps](https://rominirani.com/docker-on-windows-mounting-host-directories-d96f3f056a2c) involved, and the launch command looks like 57 | ```sh 58 | docker run -it -p 8080:8080 --name coursera-aml-nlp --user root -v /c/Users/$YOUR_USERNAME:/root/coursera akashin/coursera-aml-nlp 59 | ``` 60 | Where `/c/Users/$YOUR_USERNAME` is the path to your user's home folder. 61 | 62 | If you're using Docker Toolbox on Windows, the command given above might not work because of the additional VirtualBox layer involved. Instead, we recommend you to follow the guidance in http://blog.shahinrostami.com/2017/11/docker-toolbox-windows-7-shared-volumes/. 63 | 64 | ## Stop and resume container 65 | 66 | To stop the container use: 67 | ```sh 68 | docker stop coursera-aml-nlp 69 | ``` 70 | All the changes that were made within container will be saved. 71 | 72 | To resume the stopped container use: 73 | ```sh 74 | docker start -i coursera-aml-nlp 75 | ``` 76 | ## Other operations on the container 77 | 78 | There are many other operations that you can perform on the container, to show all of them: 79 | ```sh 80 | docker container 81 | ``` 82 | Some particularly useful would be **showing a list of containers** and **removing container**. 83 | 84 | To show currently running and stopped containers with their status: 85 | ```sh 86 | docker ps -a 87 | ``` 88 | 89 | To connect to a Bash shell in the already running container with name `coursera-aml-nlp` run: 90 | ``` 91 | docker exec -it coursera-aml-nlp bash 92 | ``` 93 | This will drop you into the standard Linux Bash shell that supports common commands like `ls`, `wget` or `python3`. 94 | 95 | To remove the container and all data associated with it: 96 | ```sh 97 | docker rm coursera-aml-nlp 98 | ``` 99 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected. 100 | 101 | ## Install more packages 102 | 103 | You can install more packages in the container if needed: 104 | ```sh 105 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME 106 | ``` 107 | 108 | ## Change RAM limits of the container 109 | 110 | Your container might have memory limits that are different from the actual limits of your physical machine, which might lead to a crash of your code due memory shortage. 111 | 112 | - If you're running Windows or OSX, the default limit is 2GB, but you can change it by following this tutorials: 113 | - For Windows: https://docs.docker.com/docker-for-windows/#advanced 114 | - For Mac OSX: https://docs.docker.com/docker-for-mac/#advanced 115 | 116 | - If you're running Linux, you're all set as the memory limits are the same as the physical memory of your machine. 117 | 118 | 119 | ## Further reading 120 | 121 | If you are interested to know more about Docker, check out this articles: 122 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/ 123 | - General introduction to Docker: https://docker-curriculum.com/ 124 | 125 | ## Troubleshooting 126 | 127 | ### Verify your Docker installation by running "Hello World" application 128 | - Run `docker pull hello-world`. You should see a message that ends with 129 | “Status: Downloaded newer image for hello-world:latest”. 130 | - Run `docker run hello-world`. You should see a message that starts with 131 | “Hello from Docker! 132 | This message shows that your installation appears to be working correctly.” 133 | 134 | If you see any errors, follow relevant troubleshooting steps. 135 | 136 | ### “Unauthorized: authentication required” when trying to pull Docker image 137 | Run `docker logout` and try pulling again. If this doesn't help, make sure the system date is set correctly and try again. If this doesn't help, reinstall Docker and try again. 138 | 139 | ### Can't open Jupyter notebook in the browser 140 | If you try to open "http://localhost:8080" or "http://127.0.0.1:8080" in your browser, when `run_notebook` command is started, and you can't access your notebooks, here are some advices: 141 | - If you're using Docker Toolbox on Windows, try accessing "http://192.168.99.100:8080" instead. If this doesn't work, follow the instructions [on official Docker docs](https://docs.docker.com/docker-for-windows/troubleshoot/#limitations-of-windows-containers-for-localhost-and-published-ports) and on [Stackoverflow](https://stackoverflow.com/questions/42866013/docker-toolbox-localhost-not-working). 142 | - Make sure that you're running container with `-p` flag as described [here](#run-container-for-the-first-time) and that the output of `docker ps` contains a message like this: 143 | ``` 144 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 145 | e5b7bcd85a1b akashin/coursera-aml-nlp "/bin/bash" Less than a second ago Up 2 seconds 8080/tcp peaceful_lamarr 146 | ``` 147 | If the part about `PORTS` differs, remove the current container following [instructions](#other-operations-on-the-container) and start it again. 148 | - Make sure that browser proxy settings don't interfere with accessing local web sites. 149 | 150 | ### How do I load data into Docker container? 151 | To access the data in the container, we recommend to use `-v` flag described [here](#run-container-for-the-first-time) to mount a local directory from your computer into the container filesystem. For more details read [Docker documentation](https://docs.docker.com/storage/volumes/). 152 | 153 | Alternatively, you can download data using Jupyter "Upload" button or `wget` command in the [Bash shell](#other-operations-on-the-container) of the container. 154 | 155 | ### Can't run `run_notebook` or `starspace` command 156 | Make sure that you're executing it in the context of the Docker container as described [here](#run-container-for-the-first-time). 157 | 158 | ### "Name is already in use by container" when trying to run the container 159 | This means that the container with this name is already created. You can connect to this container or remove it by following [instructions](#other-operations-on-the-container). 160 | 161 | ### StarSpace/Jupyter notebook crashes in Docker 162 | This usually happens due to low default 2GB memory limit on Windows and OSX. Follow this [instructions](#change-ram-limits-of-the-container) to fix this. 163 | 164 | ## Reporting the issue to the Coursera forum 165 | Before reporting the issue to the Coursera forum, please, make sure that you've checked the [troubleshooting](#troubleshooting) steps. Only if they don't help, post all relevant error messages, throubleshooting results, and the following information to your post: 166 | 167 | - Your operating system (e.g. Windows 7, Ubuntu Linux, OSX 10.13.3) 168 | - Your docker version (e.g. Docker Toolbox, Docker for Windows, output of `docker --version`) 169 | - Output of `docker ps -a`, `docker info`, `docker version -f "{{ .Server.Os }}"` (share thorough https://gist.github.com/ or https://pastebin.com/) 170 | - Output of `wget http://localhost:8080` (or `wget http://192.168.99.100:8080` for Docker Toolbox), executed from within Docker container and outside of it 171 | 172 | ## Credits 173 | 174 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker 175 | -------------------------------------------------------------------------------- /common/README.md: -------------------------------------------------------------------------------- 1 | # Common utils 2 | 3 | This folder stores collection of functions that are common for different assignments 4 | 5 | - `download_utils.py`: Functions for downloading data for the assignments. 6 | -------------------------------------------------------------------------------- /common/download_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import shutil 5 | 6 | try: 7 | import tqdm 8 | # Address problem in tqdm library. For details see: https://github.com/tqdm/tqdm/issues/481 9 | tqdm.monitor_interval = 0 10 | except ImportError: 11 | tqdm = None 12 | 13 | import requests 14 | 15 | REPOSITORY_PATH="https://github.com/hse-aml/natural-language-processing" 16 | 17 | 18 | def download_file(url, file_path): 19 | r = requests.get(url, stream=True) 20 | total_size = int(r.headers.get('content-length')) 21 | try: 22 | with open(file_path, 'wb', buffering=16*1024*1024) as f: 23 | if tqdm: 24 | bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True) 25 | bar.set_description(os.path.split(file_path)[-1]) 26 | 27 | for chunk in r.iter_content(32 * 1024): 28 | f.write(chunk) 29 | if tqdm: 30 | bar.update(len(chunk)) 31 | 32 | if tqdm: 33 | bar.close() 34 | else: 35 | print("File {!r} successfully downloaded".format(file_path)) 36 | except Exception: 37 | print("Download failed") 38 | finally: 39 | if os.path.getsize(file_path) != total_size: 40 | os.remove(file_path) 41 | print("Removed incomplete download") 42 | 43 | 44 | def download_from_github(version, fn, target_dir, force=False): 45 | url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn) 46 | file_path = os.path.join(target_dir, fn) 47 | if os.path.exists(file_path) and not force: 48 | print("File {} is already downloaded.".format(file_path)) 49 | return 50 | download_file(url, file_path) 51 | 52 | 53 | def sequential_downloader(version, fns, target_dir, force=False): 54 | os.makedirs(target_dir, exist_ok=True) 55 | for fn in fns: 56 | download_from_github(version, fn, target_dir, force=force) 57 | 58 | 59 | def link_all_files_from_dir(src_dir, dst_dir): 60 | os.makedirs(dst_dir, exist_ok=True) 61 | for fn in os.listdir(src_dir): 62 | src_file = os.path.join(src_dir, fn) 63 | dst_file = os.path.join(dst_dir, fn) 64 | if os.name == "nt": 65 | shutil.copyfile(src_file, dst_file) 66 | else: 67 | if not os.path.exists(dst_file): 68 | os.symlink(os.path.abspath(src_file), dst_file) 69 | 70 | 71 | def link_resources(): 72 | link_all_files_from_dir("../readonly/dataset/", ".") 73 | 74 | 75 | def download_week1_resources(force=False): 76 | sequential_downloader( 77 | "week1", 78 | [ 79 | "train.tsv", 80 | "validation.tsv", 81 | "test.tsv", 82 | "text_prepare_tests.tsv", 83 | ], 84 | "data", 85 | force=force 86 | ) 87 | 88 | 89 | def download_week2_resources(force=False): 90 | sequential_downloader( 91 | "week2", 92 | [ 93 | "train.txt", 94 | "validation.txt", 95 | "test.txt", 96 | ], 97 | "data", 98 | force=force 99 | ) 100 | 101 | 102 | def download_week3_resources(force=False): 103 | sequential_downloader( 104 | "week3", 105 | [ 106 | "train.tsv", 107 | "validation.tsv", 108 | "test.tsv", 109 | "test_embeddings.tsv", 110 | ], 111 | "data", 112 | force=force 113 | ) 114 | 115 | 116 | def download_project_resources(force=False): 117 | sequential_downloader( 118 | "project", 119 | [ 120 | "dialogues.tsv", 121 | "tagged_posts.tsv", 122 | ], 123 | "data", 124 | force=force 125 | ) 126 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | LABEL maintainer="Andrei Kashin " 3 | 4 | RUN apt-get update && apt-get install -yq \ 5 | python3 python3-pip htop nano git wget \ 6 | libglib2.0-0 autoconf automake \ 7 | libtool build-essential unzip \ 8 | libarchive-dev vim 9 | 10 | # Install Starspace. 11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \ 12 | unzip boost_1_63_0.zip && \ 13 | mv boost_1_63_0 /usr/local/bin 14 | 15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \ 16 | cd Starspace && \ 17 | make && \ 18 | cp -Rf starspace /usr/local/bin 19 | 20 | # Install Python dependencies. 21 | ADD requirements.txt / 22 | RUN pip3 install --upgrade pip 23 | RUN pip3 install -r requirements.txt 24 | 25 | # Install Jupyter. 26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension 27 | RUN jupyter contrib nbextension install 28 | RUN jupyter nbextension enable codefolding/main 29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py 30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py 31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py 32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook 33 | 34 | # Welcome message. 35 | ADD welcome_message.txt / 36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \ 37 | >> /etc/bash.bashrc \ 38 | ; cat welcome_message.txt > /etc/motd 39 | 40 | WORKDIR /root 41 | EXPOSE 8080 42 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.weakref==1.0.post1 2 | bleach==1.5.0 3 | certifi==2017.11.5 4 | chardet==3.0.4 5 | ChatterBot==0.7.6 6 | decorator==4.1.2 7 | entrypoints==0.2.3 8 | enum34==1.1.6 9 | funcsigs==1.0.2 10 | gensim==3.1.0 11 | html5lib==0.9999999 12 | idna==2.6 13 | ipykernel==4.6.1 14 | ipython==6.2.1 15 | ipython-genutils==0.2.0 16 | ipywidgets==7.0.5 17 | jedi==0.11.0 18 | Jinja2==2.10 19 | jsonschema==2.6.0 20 | jupyter==1.0.0 21 | jupyter-client==5.1.0 22 | jupyter-console==5.2.0 23 | jupyter-contrib-core==0.3.3 24 | jupyter-contrib-nbextensions==0.3.3 25 | jupyter-core==4.4.0 26 | jupyter-highlight-selected-word==0.1.0 27 | jupyter-latex-envs==1.3.8.4 28 | jupyter-nbextensions-configurator==0.2.8 29 | libarchive==0.4.4 30 | Markdown==2.6.9 31 | MarkupSafe==1.0 32 | matplotlib==2.1.0 33 | mistune==0.8.1 34 | mock==2.0.0 35 | nbconvert==5.3.1 36 | nbformat==4.4.0 37 | nltk==3.2.5 38 | notebook==5.2.1 39 | numpy==1.13.3 40 | pandas==0.21.0 41 | pandocfilters==1.4.2 42 | parso==0.1.0 43 | pbr==3.1.1 44 | pexpect==4.3.0 45 | pickleshare==0.7.4 46 | prompt-toolkit==1.0.15 47 | protobuf==3.5.0.post1 48 | ptyprocess==0.5.2 49 | Pygments==2.2.0 50 | python-dateutil==2.6.1 51 | pyzmq==16.0.3 52 | qtconsole==4.3.1 53 | regex==2017.11.9 54 | requests==2.18.4 55 | scikit-learn==0.19.1 56 | scipy==1.0.0 57 | simplegeneric==0.8.1 58 | six==1.11.0 59 | tensorflow==1.4.0 60 | tensorflow-tensorboard==0.4.0rc3 61 | terminado==0.7 62 | testpath==0.3.1 63 | tornado==4.5.2 64 | tqdm==4.19.4 65 | traitlets==4.3.2 66 | urllib3==1.22 67 | wcwidth==0.1.7 68 | Werkzeug==0.12.2 69 | widgetsnbextension==3.0.8 70 | -------------------------------------------------------------------------------- /docker/welcome_message.txt: -------------------------------------------------------------------------------- 1 | 2 | =================================================================== 3 | Welcome to the Docker container for the Coursera NLP course. 4 | 5 | This container contains dependencies that you might need 6 | to complete course assignments. 7 | 8 | You can also install any additional system dependencies with 9 | > apt-get install PACKAGE_NAME 10 | 11 | And Python dependencies with 12 | > pip3 install PACKAGE_NAME 13 | 14 | To run Jupyter Notebook in the container just type 15 | > run_notebook 16 | =================================================================== 17 | 18 | -------------------------------------------------------------------------------- /honor/README.md: -------------------------------------------------------------------------------- 1 | # Utils to download and read data for chat-bot training 2 | 3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training: 4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size) 5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size) 6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset 7 | - `example.py` - example of reading the dataset 8 | -------------------------------------------------------------------------------- /honor/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Conchylicultor. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import ast 17 | import os 18 | import random 19 | import re 20 | from time import time 21 | 22 | import nltk 23 | from tqdm import tqdm 24 | 25 | """ 26 | Load the cornell movie dialog corpus. 27 | 28 | Available from here: 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html 30 | 31 | """ 32 | 33 | class CornellData: 34 | """ 35 | 36 | """ 37 | 38 | def __init__(self, dirName): 39 | """ 40 | Args: 41 | dirName (string): directory where to load the corpus 42 | """ 43 | self.lines = {} 44 | self.conversations = [] 45 | 46 | MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"] 47 | MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"] 48 | 49 | self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS) 50 | self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS) 51 | 52 | # TODO: Cleaner program (merge copy-paste) !! 53 | 54 | def loadLines(self, fileName, fields): 55 | """ 56 | Args: 57 | fileName (str): file to load 58 | field (set): fields to extract 59 | Return: 60 | dict>: the extracted fields for each line 61 | """ 62 | lines = {} 63 | 64 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 65 | for line in f: 66 | values = line.split(" +++$+++ ") 67 | 68 | # Extract fields 69 | lineObj = {} 70 | for i, field in enumerate(fields): 71 | lineObj[field] = values[i] 72 | 73 | lines[lineObj['lineID']] = lineObj 74 | 75 | return lines 76 | 77 | def loadConversations(self, fileName, fields): 78 | """ 79 | Args: 80 | fileName (str): file to load 81 | field (set): fields to extract 82 | Return: 83 | list>: the extracted fields for each line 84 | """ 85 | conversations = [] 86 | 87 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 88 | for line in f: 89 | values = line.split(" +++$+++ ") 90 | 91 | # Extract fields 92 | convObj = {} 93 | for i, field in enumerate(fields): 94 | convObj[field] = values[i] 95 | 96 | # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]") 97 | lineIds = ast.literal_eval(convObj["utteranceIDs"]) 98 | 99 | # Reassemble lines 100 | convObj["lines"] = [] 101 | for lineId in lineIds: 102 | convObj["lines"].append(self.lines[lineId]) 103 | 104 | conversations.append(convObj) 105 | 106 | return conversations 107 | 108 | def getConversations(self): 109 | return self.conversations 110 | 111 | 112 | # Based on code from https://github.com/AlJohri/OpenSubtitles 113 | # by Al Johri 114 | 115 | import xml.etree.ElementTree as ET 116 | import datetime 117 | import os 118 | import sys 119 | import json 120 | import re 121 | import pprint 122 | 123 | from gzip import GzipFile 124 | 125 | """ 126 | Load the opensubtitles dialog corpus. 127 | """ 128 | 129 | class OpensubsData: 130 | """ 131 | """ 132 | 133 | def __init__(self, dirName): 134 | """ 135 | Args: 136 | dirName (string): directory where to load the corpus 137 | """ 138 | 139 | # Hack this to filter on subset of Opensubtitles 140 | # dirName = "%s/en/Action" % dirName 141 | 142 | print("Loading OpenSubtitles conversations in %s." % dirName) 143 | self.conversations = [] 144 | self.tag_re = re.compile(r'(|<[^>]*>)') 145 | self.conversations = self.loadConversations(dirName) 146 | 147 | def loadConversations(self, dirName): 148 | """ 149 | Args: 150 | dirName (str): folder to load 151 | Return: 152 | array(question, answer): the extracted QA pairs 153 | """ 154 | conversations = [] 155 | dirList = self.filesInDir(dirName) 156 | for filepath in tqdm(dirList, "OpenSubtitles data files"): 157 | if filepath.endswith('gz'): 158 | try: 159 | doc = self.getXML(filepath) 160 | conversations.extend(self.genList(doc)) 161 | except ValueError: 162 | tqdm.write("Skipping file %s with errors." % filepath) 163 | except: 164 | print("Unexpected error:", sys.exc_info()[0]) 165 | raise 166 | return conversations 167 | 168 | def getConversations(self): 169 | return self.conversations 170 | 171 | def genList(self, tree): 172 | root = tree.getroot() 173 | 174 | timeFormat = '%H:%M:%S' 175 | maxDelta = datetime.timedelta(seconds=1) 176 | 177 | startTime = datetime.datetime.min 178 | strbuf = '' 179 | sentList = [] 180 | 181 | for child in root: 182 | for elem in child: 183 | if elem.tag == 'time': 184 | elemID = elem.attrib['id'] 185 | elemVal = elem.attrib['value'][:-4] 186 | if elemID[-1] == 'S': 187 | startTime = datetime.datetime.strptime(elemVal, timeFormat) 188 | else: 189 | sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat))) 190 | strbuf = '' 191 | else: 192 | try: 193 | strbuf = strbuf + " " + elem.text 194 | except: 195 | pass 196 | 197 | conversations = [] 198 | for idx in range(0, len(sentList) - 1): 199 | cur = sentList[idx] 200 | nxt = sentList[idx + 1] 201 | if nxt[1] - cur[2] <= maxDelta and cur and nxt: 202 | tmp = {} 203 | tmp["lines"] = [] 204 | tmp["lines"].append(self.getLine(cur[0])) 205 | tmp["lines"].append(self.getLine(nxt[0])) 206 | if self.filter(tmp): 207 | conversations.append(tmp) 208 | 209 | return conversations 210 | 211 | def getLine(self, sentence): 212 | line = {} 213 | line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower() 214 | return line 215 | 216 | def filter(self, lines): 217 | # Use the followint to customize filtering of QA pairs 218 | # 219 | # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will") 220 | # question = lines["lines"][0]["text"] 221 | # if not question.endswith('?'): 222 | # return False 223 | # if not question.split(' ')[0] in startwords: 224 | # return False 225 | # 226 | return True 227 | 228 | def getXML(self, filepath): 229 | fext = os.path.splitext(filepath)[1] 230 | if fext == '.gz': 231 | tmp = GzipFile(filename=filepath) 232 | return ET.parse(tmp) 233 | else: 234 | return ET.parse(filepath) 235 | 236 | def filesInDir(self, dirname): 237 | result = [] 238 | for dirpath, dirs, files in os.walk(dirname): 239 | for filename in files: 240 | fname = os.path.join(dirpath, filename) 241 | result.append(fname) 242 | return result 243 | 244 | 245 | def extractText(line, fast_preprocessing=True): 246 | if fast_preprocessing: 247 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]') 248 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]') 249 | REPLACE_SEVERAL_SPACES = re.compile('\s+') 250 | 251 | line = line.lower() 252 | line = REPLACE_BY_SPACE_RE.sub(' ', line) 253 | line = GOOD_SYMBOLS_RE.sub('', line) 254 | line = REPLACE_SEVERAL_SPACES.sub(' ', line) 255 | return line.strip() 256 | else: 257 | return nltk.word_tokenize(line) 258 | 259 | 260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True): 261 | data = [] 262 | for i, conversation in enumerate(tqdm(conversations)): 263 | lines = conversation['lines'] 264 | for i in range(len(lines) - 1): 265 | request = extractText(lines[i]['text']) 266 | reply = extractText(lines[i + 1]['text']) 267 | if 0 < len(request) <= max_len and 0 < len(reply) <= max_len: 268 | data += [(request, reply)] 269 | return data 270 | 271 | 272 | def readCornellData(path, max_len=20, fast_preprocessing=True): 273 | dataset = CornellData(path) 274 | conversations = dataset.getConversations() 275 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 276 | 277 | 278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True): 279 | dataset = OpensubsData(path) 280 | conversations = dataset.getConversations() 281 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 282 | -------------------------------------------------------------------------------- /honor/download_cornell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/cornell 4 | cd data/cornell 5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt 6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt 7 | -------------------------------------------------------------------------------- /honor/download_opensubs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/opensubs 4 | cd data/opensubs 5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz 6 | tar -xf en.tar.gz 7 | rm en.tar.gz 8 | -------------------------------------------------------------------------------- /honor/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datasets 4 | import argparse 5 | import os 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.") 10 | parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.") 11 | args = parser.parse_args() 12 | 13 | dataset_path = os.path.join("data", args.dataset) 14 | if args.dataset == "cornell": 15 | data = datasets.readCornellData(dataset_path, max_len=args.max_len) 16 | elif args.dataset == "opensubs": 17 | data = datasets.readOpensubsData(dataset_path, max_len=args.max_len) 18 | else: 19 | raise ValueError("Unrecognized dataset: {!r}".format(args.dataset)) 20 | 21 | print("Size of dataset: {}".format(len(data))) 22 | print("First 10 training pairs:") 23 | for item in data[:10]: 24 | print(item) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /project/.gitignore: -------------------------------------------------------------------------------- 1 | GoogleNews-vectors-negative300.* 2 | starspace_embedding 3 | starspace_embedding.* 4 | word_embedd*.* 5 | *.pkl 6 | thread_embeddings_by_tags/ 7 | eval* 8 | db.sql* 9 | -------------------------------------------------------------------------------- /project/dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from utils import * 6 | 7 | 8 | class ThreadRanker(object): 9 | def __init__(self, paths): 10 | self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS']) 11 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 12 | 13 | def __load_embeddings_by_tag(self, tag_name): 14 | embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl") 15 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 16 | return thread_ids, thread_embeddings 17 | 18 | def get_best_thread(self, question, tag_name): 19 | """ Returns id of the most similar thread for the question. 20 | The search is performed across the threads with a given tag. 21 | """ 22 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 23 | 24 | # HINT: you have already implemented a similar routine in the 3rd assignment. 25 | 26 | #question_vec = #### YOUR CODE HERE #### 27 | #best_thread = #### YOUR CODE HERE #### 28 | question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim).reshape(1,-1) 29 | best_thread = pairwise_distances_argmin(question_vec,thread_embeddings) 30 | 31 | return thread_ids[best_thread] 32 | 33 | 34 | class DialogueManager(object): 35 | def __init__(self, paths): 36 | print("Loading resources...") 37 | 38 | # Intent recognition: 39 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 40 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 41 | 42 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 43 | 44 | # Goal-oriented part: 45 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 46 | self.thread_ranker = ThreadRanker(paths) 47 | 48 | #init chatbot 49 | self.create_chitchat_bot() 50 | 51 | def create_chitchat_bot(self): 52 | """Initializes self.chitchat_bot with some conversational model.""" 53 | 54 | # Hint: you might want to create and train chatterbot.ChatBot here. 55 | # It could be done by creating ChatBot with the *trainer* parameter equals 56 | # "chatterbot.trainers.ChatterBotCorpusTrainer" 57 | # and then calling *train* function with "chatterbot.corpus.english" param 58 | 59 | ######################## 60 | #### YOUR CODE HERE #### 61 | ######################## 62 | self.chitchat_bot = ChatBot('Nim Obvious', trainer='chatterbot.trainers.ChatterBotCorpusTrainer') 63 | 64 | # Train based on the english corpus 65 | self.chitchat_bot.train("chatterbot.corpus.english") 66 | 67 | 68 | def generate_answer(self, question): 69 | """Combines stackoverflow and chitchat parts using intent recognition.""" 70 | 71 | # Recognize intent of the question using `intent_recognizer`. 72 | # Don't forget to prepare question and calculate features for the question. 73 | 74 | #prepared_question = #### YOUR CODE HERE #### 75 | #features = #### YOUR CODE HERE #### 76 | #intent = #### YOUR CODE HERE #### 77 | 78 | prepared_question = text_prepare(question) 79 | features = self.tfidf_vectorizer.transform([prepared_question]) 80 | intent = self.intent_recognizer.predict(features) 81 | 82 | 83 | 84 | # Chit-chat part: 85 | if intent == 'dialogue': 86 | # Pass question to chitchat_bot to generate a response. 87 | #response = #### YOUR CODE HERE #### 88 | response = self.chitchat_bot.get_response(question) 89 | return response 90 | 91 | # Goal-oriented part: 92 | else: 93 | # Pass features to tag_classifier to get predictions. 94 | #tag = #### YOUR CODE HERE #### 95 | tag = self.tag_classifier.predict( features)[0] 96 | #print(tag) 97 | 98 | # Pass prepared_question to thread_ranker to get predictions. 99 | #thread_id = #### YOUR CODE HERE #### 100 | thread_id = self.thread_ranker.get_best_thread(question, tag)[0] 101 | 102 | return self.ANSWER_TEMPLATE % (tag, thread_id) 103 | 104 | -------------------------------------------------------------------------------- /project/main_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import argparse 6 | import os 7 | import json 8 | 9 | from requests.compat import urljoin 10 | from utils import * 11 | from dialogue_manager import * 12 | 13 | 14 | class BotHandler(object): 15 | """ 16 | BotHandler is a class which implements all back-end of the bot. 17 | It has tree main functions: 18 | 'get_updates' — checks for new messages 19 | 'send_message' – posts new message to user 20 | 'get_answer' — computes the most relevant on a user's question 21 | """ 22 | 23 | def __init__(self, token, dialogue_manager): 24 | self.token = token 25 | self.api_url = "https://api.telegram.org/bot{}/".format(token) 26 | self.dialogue_manager = dialogue_manager 27 | 28 | def get_updates(self, offset=None, timeout=30): 29 | params = {"timeout": timeout, "offset": offset} 30 | raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params) 31 | try: 32 | resp = raw_resp.json() 33 | except json.decoder.JSONDecodeError as e: 34 | print("Failed to parse response {}: {}.".format(raw_resp.content, e)) 35 | return [] 36 | 37 | if "result" not in resp: 38 | return [] 39 | return resp["result"] 40 | 41 | def send_message(self, chat_id, text): 42 | params = {"chat_id": chat_id, "text": text} 43 | return requests.post(urljoin(self.api_url, "sendMessage"), params) 44 | 45 | def get_answer(self, question): 46 | if question == '/start': 47 | return "Hi, I am your project bot. How can I help you today?" 48 | return self.dialogue_manager.generate_answer(question) 49 | 50 | 51 | def parse_args(): 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--token', type=str, default='') 54 | return parser.parse_args() 55 | 56 | 57 | def is_unicode(text): 58 | return len(text) == len(text.encode()) 59 | 60 | 61 | class SimpleDialogueManager(object): 62 | """ 63 | This is the simplest dialogue manager to test the telegram bot. 64 | Your task is to create a more advanced one in dialogue_manager.py." 65 | """ 66 | 67 | def generate_answer(self, question): 68 | return "Hello, world!" 69 | 70 | 71 | def main(): 72 | args = parse_args() 73 | token = args.token 74 | 75 | if not token: 76 | if not "TELEGRAM_TOKEN" in os.environ: 77 | print("Please, set bot token through --token or TELEGRAM_TOKEN env variable") 78 | return 79 | token = os.environ["TELEGRAM_TOKEN"] 80 | 81 | ################################################################# 82 | 83 | # Your task is to complete dialogue_manager.py and use your 84 | # advanced DialogueManager instead of SimpleDialogueManager. 85 | 86 | # This is the point where you plug it into the Telegram bot. 87 | # Do not forget to import all needed dependencies when you do so. 88 | 89 | # simple_manager = SimpleDialogueManager() 90 | # bot = BotHandler(token, simple_manager) 91 | 92 | dialog_manager = DialogueManager(RESOURCE_PATH) 93 | bot = BotHandler(token, dialog_manager) 94 | 95 | ############################################################### 96 | 97 | print("Ready to talk!") 98 | offset = 0 99 | while True: 100 | updates = bot.get_updates(offset=offset) 101 | for update in updates: 102 | print("An update received.") 103 | if "message" in update: 104 | chat_id = update["message"]["chat"]["id"] 105 | if "text" in update["message"]: 106 | text = update["message"]["text"] 107 | if is_unicode(text): 108 | print("Update content: {}".format(update)) 109 | bot.send_message(chat_id, bot.get_answer(update["message"]["text"])) 110 | else: 111 | bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...") 112 | offset = max(offset, update['update_id'] + 1) 113 | time.sleep(1) 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /project/utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | 6 | nltk.download('stopwords') 7 | from nltk.corpus import stopwords 8 | 9 | # Paths for all resources for the bot. 10 | RESOURCE_PATH = { 11 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 12 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 13 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 14 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 15 | 'WORD_EMBEDDINGS': 'word_embeddings.tsv', 16 | } 17 | 18 | 19 | def text_prepare(text): 20 | """Performs tokenization and simple preprocessing.""" 21 | 22 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 23 | bad_symbols_re = re.compile('[^0-9a-z #+_]') 24 | stopwords_set = set(stopwords.words('english')) 25 | 26 | text = text.lower() 27 | text = replace_by_space_re.sub(' ', text) 28 | text = bad_symbols_re.sub('', text) 29 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 30 | 31 | return text.strip() 32 | 33 | 34 | def load_embeddings(embeddings_path): 35 | """Loads pre-trained word embeddings from tsv file. 36 | 37 | Args: 38 | embeddings_path - path to the embeddings file. 39 | 40 | Returns: 41 | embeddings - dict mapping words to vectors; 42 | embeddings_dim - dimension of the vectors. 43 | """ 44 | 45 | # Hint: you have already implemented a similar routine in the 3rd assignment. 46 | # Note that here you also need to know the dimension of the loaded embeddings. 47 | # When you load the embeddings, use numpy.float32 type as dtype 48 | 49 | ######################## 50 | #### YOUR CODE HERE #### 51 | ######################## 52 | 53 | embeddings = dict() 54 | for line in open(embeddings_path, encoding='utf-8'): 55 | row = line.strip().split('\t') 56 | embeddings[row[0]] = np.array(row[1:], dtype=np.float32) 57 | embeddings_dim = embeddings[list(embeddings)[0]].shape[0] 58 | 59 | return embeddings, embeddings_dim 60 | 61 | 62 | 63 | def question_to_vec(question, embeddings, dim): 64 | """Transforms a string to an embedding by averaging word embeddings.""" 65 | 66 | # Hint: you have already implemented exactly this function in the 3rd assignment. 67 | 68 | ######################## 69 | #### YOUR CODE HERE #### 70 | ######################## 71 | result = np.zeros(dim) 72 | cnt = 0 73 | words = question.split() 74 | for word in words: 75 | if word in embeddings: 76 | result += np.array(embeddings[word]) 77 | cnt += 1 78 | if cnt != 0: 79 | result /= cnt 80 | return result 81 | 82 | 83 | def unpickle_file(filename): 84 | """Returns the result of unpickling the file content.""" 85 | with open(filename, 'rb') as f: 86 | return pickle.load(f) 87 | -------------------------------------------------------------------------------- /project/week5-project-Soln.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Final project: StackOverflow assistant bot\n", 8 | "\n", 9 | "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n", 10 | "* answer programming-related questions (using StackOverflow dataset);\n", 11 | "* chit-chat and simulate dialogue on all non programming-related questions.\n", 12 | "\n", 13 | "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n", 14 | "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n", 15 | "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n", 16 | "©[xkcd](https://xkcd.com)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Data description\n", 24 | "\n", 25 | "To detect *intent* of users questions we will need two text collections:\n", 26 | "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n", 27 | "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "application/vnd.jupyter.widget-view+json": { 38 | "model_id": "d0172568852c4a4e8822d8e48aedc512", 39 | "version_major": 2, 40 | "version_minor": 0 41 | }, 42 | "text/html": [ 43 | "

Failed to display Jupyter Widget of type HBox.

\n", 44 | "

\n", 45 | " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", 46 | " that the widgets JavaScript is still loading. If this message persists, it\n", 47 | " likely means that the widgets JavaScript library is either not installed or\n", 48 | " not enabled. See the Jupyter\n", 49 | " Widgets Documentation for setup instructions.\n", 50 | "

\n", 51 | "

\n", 52 | " If you're reading this message in another frontend (for example, a static\n", 53 | " rendering on GitHub or NBViewer),\n", 54 | " it may mean that your frontend doesn't currently support widgets.\n", 55 | "

\n" 56 | ], 57 | "text/plain": [ 58 | "HBox(children=(IntProgress(value=0, max=18012894), HTML(value='')))" 59 | ] 60 | }, 61 | "metadata": {}, 62 | "output_type": "display_data" 63 | }, 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "\n" 69 | ] 70 | }, 71 | { 72 | "data": { 73 | "application/vnd.jupyter.widget-view+json": { 74 | "model_id": "8142ba6c86f24a8e8902fd105acc1f9b", 75 | "version_major": 2, 76 | "version_minor": 0 77 | }, 78 | "text/html": [ 79 | "

Failed to display Jupyter Widget of type HBox.

\n", 80 | "

\n", 81 | " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", 82 | " that the widgets JavaScript is still loading. If this message persists, it\n", 83 | " likely means that the widgets JavaScript library is either not installed or\n", 84 | " not enabled. See the Jupyter\n", 85 | " Widgets Documentation for setup instructions.\n", 86 | "

\n", 87 | "

\n", 88 | " If you're reading this message in another frontend (for example, a static\n", 89 | " rendering on GitHub or NBViewer),\n", 90 | " it may mean that your frontend doesn't currently support widgets.\n", 91 | "

\n" 92 | ], 93 | "text/plain": [ 94 | "HBox(children=(IntProgress(value=0, max=145677870), HTML(value='')))" 95 | ] 96 | }, 97 | "metadata": {}, 98 | "output_type": "display_data" 99 | }, 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "import sys\n", 110 | "sys.path.append(\"..\")\n", 111 | "from common.download_utils import download_project_resources\n", 112 | "\n", 113 | "download_project_resources()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n", 121 | "For the ranking part, you will need:\n", 122 | "- `word_embeddings.tsv` — word embeddings, that you trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you." 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n", 130 | "\n", 131 | "- `intent_recognizer.pkl` — intent recognition model;\n", 132 | "- `tag_classifier.pkl` — programming language classification model;\n", 133 | "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n", 134 | "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n", 135 | " " 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 37, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "%load_ext autoreload\n", 152 | "%autoreload 2\n", 153 | "from utils import *" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "## Part I. Intent and language recognition" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n", 168 | "\n", 169 | "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 3, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "import numpy as np\n", 179 | "import pandas as pd\n", 180 | "import pickle\n", 181 | "import re\n", 182 | "\n", 183 | "from sklearn.feature_extraction.text import TfidfVectorizer" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Data preparation" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 18, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "def tfidf_features(X_train, X_test, vectorizer_path):\n", 207 | " \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n", 208 | " \n", 209 | " # Train a vectorizer on X_train data.\n", 210 | " # Transform X_train and X_test data.\n", 211 | " \n", 212 | " # Pickle the trained vectorizer to 'vectorizer_path'\n", 213 | " # Don't forget to open the file in writing bytes mode.\n", 214 | " \n", 215 | " ######################################\n", 216 | " ######### YOUR CODE HERE #############\n", 217 | " ######################################\n", 218 | " tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\\S+)')\n", 219 | " X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", 220 | " X_test_tfidf = tfidf_vectorizer.transform(X_test)\n", 221 | " with open(vectorizer_path, 'wb') as f:\n", 222 | " pickle.dump(tfidf_vectorizer, f)\n", 223 | " \n", 224 | " \n", 225 | " #return X_train, X_test\n", 226 | " return X_train_tfidf, X_test_tfidf" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "sample_size = 200000\n", 243 | "\n", 244 | "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n", 245 | "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Check how the data look like:" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 12, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/html": [ 263 | "
\n", 264 | "\n", 277 | "\n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | "
texttag
82925Donna, you are a muffin.dialogue
48774He was here last night till about two o'clock....dialogue
55394All right, then make an appointment with her s...dialogue
90806Hey, what is this-an interview? We're supposed...dialogue
107758Yeah. He's just a friend of mine I was trying ...dialogue
\n", 313 | "
" 314 | ], 315 | "text/plain": [ 316 | " text tag\n", 317 | "82925 Donna, you are a muffin. dialogue\n", 318 | "48774 He was here last night till about two o'clock.... dialogue\n", 319 | "55394 All right, then make an appointment with her s... dialogue\n", 320 | "90806 Hey, what is this-an interview? We're supposed... dialogue\n", 321 | "107758 Yeah. He's just a friend of mine I was trying ... dialogue" 322 | ] 323 | }, 324 | "execution_count": 12, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "dialogue_df.head()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 13, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/html": [ 341 | "
\n", 342 | "\n", 355 | "\n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | "
post_idtitletag
216898343837842Efficient Algorithm to compose valid expressio...python
108409515747223Why does this basic thread program fail with C...c\\c++
104902015189594Link to scroll to top not workingjavascript
2004663273927Is it possible to implement ping on windows ph...c#
120024917684551GLSL normal mapping issuec\\c++
\n", 397 | "
" 398 | ], 399 | "text/plain": [ 400 | " post_id title \\\n", 401 | "2168983 43837842 Efficient Algorithm to compose valid expressio... \n", 402 | "1084095 15747223 Why does this basic thread program fail with C... \n", 403 | "1049020 15189594 Link to scroll to top not working \n", 404 | "200466 3273927 Is it possible to implement ping on windows ph... \n", 405 | "1200249 17684551 GLSL normal mapping issue \n", 406 | "\n", 407 | " tag \n", 408 | "2168983 python \n", 409 | "1084095 c\\c++ \n", 410 | "1049020 javascript \n", 411 | "200466 c# \n", 412 | "1200249 c\\c++ " 413 | ] 414 | }, 415 | "execution_count": 13, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "stackoverflow_df.head()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "Apply *text_prepare* function to preprocess the data:" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 14, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "from utils import text_prepare" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 15, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "dialogue_df['text'] = dialogue_df['text'].apply(lambda x: text_prepare(x)) ######### YOUR CODE HERE #############\n", 447 | "stackoverflow_df['title'] = stackoverflow_df['title'].apply(lambda x: text_prepare(x)) ######### YOUR CODE HERE #############" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "### Intent recognition" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n", 462 | "- concatenate `dialogue` and `stackoverflow` examples into one sample\n", 463 | "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n", 464 | "- transform it into TF-IDF features" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 16, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "from sklearn.model_selection import train_test_split" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 19, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stderr", 483 | "output_type": "stream", 484 | "text": [ 485 | "/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 486 | " FutureWarning)\n" 487 | ] 488 | }, 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "Train size = 360000, test size = 40000\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n", 499 | "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n", 500 | "\n", 501 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0) ######### YOUR CODE HERE ##########\n", 502 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n", 503 | "\n", 504 | "X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, RESOURCE_PATH['TFIDF_VECTORIZER']) ######### YOUR CODE HERE ###########" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 20, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "from sklearn.linear_model import LogisticRegression\n", 521 | "from sklearn.metrics import accuracy_score" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 24, 527 | "metadata": {}, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": [ 532 | "LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n", 533 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 534 | " penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n", 535 | " verbose=0, warm_start=False)" 536 | ] 537 | }, 538 | "execution_count": 24, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "######################################\n", 545 | "######### YOUR CODE HERE #############\n", 546 | "######################################\n", 547 | "intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=0)\n", 548 | "intent_recognizer.fit(X_train_tfidf, y_train)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 25, 554 | "metadata": {}, 555 | "outputs": [ 556 | { 557 | "name": "stdout", 558 | "output_type": "stream", 559 | "text": [ 560 | "Test accuracy = 0.991575\n" 561 | ] 562 | } 563 | ], 564 | "source": [ 565 | "# Check test accuracy.\n", 566 | "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n", 567 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 568 | "print('Test accuracy = {}'.format(test_accuracy))" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "Dump the classifier to use it in the running bot." 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 26, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "### Programming language classification " 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n", 599 | "\n", 600 | "First, let us prepare the data for this task." 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 27, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "X = stackoverflow_df['title'].values\n", 610 | "y = stackoverflow_df['tag'].values" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 28, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "Train size = 160000, test size = 40000\n" 623 | ] 624 | } 625 | ], 626 | "source": [ 627 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", 628 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it." 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 29, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n", 645 | "\n", 646 | "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*." 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 30, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "from sklearn.multiclass import OneVsRestClassifier" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 32, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/plain": [ 673 | "OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,\n", 674 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 675 | " penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n", 676 | " verbose=0, warm_start=False),\n", 677 | " n_jobs=1)" 678 | ] 679 | }, 680 | "execution_count": 32, 681 | "metadata": {}, 682 | "output_type": "execute_result" 683 | } 684 | ], 685 | "source": [ 686 | "######################################\n", 687 | "######### YOUR CODE HERE #############\n", 688 | "######################################\n", 689 | "tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0))\n", 690 | "tag_classifier.fit(X_train_tfidf, y_train)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 33, 696 | "metadata": {}, 697 | "outputs": [ 698 | { 699 | "name": "stdout", 700 | "output_type": "stream", 701 | "text": [ 702 | "Test accuracy = 0.800725\n" 703 | ] 704 | } 705 | ], 706 | "source": [ 707 | "# Check test accuracy.\n", 708 | "y_test_pred = tag_classifier.predict(X_test_tfidf)\n", 709 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 710 | "print('Test accuracy = {}'.format(test_accuracy))" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": {}, 716 | "source": [ 717 | "Dump the classifier to use it in the running bot." 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 34, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "## Part II. Ranking questions with embeddings" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n", 741 | "\n", 742 | "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. " 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n", 750 | "\n", 751 | "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StartSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data." 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 39, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "name": "stdout", 761 | "output_type": "stream", 762 | "text": [ 763 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 764 | "[nltk_data] Package stopwords is already up-to-date!\n" 765 | ] 766 | } 767 | ], 768 | "source": [ 769 | "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": {}, 775 | "source": [ 776 | "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 41, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')" 786 | ] 787 | }, 788 | { 789 | "cell_type": "markdown", 790 | "metadata": {}, 791 | "source": [ 792 | "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n", 793 | "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 42, 799 | "metadata": {}, 800 | "outputs": [ 801 | { 802 | "data": { 803 | "text/html": [ 804 | "
\n", 805 | "\n", 818 | "\n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | "
post_idtitletag
09Calculate age in C#c#
116Filling a DataSet or DataTable from a LINQ que...c#
239Reliable timer in a console applicationc#
342Best way to allow plugins for a PHP applicationphp
459How do I get a distinct, ordered list of names...c#
\n", 860 | "
" 861 | ], 862 | "text/plain": [ 863 | " post_id title tag\n", 864 | "0 9 Calculate age in C# c#\n", 865 | "1 16 Filling a DataSet or DataTable from a LINQ que... c#\n", 866 | "2 39 Reliable timer in a console application c#\n", 867 | "3 42 Best way to allow plugins for a PHP application php\n", 868 | "4 59 How do I get a distinct, ordered list of names... c#" 869 | ] 870 | }, 871 | "execution_count": 42, 872 | "metadata": {}, 873 | "output_type": "execute_result" 874 | } 875 | ], 876 | "source": [ 877 | "posts_df.head()" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 48, 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "counts_by_tag = posts_df.groupby(['tag'])['tag'].count() ######### YOUR CODE HERE #############" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 53, 892 | "metadata": {}, 893 | "outputs": [ 894 | { 895 | "data": { 896 | "text/plain": [ 897 | "[('c#', 394451),\n", 898 | " ('c\\\\c++', 281300),\n", 899 | " ('java', 383456),\n", 900 | " ('javascript', 375867),\n", 901 | " ('php', 321752),\n", 902 | " ('python', 208607),\n", 903 | " ('r', 36359),\n", 904 | " ('ruby', 99930),\n", 905 | " ('swift', 34809),\n", 906 | " ('vb', 35044)]" 907 | ] 908 | }, 909 | "execution_count": 53, 910 | "metadata": {}, 911 | "output_type": "execute_result" 912 | } 913 | ], 914 | "source": [ 915 | "list(counts_by_tag.items())" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "Now for each `tag` you need to create two data structures, which will serve as online search index:\n", 923 | "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n", 924 | "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n", 925 | "\n", 926 | "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it." 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 54, 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [ 935 | "import os\n", 936 | "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n", 937 | "\n", 938 | "for tag, count in counts_by_tag.items():\n", 939 | " tag_posts = posts_df[posts_df['tag'] == tag]\n", 940 | " \n", 941 | " tag_post_ids = tag_posts['post_id'].values ######### YOUR CODE HERE #############\n", 942 | " \n", 943 | " tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n", 944 | " for i, title in enumerate(tag_posts['title']):\n", 945 | " tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) ######### YOUR CODE HERE #############\n", 946 | "\n", 947 | " # Dump post ids and vectors to a file.\n", 948 | " filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n", 949 | " pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))" 950 | ] 951 | } 952 | ], 953 | "metadata": { 954 | "kernelspec": { 955 | "display_name": "Python 3", 956 | "language": "python", 957 | "name": "python3" 958 | }, 959 | "language_info": { 960 | "codemirror_mode": { 961 | "name": "ipython", 962 | "version": 3 963 | }, 964 | "file_extension": ".py", 965 | "mimetype": "text/x-python", 966 | "name": "python", 967 | "nbconvert_exporter": "python", 968 | "pygments_lexer": "ipython3", 969 | "version": "3.5.2" 970 | }, 971 | "latex_envs": { 972 | "bibliofile": "biblio.bib", 973 | "cite_by": "apalike", 974 | "current_citInitial": 1, 975 | "eqLabelWithNumbers": true, 976 | "eqNumInitial": 0 977 | } 978 | }, 979 | "nbformat": 4, 980 | "nbformat_minor": 2 981 | } 982 | -------------------------------------------------------------------------------- /project/week5-project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Final project: StackOverflow assistant bot\n", 8 | "\n", 9 | "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n", 10 | "* answer programming-related questions (using StackOverflow dataset);\n", 11 | "* chit-chat and simulate dialogue on all non programming-related questions.\n", 12 | "\n", 13 | "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n", 14 | "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n", 15 | "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n", 16 | "©[xkcd](https://xkcd.com)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Data description\n", 24 | "\n", 25 | "To detect *intent* of users questions we will need two text collections:\n", 26 | "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n", 27 | "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import sys\n", 39 | "sys.path.append(\"..\")\n", 40 | "from common.download_utils import download_project_resources\n", 41 | "\n", 42 | "download_project_resources()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n", 50 | "For the ranking part, you will need:\n", 51 | "- `word_embeddings.tsv` — word embeddings, that you trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n", 59 | "\n", 60 | "- `intent_recognizer.pkl` — intent recognition model;\n", 61 | "- `tag_classifier.pkl` — programming language classification model;\n", 62 | "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n", 63 | "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n", 64 | " " 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "from utils import *" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Part I. Intent and language recognition" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n", 97 | "\n", 98 | "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "import numpy as np\n", 110 | "import pandas as pd\n", 111 | "import pickle\n", 112 | "import re\n", 113 | "\n", 114 | "from sklearn.feature_extraction.text import TfidfVectorizer" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Data preparation" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "def tfidf_features(X_train, X_test, vectorizer_path):\n", 140 | " \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n", 141 | " \n", 142 | " # Train a vectorizer on X_train data.\n", 143 | " # Transform X_train and X_test data.\n", 144 | " \n", 145 | " # Pickle the trained vectorizer to 'vectorizer_path'\n", 146 | " # Don't forget to open the file in writing bytes mode.\n", 147 | " \n", 148 | " ######################################\n", 149 | " ######### YOUR CODE HERE #############\n", 150 | " ######################################\n", 151 | " \n", 152 | " return X_train, X_test" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "sample_size = 200000\n", 171 | "\n", 172 | "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n", 173 | "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "Check how the data look like:" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "dialogue_df.head()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "stackoverflow_df.head()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Apply *text_prepare* function to preprocess the data:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "from utils import text_prepare" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "dialogue_df['text'] = ######### YOUR CODE HERE #############\n", 232 | "stackoverflow_df['title'] = ######### YOUR CODE HERE #############" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Intent recognition" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n", 247 | "- concatenate `dialogue` and `stackoverflow` examples into one sample\n", 248 | "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n", 249 | "- transform it into TF-IDF features" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "from sklearn.model_selection import train_test_split" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n", 272 | "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n", 273 | "\n", 274 | "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n", 275 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n", 276 | "\n", 277 | "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "from sklearn.linear_model import LogisticRegression\n", 296 | "from sklearn.metrics import accuracy_score" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "######################################\n", 308 | "######### YOUR CODE HERE #############\n", 309 | "######################################" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# Check test accuracy.\n", 321 | "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n", 322 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 323 | "print('Test accuracy = {}'.format(test_accuracy))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Dump the classifier to use it in the running bot." 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "### Programming language classification " 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n", 356 | "\n", 357 | "First, let us prepare the data for this task." 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "X = stackoverflow_df['title'].values\n", 369 | "y = stackoverflow_df['tag'].values" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", 381 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n", 400 | "\n", 401 | "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "collapsed": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "from sklearn.multiclass import OneVsRestClassifier" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": true 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "######################################\n", 431 | "######### YOUR CODE HERE #############\n", 432 | "######################################" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "collapsed": true 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "# Check test accuracy.\n", 444 | "y_test_pred = tag_classifier.predict(X_test_tfidf)\n", 445 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 446 | "print('Test accuracy = {}'.format(test_accuracy))" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "Dump the classifier to use it in the running bot." 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "## Part II. Ranking questions with embeddings" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n", 479 | "\n", 480 | "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. " 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n", 488 | "\n", 489 | "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StartSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data." 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "collapsed": true 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": { 514 | "collapsed": true 515 | }, 516 | "outputs": [], 517 | "source": [ 518 | "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n", 526 | "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": true 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "counts_by_tag = ######### YOUR CODE HERE #############" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "Now for each `tag` you need to create two data structures, which will serve as online search index:\n", 545 | "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n", 546 | "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n", 547 | "\n", 548 | "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it." 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": { 555 | "collapsed": true 556 | }, 557 | "outputs": [], 558 | "source": [ 559 | "import os\n", 560 | "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n", 561 | "\n", 562 | "for tag, count in counts_by_tag.items():\n", 563 | " tag_posts = posts_df[posts_df['tag'] == tag]\n", 564 | " \n", 565 | " tag_post_ids = ######### YOUR CODE HERE #############\n", 566 | " \n", 567 | " tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n", 568 | " for i, title in enumerate(tag_posts['title']):\n", 569 | " tag_vectors[i, :] = ######### YOUR CODE HERE #############\n", 570 | "\n", 571 | " # Dump post ids and vectors to a file.\n", 572 | " filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n", 573 | " pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))" 574 | ] 575 | } 576 | ], 577 | "metadata": { 578 | "kernelspec": { 579 | "display_name": "Python 3", 580 | "language": "python", 581 | "name": "python3" 582 | }, 583 | "language_info": { 584 | "codemirror_mode": { 585 | "name": "ipython", 586 | "version": 3 587 | }, 588 | "file_extension": ".py", 589 | "mimetype": "text/x-python", 590 | "name": "python", 591 | "nbconvert_exporter": "python", 592 | "pygments_lexer": "ipython3", 593 | "version": "3.4.3" 594 | }, 595 | "latex_envs": { 596 | "bibliofile": "biblio.bib", 597 | "cite_by": "apalike", 598 | "current_citInitial": 1, 599 | "eqLabelWithNumbers": true, 600 | "eqNumInitial": 0 601 | } 602 | }, 603 | "nbformat": 4, 604 | "nbformat_minor": 2 605 | } 606 | -------------------------------------------------------------------------------- /week1/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g' 10 | self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 11 | ('hTrz8', 'WordsTagsCount'), 12 | ('0kUjR', 'BagOfWords'), 13 | ('tLJV1', 'MultilabelClassification')]) 14 | self.answers = {key: None for key in self.parts} 15 | 16 | @staticmethod 17 | def ravel_output(output): 18 | ''' 19 | If student accidentally submitted np.array with one 20 | element instead of number, this function will submit 21 | this number instead 22 | ''' 23 | if isinstance(output, np.ndarray) and output.size == 1: 24 | output = output.item(0) 25 | return output 26 | 27 | def submit(self, email, token): 28 | submission = { 29 | "assignmentKey": self.assignment_key, 30 | "submitterEmail": email, 31 | "secret": token, 32 | "parts": {} 33 | } 34 | for part, output in self.answers.items(): 35 | if output is not None: 36 | submission["parts"][part] = {"output": output} 37 | else: 38 | submission["parts"][part] = dict() 39 | request = requests.post(self.submission_page, data=json.dumps(submission)) 40 | response = request.json() 41 | if request.status_code == 201: 42 | print('Submitted to Coursera platform. See results on assignment page!') 43 | elif u'details' in response and u'learnerMessage' in response[u'details']: 44 | print(response[u'details'][u'learnerMessage']) 45 | else: 46 | print("Unknown response from Coursera: {}".format(request.status_code)) 47 | print(response) 48 | 49 | def status(self): 50 | print("You want to submit these parts:") 51 | for part_id, part_name in self.parts.items(): 52 | answer = self.answers[part_id] 53 | if answer is None: 54 | answer = '-'*10 55 | print("Task {}:\n {}".format(part_name, answer[:100] + '...')) 56 | 57 | def submit_part(self, part, output): 58 | self.answers[part] = output 59 | print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...')) 60 | 61 | def submit_tag(self, tag, output): 62 | part_id = [k for k, v in self.parts.items() if v == tag] 63 | if len(part_id) != 1: 64 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 65 | part_id = part_id[0] 66 | self.submit_part(part_id, str(self.ravel_output(output))) 67 | -------------------------------------------------------------------------------- /week1/metrics.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.metrics import roc_curve, auc 4 | from scipy import interp 5 | from itertools import cycle 6 | 7 | def roc_auc(y_test, y_score, n_classes): 8 | """Plots ROC curve for micro and macro averaging.""" 9 | 10 | # Compute ROC curve and ROC area for each class 11 | fpr = {} 12 | tpr = {} 13 | roc_auc = {} 14 | for i in range(n_classes): 15 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 16 | roc_auc[i] = auc(fpr[i], tpr[i]) 17 | 18 | # Compute micro-average ROC curve and ROC area 19 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 20 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 21 | 22 | # Compute macro-average ROC curve and ROC area 23 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 24 | mean_tpr = np.zeros_like(all_fpr) 25 | for i in range(n_classes): 26 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 27 | mean_tpr /= n_classes 28 | fpr["macro"] = all_fpr 29 | tpr["macro"] = mean_tpr 30 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 31 | 32 | # Plot all ROC curves 33 | plt.figure() 34 | plt.plot(fpr["micro"], tpr["micro"], 35 | label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), 36 | color='deeppink', linestyle=':', linewidth=4) 37 | 38 | plt.plot(fpr["macro"], tpr["macro"], 39 | label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), 40 | color='navy', linestyle=':', linewidth=4) 41 | 42 | colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) 43 | for i, color in zip(range(0,3), colors): 44 | plt.plot(fpr[i], tpr[i], color=color, lw=2, 45 | label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])) 46 | 47 | plt.plot([0, 1], [0, 1], 'k--', lw=2) 48 | plt.xlim([0.0, 1.0]) 49 | plt.ylim([0.0, 1.05]) 50 | plt.xlabel('False Positive Rate') 51 | plt.ylabel('True Positive Rate') 52 | plt.title('Some extension of ROC to multi-class') 53 | plt.legend(loc="lower right") 54 | plt.show() -------------------------------------------------------------------------------- /week1/week1-MultilabelClassification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predict tags on StackOverflow with linear models" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this assignment you will learn how to predict tags for posts from [StackOverflow](https://stackoverflow.com). To solve this task you will use multilabel classification approach.\n", 15 | "\n", 16 | "### Libraries\n", 17 | "\n", 18 | "In this task you will need the following libraries:\n", 19 | "- [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 20 | "- [Pandas](https://pandas.pydata.org) — a library providing high-performance, easy-to-use data structures and data analysis tools for the Python\n", 21 | "- [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n", 22 | "- [NLTK](http://www.nltk.org) — a platform to work with natural language." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Data\n", 30 | "\n", 31 | "The following cell will download all data required for this assignment into the folder `week1/data`." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import sys\n", 43 | "sys.path.append(\"..\")\n", 44 | "from common.download_utils import download_week1_resources\n", 45 | "\n", 46 | "download_week1_resources()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Grading\n", 54 | "We will create a grader instance below and use it to collect your answers. Note that these outputs will be stored locally inside grader and will be uploaded to platform only after running submitting function in the last part of this assignment. If you want to make partial submission, you can run that cell any time you want." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "from grader import Grader" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "grader = Grader()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Text preprocessing" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "For this and most of the following assignments you will need to use a list of stop words. It can be downloaded from *nltk*:" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "import nltk\n", 102 | "nltk.download('stopwords')\n", 103 | "from nltk.corpus import stopwords" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "In this task you will deal with a dataset of post titles from StackOverflow. You are provided a split to 3 sets: *train*, *validation* and *test*. All corpora (except for *test*) contain titles of the posts and corresponding tags (100 tags are available). The *test* set is provided for Coursera's grading and doesn't contain answers. Upload the corpora using *pandas* and look at the data:" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "from ast import literal_eval\n", 122 | "import pandas as pd\n", 123 | "import numpy as np" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "def read_data(filename):\n", 135 | " data = pd.read_csv(filename, sep='\\t')\n", 136 | " data['tags'] = data['tags'].apply(literal_eval)\n", 137 | " return data" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "train = read_data('data/train.tsv')\n", 149 | "validation = read_data('data/validation.tsv')\n", 150 | "test = pd.read_csv('data/test.tsv', sep='\\t')" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "train.head()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "As you can see, *title* column contains titles of the posts and *tags* column contains the tags. It could be noticed that a number of tags for a post is not fixed and could be as many as necessary." 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "For a more comfortable usage, initialize *X_train*, *X_val*, *X_test*, *y_train*, *y_val*." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "X_train, y_train = train['title'].values, train['tags'].values\n", 187 | "X_val, y_val = validation['title'].values, validation['tags'].values\n", 188 | "X_test = test['title'].values" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "One of the most known difficulties when working with natural data is that it's unstructured. For example, if you use it \"as is\" and extract tokens just by splitting the titles by whitespaces, you will see that there are many \"weird\" tokens like *3.5?*, *\"Flip*, etc. To prevent the problems, it's usually useful to prepare the data somehow. In this task you'll write a function, which will be also used in the other assignments. \n", 196 | "\n", 197 | "**Task 1 (TextPrepare).** Implement the function *text_prepare* following the instructions. After that, run the function *test_test_prepare* to test it on tiny cases and submit it to Coursera." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "import re" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n", 220 | "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n", 221 | "STOPWORDS = set(stopwords.words('english'))\n", 222 | "\n", 223 | "def text_prepare(text):\n", 224 | " \"\"\"\n", 225 | " text: a string\n", 226 | " \n", 227 | " return: modified initial string\n", 228 | " \"\"\"\n", 229 | " text = # lowercase text\n", 230 | " text = # replace REPLACE_BY_SPACE_RE symbols by space in text\n", 231 | " text = # delete symbols which are in BAD_SYMBOLS_RE from text\n", 232 | " text = # delete stopwords from text\n", 233 | " return text" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "def test_text_prepare():\n", 245 | " examples = [\"SQL Server - any equivalent of Excel's CHOOSE function?\",\n", 246 | " \"How to free c++ memory vector * arr?\"]\n", 247 | " answers = [\"sql server equivalent excels choose function\", \n", 248 | " \"free c++ memory vectorint arr\"]\n", 249 | " for ex, ans in zip(examples, answers):\n", 250 | " if text_prepare(ex) != ans:\n", 251 | " return \"Wrong answer for the case: '%s'\" % ex\n", 252 | " return 'Basic tests are passed.'" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "print(test_text_prepare())" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "prepared_questions = []\n", 282 | "for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):\n", 283 | " line = text_prepare(line.strip())\n", 284 | " prepared_questions.append(line)\n", 285 | "text_prepare_results = '\\n'.join(prepared_questions)\n", 286 | "\n", 287 | "grader.submit_tag('TextPrepare', text_prepare_results)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Now we can preprocess the titles using function *text_prepare* and making sure that the headers don't have bad symbols:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "X_train = [text_prepare(x) for x in X_train]\n", 306 | "X_val = [text_prepare(x) for x in X_val]\n", 307 | "X_test = [text_prepare(x) for x in X_test]" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "X_train[:3]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "For each tag and for each word calculate how many times they occur in the train corpus. \n", 326 | "\n", 327 | "**Task 2 (WordsTagsCount).** Find 3 most popular tags and 3 most popular words in the train data and submit the results to earn the points." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "collapsed": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "# Dictionary of all tags from train corpus with their counts.\n", 339 | "tags_counts = {}\n", 340 | "# Dictionary of all words from train corpus with their counts.\n", 341 | "words_counts = {}\n", 342 | "\n", 343 | "######################################\n", 344 | "######### YOUR CODE HERE #############\n", 345 | "######################################" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "We are assuming that *tags_counts* and *words_counts* are dictionaries like `{'some_word_or_tag': frequency}`. After applying the sorting procedure, results will be look like this: `[('most_popular_word_or_tag', frequency), ('less_popular_word_or_tag', frequency), ...]`. The grader gets the results in the following format (two comma-separated strings with line break):\n", 353 | "\n", 354 | " tag1,tag2,tag3\n", 355 | " word1,word2,word3\n", 356 | "\n", 357 | "Pay attention that in this assignment you should not submit frequencies or some additional information." 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n", 369 | "most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n", 370 | "\n", 371 | "grader.submit_tag('WordsTagsCount', '%s\\n%s' % (','.join(tag for tag, _ in most_common_tags), \n", 372 | " ','.join(word for word, _ in most_common_words)))" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "### Transforming text to a vector\n", 380 | "\n", 381 | "Machine Learning algorithms work with numeric data and we cannot use the provided text data \"as is\". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.\n", 382 | "\n", 383 | "#### Bag of words\n", 384 | "\n", 385 | "One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:\n", 386 | "1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.\n", 387 | "2. For each title in the corpora create a zero vector with the dimension equals to *N*.\n", 388 | "3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.\n", 389 | "\n", 390 | "Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is \n", 391 | "\n", 392 | " ['hi', 'you', 'me', 'are']\n", 393 | "\n", 394 | "Then we need to numerate them, for example, like this: \n", 395 | "\n", 396 | " {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n", 397 | "\n", 398 | "And we have the text, which we want to transform to the vector:\n", 399 | "\n", 400 | " 'hi how are you'\n", 401 | "\n", 402 | "For this text we create a corresponding zero vector \n", 403 | "\n", 404 | " [0, 0, 0, 0]\n", 405 | " \n", 406 | "And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:\n", 407 | "\n", 408 | " 'hi': [1, 0, 0, 0]\n", 409 | " 'how': [1, 0, 0, 0] # word 'how' is not in our dictionary\n", 410 | " 'are': [1, 0, 0, 1]\n", 411 | " 'you': [1, 1, 0, 1]\n", 412 | "\n", 413 | "The resulting vector will be \n", 414 | "\n", 415 | " [1, 1, 0, 1]\n", 416 | " \n", 417 | "Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "DICT_SIZE = 5000\n", 429 | "WORDS_TO_INDEX = ####### YOUR CODE HERE #######\n", 430 | "INDEX_TO_WORDS = ####### YOUR CODE HERE #######\n", 431 | "ALL_WORDS = WORDS_TO_INDEX.keys()\n", 432 | "\n", 433 | "def my_bag_of_words(text, words_to_index, dict_size):\n", 434 | " \"\"\"\n", 435 | " text: a string\n", 436 | " dict_size: size of the dictionary\n", 437 | " \n", 438 | " return a vector which is a bag-of-words representation of 'text'\n", 439 | " \"\"\"\n", 440 | " result_vector = np.zeros(dict_size)\n", 441 | " ######################################\n", 442 | " ######### YOUR CODE HERE #############\n", 443 | " ######################################\n", 444 | " return result_vector" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "def test_my_bag_of_words():\n", 456 | " words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n", 457 | " examples = ['hi how are you']\n", 458 | " answers = [[1, 1, 0, 1]]\n", 459 | " for ex, ans in zip(examples, answers):\n", 460 | " if (my_bag_of_words(ex, words_to_index, 4) != ans).any():\n", 461 | " return \"Wrong answer for the case: '%s'\" % ex\n", 462 | " return 'Basic tests are passed.'" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "print(test_my_bag_of_words())" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "Now apply the implemented function to all samples (this might take up to a minute):" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "collapsed": true 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "from scipy import sparse as sp_sparse" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": { 498 | "collapsed": true 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])\n", 503 | "X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])\n", 504 | "X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])\n", 505 | "print('X_train shape ', X_train_mybag.shape)\n", 506 | "print('X_val shape ', X_val_mybag.shape)\n", 507 | "print('X_test shape ', X_test_mybag.shape)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "As you might notice, we transform the data to sparse representation, to store the useful information efficiently. There are many [types](https://docs.scipy.org/doc/scipy/reference/sparse.html) of such representations, however sklearn algorithms can work only with [csr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix) matrix, so we will use this one." 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "**Task 3 (BagOfWords).** For the 11th row in *X_train_mybag* find how many non-zero elements it has. In this task the answer (variable *non_zero_elements_count*) should be a number, e.g. 20." 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": { 528 | "collapsed": true 529 | }, 530 | "outputs": [], 531 | "source": [ 532 | "row = X_train_mybag[10].toarray()[0]\n", 533 | "non_zero_elements_count = ####### YOUR CODE HERE #######\n", 534 | "\n", 535 | "grader.submit_tag('BagOfWords', str(non_zero_elements_count))" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "#### TF-IDF\n", 543 | "\n", 544 | "The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space. \n", 545 | "\n", 546 | "Implement function *tfidf_features* using class [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) from *scikit-learn*. Use *train* corpus to train a vectorizer. Don't forget to take a look into the arguments that you can pass to it. We suggest that you filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the titles). Also, use bigrams along with unigrams in your vocabulary. " 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": { 553 | "collapsed": true 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "from sklearn.feature_extraction.text import TfidfVectorizer" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "collapsed": true 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "def tfidf_features(X_train, X_val, X_test):\n", 569 | " \"\"\"\n", 570 | " X_train, X_val, X_test — samples \n", 571 | " return TF-IDF vectorized representation of each sample and vocabulary\n", 572 | " \"\"\"\n", 573 | " # Create TF-IDF vectorizer with a proper parameters choice\n", 574 | " # Fit the vectorizer on the train set\n", 575 | " # Transform the train, test, and val sets and return the result\n", 576 | " \n", 577 | " \n", 578 | " tfidf_vectorizer = ####### YOUR CODE HERE #######\n", 579 | " \n", 580 | " ######################################\n", 581 | " ######### YOUR CODE HERE #############\n", 582 | " ######################################\n", 583 | " \n", 584 | " return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "Once you have done text preprocessing, always have a look at the results. Be very careful at this step, because the performance of future models will drastically depend on it. \n", 592 | "\n", 593 | "In this case, check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": { 600 | "collapsed": true 601 | }, 602 | "outputs": [], 603 | "source": [ 604 | "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)\n", 605 | "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": { 612 | "collapsed": true 613 | }, 614 | "outputs": [], 615 | "source": [ 616 | "######### YOUR CODE HERE #############" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\\S+)' regexp as a *token_pattern* in the constructor of the vectorizer. " 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "Now, use this transormation for the data and check again." 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": { 637 | "collapsed": true 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "######### YOUR CODE HERE #############" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "### MultiLabel classifier\n", 649 | "\n", 650 | "As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*." 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "collapsed": true 658 | }, 659 | "outputs": [], 660 | "source": [ 661 | "from sklearn.preprocessing import MultiLabelBinarizer" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": { 668 | "collapsed": true 669 | }, 670 | "outputs": [], 671 | "source": [ 672 | "mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))\n", 673 | "y_train = mlb.fit_transform(y_train)\n", 674 | "y_val = mlb.fit_transform(y_val)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large." 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": { 688 | "collapsed": true 689 | }, 690 | "outputs": [], 691 | "source": [ 692 | "from sklearn.multiclass import OneVsRestClassifier\n", 693 | "from sklearn.linear_model import LogisticRegression, RidgeClassifier" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": { 700 | "collapsed": true 701 | }, 702 | "outputs": [], 703 | "source": [ 704 | "def train_classifier(X_train, y_train):\n", 705 | " \"\"\"\n", 706 | " X_train, y_train — training data\n", 707 | " \n", 708 | " return: trained classifier\n", 709 | " \"\"\"\n", 710 | " \n", 711 | " # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n", 712 | "\n", 713 | " ######################################\n", 714 | " ######### YOUR CODE HERE #############\n", 715 | " ###################################### " 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*." 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": { 729 | "collapsed": true 730 | }, 731 | "outputs": [], 732 | "source": [ 733 | "classifier_mybag = train_classifier(X_train_mybag, y_train)\n", 734 | "classifier_tfidf = train_classifier(X_train_tfidf, y_train)" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "Now you can create predictions for the data. You will need two types of predictions: labels and scores." 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": { 748 | "collapsed": true 749 | }, 750 | "outputs": [], 751 | "source": [ 752 | "y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)\n", 753 | "y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)\n", 754 | "\n", 755 | "y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)\n", 756 | "y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)" 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "Now take a look at how classifier, which uses TF-IDF, works for a few examples:" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "metadata": { 770 | "collapsed": true 771 | }, 772 | "outputs": [], 773 | "source": [ 774 | "y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)\n", 775 | "y_val_inversed = mlb.inverse_transform(y_val)\n", 776 | "for i in range(3):\n", 777 | " print('Title:\\t{}\\nTrue labels:\\t{}\\nPredicted labels:\\t{}\\n\\n'.format(\n", 778 | " X_val[i],\n", 779 | " ','.join(y_val_inversed[i]),\n", 780 | " ','.join(y_val_pred_inversed[i])\n", 781 | " ))" 782 | ] 783 | }, 784 | { 785 | "cell_type": "markdown", 786 | "metadata": {}, 787 | "source": [ 788 | "Now, we would need to compare the results of different predictions, e.g. to see whether TF-IDF transformation helps or to try different regularization techniques in logistic regression. For all these experiments, we need to setup evaluation procedure. " 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "### Evaluation\n", 796 | "\n", 797 | "To evaluate the results we will use several classification metrics:\n", 798 | " - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)\n", 799 | " - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n", 800 | " - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)\n", 801 | " - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) \n", 802 | " \n", 803 | "Make sure you are familiar with all of them. How would you expect the things work for the multi-label scenario? Read about micro/macro/weighted averaging following the sklearn links provided above." 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": { 810 | "collapsed": true 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "from sklearn.metrics import accuracy_score\n", 815 | "from sklearn.metrics import f1_score\n", 816 | "from sklearn.metrics import roc_auc_score \n", 817 | "from sklearn.metrics import average_precision_score\n", 818 | "from sklearn.metrics import recall_score" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "metadata": {}, 824 | "source": [ 825 | "Implement the function *print_evaluation_scores* which calculates and prints to stdout:\n", 826 | " - *accuracy*\n", 827 | " - *F1-score macro/micro/weighted*\n", 828 | " - *Precision macro/micro/weighted*" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": true 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "def print_evaluation_scores(y_val, predicted):\n", 840 | " \n", 841 | " ######################################\n", 842 | " ######### YOUR CODE HERE #############\n", 843 | " ######################################" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "collapsed": true 851 | }, 852 | "outputs": [], 853 | "source": [ 854 | "print('Bag-of-words')\n", 855 | "print_evaluation_scores(y_val, y_val_predicted_labels_mybag)\n", 856 | "print('Tfidf')\n", 857 | "print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are:\n", 865 | " - true labels\n", 866 | " - decision functions scores\n", 867 | " - number of classes" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": { 874 | "collapsed": true 875 | }, 876 | "outputs": [], 877 | "source": [ 878 | "from metrics import roc_auc\n", 879 | "%matplotlib inline" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "metadata": { 886 | "collapsed": true 887 | }, 888 | "outputs": [], 889 | "source": [ 890 | "n_classes = len(tags_counts)\n", 891 | "roc_auc(y_val, y_val_predicted_scores_mybag, n_classes)" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "metadata": { 898 | "collapsed": true 899 | }, 900 | "outputs": [], 901 | "source": [ 902 | "n_classes = len(tags_counts)\n", 903 | "roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes)" 904 | ] 905 | }, 906 | { 907 | "cell_type": "markdown", 908 | "metadata": {}, 909 | "source": [ 910 | "**Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation:\n", 911 | "- compare the quality of the bag-of-words and TF-IDF approaches and chose one of them.\n", 912 | "- for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100).\n", 913 | "\n", 914 | "You also could try other improvements of the preprocessing / model, if you want. " 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "metadata": { 921 | "collapsed": true 922 | }, 923 | "outputs": [], 924 | "source": [ 925 | "######################################\n", 926 | "######### YOUR CODE HERE #############\n", 927 | "######################################" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "When you are happy with the quality, create predictions for *test* set, which you will submit to Coursera." 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": null, 940 | "metadata": { 941 | "collapsed": true 942 | }, 943 | "outputs": [], 944 | "source": [ 945 | "test_predictions = ######### YOUR CODE HERE #############\n", 946 | "test_pred_inversed = mlb.inverse_transform(test_predictions)\n", 947 | "\n", 948 | "test_predictions_for_submission = '\\n'.join('%i\\t%s' % (i, ','.join(row)) for i, row in enumerate(test_pred_inversed))\n", 949 | "grader.submit_tag('MultilabelClassification', test_predictions_for_submission)" 950 | ] 951 | }, 952 | { 953 | "cell_type": "markdown", 954 | "metadata": {}, 955 | "source": [ 956 | "### Analysis of the most important features" 957 | ] 958 | }, 959 | { 960 | "cell_type": "markdown", 961 | "metadata": {}, 962 | "source": [ 963 | "Finally, it is usually a good idea to look at the features (words or n-grams) that are used with the largest weigths in your logistic regression model." 964 | ] 965 | }, 966 | { 967 | "cell_type": "markdown", 968 | "metadata": {}, 969 | "source": [ 970 | "Implement the function *print_words_for_tag* to find them. Get back to sklearn documentation on [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) and [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) if needed." 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": null, 976 | "metadata": { 977 | "collapsed": true 978 | }, 979 | "outputs": [], 980 | "source": [ 981 | "def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):\n", 982 | " \"\"\"\n", 983 | " classifier: trained classifier\n", 984 | " tag: particular tag\n", 985 | " tags_classes: a list of classes names from MultiLabelBinarizer\n", 986 | " index_to_words: index_to_words transformation\n", 987 | " all_words: all words in the dictionary\n", 988 | " \n", 989 | " return nothing, just print top 5 positive and top 5 negative words for current tag\n", 990 | " \"\"\"\n", 991 | " print('Tag:\\t{}'.format(tag))\n", 992 | " \n", 993 | " # Extract an estimator from the classifier for the given tag.\n", 994 | " # Extract feature coefficients from the estimator. \n", 995 | " \n", 996 | " ######################################\n", 997 | " ######### YOUR CODE HERE #############\n", 998 | " ######################################\n", 999 | " \n", 1000 | " top_positive_words = # top-5 words sorted by the coefficiens.\n", 1001 | " top_negative_words = # bottom-5 words sorted by the coefficients.\n", 1002 | " print('Top positive words:\\t{}'.format(', '.join(top_positive_words)))\n", 1003 | " print('Top negative words:\\t{}\\n'.format(', '.join(top_negative_words)))" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": null, 1009 | "metadata": { 1010 | "collapsed": true 1011 | }, 1012 | "outputs": [], 1013 | "source": [ 1014 | "print_words_for_tag(classifier_tfidf, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n", 1015 | "print_words_for_tag(classifier_tfidf, 'c++', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n", 1016 | "print_words_for_tag(classifier_tfidf, 'linux', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "### Authorization & Submission\n", 1024 | "To submit assignment parts to Cousera platform, please, enter your e-mail and token into variables below. You can generate token on this programming assignment page. Note: Token expires 30 minutes after generation." 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "metadata": { 1031 | "collapsed": true 1032 | }, 1033 | "outputs": [], 1034 | "source": [ 1035 | "grader.status()" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": null, 1041 | "metadata": { 1042 | "collapsed": true 1043 | }, 1044 | "outputs": [], 1045 | "source": [ 1046 | "STUDENT_EMAIL = # EMAIL \n", 1047 | "STUDENT_TOKEN = # TOKEN \n", 1048 | "grader.status()" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "markdown", 1053 | "metadata": {}, 1054 | "source": [ 1055 | "If you want to submit these answers, run cell below" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": null, 1061 | "metadata": { 1062 | "collapsed": true 1063 | }, 1064 | "outputs": [], 1065 | "source": [ 1066 | "grader.submit(STUDENT_EMAIL, STUDENT_TOKEN)" 1067 | ] 1068 | } 1069 | ], 1070 | "metadata": { 1071 | "kernelspec": { 1072 | "display_name": "Python 3", 1073 | "language": "python", 1074 | "name": "python3" 1075 | }, 1076 | "language_info": { 1077 | "codemirror_mode": { 1078 | "name": "ipython", 1079 | "version": 3 1080 | }, 1081 | "file_extension": ".py", 1082 | "mimetype": "text/x-python", 1083 | "name": "python", 1084 | "nbconvert_exporter": "python", 1085 | "pygments_lexer": "ipython3", 1086 | "version": "3.5.2" 1087 | } 1088 | }, 1089 | "nbformat": 4, 1090 | "nbformat_minor": 2 1091 | } 1092 | -------------------------------------------------------------------------------- /week2/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False): 4 | if candidate == 'B-' + current_tag: 5 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 6 | current_chunk[-1].append(current_pos - 1) 7 | current_chunk.append([current_pos]) 8 | elif candidate == 'I-' + current_tag: 9 | if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag): 10 | current_chunk.append([current_pos]) 11 | if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'): 12 | current_chunk.append([current_pos]) 13 | elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag: 14 | if len(current_chunk) > 0: 15 | current_chunk[-1].append(current_pos - 1) 16 | 17 | def _update_last_chunk(current_chunk, current_pos): 18 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 19 | current_chunk[-1].append(current_pos - 1) 20 | 21 | def _tag_precision_recall_f1(tp, fp, fn): 22 | precision, recall, f1 = 0, 0, 0 23 | if tp + fp > 0: 24 | precision = tp / (tp + fp) * 100 25 | if tp + fn > 0: 26 | recall = tp / (tp + fn) * 100 27 | if precision + recall > 0: 28 | f1 = 2 * precision * recall / (precision + recall) 29 | return precision, recall, f1 30 | 31 | def _aggregate_metrics(results, total_correct): 32 | total_true_entities = 0 33 | total_predicted_entities = 0 34 | total_precision = 0 35 | total_recall = 0 36 | total_f1 = 0 37 | for tag, tag_metrics in results.items(): 38 | n_pred = tag_metrics['n_predicted_entities'] 39 | n_true = tag_metrics['n_true_entities'] 40 | total_true_entities += n_true 41 | total_predicted_entities += n_pred 42 | total_precision += tag_metrics['precision'] * n_pred 43 | total_recall += tag_metrics['recall'] * n_true 44 | 45 | accuracy = total_correct / total_true_entities * 100 46 | if total_predicted_entities > 0: 47 | total_precision = total_precision / total_predicted_entities 48 | total_recall = total_recall / total_true_entities 49 | if total_precision + total_recall > 0: 50 | total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall) 51 | return total_true_entities, total_predicted_entities, \ 52 | total_precision, total_recall, total_f1, accuracy 53 | 54 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct): 55 | print('processed {len} tokens ' \ 56 | 'with {tot_true} phrases; ' \ 57 | 'found: {tot_pred} phrases; ' \ 58 | 'correct: {tot_cor}.\n'.format(len=n_tokens, 59 | tot_true=total_true_entities, 60 | tot_pred=total_predicted_entities, 61 | tot_cor=total_correct)) 62 | 63 | def _print_metrics(accuracy, total_precision, total_recall, total_f1): 64 | print('precision: {tot_prec:.2f}%; ' \ 65 | 'recall: {tot_recall:.2f}%; ' \ 66 | 'F1: {tot_f1:.2f}\n'.format(acc=accuracy, 67 | tot_prec=total_precision, 68 | tot_recall=total_recall, 69 | tot_f1=total_f1)) 70 | 71 | def _print_tag_metrics(tag, tag_results): 72 | print(('\t%12s' % tag) + ': precision: {tot_prec:6.2f}%; ' \ 73 | 'recall: {tot_recall:6.2f}%; ' \ 74 | 'F1: {tot_f1:6.2f}; ' \ 75 | 'predicted: {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'], 76 | tot_recall=tag_results['recall'], 77 | tot_f1=tag_results['f1'], 78 | tot_predicted=tag_results['n_predicted_entities'])) 79 | 80 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False): 81 | # Find all tags 82 | tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O')) 83 | 84 | results = OrderedDict((tag, OrderedDict()) for tag in tags) 85 | n_tokens = len(y_true) 86 | total_correct = 0 87 | 88 | # For eval_conll_try we find all chunks in the ground truth and prediction 89 | # For each chunk we store starting and ending indices 90 | for tag in tags: 91 | true_chunk = list() 92 | predicted_chunk = list() 93 | for position in range(n_tokens): 94 | _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position) 95 | _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True) 96 | 97 | _update_last_chunk(true_chunk, position) 98 | _update_last_chunk(predicted_chunk, position) 99 | 100 | # Then we find all correctly classified intervals 101 | # True positive results 102 | tp = sum(chunk in predicted_chunk for chunk in true_chunk) 103 | total_correct += tp 104 | 105 | # And then just calculate errors of the first and second kind 106 | # False negative 107 | fn = len(true_chunk) - tp 108 | # False positive 109 | fp = len(predicted_chunk) - tp 110 | precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn) 111 | 112 | results[tag]['precision'] = precision 113 | results[tag]['recall'] = recall 114 | results[tag]['f1'] = f1 115 | results[tag]['n_predicted_entities'] = len(predicted_chunk) 116 | results[tag]['n_true_entities'] = len(true_chunk) 117 | 118 | total_true_entities, total_predicted_entities, \ 119 | total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct) 120 | 121 | if print_results: 122 | _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct) 123 | _print_metrics(accuracy, total_precision, total_recall, total_f1) 124 | 125 | if not short_report: 126 | for tag, tag_results in results.items(): 127 | _print_tag_metrics(tag, tag_results) 128 | return results 129 | -------------------------------------------------------------------------------- /week2/week2-NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Recognize named entities on Twitter with LSTMs\n", 10 | "\n", 11 | "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n", 12 | "\n", 13 | "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n", 14 | "\n", 15 | " Ian Goodfellow works for Google Brain\n", 16 | "\n", 17 | "a NER model needs to provide the following sequence of tags:\n", 18 | "\n", 19 | " B-PER I-PER O O B-ORG I-ORG\n", 20 | "\n", 21 | "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n", 22 | "\n", 23 | "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n", 24 | "\n", 25 | "### Libraries\n", 26 | "\n", 27 | "For this task you will need the following libraries:\n", 28 | " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 29 | " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 30 | " \n", 31 | "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### Data\n", 39 | "\n", 40 | "The following cell will download all data required for this assignment into the folder `week2/data`." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "import sys\n", 52 | "sys.path.append(\"..\")\n", 53 | "from common.download_utils import download_week2_resources\n", 54 | "\n", 55 | "download_week2_resources()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Load the Twitter Named Entity Recognition corpus\n", 63 | "\n", 64 | "We will work with a corpus, which contains twits with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n", 65 | "\n", 66 | "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `` token and any URL to `` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "def read_data(file_path):\n", 78 | " tokens = []\n", 79 | " tags = []\n", 80 | " \n", 81 | " tweet_tokens = []\n", 82 | " tweet_tags = []\n", 83 | " for line in open(file_path, encoding='utf-8'):\n", 84 | " line = line.strip()\n", 85 | " if not line:\n", 86 | " if tweet_tokens:\n", 87 | " tokens.append(tweet_tokens)\n", 88 | " tags.append(tweet_tags)\n", 89 | " tweet_tokens = []\n", 90 | " tweet_tags = []\n", 91 | " else:\n", 92 | " token, tag = line.split()\n", 93 | " # Replace all urls with token\n", 94 | " # Replace all users with token\n", 95 | "\n", 96 | " ######################################\n", 97 | " ######### YOUR CODE HERE #############\n", 98 | " ######################################\n", 99 | " \n", 100 | " tweet_tokens.append(token)\n", 101 | " tweet_tags.append(tag)\n", 102 | " \n", 103 | " return tokens, tags" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "And now we can load three separate parts of the dataset:\n", 111 | " - *train* data for training the model;\n", 112 | " - *validation* data for evaluation and hyperparameters tuning;\n", 113 | " - *test* data for final evaluation of the model." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "train_tokens, train_tags = read_data('data/train.txt')\n", 125 | "validation_tokens, validation_tags = read_data('data/validation.txt')\n", 126 | "test_tokens, test_tags = read_data('data/test.txt')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "for i in range(3):\n", 145 | " for token, tag in zip(train_tokens[i], train_tags[i]):\n", 146 | " print('%s\\t%s' % (token, tag))\n", 147 | " print()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### Prepare dictionaries\n", 155 | "\n", 156 | "To train a neural network, we will use two mappings: \n", 157 | "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", 158 | "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", 159 | "\n", 160 | "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "from collections import defaultdict" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "def build_dict(tokens_or_tags, special_tokens):\n", 183 | " \"\"\"\n", 184 | " tokens_or_tags: a list of lists of tokens or tags\n", 185 | " special_tokens: some special tokens\n", 186 | " \"\"\"\n", 187 | " # Create a dictionary with default value 0\n", 188 | " tok2idx = defaultdict(lambda: 0)\n", 189 | " idx2tok = []\n", 190 | " \n", 191 | " # Create mappings from tokens (or tags) to indices and vice versa.\n", 192 | " # At first, add special tokens (or tags) to the dictionaries.\n", 193 | " # The first special token must have index 0.\n", 194 | " \n", 195 | " # Mapping tok2idx should contain each token or tag only once. \n", 196 | " # To do so, you should:\n", 197 | " # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n", 198 | " # occure in special_tokens (because they could have non-empty intersection)\n", 199 | " # 2. index them (for example, you can add them into the list idx2tok\n", 200 | " # 3. for each token/tag save the index into tok2idx).\n", 201 | " \n", 202 | " ######################################\n", 203 | " ######### YOUR CODE HERE #############\n", 204 | " ######################################\n", 205 | " \n", 206 | " return tok2idx, idx2tok" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n", 214 | " - `` token for out of vocabulary tokens;\n", 215 | " - `` token for padding sentence to the same length when we create batches of sentences." 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "special_tokens = ['', '']\n", 227 | "special_tags = ['O']\n", 228 | "\n", 229 | "# Create dictionaries \n", 230 | "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n", 231 | "tag2idx, idx2tag = build_dict(train_tags, special_tags)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "The next additional functions will help you to create the mapping between tokens and ids for a sentence. " 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "def words2idxs(tokens_list):\n", 250 | " return [token2idx[word] for word in tokens_list]\n", 251 | "\n", 252 | "def tags2idxs(tags_list):\n", 253 | " return [tag2idx[tag] for tag in tags_list]\n", 254 | "\n", 255 | "def idxs2words(idxs):\n", 256 | " return [idx2token[idx] for idx in idxs]\n", 257 | "\n", 258 | "def idxs2tags(idxs):\n", 259 | " return [idx2tag[idx] for idx in idxs]" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### Generate batches\n", 267 | "\n", 268 | "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. " 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "def batches_generator(batch_size, tokens, tags,\n", 280 | " shuffle=True, allow_smaller_last_batch=True):\n", 281 | " \"\"\"Generates padded batches of tokens and tags.\"\"\"\n", 282 | " \n", 283 | " n_samples = len(tokens)\n", 284 | " if shuffle:\n", 285 | " order = np.random.permutation(n_samples)\n", 286 | " else:\n", 287 | " order = np.arange(n_samples)\n", 288 | "\n", 289 | " n_batches = n_samples // batch_size\n", 290 | " if allow_smaller_last_batch and n_samples % batch_size:\n", 291 | " n_batches += 1\n", 292 | "\n", 293 | " for k in range(n_batches):\n", 294 | " batch_start = k * batch_size\n", 295 | " batch_end = min((k + 1) * batch_size, n_samples)\n", 296 | " current_batch_size = batch_end - batch_start\n", 297 | " x_list = []\n", 298 | " y_list = []\n", 299 | " max_len_token = 0\n", 300 | " for idx in order[batch_start: batch_end]:\n", 301 | " x_list.append(words2idxs(tokens[idx]))\n", 302 | " y_list.append(tags2idxs(tags[idx]))\n", 303 | " max_len_token = max(max_len_token, len(tags[idx]))\n", 304 | " \n", 305 | " # Fill in the data into numpy nd-arrays filled with padding indices.\n", 306 | " x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['']\n", 307 | " y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n", 308 | " lengths = np.zeros(current_batch_size, dtype=np.int32)\n", 309 | " for n in range(current_batch_size):\n", 310 | " utt_len = len(x_list[n])\n", 311 | " x[n, :utt_len] = x_list[n]\n", 312 | " lengths[n] = utt_len\n", 313 | " y[n, :utt_len] = y_list[n]\n", 314 | " yield x, y, lengths" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "## Build a recurrent neural network\n", 322 | "\n", 323 | "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification. " 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "import tensorflow as tf\n", 335 | "import numpy as np" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "class BiLSTMModel():\n", 347 | " pass" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "source": [ 356 | "First, we need to create [placeholders](https://www.tensorflow.org/versions/master/api_docs/python/tf/placeholder) to specify what data we are going to feed into the network during the execution time. For this task we will need the following placeholders:\n", 357 | " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n", 358 | " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n", 359 | " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n", 360 | " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n", 361 | " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n", 362 | "\n", 363 | "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n", 364 | "\n", 365 | "You need to complete the function *declare_placeholders*." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "def declare_placeholders(self):\n", 377 | " \"\"\"Specifies placeholders for the model.\"\"\"\n", 378 | "\n", 379 | " # Placeholders for input and ground truth output.\n", 380 | " self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n", 381 | " self.ground_truth_tags = ######### YOUR CODE HERE #############\n", 382 | " \n", 383 | " # Placeholder for lengths of the sequences.\n", 384 | " self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n", 385 | " \n", 386 | " # Placeholder for a dropout keep probability. If we don't feed\n", 387 | " # a value for this placeholder, it will be equal to 1.0.\n", 388 | " self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n", 389 | " \n", 390 | " # Placeholder for a learning rate (tf.float32).\n", 391 | " self.learning_rate_ph = ######### YOUR CODE HERE #############" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": true 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "collapsed": true 409 | }, 410 | "source": [ 411 | "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n", 412 | " \n", 413 | "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type (*tf.float32*), and initialize with random values.\n", 414 | "- Create forward and backward LSTM cells. TensorFlow provides a number of [RNN cells](https://www.tensorflow.org/api_guides/python/contrib.rnn#Core_RNN_Cells_for_use_with_TensorFlow_s_core_RNN_methods) ready for you. We suggest that you use *BasicLSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n", 415 | "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n", 416 | " \n", 417 | "After that, you can build the computation graph that transforms an input_batch:\n", 418 | "\n", 419 | "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n", 420 | "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n", 421 | "- Create a dense layer on top. Its output will be used directly in loss function. \n", 422 | " \n", 423 | "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n", 424 | " " 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "collapsed": true 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n", 436 | " \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n", 437 | " \n", 438 | " # Create embedding variable (tf.Variable) with dtype tf.float32\n", 439 | " initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)\n", 440 | " embedding_matrix_variable = ######### YOUR CODE HERE #############\n", 441 | " \n", 442 | " # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n", 443 | " # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n", 444 | " forward_cell = ######### YOUR CODE HERE #############\n", 445 | " backward_cell = ######### YOUR CODE HERE #############\n", 446 | "\n", 447 | " # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n", 448 | " # Shape: [batch_size, sequence_len, embedding_dim].\n", 449 | " embeddings = ######### YOUR CODE HERE #############\n", 450 | " \n", 451 | " # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n", 452 | " # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n", 453 | " # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n", 454 | " (rnn_output_fw, rnn_output_bw), _ = ######### YOUR CODE HERE #############\n", 455 | " rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n", 456 | "\n", 457 | " # Dense layer on top.\n", 458 | " # Shape: [batch_size, sequence_len, n_tags]. \n", 459 | " self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "collapsed": true 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "BiLSTMModel.__build_layers = classmethod(build_layers)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": true 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "def compute_predictions(self):\n", 489 | " \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n", 490 | " \n", 491 | " # Create softmax (tf.nn.softmax) function\n", 492 | " softmax_output = ######### YOUR CODE HERE #############\n", 493 | " \n", 494 | " # Use argmax (tf.argmax) to get the most probable tags\n", 495 | " # Don't forget to set axis=-1\n", 496 | " # otherwise argmax will be calculated in a wrong way\n", 497 | " self.predictions = ######### YOUR CODE HERE #############" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": { 504 | "collapsed": true 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": { 514 | "collapsed": true 515 | }, 516 | "source": [ 517 | "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n", 518 | "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note, that we do not want to take into account loss terms coming from `` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)." 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": { 525 | "collapsed": true 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "def compute_loss(self, n_tags, PAD_index):\n", 530 | " \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n", 531 | " \n", 532 | " # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)\n", 533 | " ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n", 534 | " loss_tensor = ######### YOUR CODE HERE #############\n", 535 | " \n", 536 | " mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n", 537 | " # Create loss function which doesn't operate with tokens (tf.reduce_mean)\n", 538 | " # Be careful that the argument of tf.reduce_mean should be\n", 539 | " # multiplication of mask and loss_tensor.\n", 540 | " self.loss = ######### YOUR CODE HERE #############" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "collapsed": true 548 | }, 549 | "outputs": [], 550 | "source": [ 551 | "BiLSTMModel.__compute_loss = classmethod(compute_loss)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "The last thing to specify is how we want to optimize the loss. \n", 559 | "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n", 560 | "You will also need to apply [clipping](https://www.tensorflow.org/versions/r0.12/api_docs/python/train/gradient_clipping) to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. " 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": { 567 | "collapsed": true 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "def perform_optimization(self):\n", 572 | " \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n", 573 | " \n", 574 | " # Create an optimizer (tf.train.AdamOptimizer)\n", 575 | " self.optimizer = ######### YOUR CODE HERE #############\n", 576 | " self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n", 577 | " \n", 578 | " # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n", 579 | " # Pay attention that you need to apply this operation only for gradients \n", 580 | " # because self.grads_and_vars contains also variables.\n", 581 | " # list comprehension might be useful in this case.\n", 582 | " clip_norm = tf.cast(1.0, tf.float32)\n", 583 | " self.grads_and_vars = ######### YOUR CODE HERE #############\n", 584 | " \n", 585 | " self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": { 592 | "collapsed": true 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "metadata": { 602 | "collapsed": true 603 | }, 604 | "source": [ 605 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n", 606 | "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. " 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": { 613 | "collapsed": true 614 | }, 615 | "outputs": [], 616 | "source": [ 617 | "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n", 618 | " self.__declare_placeholders()\n", 619 | " self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n", 620 | " self.__compute_predictions()\n", 621 | " self.__compute_loss(n_tags, PAD_index)\n", 622 | " self.__perform_optimization()" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": { 629 | "collapsed": true 630 | }, 631 | "outputs": [], 632 | "source": [ 633 | "BiLSTMModel.__init__ = classmethod(init_model)" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "## Train the network and predict tags" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": { 646 | "collapsed": true 647 | }, 648 | "source": [ 649 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. " 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": { 656 | "collapsed": true 657 | }, 658 | "outputs": [], 659 | "source": [ 660 | "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n", 661 | " feed_dict = {self.input_batch: x_batch,\n", 662 | " self.ground_truth_tags: y_batch,\n", 663 | " self.learning_rate_ph: learning_rate,\n", 664 | " self.dropout_ph: dropout_keep_probability,\n", 665 | " self.lengths: lengths}\n", 666 | " \n", 667 | " session.run(self.train_op, feed_dict=feed_dict)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": { 674 | "collapsed": true 675 | }, 676 | "outputs": [], 677 | "source": [ 678 | "BiLSTMModel.train_on_batch = classmethod(train_on_batch)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*." 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": { 692 | "collapsed": true 693 | }, 694 | "outputs": [], 695 | "source": [ 696 | "def predict_for_batch(self, session, x_batch, lengths):\n", 697 | " ######################################\n", 698 | " ######### YOUR CODE HERE #############\n", 699 | " ######################################\n", 700 | " \n", 701 | " return predictions" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": { 708 | "collapsed": true 709 | }, 710 | "outputs": [], 711 | "source": [ 712 | "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n", 720 | "\n", 721 | "### Evaluation \n", 722 | "To simplify the evaluation process we provide two functions for you:\n", 723 | " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n", 724 | " - *eval_conll*: calculates precision, recall and F1 for the results." 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": { 731 | "collapsed": true 732 | }, 733 | "outputs": [], 734 | "source": [ 735 | "from evaluation import precision_recall_f1" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": { 742 | "collapsed": true 743 | }, 744 | "outputs": [], 745 | "source": [ 746 | "def predict_tags(model, session, token_idxs_batch, lengths):\n", 747 | " \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n", 748 | " \n", 749 | " tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n", 750 | " \n", 751 | " tags_batch, tokens_batch = [], []\n", 752 | " for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n", 753 | " tags, tokens = [], []\n", 754 | " for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n", 755 | " tags.append(idx2tag[tag_idx])\n", 756 | " tokens.append(idx2token[token_idx])\n", 757 | " tags_batch.append(tags)\n", 758 | " tokens_batch.append(tokens)\n", 759 | " return tags_batch, tokens_batch\n", 760 | " \n", 761 | " \n", 762 | "def eval_conll(model, session, tokens, tags, short_report=True):\n", 763 | " \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n", 764 | " \n", 765 | " y_true, y_pred = [], []\n", 766 | " for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n", 767 | " tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n", 768 | " if len(x_batch[0]) != len(tags_batch[0]):\n", 769 | " raise Exception(\"Incorrect length of prediction for the input, \"\n", 770 | " \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n", 771 | " predicted_tags = []\n", 772 | " ground_truth_tags = []\n", 773 | " for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n", 774 | " if token != '':\n", 775 | " ground_truth_tags.append(idx2tag[gt_tag_idx])\n", 776 | " predicted_tags.append(pred_tag)\n", 777 | "\n", 778 | " # We extend every prediction and ground truth sequence with 'O' tag\n", 779 | " # to indicate a possible end of entity.\n", 780 | " y_true.extend(ground_truth_tags + ['O'])\n", 781 | " y_pred.extend(predicted_tags + ['O'])\n", 782 | " \n", 783 | " results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n", 784 | " return results" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "## Run your experiment" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "metadata": {}, 797 | "source": [ 798 | "Create *BiLSTMModel* model with the following parameters:\n", 799 | " - *vocabulary_size* — number of tokens;\n", 800 | " - *n_tags* — number of tags;\n", 801 | " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n", 802 | " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n", 803 | " - *PAD_index* — an index of the padding token (``).\n", 804 | "\n", 805 | "Set hyperparameters. You might want to start with the following recommended values:\n", 806 | "- *batch_size*: 32;\n", 807 | "- 4 epochs;\n", 808 | "- starting value of *learning_rate*: 0.005\n", 809 | "- *learning_rate_decay*: a square root of 2;\n", 810 | "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n", 811 | "\n", 812 | "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment." 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [ 823 | "tf.reset_default_graph()\n", 824 | "\n", 825 | "model = ######### YOUR CODE HERE #############\n", 826 | "\n", 827 | "batch_size = ######### YOUR CODE HERE #############\n", 828 | "n_epochs = ######### YOUR CODE HERE #############\n", 829 | "learning_rate = ######### YOUR CODE HERE #############\n", 830 | "learning_rate_decay = ######### YOUR CODE HERE #############\n", 831 | "dropout_keep_probability = ######### YOUR CODE HERE #############" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": {}, 837 | "source": [ 838 | "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables." 839 | ] 840 | }, 841 | { 842 | "cell_type": "markdown", 843 | "metadata": {}, 844 | "source": [ 845 | "Finally, we are ready to run the training!" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": { 852 | "collapsed": true 853 | }, 854 | "outputs": [], 855 | "source": [ 856 | "sess = tf.Session()\n", 857 | "sess.run(tf.global_variables_initializer())\n", 858 | "\n", 859 | "print('Start training... \\n')\n", 860 | "for epoch in range(n_epochs):\n", 861 | " # For each epoch evaluate the model on train and validation data\n", 862 | " print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n", 863 | " print('Train data evaluation:')\n", 864 | " eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n", 865 | " print('Validation data evaluation:')\n", 866 | " eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n", 867 | " \n", 868 | " # Train the model\n", 869 | " for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n", 870 | " model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n", 871 | " \n", 872 | " # Decaying the learning rate\n", 873 | " learning_rate = learning_rate / learning_rate_decay\n", 874 | " \n", 875 | "print('...training finished.')" 876 | ] 877 | }, 878 | { 879 | "cell_type": "markdown", 880 | "metadata": {}, 881 | "source": [ 882 | "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n", 883 | "\n", 884 | "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": null, 890 | "metadata": { 891 | "collapsed": true 892 | }, 893 | "outputs": [], 894 | "source": [ 895 | "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n", 896 | "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n", 897 | "\n", 898 | "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n", 899 | "validation_results = ######### YOUR CODE HERE #############\n", 900 | "\n", 901 | "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n", 902 | "test_results = ######### YOUR CODE HERE #############" 903 | ] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "### Conclusions\n", 910 | "\n", 911 | "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested." 912 | ] 913 | } 914 | ], 915 | "metadata": { 916 | "kernelspec": { 917 | "display_name": "Python 3", 918 | "language": "python", 919 | "name": "python3" 920 | }, 921 | "language_info": { 922 | "codemirror_mode": { 923 | "name": "ipython", 924 | "version": 3 925 | }, 926 | "file_extension": ".py", 927 | "mimetype": "text/x-python", 928 | "name": "python", 929 | "nbconvert_exporter": "python", 930 | "pygments_lexer": "ipython3", 931 | "version": "3.4.3" 932 | } 933 | }, 934 | "nbformat": 4, 935 | "nbformat_minor": 1 936 | } 937 | -------------------------------------------------------------------------------- /week3/.gitignore: -------------------------------------------------------------------------------- 1 | GoogleNews-vectors-negative300.* 2 | starspace_embedding 3 | starspace_embedding.* 4 | -------------------------------------------------------------------------------- /week3/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A' 10 | self.parts = OrderedDict([('98mDT', 'Question2Vec'), 11 | ('nc7RP', 'HitsCount'), 12 | ('bNp90', 'DCGScore'), 13 | ('3gRlQ', 'W2VTokenizedRanks'), 14 | ('mX6wS', 'StarSpaceRanks')]) 15 | self.answers = {key: None for key in self.parts} 16 | 17 | @staticmethod 18 | def ravel_output(output): 19 | ''' 20 | If student accidentally submitted np.array with one 21 | element instead of number, this function will submit 22 | this number instead 23 | ''' 24 | if isinstance(output, np.ndarray) and output.size == 1: 25 | output = output.item(0) 26 | return output 27 | 28 | def submit(self, email, token): 29 | submission = { 30 | "assignmentKey": self.assignment_key, 31 | "submitterEmail": email, 32 | "secret": token, 33 | "parts": {} 34 | } 35 | for part, output in self.answers.items(): 36 | if output is not None: 37 | submission["parts"][part] = {"output": output} 38 | else: 39 | submission["parts"][part] = dict() 40 | request = requests.post(self.submission_page, data=json.dumps(submission)) 41 | response = request.json() 42 | if request.status_code == 201: 43 | print('Submitted to Coursera platform. See results on assignment page!') 44 | elif u'details' in response and u'learnerMessage' in response[u'details']: 45 | print(response[u'details'][u'learnerMessage']) 46 | else: 47 | print("Unknown response from Coursera: {}".format(request.status_code)) 48 | print(response) 49 | 50 | def status(self): 51 | print("You want to submit these parts:") 52 | for part_id, part_name in self.parts.items(): 53 | answer = self.answers[part_id] 54 | if answer is None: 55 | answer = '-'*10 56 | print("Task {}: {}".format(part_name, answer[:100] + '...')) 57 | 58 | def submit_part(self, part, output): 59 | self.answers[part] = output 60 | print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...')) 61 | 62 | def submit_tag(self, tag, output): 63 | part_id = [k for k, v in self.parts.items() if v == tag] 64 | if len(part_id) != 1: 65 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 66 | part_id = part_id[0] 67 | self.submit_part(part_id, str(self.ravel_output(output))) 68 | -------------------------------------------------------------------------------- /week3/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') 6 | STOPWORDS = set(stopwords.words('english')) 7 | def text_prepare(text): 8 | text = text.lower() 9 | text = REPLACE_BY_SPACE_RE.sub(' ', text) 10 | text = GOOD_SYMBOLS_RE.sub('', text) 11 | text = ' '.join([x for x in text.split() if x and x not in STOPWORDS]) 12 | return text.strip() 13 | 14 | def array_to_string(arr): 15 | return '\n'.join(str(num) for num in arr) 16 | 17 | def matrix_to_string(matrix): 18 | return '\n'.join('\t'.join(str(num) for num in line) for line in matrix) -------------------------------------------------------------------------------- /week4/encoder-decoder-pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nsanghi/HSE-NLP-Coursera/9df88e63eba6dbb38cabd87bd88fff25f4abcda6/week4/encoder-decoder-pic.png --------------------------------------------------------------------------------