├── .gitignore ├── AWS-tutorial.md ├── Docker-tutorial.md ├── README.md ├── common ├── README.md ├── __init__.py ├── download_utils.py ├── requirements_colab.txt └── tqdm_utils.py ├── docker ├── Dockerfile ├── requirements.txt └── welcome_message.txt ├── honor ├── README.md ├── datasets.py ├── download_cornell.sh ├── download_opensubs.sh └── example.py ├── optional ├── README.md └── telegram_bot │ ├── README.md │ ├── dialogue_manager.py │ ├── main_bot.py │ ├── utils.py │ └── week5-project.ipynb ├── setup_google_colab.py ├── week1 ├── grader.py ├── lemmatization_demo.ipynb ├── metrics.py ├── tfidf_demo.ipynb └── week1-MultilabelClassification.ipynb ├── week2 ├── evaluation.py └── week2-NER.ipynb ├── week3 ├── grader.py ├── util.py └── week3-Embeddings.ipynb ├── week4 ├── encoder-decoder-pic.png └── week4-seq2seq.ipynb └── week5 ├── dialogue_manager.py ├── utils.py └── week5-project.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Data for assignments 104 | data/ 105 | 106 | .idea -------------------------------------------------------------------------------- /AWS-tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial for setting up an AWS Virtual Machine 2 | 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 4 | 5 | ### 1. Register with AWS and launch an EC2 instance 6 | 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them): 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged. 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH. 10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change. 11 | 12 | Next, you are ready to create your first EC2 instance: 13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) choose **Ubuntu Server 16.04 LTS**. 14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH. If you have problems connecting to the instance, try following this [troubleshooting guide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/TroubleshootingInstancesConnecting.html). 15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end. 16 | 17 | ### 2. Set up dependencies and run your project 18 | 19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions. 20 | 21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling: 22 | ```sh 23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com 24 | ``` 25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine. 26 | 27 | If you're using PuTTY, before you start your SSH connection, go to the PuTTY Tunnels panel. Make sure the «Local» and «Auto» radio buttons are set. Enter the local port 8080 number into the «Source port» box. Enter the destination host name and port number into the «Destination» box, separated by a colon ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:8080. 28 | For more details see [this guide](https://www.akadia.com/services/ssh_putty.html). 29 | 30 | - Bring code and data to AWS instance, e.g. 31 | ```sh 32 | scp -i path/to/your_key.pem -r path/to/local_directory ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file 33 | ``` 34 | You might need to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows. 35 | 36 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop. 37 | 38 | Thus, to run your scripts on the machine, we suggest that you run: ssh -> tmux -> Docker -> Python. 39 | -------------------------------------------------------------------------------- /Docker-tutorial.md: -------------------------------------------------------------------------------- 1 | # Docker container with course dependencies 2 | 3 | This file describes how to use a Docker container with Jupyter notebook and 4 | all dependencies required for the course. 5 | 6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/. 7 | 8 | ## Install Stable Docker Community Edition (CE) 9 | 10 | - For Mac: 11 | https://docs.docker.com/docker-for-mac/install/ 12 | 13 | - For Ubuntu: 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu). 15 | 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education): 17 | https://docs.docker.com/docker-for-windows/install/ 18 | 19 | - For Windows (older versions): 20 | https://docs.docker.com/toolbox/toolbox_install_windows/ 21 | 22 | 23 | 24 | ## Get container image 25 | 26 | To get the latest version of the container image run: 27 | ```sh 28 | docker pull akashin/coursera-aml-nlp 29 | ``` 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 31 | 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group: 33 | ```sh 34 | sudo usermod -a -G docker $USER 35 | sudo service docker restart 36 | ``` 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it). 38 | 39 | 40 | ## Run container for the first time 41 | 42 | Now you can start new container from this image with the following command: 43 | ```sh 44 | docker run -it -p 8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp -v path_on_your_machine:path_within_docker 45 | ``` 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server. 47 | 48 | Note that we are using `-p 8080:8080` argument to set up port forwarding to make IPython notebook accessible at address http://localhost:8080. If you're using AWS, make sure that you've [set up the port forwarding](https://github.com/hse-aml/natural-language-processing/blob/master/AWS-tutorial.md#2-set-up-dependencies-and-run-your-project) there as well. 49 | 50 | **Important:** Docker image only contains system dependencies for the project (e.g. TensorFlow, Starspace). 51 | All other project-related files (e.g. input data) need to be exposed to the container manually though [Docker volumes](https://docs.docker.com/storage/volumes/). To do this, we are mounting a directory from your machine within the container using `-v` option. 52 | 53 | On Linux and OSX, an example command looks like: 54 | ```sh 55 | docker run -it -p 8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp 56 | ``` 57 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path. 58 | 59 | **On Windows**, there are some extra [steps](https://rominirani.com/docker-on-windows-mounting-host-directories-d96f3f056a2c) involved, and the launch command looks like 60 | ```sh 61 | docker run -it -p 8080:8080 --name coursera-aml-nlp --user root -v /c/Users/$YOUR_USERNAME:/root/coursera akashin/coursera-aml-nlp 62 | ``` 63 | Where `/c/Users/$YOUR_USERNAME` is the path to your user's home folder. 64 | 65 | If you're using Docker Toolbox on Windows, the command given above might not work because of the additional VirtualBox layer involved. Instead, we recommend that you follow the guidance in http://blog.shahinrostami.com/2017/11/docker-toolbox-windows-7-shared-volumes/. 66 | 67 | ## Stop and resume container 68 | 69 | To stop the container use: 70 | ```sh 71 | docker stop coursera-aml-nlp 72 | ``` 73 | All the changes that were made within container will be saved. 74 | 75 | To resume the stopped container use: 76 | ```sh 77 | docker start -i coursera-aml-nlp 78 | ``` 79 | ## Other operations on the container 80 | 81 | There are many other operations that you can perform on the container, to show all of them: 82 | ```sh 83 | docker container 84 | ``` 85 | Some particularly useful would be **showing a list of containers** and **removing container**. 86 | 87 | To show currently running and stopped containers with their status: 88 | ```sh 89 | docker ps -a 90 | ``` 91 | 92 | To connect to a Bash shell in the already running container with name `coursera-aml-nlp` run: 93 | ``` 94 | docker exec -it coursera-aml-nlp bash 95 | ``` 96 | This will drop you into the standard Linux Bash shell that supports common commands like `ls`, `wget` or `python3`. 97 | 98 | To remove the container and all data associated with it: 99 | ```sh 100 | docker rm coursera-aml-nlp 101 | ``` 102 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected. 103 | 104 | ## Install more packages 105 | 106 | You can install more packages in the container if needed: 107 | ```sh 108 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME 109 | ``` 110 | 111 | ## Change RAM limits of the container 112 | 113 | Your container might have memory limits that are different from the actual limits of your physical machine, which might lead to a crash of your code due memory shortage. 114 | 115 | - If you're running Windows or OSX, the default limit is 2GB, but you can change it by following this tutorials: 116 | - For Windows: https://docs.docker.com/docker-for-windows/#advanced 117 | - For Mac OSX: https://docs.docker.com/docker-for-mac/#advanced 118 | 119 | - If you're running Linux, you're all set as the memory limits are the same as the physical memory of your machine. 120 | 121 | 122 | ## Further reading 123 | 124 | If you are interested to know more about Docker, check out this articles: 125 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/ 126 | - General introduction to Docker: https://docker-curriculum.com/ 127 | 128 | ## Troubleshooting 129 | 130 | ### Verify your Docker installation by running "Hello World" application 131 | - Run `docker pull hello-world`. You should see a message that ends with 132 | “Status: Downloaded newer image for hello-world:latest”. 133 | - Run `docker run hello-world`. You should see a message that starts with 134 | “Hello from Docker! 135 | This message shows that your installation appears to be working correctly.” 136 | 137 | If you see any errors, follow relevant troubleshooting steps. 138 | 139 | ### “Unauthorized: authentication required” when trying to pull Docker image 140 | Run `docker logout` and try pulling again. If this doesn't help, make sure the system date is set correctly and try again. If this doesn't help, reinstall Docker and try again. 141 | 142 | ### Can't open Jupyter notebook in the browser 143 | If you try to open "http://localhost:8080" or "http://127.0.0.1:8080" in your browser, when `run_notebook` command is started, and you can't access your notebooks, here are some advices: 144 | - If you're using Docker Toolbox on Windows, try accessing "http://192.168.99.100:8080" instead. If this doesn't work, follow the instructions [on official Docker docs](https://docs.docker.com/docker-for-windows/troubleshoot/#limitations-of-windows-containers-for-localhost-and-published-ports) and on [Stackoverflow](https://stackoverflow.com/questions/42866013/docker-toolbox-localhost-not-working). 145 | - Make sure that you're running container with `-p` flag as described [here](#run-container-for-the-first-time) and that the output of `docker ps` contains a message like this: 146 | ``` 147 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 148 | e5b7bcd85a1b akashin/coursera-aml-nlp "/bin/bash" Less than a second ago Up 2 seconds 8080/tcp peaceful_lamarr 149 | ``` 150 | If the part about `PORTS` differs, remove the current container following [instructions](#other-operations-on-the-container) and start it again. 151 | - Make sure that browser proxy settings don't interfere with accessing local web sites. 152 | - If you're running Docker on AWS, make sure you've set up port forwarding as described [here](https://github.com/hse-aml/natural-language-processing/blob/master/AWS-tutorial.md#2-set-up-dependencies-and-run-your-project). 153 | 154 | ### How do I load data into Docker container? 155 | To access the data in the container, we recommend to use `-v` flag described [here](#run-container-for-the-first-time) to mount a local directory from your computer into the container filesystem. For more details read [Docker documentation](https://docs.docker.com/storage/volumes/). 156 | 157 | Alternatively, you can download data using Jupyter "Upload" button or `wget` command in the [Bash shell](#other-operations-on-the-container) of the container. 158 | 159 | ### Can't run `run_notebook` or `starspace` command 160 | Make sure that you're executing it in the context of the Docker container as described [here](#run-container-for-the-first-time). 161 | 162 | ### "Name is already in use by container" when trying to run the container 163 | This means that the container with this name is already created. You can connect to this container or remove it by following [instructions](#other-operations-on-the-container). 164 | 165 | ### StarSpace/Jupyter notebook crashes in Docker 166 | This usually happens due to low default 2GB memory limit on Windows and OSX. Follow this [instructions](#change-ram-limits-of-the-container) to fix this. 167 | 168 | ### "This computer doesn't have VT-X/AMD-v enabled", when trying to run the container 169 | This usually happens if you're using Docker Toolbox that needs Virtual Box support - hence the need for the hardware virtualization that can be enabled in BIOS. 170 | Try to turn on the VT-X support in BIOS as described in [Microsoft documentation](https://blogs.technet.microsoft.com/canitpro/2015/09/08/step-by-step-enabling-hyper-v-for-use-on-windows-10/) or on [GitHub](https://github.com/docker/machine/issues/4271). 171 | 172 | ## Reporting the issue to the Coursera forum 173 | Before reporting the issue to the Coursera forum, please, make sure that you've checked the [troubleshooting](#troubleshooting) steps. Only if they don't help, post all relevant error messages, throubleshooting results, and the following information to your post: 174 | 175 | - Your operating system (e.g. Windows 7, Ubuntu Linux, OSX 10.13.3) 176 | - Your docker version (e.g. Docker Toolbox, Docker for Windows, output of `docker --version`) 177 | - Output of `docker ps -a`, `docker info`, `docker version -f "{{ .Server.Os }}"` (share thorough https://gist.github.com/ or https://pastebin.com/) 178 | - Output of `wget http://localhost:8080` (or `wget http://192.168.99.100:8080` for Docker Toolbox), executed from within Docker container and outside of it 179 | 180 | ## Credits 181 | 182 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing course resources 2 | This github contains practical assignments for Natural Language Processing course by Higher School of Economics: 3 | https://www.coursera.org/learn/language-processing. 4 | In this course you will learn how to solve common NLP problems using classical and deep learning approaches. 5 | 6 | From a practical side, we expect your familiarity with Python, since we will use it for all assignments in the course. Two of the assignments will also involve TensorFlow. You will work with many other libraries, including NLTK, Scikit-learn, and Gensim. You have several options on how to set it up. 7 | 8 | ## 1. Running on Google Colab 9 | Google has released its own flavour of Jupyter called Colab, which has free GPUs! 10 | 11 | Here's how you can use it: 12 | 1. Open https://colab.research.google.com, click **Sign in** in the upper right corner, use your Google credentials to sign in. 13 | 2. Click **GITHUB** tab, paste https://github.com/hse-aml/natural-language-processing and press Enter 14 | 3. Choose the notebook you want to open, e.g. week1/week1-MultilabelClassification.ipynb 15 | 4. Click **File -> Save a copy in Drive...** to save your progress in Google Drive 16 | 5. _If you need a GPU_, click **Runtime -> Change runtime type** and select **GPU** in Hardware accelerator box 17 | 6. **Execute** the following code in the first cell that downloads dependencies (change for your week number): 18 | ```python 19 | ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py 20 | import setup_google_colab 21 | # please, uncomment the week you're working on 22 | # setup_google_colab.setup_week1() 23 | # setup_google_colab.setup_week2() 24 | # setup_google_colab.setup_week3() 25 | # setup_google_colab.setup_week4() 26 | # setup_google_colab.setup_project() 27 | # setup_google_colab.setup_honor() 28 | ``` 29 | 7. If you run many notebooks on Colab, they can continue to eat up memory, 30 | you can kill them with `! pkill -9 python3` and check with `! nvidia-smi` that GPU memory is freed. 31 | 32 | **Known issues:** 33 | * No support for `ipywidgets`, so we cannot use fancy `tqdm` progress bars. 34 | For now, we use a simplified version of a progress bar suitable for Colab. 35 | * Blinking animation with `IPython.display.clear_output()`. 36 | It's usable, but still looking for a workaround. 37 | * If you see an error "No module named 'common'", make sure you've uncommented the assignment-specific line in step 6, restart your kernel and execute all cells again 38 | 39 | ## 2. Running locally 40 | 41 | Two options here: 42 | 43 | 1. Use the Docker container of our course. It already has all libraries, that you will need. The setup for you is very simple: install Docker application depending on your OS, download our container image, run everything within the container. Please, see this [detailed Docker tutorial](Docker-tutorial.md). 44 | 45 | 2. Manually install all the libraries depending on your OS (each task contains a list of needed libraries in the very beginning). If you use Windows/MacOS you might find useful Anaconda distribution which allows to install easily most of the needed libraries. However, some tools, like StarSpace for week 2, are not compatible with Windows, so it's likely that you will have to use Docker anyways, if you go for these tasks. 46 | 47 | It might take a significant amount of time and resources to run the assignments code, but we expect that an average laptop is enough to accomplish the tasks. All assignments were tested in the Docker on Mac with 8GB RAM. If you have memory errors, that could be caused by not tested configurations or inefficient code. Consider reporting these cases or double-checking your code. 48 | 49 | If you want to run the code of the course on the AWS machine, we've prepared the [AWS tutorial here](AWS-tutorial.md). 50 | -------------------------------------------------------------------------------- /common/README.md: -------------------------------------------------------------------------------- 1 | # Common utils 2 | 3 | This folder stores collection of functions that are common for different assignments 4 | 5 | - `download_utils.py`: Functions for downloading data for the assignments. 6 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /common/download_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import shutil 5 | import requests 6 | from common import tqdm_utils 7 | 8 | 9 | REPOSITORY_PATH = "https://github.com/hse-aml/natural-language-processing" 10 | 11 | 12 | def download_file(url, file_path): 13 | r = requests.get(url, stream=True) 14 | total_size = int(r.headers.get('content-length')) 15 | try: 16 | with open(file_path, 'wb', buffering=16*1024*1024) as f: 17 | bar = tqdm_utils.tqdm_notebook_failsafe(total=total_size, unit='B', unit_scale=True) 18 | bar.set_description(os.path.split(file_path)[-1]) 19 | for chunk in r.iter_content(32 * 1024): 20 | f.write(chunk) 21 | bar.update(len(chunk)) 22 | bar.close() 23 | except Exception: 24 | print("Download failed") 25 | finally: 26 | if os.path.getsize(file_path) != total_size: 27 | os.remove(file_path) 28 | print("Removed incomplete download") 29 | 30 | 31 | def download_from_github(version, fn, target_dir, force=False): 32 | url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn) 33 | file_path = os.path.join(target_dir, fn) 34 | if os.path.exists(file_path) and not force: 35 | print("File {} is already downloaded.".format(file_path)) 36 | return 37 | download_file(url, file_path) 38 | 39 | 40 | def sequential_downloader(version, fns, target_dir, force=False): 41 | os.makedirs(target_dir, exist_ok=True) 42 | for fn in fns: 43 | download_from_github(version, fn, target_dir, force=force) 44 | 45 | 46 | def download_week1_resources(force=False): 47 | sequential_downloader( 48 | "week1", 49 | [ 50 | "train.tsv", 51 | "validation.tsv", 52 | "test.tsv", 53 | "text_prepare_tests.tsv", 54 | ], 55 | "data", 56 | force=force 57 | ) 58 | 59 | 60 | def download_week2_resources(force=False): 61 | sequential_downloader( 62 | "week2", 63 | [ 64 | "train.txt", 65 | "validation.txt", 66 | "test.txt", 67 | ], 68 | "data", 69 | force=force 70 | ) 71 | 72 | 73 | def download_week3_resources(force=False): 74 | sequential_downloader( 75 | "week3", 76 | [ 77 | "train.tsv", 78 | "validation.tsv", 79 | "test.tsv", 80 | "test_embeddings.tsv", 81 | ], 82 | "data", 83 | force=force 84 | ) 85 | print("Downloading GoogleNews-vectors-negative300.bin.gz (1.5G) for you, it will take a while...") 86 | download_file("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", 87 | "GoogleNews-vectors-negative300.bin.gz") 88 | 89 | 90 | def download_project_resources(force=False): 91 | sequential_downloader( 92 | "project", 93 | [ 94 | "dialogues.tsv", 95 | "tagged_posts.tsv", 96 | ], 97 | "data", 98 | force=force 99 | ) 100 | -------------------------------------------------------------------------------- /common/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | backports.weakref==1.0.post1 3 | ChatterBot==0.7.6 4 | enum34==1.1.6 5 | funcsigs==1.0.2 6 | gensim==3.8.0 7 | jedi==0.11.0 8 | libarchive==0.4.4 9 | mock==2.0.0 10 | parso==0.1.0 11 | pbr==3.1.1 12 | regex==2017.11.9 13 | -------------------------------------------------------------------------------- /common/tqdm_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | 6 | class SimpleTqdm(): 7 | def __init__(self, iterable=None, total=None, **kwargs): 8 | self.iterable = list(iterable) if iterable is not None else None 9 | self.total = len(self.iterable) if self.iterable is not None else total 10 | assert self.iterable is not None or self.total is not None 11 | self.current_step = 0 12 | self.print_frequency = max(self.total // 50, 1) 13 | self.desc = "" 14 | 15 | def set_description_str(self, desc): 16 | self.desc = desc 17 | 18 | def set_description(self, desc): 19 | self.desc = desc 20 | 21 | def update(self, steps): 22 | last_print_step = (self.current_step // self.print_frequency) * self.print_frequency 23 | i = 1 24 | while last_print_step + i * self.print_frequency <= self.current_step + steps: 25 | print("*", end='') 26 | i += 1 27 | self.current_step += steps 28 | 29 | def close(self): 30 | print("\n" + self.desc) 31 | 32 | def __iter__(self): 33 | assert self.iterable is not None 34 | self.index = 0 35 | return self 36 | 37 | def __next__(self): 38 | if self.index < self.total: 39 | element = self.iterable[self.index] 40 | self.update(1) 41 | self.index += 1 42 | return element 43 | else: 44 | self.close() 45 | raise StopIteration 46 | 47 | 48 | def tqdm_notebook_failsafe(*args, **kwargs): 49 | try: 50 | import tqdm 51 | tqdm.monitor_interval = 0 # workaround for https://github.com/tqdm/tqdm/issues/481 52 | return tqdm.tqdm_notebook(*args, **kwargs) 53 | except: 54 | # tqdm is broken on Google Colab 55 | return SimpleTqdm(*args, **kwargs) 56 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | LABEL maintainer="Andrei Kashin " 3 | 4 | RUN apt-get update && apt-get install -yq \ 5 | python3 python3-pip htop nano git wget \ 6 | libglib2.0-0 autoconf automake \ 7 | libtool build-essential unzip \ 8 | libarchive-dev vim 9 | 10 | # Install Starspace. 11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \ 12 | unzip boost_1_63_0.zip && \ 13 | mv boost_1_63_0 /usr/local/bin 14 | 15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \ 16 | cd Starspace && \ 17 | make && \ 18 | cp -Rf starspace /usr/local/bin 19 | 20 | # Install Python dependencies. 21 | ADD requirements.txt / 22 | RUN pip3 install --upgrade pip 23 | RUN pip3 install -r requirements.txt 24 | 25 | # Install Jupyter. 26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension 27 | RUN jupyter contrib nbextension install 28 | RUN jupyter nbextension enable codefolding/main 29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py 30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py 31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py 32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook 33 | 34 | # Welcome message. 35 | ADD welcome_message.txt / 36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \ 37 | >> /etc/bash.bashrc \ 38 | ; cat welcome_message.txt > /etc/motd 39 | 40 | WORKDIR /root 41 | EXPOSE 8080 42 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.weakref==1.0.post1 2 | bleach==1.5.0 3 | certifi==2017.11.5 4 | chardet==3.0.4 5 | ChatterBot==0.7.6 6 | decorator==4.1.2 7 | entrypoints==0.2.3 8 | enum34==1.1.6 9 | funcsigs==1.0.2 10 | gensim==3.1.0 11 | html5lib==0.9999999 12 | idna==2.6 13 | ipykernel==4.6.1 14 | ipython==6.2.1 15 | ipython-genutils==0.2.0 16 | ipywidgets==7.0.5 17 | jedi==0.11.0 18 | Jinja2==2.10 19 | jsonschema==2.6.0 20 | jupyter==1.0.0 21 | jupyter-client==5.1.0 22 | jupyter-console==5.2.0 23 | jupyter-contrib-core==0.3.3 24 | jupyter-contrib-nbextensions==0.3.3 25 | jupyter-core==4.4.0 26 | jupyter-highlight-selected-word==0.1.0 27 | jupyter-latex-envs==1.3.8.4 28 | jupyter-nbextensions-configurator==0.2.8 29 | libarchive==0.4.4 30 | Markdown==2.6.9 31 | MarkupSafe==1.0 32 | matplotlib==2.1.0 33 | mistune==0.8.1 34 | mock==2.0.0 35 | nbconvert==5.3.1 36 | nbformat==4.4.0 37 | nltk==3.4.5 38 | notebook==5.7.8 39 | numpy==1.13.3 40 | pandas==0.21.0 41 | pandocfilters==1.4.2 42 | parso==0.1.0 43 | pbr==3.1.1 44 | pexpect==4.3.0 45 | pickleshare==0.7.4 46 | prompt-toolkit==1.0.15 47 | protobuf==3.5.0.post1 48 | ptyprocess==0.5.2 49 | Pygments==2.2.0 50 | python-dateutil==2.6.1 51 | pyzmq==16.0.3 52 | qtconsole==4.3.1 53 | regex==2017.11.9 54 | requests==2.18.4 55 | scikit-learn==0.19.1 56 | scipy==1.0.0 57 | simplegeneric==0.8.1 58 | six==1.11.0 59 | tensorflow==1.15.0 60 | tensorflow-tensorboard==0.4.0rc3 61 | terminado==0.7 62 | testpath==0.3.1 63 | tornado==4.5.2 64 | tqdm==4.19.4 65 | traitlets==4.3.2 66 | urllib3==1.22 67 | wcwidth==0.1.7 68 | Werkzeug==0.12.2 69 | widgetsnbextension==3.0.8 70 | -------------------------------------------------------------------------------- /docker/welcome_message.txt: -------------------------------------------------------------------------------- 1 | 2 | =================================================================== 3 | Welcome to the Docker container for the Coursera NLP course. 4 | 5 | This container contains dependencies that you might need 6 | to complete course assignments. 7 | 8 | You can also install any additional system dependencies with 9 | > apt-get install PACKAGE_NAME 10 | 11 | And Python dependencies with 12 | > pip3 install PACKAGE_NAME 13 | 14 | To run Jupyter Notebook in the container just type 15 | > run_notebook 16 | =================================================================== 17 | 18 | -------------------------------------------------------------------------------- /honor/README.md: -------------------------------------------------------------------------------- 1 | # Utils to download and read data for chat-bot training 2 | 3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training: 4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size) 5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size) 6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset 7 | - `example.py` - example of reading the dataset 8 | -------------------------------------------------------------------------------- /honor/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Conchylicultor. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import ast 17 | import os 18 | import random 19 | import re 20 | from time import time 21 | 22 | import nltk 23 | from tqdm import tqdm 24 | 25 | """ 26 | Load the cornell movie dialog corpus. 27 | 28 | Available from here: 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html 30 | 31 | """ 32 | 33 | class CornellData: 34 | """ 35 | 36 | """ 37 | 38 | def __init__(self, dirName): 39 | """ 40 | Args: 41 | dirName (string): directory where to load the corpus 42 | """ 43 | self.lines = {} 44 | self.conversations = [] 45 | 46 | MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"] 47 | MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"] 48 | 49 | self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS) 50 | self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS) 51 | 52 | # TODO: Cleaner program (merge copy-paste) !! 53 | 54 | def loadLines(self, fileName, fields): 55 | """ 56 | Args: 57 | fileName (str): file to load 58 | field (set): fields to extract 59 | Return: 60 | dict>: the extracted fields for each line 61 | """ 62 | lines = {} 63 | 64 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 65 | for line in f: 66 | values = line.split(" +++$+++ ") 67 | 68 | # Extract fields 69 | lineObj = {} 70 | for i, field in enumerate(fields): 71 | lineObj[field] = values[i] 72 | 73 | lines[lineObj['lineID']] = lineObj 74 | 75 | return lines 76 | 77 | def loadConversations(self, fileName, fields): 78 | """ 79 | Args: 80 | fileName (str): file to load 81 | field (set): fields to extract 82 | Return: 83 | list>: the extracted fields for each line 84 | """ 85 | conversations = [] 86 | 87 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 88 | for line in f: 89 | values = line.split(" +++$+++ ") 90 | 91 | # Extract fields 92 | convObj = {} 93 | for i, field in enumerate(fields): 94 | convObj[field] = values[i] 95 | 96 | # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]") 97 | lineIds = ast.literal_eval(convObj["utteranceIDs"]) 98 | 99 | # Reassemble lines 100 | convObj["lines"] = [] 101 | for lineId in lineIds: 102 | convObj["lines"].append(self.lines[lineId]) 103 | 104 | conversations.append(convObj) 105 | 106 | return conversations 107 | 108 | def getConversations(self): 109 | return self.conversations 110 | 111 | 112 | # Based on code from https://github.com/AlJohri/OpenSubtitles 113 | # by Al Johri 114 | 115 | import xml.etree.ElementTree as ET 116 | import datetime 117 | import os 118 | import sys 119 | import json 120 | import re 121 | import pprint 122 | 123 | from gzip import GzipFile 124 | 125 | """ 126 | Load the opensubtitles dialog corpus. 127 | """ 128 | 129 | class OpensubsData: 130 | """ 131 | """ 132 | 133 | def __init__(self, dirName): 134 | """ 135 | Args: 136 | dirName (string): directory where to load the corpus 137 | """ 138 | 139 | # Hack this to filter on subset of Opensubtitles 140 | # dirName = "%s/en/Action" % dirName 141 | 142 | print("Loading OpenSubtitles conversations in %s." % dirName) 143 | self.conversations = [] 144 | self.tag_re = re.compile(r'(|<[^>]*>)') 145 | self.conversations = self.loadConversations(dirName) 146 | 147 | def loadConversations(self, dirName): 148 | """ 149 | Args: 150 | dirName (str): folder to load 151 | Return: 152 | array(question, answer): the extracted QA pairs 153 | """ 154 | conversations = [] 155 | dirList = self.filesInDir(dirName) 156 | for filepath in tqdm(dirList, "OpenSubtitles data files"): 157 | if filepath.endswith('gz'): 158 | try: 159 | doc = self.getXML(filepath) 160 | conversations.extend(self.genList(doc)) 161 | except ValueError: 162 | tqdm.write("Skipping file %s with errors." % filepath) 163 | except: 164 | print("Unexpected error:", sys.exc_info()[0]) 165 | raise 166 | return conversations 167 | 168 | def getConversations(self): 169 | return self.conversations 170 | 171 | def genList(self, tree): 172 | root = tree.getroot() 173 | 174 | timeFormat = '%H:%M:%S' 175 | maxDelta = datetime.timedelta(seconds=1) 176 | 177 | startTime = datetime.datetime.min 178 | strbuf = '' 179 | sentList = [] 180 | 181 | for child in root: 182 | for elem in child: 183 | if elem.tag == 'time': 184 | elemID = elem.attrib['id'] 185 | elemVal = elem.attrib['value'][:-4] 186 | if elemID[-1] == 'S': 187 | startTime = datetime.datetime.strptime(elemVal, timeFormat) 188 | else: 189 | sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat))) 190 | strbuf = '' 191 | else: 192 | try: 193 | strbuf = strbuf + " " + elem.text 194 | except: 195 | pass 196 | 197 | conversations = [] 198 | for idx in range(0, len(sentList) - 1): 199 | cur = sentList[idx] 200 | nxt = sentList[idx + 1] 201 | if nxt[1] - cur[2] <= maxDelta and cur and nxt: 202 | tmp = {} 203 | tmp["lines"] = [] 204 | tmp["lines"].append(self.getLine(cur[0])) 205 | tmp["lines"].append(self.getLine(nxt[0])) 206 | if self.filter(tmp): 207 | conversations.append(tmp) 208 | 209 | return conversations 210 | 211 | def getLine(self, sentence): 212 | line = {} 213 | line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower() 214 | return line 215 | 216 | def filter(self, lines): 217 | # Use the followint to customize filtering of QA pairs 218 | # 219 | # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will") 220 | # question = lines["lines"][0]["text"] 221 | # if not question.endswith('?'): 222 | # return False 223 | # if not question.split(' ')[0] in startwords: 224 | # return False 225 | # 226 | return True 227 | 228 | def getXML(self, filepath): 229 | fext = os.path.splitext(filepath)[1] 230 | if fext == '.gz': 231 | tmp = GzipFile(filename=filepath) 232 | return ET.parse(tmp) 233 | else: 234 | return ET.parse(filepath) 235 | 236 | def filesInDir(self, dirname): 237 | result = [] 238 | for dirpath, dirs, files in os.walk(dirname): 239 | for filename in files: 240 | fname = os.path.join(dirpath, filename) 241 | result.append(fname) 242 | return result 243 | 244 | 245 | def extractText(line, fast_preprocessing=True): 246 | if fast_preprocessing: 247 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]') 248 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]') 249 | REPLACE_SEVERAL_SPACES = re.compile('\s+') 250 | 251 | line = line.lower() 252 | line = REPLACE_BY_SPACE_RE.sub(' ', line) 253 | line = GOOD_SYMBOLS_RE.sub('', line) 254 | line = REPLACE_SEVERAL_SPACES.sub(' ', line) 255 | return line.strip() 256 | else: 257 | return nltk.word_tokenize(line) 258 | 259 | 260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True): 261 | data = [] 262 | for i, conversation in enumerate(tqdm(conversations)): 263 | lines = conversation['lines'] 264 | for i in range(len(lines) - 1): 265 | request = extractText(lines[i]['text']) 266 | reply = extractText(lines[i + 1]['text']) 267 | if 0 < len(request) <= max_len and 0 < len(reply) <= max_len: 268 | data += [(request, reply)] 269 | return data 270 | 271 | 272 | def readCornellData(path, max_len=20, fast_preprocessing=True): 273 | dataset = CornellData(path) 274 | conversations = dataset.getConversations() 275 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 276 | 277 | 278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True): 279 | dataset = OpensubsData(path) 280 | conversations = dataset.getConversations() 281 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 282 | -------------------------------------------------------------------------------- /honor/download_cornell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/cornell 4 | cd data/cornell 5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt 6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt 7 | -------------------------------------------------------------------------------- /honor/download_opensubs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/opensubs 4 | cd data/opensubs 5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz 6 | tar -xf en.tar.gz 7 | rm en.tar.gz 8 | -------------------------------------------------------------------------------- /honor/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datasets 4 | import argparse 5 | import os 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.") 10 | parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.") 11 | args = parser.parse_args() 12 | 13 | dataset_path = os.path.join("data", args.dataset) 14 | if args.dataset == "cornell": 15 | data = datasets.readCornellData(dataset_path, max_len=args.max_len) 16 | elif args.dataset == "opensubs": 17 | data = datasets.readOpensubsData(dataset_path, max_len=args.max_len) 18 | else: 19 | raise ValueError("Unrecognized dataset: {!r}".format(args.dataset)) 20 | 21 | print("Size of dataset: {}".format(len(data))) 22 | print("First 10 training pairs:") 23 | for item in data[:10]: 24 | print(item) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /optional/README.md: -------------------------------------------------------------------------------- 1 | # Optional projects 2 | 3 | This folder contains optional projects available in this course. 4 | -------------------------------------------------------------------------------- /optional/telegram_bot/README.md: -------------------------------------------------------------------------------- 1 | # [Optional] Telegram bot 2 | 3 | This folder contains the starting code for the optional Telegram bot extension of the project. 4 | 5 | If you want to permanently host your bot, you can follow our [AWS tutorial](../../AWS-tutorial.md). 6 | 7 | ## Troubleshooting 8 | 9 | ### Bot crashes with the unicode error 10 | 11 | If your bot code crashes with the error that ends with `UnicodeEncodeError: 'ascii' codec can't encode character`, 12 | your terminal likely has problems showing unicode symbols. To fix this you can change your terminal local by adding 13 | the following lines to you `~/.bashrc` file (or any other shell configuration): 14 | 15 | ``` 16 | export LC_ALL=en_US.UTF-8 17 | export LANG=en_US.UTF-8 18 | export LANGUAGE=en_US.UTF-8 19 | ``` 20 | 21 | To verify the effect, you can run the following command end check that it outputs 'utf-8' 22 | ```python 23 | > python -c 'import locale; print(locale.getpreferredencoding())' 24 | utf-8 25 | ``` 26 | 27 | You can find more details in this [article](https://perlgeek.de/en/article/set-up-a-clean-utf8-environment). 28 | 29 | If this doesn't work, you can explicitly specify the encoding when opening files: 30 | ```python 31 | with open(filename, 'r', encoding="utf-8") as file: 32 | ... 33 | ``` 34 | -------------------------------------------------------------------------------- /optional/telegram_bot/dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from utils import * 6 | 7 | 8 | class ThreadRanker(object): 9 | def __init__(self, paths): 10 | self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS']) 11 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 12 | 13 | def __load_embeddings_by_tag(self, tag_name): 14 | embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl") 15 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 16 | return thread_ids, thread_embeddings 17 | 18 | def get_best_thread(self, question, tag_name): 19 | """ Returns id of the most similar thread for the question. 20 | The search is performed across the threads with a given tag. 21 | """ 22 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 23 | 24 | # HINT: you have already implemented a similar routine in the 3rd assignment. 25 | 26 | question_vec = #### YOUR CODE HERE #### 27 | best_thread = #### YOUR CODE HERE #### 28 | 29 | return thread_ids[best_thread] 30 | 31 | 32 | class DialogueManager(object): 33 | def __init__(self, paths): 34 | print("Loading resources...") 35 | 36 | # Intent recognition: 37 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 38 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 39 | 40 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 41 | 42 | # Goal-oriented part: 43 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 44 | self.thread_ranker = ThreadRanker(paths) 45 | 46 | def create_chitchat_bot(self): 47 | """Initializes self.chitchat_bot with some conversational model.""" 48 | 49 | # Hint: you might want to create and train chatterbot.ChatBot here. 50 | # It could be done by creating ChatBot with the *trainer* parameter equals 51 | # "chatterbot.trainers.ChatterBotCorpusTrainer" 52 | # and then calling *train* function with "chatterbot.corpus.english" param 53 | 54 | ######################## 55 | #### YOUR CODE HERE #### 56 | ######################## 57 | 58 | def generate_answer(self, question): 59 | """Combines stackoverflow and chitchat parts using intent recognition.""" 60 | 61 | # Recognize intent of the question using `intent_recognizer`. 62 | # Don't forget to prepare question and calculate features for the question. 63 | 64 | prepared_question = #### YOUR CODE HERE #### 65 | features = #### YOUR CODE HERE #### 66 | intent = #### YOUR CODE HERE #### 67 | 68 | # Chit-chat part: 69 | if intent == 'dialogue': 70 | # Pass question to chitchat_bot to generate a response. 71 | response = #### YOUR CODE HERE #### 72 | return response 73 | 74 | # Goal-oriented part: 75 | else: 76 | # Pass features to tag_classifier to get predictions. 77 | tag = #### YOUR CODE HERE #### 78 | 79 | # Pass prepared_question to thread_ranker to get predictions. 80 | thread_id = #### YOUR CODE HERE #### 81 | 82 | return self.ANSWER_TEMPLATE % (tag, thread_id) 83 | 84 | -------------------------------------------------------------------------------- /optional/telegram_bot/main_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import argparse 6 | import os 7 | import json 8 | 9 | from requests.compat import urljoin 10 | 11 | 12 | class BotHandler(object): 13 | """ 14 | BotHandler is a class which implements all back-end of the bot. 15 | It has tree main functions: 16 | 'get_updates' — checks for new messages 17 | 'send_message' – posts new message to user 18 | 'get_answer' — computes the most relevant on a user's question 19 | """ 20 | 21 | def __init__(self, token, dialogue_manager): 22 | self.token = token 23 | self.api_url = "https://api.telegram.org/bot{}/".format(token) 24 | self.dialogue_manager = dialogue_manager 25 | 26 | def get_updates(self, offset=None, timeout=30): 27 | params = {"timeout": timeout, "offset": offset} 28 | raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params) 29 | try: 30 | resp = raw_resp.json() 31 | except json.decoder.JSONDecodeError as e: 32 | print("Failed to parse response {}: {}.".format(raw_resp.content, e)) 33 | return [] 34 | 35 | if "result" not in resp: 36 | return [] 37 | return resp["result"] 38 | 39 | def send_message(self, chat_id, text): 40 | params = {"chat_id": chat_id, "text": text} 41 | return requests.post(urljoin(self.api_url, "sendMessage"), params) 42 | 43 | def get_answer(self, question): 44 | if question == '/start': 45 | return "Hi, I am your project bot. How can I help you today?" 46 | return self.dialogue_manager.generate_answer(question) 47 | 48 | 49 | def parse_args(): 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument('--token', type=str, default='') 52 | return parser.parse_args() 53 | 54 | 55 | def is_unicode(text): 56 | return len(text) == len(text.encode()) 57 | 58 | 59 | class SimpleDialogueManager(object): 60 | """ 61 | This is the simplest dialogue manager to test the telegram bot. 62 | Your task is to create a more advanced one in dialogue_manager.py." 63 | """ 64 | 65 | def generate_answer(self, question): 66 | return "Hello, world!" 67 | 68 | 69 | def main(): 70 | args = parse_args() 71 | token = args.token 72 | 73 | if not token: 74 | if not "TELEGRAM_TOKEN" in os.environ: 75 | print("Please, set bot token through --token or TELEGRAM_TOKEN env variable") 76 | return 77 | token = os.environ["TELEGRAM_TOKEN"] 78 | 79 | ################################################################# 80 | 81 | # Your task is to complete dialogue_manager.py and use your 82 | # advanced DialogueManager instead of SimpleDialogueManager. 83 | 84 | # This is the point where you plug it into the Telegram bot. 85 | # Do not forget to import all needed dependencies when you do so. 86 | 87 | simple_manager = SimpleDialogueManager() 88 | bot = BotHandler(token, simple_manager) 89 | 90 | ############################################################### 91 | 92 | print("Ready to talk!") 93 | offset = 0 94 | while True: 95 | updates = bot.get_updates(offset=offset) 96 | for update in updates: 97 | print("An update received.") 98 | if "message" in update: 99 | chat_id = update["message"]["chat"]["id"] 100 | if "text" in update["message"]: 101 | text = update["message"]["text"] 102 | if is_unicode(text): 103 | print("Update content: {}".format(update)) 104 | bot.send_message(chat_id, bot.get_answer(update["message"]["text"])) 105 | else: 106 | bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...") 107 | offset = max(offset, update['update_id'] + 1) 108 | time.sleep(1) 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /optional/telegram_bot/utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | 6 | nltk.download('stopwords') 7 | from nltk.corpus import stopwords 8 | 9 | # Paths for all resources for the bot. 10 | RESOURCE_PATH = { 11 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 12 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 13 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 14 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 15 | 'WORD_EMBEDDINGS': 'word_embeddings.tsv', 16 | } 17 | 18 | 19 | def text_prepare(text): 20 | """Performs tokenization and simple preprocessing.""" 21 | 22 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 23 | bad_symbols_re = re.compile('[^0-9a-z #+_]') 24 | stopwords_set = set(stopwords.words('english')) 25 | 26 | text = text.lower() 27 | text = replace_by_space_re.sub(' ', text) 28 | text = bad_symbols_re.sub('', text) 29 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 30 | 31 | return text.strip() 32 | 33 | 34 | def load_embeddings(embeddings_path): 35 | """Loads pre-trained word embeddings from tsv file. 36 | 37 | Args: 38 | embeddings_path - path to the embeddings file. 39 | 40 | Returns: 41 | embeddings - dict mapping words to vectors; 42 | embeddings_dim - dimension of the vectors. 43 | """ 44 | 45 | # Hint: you have already implemented a similar routine in the 3rd assignment. 46 | # Note that here you also need to know the dimension of the loaded embeddings. 47 | # When you load the embeddings, use numpy.float32 type as dtype 48 | 49 | ######################## 50 | #### YOUR CODE HERE #### 51 | ######################## 52 | 53 | # remove this when you're done 54 | raise NotImplementedError( 55 | "Open utils.py and fill with your code. In case of Google Colab, download" 56 | "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), " 57 | "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD") 58 | 59 | 60 | def question_to_vec(question, embeddings, dim): 61 | """Transforms a string to an embedding by averaging word embeddings.""" 62 | 63 | # Hint: you have already implemented exactly this function in the 3rd assignment. 64 | 65 | ######################## 66 | #### YOUR CODE HERE #### 67 | ######################## 68 | 69 | # remove this when you're done 70 | raise NotImplementedError( 71 | "Open utils.py and fill with your code. In case of Google Colab, download" 72 | "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), " 73 | "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD") 74 | 75 | 76 | def unpickle_file(filename): 77 | """Returns the result of unpickling the file content.""" 78 | with open(filename, 'rb') as f: 79 | return pickle.load(f) 80 | -------------------------------------------------------------------------------- /optional/telegram_bot/week5-project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Final project: StackOverflow assistant bot\n", 8 | "\n", 9 | "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n", 10 | "* answer programming-related questions (using StackOverflow dataset);\n", 11 | "* chit-chat and simulate dialogue on all non programming-related questions.\n", 12 | "\n", 13 | "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n", 14 | "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n", 15 | "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n", 16 | "©[xkcd](https://xkcd.com)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Data description\n", 24 | "\n", 25 | "To detect *intent* of users questions we will need two text collections:\n", 26 | "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n", 27 | "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "try:\n", 39 | " import google.colab\n", 40 | " IN_COLAB = True\n", 41 | "except:\n", 42 | " IN_COLAB = False\n", 43 | "\n", 44 | "if IN_COLAB:\n", 45 | " ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 46 | " import setup_google_colab\n", 47 | " setup_google_colab.setup_project()\n", 48 | "\n", 49 | "import sys\n", 50 | "sys.path.append(\"..\")\n", 51 | "from common.download_utils import download_project_resources\n", 52 | "\n", 53 | "download_project_resources()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n", 61 | "For the ranking part, you will need:\n", 62 | "- `word_embeddings.tsv` — word embeddings, that you trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you." 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n", 70 | "\n", 71 | "- `intent_recognizer.pkl` — intent recognition model;\n", 72 | "- `tag_classifier.pkl` — programming language classification model;\n", 73 | "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n", 74 | "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n", 75 | " " 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "from utils import *" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## Part I. Intent and language recognition" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n", 108 | "\n", 109 | "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "import numpy as np\n", 121 | "import pandas as pd\n", 122 | "import pickle\n", 123 | "import re\n", 124 | "\n", 125 | "from sklearn.feature_extraction.text import TfidfVectorizer" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Data preparation" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "def tfidf_features(X_train, X_test, vectorizer_path):\n", 151 | " \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n", 152 | " \n", 153 | " # Train a vectorizer on X_train data.\n", 154 | " # Transform X_train and X_test data.\n", 155 | " \n", 156 | " # Pickle the trained vectorizer to 'vectorizer_path'\n", 157 | " # Don't forget to open the file in writing bytes mode.\n", 158 | " \n", 159 | " ######################################\n", 160 | " ######### YOUR CODE HERE #############\n", 161 | " ######################################\n", 162 | " \n", 163 | " return X_train, X_test" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "sample_size = 200000\n", 182 | "\n", 183 | "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n", 184 | "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Check how the data look like:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "dialogue_df.head()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "stackoverflow_df.head()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "Apply *text_prepare* function to preprocess the data:" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "from utils import text_prepare" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "dialogue_df['text'] = ######### YOUR CODE HERE #############\n", 243 | "stackoverflow_df['title'] = ######### YOUR CODE HERE #############" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### Intent recognition" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n", 258 | "- concatenate `dialogue` and `stackoverflow` examples into one sample\n", 259 | "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n", 260 | "- transform it into TF-IDF features" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "from sklearn.model_selection import train_test_split" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n", 283 | "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n", 284 | "\n", 285 | "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n", 286 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n", 287 | "\n", 288 | "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "from sklearn.linear_model import LogisticRegression\n", 307 | "from sklearn.metrics import accuracy_score" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "######################################\n", 319 | "######### YOUR CODE HERE #############\n", 320 | "######################################" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": true 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "# Check test accuracy.\n", 332 | "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n", 333 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 334 | "print('Test accuracy = {}'.format(test_accuracy))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Dump the classifier to use it in the running bot." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "### Programming language classification " 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n", 367 | "\n", 368 | "First, let us prepare the data for this task." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "X = stackoverflow_df['title'].values\n", 380 | "y = stackoverflow_df['tag'].values" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": { 387 | "collapsed": true 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", 392 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it." 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": { 406 | "collapsed": true 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n", 411 | "\n", 412 | "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": true 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "from sklearn.multiclass import OneVsRestClassifier" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "######################################\n", 442 | "######### YOUR CODE HERE #############\n", 443 | "######################################" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": { 450 | "collapsed": true 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "# Check test accuracy.\n", 455 | "y_test_pred = tag_classifier.predict(X_test_tfidf)\n", 456 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 457 | "print('Test accuracy = {}'.format(test_accuracy))" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "Dump the classifier to use it in the running bot." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "collapsed": true 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "## Part II. Ranking questions with embeddings" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n", 490 | "\n", 491 | "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. " 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n", 499 | "\n", 500 | "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StarSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which is not in Stackoverflow data." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": true 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": { 525 | "collapsed": true 526 | }, 527 | "outputs": [], 528 | "source": [ 529 | "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n", 537 | "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": { 544 | "collapsed": true 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "counts_by_tag = ######### YOUR CODE HERE #############" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "Now for each `tag` you need to create two data structures, which will serve as online search index:\n", 556 | "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n", 557 | "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n", 558 | "\n", 559 | "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": { 566 | "collapsed": true 567 | }, 568 | "outputs": [], 569 | "source": [ 570 | "import os\n", 571 | "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n", 572 | "\n", 573 | "for tag, count in counts_by_tag.items():\n", 574 | " tag_posts = posts_df[posts_df['tag'] == tag]\n", 575 | " \n", 576 | " tag_post_ids = ######### YOUR CODE HERE #############\n", 577 | " \n", 578 | " tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n", 579 | " for i, title in enumerate(tag_posts['title']):\n", 580 | " tag_vectors[i, :] = ######### YOUR CODE HERE #############\n", 581 | "\n", 582 | " # Dump post ids and vectors to a file.\n", 583 | " filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n", 584 | " pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))" 585 | ] 586 | } 587 | ], 588 | "metadata": { 589 | "kernelspec": { 590 | "display_name": "Python 3", 591 | "language": "python", 592 | "name": "python3" 593 | }, 594 | "language_info": { 595 | "codemirror_mode": { 596 | "name": "ipython", 597 | "version": 3 598 | }, 599 | "file_extension": ".py", 600 | "mimetype": "text/x-python", 601 | "name": "python", 602 | "nbconvert_exporter": "python", 603 | "pygments_lexer": "ipython3", 604 | "version": "3.4.3" 605 | }, 606 | "latex_envs": { 607 | "bibliofile": "biblio.bib", 608 | "cite_by": "apalike", 609 | "current_citInitial": 1, 610 | "eqLabelWithNumbers": true, 611 | "eqNumInitial": 0 612 | } 613 | }, 614 | "nbformat": 4, 615 | "nbformat_minor": 2 616 | } 617 | -------------------------------------------------------------------------------- /setup_google_colab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | 5 | 6 | def download_github_code(path): 7 | filename = path.rsplit("/")[-1] 8 | os.system("wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/{} -O {}".format(path, filename)) 9 | 10 | 11 | def setup_common(): 12 | download_github_code("common/requirements_colab.txt") 13 | download_github_code("common/download_utils.py") 14 | download_github_code("common/tqdm_utils.py") 15 | download_github_code("common/__init__.py") 16 | os.system("mkdir common") 17 | os.system("mv download_utils.py tqdm_utils.py __init__.py common/") 18 | os.system("mv requirements_colab.txt common/") 19 | 20 | os.system("pip install -r common/requirements_colab.txt --force-reinstall") 21 | 22 | 23 | def setup_starspace(): 24 | if not os.path.exists("/usr/local/bin/starspace"): 25 | os.system("wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip") 26 | os.system("unzip boost_1_63_0.zip && mv boost_1_63_0 /usr/local/bin") 27 | os.system("git clone https://github.com/facebookresearch/Starspace.git") 28 | os.system("cd Starspace && make && cp -Rf starspace /usr/local/bin") 29 | 30 | 31 | def setup_week1(): 32 | setup_common() 33 | download_github_code("week1/grader.py") 34 | download_github_code("week1/metrics.py") 35 | 36 | 37 | def setup_week2(): 38 | setup_common() 39 | download_github_code("week2/evaluation.py") 40 | 41 | 42 | def setup_week3(): 43 | setup_common() 44 | download_github_code("week3/grader.py") 45 | download_github_code("week3/util.py") 46 | setup_starspace() 47 | 48 | 49 | def setup_week4(): 50 | setup_common() 51 | 52 | 53 | def setup_project(): 54 | setup_common() 55 | download_github_code("week5/dialogue_manager.py") 56 | download_github_code("week5/utils.py") 57 | setup_starspace() 58 | 59 | 60 | def setup_honor(): 61 | setup_common() 62 | download_github_code("honor/datasets.py") 63 | download_github_code("honor/example.py") 64 | download_github_code("honor/download_cornell.sh") 65 | download_github_code("honor/download_opensubs.sh") 66 | -------------------------------------------------------------------------------- /week1/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g' 10 | self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 11 | ('hTrz8', 'WordsTagsCount'), 12 | ('0kUjR', 'BagOfWords'), 13 | ('tLJV1', 'MultilabelClassification')]) 14 | self.answers = {key: None for key in self.parts} 15 | 16 | @staticmethod 17 | def ravel_output(output): 18 | ''' 19 | If student accidentally submitted np.array with one 20 | element instead of number, this function will submit 21 | this number instead 22 | ''' 23 | if isinstance(output, np.ndarray) and output.size == 1: 24 | output = output.item(0) 25 | return output 26 | 27 | def submit(self, email, token): 28 | submission = { 29 | "assignmentKey": self.assignment_key, 30 | "submitterEmail": email, 31 | "secret": token, 32 | "parts": {} 33 | } 34 | for part, output in self.answers.items(): 35 | if output is not None: 36 | submission["parts"][part] = {"output": output} 37 | else: 38 | submission["parts"][part] = dict() 39 | request = requests.post(self.submission_page, data=json.dumps(submission)) 40 | response = request.json() 41 | if request.status_code == 201: 42 | print('Submitted to Coursera platform. See results on assignment page!') 43 | elif u'details' in response and u'learnerMessage' in response[u'details']: 44 | print(response[u'details'][u'learnerMessage']) 45 | else: 46 | print("Unknown response from Coursera: {}".format(request.status_code)) 47 | print(response) 48 | 49 | def status(self): 50 | print("You want to submit these parts:") 51 | for part_id, part_name in self.parts.items(): 52 | answer = self.answers[part_id] 53 | if answer is None: 54 | answer = '-'*10 55 | print("Task {}:\n {}".format(part_name, answer[:100] + '...')) 56 | 57 | def submit_part(self, part, output): 58 | self.answers[part] = output 59 | print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...')) 60 | 61 | def submit_tag(self, tag, output): 62 | part_id = [k for k, v in self.parts.items() if v == tag] 63 | if len(part_id) != 1: 64 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 65 | part_id = part_id[0] 66 | self.submit_part(part_id, str(self.ravel_output(output))) 67 | -------------------------------------------------------------------------------- /week1/lemmatization_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "[nltk_data] Downloading package wordnet to /Users/anton/nltk_data...\n", 20 | "[nltk_data] Package wordnet is already up-to-date!\n" 21 | ] 22 | }, 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "True" 27 | ] 28 | }, 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "import nltk\n", 36 | "nltk.download('wordnet')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "ExecuteTime": { 44 | "end_time": "2017-11-05T18:16:27.608310Z", 45 | "start_time": "2017-11-05T18:16:26.423528Z" 46 | } 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "text = \"This is Andrew's text, isn't it?\"" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "ExecuteTime": { 58 | "end_time": "2017-11-05T18:16:27.633134Z", 59 | "start_time": "2017-11-05T18:16:27.610910Z" 60 | } 61 | }, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "['This', 'is', \"Andrew's\", 'text,', \"isn't\", 'it?']" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "tokenizer = nltk.tokenize.WhitespaceTokenizer()\n", 76 | "tokenizer.tokenize(text)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": { 83 | "ExecuteTime": { 84 | "end_time": "2017-11-05T18:16:27.647746Z", 85 | "start_time": "2017-11-05T18:16:27.637909Z" 86 | } 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "['This', 'is', 'Andrew', \"'s\", 'text', ',', 'is', \"n't\", 'it', '?']" 93 | ] 94 | }, 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "tokenizer = nltk.tokenize.TreebankWordTokenizer()\n", 102 | "tokenizer.tokenize(text)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "metadata": { 109 | "ExecuteTime": { 110 | "end_time": "2017-11-05T18:16:27.660827Z", 111 | "start_time": "2017-11-05T18:16:27.651961Z" 112 | } 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "['This', 'is', 'Andrew', \"'\", 's', 'text', ',', 'isn', \"'\", 't', 'it', '?']" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "tokenizer = nltk.tokenize.WordPunctTokenizer()\n", 128 | "tokenizer.tokenize(text)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "# Stemming (further in the video)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": { 142 | "ExecuteTime": { 143 | "end_time": "2017-11-05T18:16:27.674332Z", 144 | "start_time": "2017-11-05T18:16:27.666509Z" 145 | } 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "\n", 150 | "text = \"feet wolves cats talked\"\n", 151 | "tokenizer = nltk.tokenize.TreebankWordTokenizer()\n", 152 | "tokens = tokenizer.tokenize(text)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": { 159 | "ExecuteTime": { 160 | "end_time": "2017-11-05T18:16:27.693761Z", 161 | "start_time": "2017-11-05T18:16:27.677877Z" 162 | } 163 | }, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "u'feet wolv cat talk'" 169 | ] 170 | }, 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "stemmer = nltk.stem.PorterStemmer()\n", 178 | "\" \".join(stemmer.stem(token) for token in tokens)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 8, 184 | "metadata": { 185 | "ExecuteTime": { 186 | "end_time": "2017-11-05T18:16:30.840117Z", 187 | "start_time": "2017-11-05T18:16:27.698683Z" 188 | } 189 | }, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "u'foot wolf cat talked'" 195 | ] 196 | }, 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "stemmer = nltk.stem.WordNetLemmatizer()\n", 204 | "\" \".join(stemmer.lemmatize(token) for token in tokens)" 205 | ] 206 | } 207 | ], 208 | "metadata": { 209 | "kernelspec": { 210 | "display_name": "Python 2", 211 | "language": "python", 212 | "name": "python2" 213 | }, 214 | "language_info": { 215 | "codemirror_mode": { 216 | "name": "ipython", 217 | "version": 2 218 | }, 219 | "file_extension": ".py", 220 | "mimetype": "text/x-python", 221 | "name": "python", 222 | "nbconvert_exporter": "python", 223 | "pygments_lexer": "ipython2", 224 | "version": "2.7.15" 225 | } 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 2 229 | } 230 | -------------------------------------------------------------------------------- /week1/metrics.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.metrics import roc_curve, auc 4 | from scipy import interp 5 | from itertools import cycle 6 | 7 | def roc_auc(y_test, y_score, n_classes): 8 | """Plots ROC curve for micro and macro averaging.""" 9 | 10 | # Compute ROC curve and ROC area for each class 11 | fpr = {} 12 | tpr = {} 13 | roc_auc = {} 14 | for i in range(n_classes): 15 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 16 | roc_auc[i] = auc(fpr[i], tpr[i]) 17 | 18 | # Compute micro-average ROC curve and ROC area 19 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 20 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 21 | 22 | # Compute macro-average ROC curve and ROC area 23 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 24 | mean_tpr = np.zeros_like(all_fpr) 25 | for i in range(n_classes): 26 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 27 | mean_tpr /= n_classes 28 | fpr["macro"] = all_fpr 29 | tpr["macro"] = mean_tpr 30 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 31 | 32 | # Plot all ROC curves 33 | plt.figure() 34 | plt.plot(fpr["micro"], tpr["micro"], 35 | label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), 36 | color='deeppink', linestyle=':', linewidth=4) 37 | 38 | plt.plot(fpr["macro"], tpr["macro"], 39 | label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), 40 | color='navy', linestyle=':', linewidth=4) 41 | 42 | colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) 43 | for i, color in zip(range(0,3), colors): 44 | plt.plot(fpr[i], tpr[i], color=color, lw=2, 45 | label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])) 46 | 47 | plt.plot([0, 1], [0, 1], 'k--', lw=2) 48 | plt.xlim([0.0, 1.0]) 49 | plt.ylim([0.0, 1.05]) 50 | plt.xlabel('False Positive Rate') 51 | plt.ylabel('True Positive Rate') 52 | plt.title('Some extension of ROC to multi-class') 53 | plt.legend(loc="lower right") 54 | plt.show() -------------------------------------------------------------------------------- /week1/tfidf_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tf-Idf example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "
\n", 19 | "\n", 32 | "\n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | "
good movielikemovienot
00.7071070.0000000.7071070.000000
10.5773500.0000000.5773500.577350
20.0000000.7071070.0000000.707107
30.0000001.0000000.0000000.000000
40.0000000.0000000.0000000.000000
\n", 80 | "
" 81 | ], 82 | "text/plain": [ 83 | " good movie like movie not\n", 84 | "0 0.707107 0.000000 0.707107 0.000000\n", 85 | "1 0.577350 0.000000 0.577350 0.577350\n", 86 | "2 0.000000 0.707107 0.000000 0.707107\n", 87 | "3 0.000000 1.000000 0.000000 0.000000\n", 88 | "4 0.000000 0.000000 0.000000 0.000000" 89 | ] 90 | }, 91 | "execution_count": 1, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 98 | "import pandas as pd\n", 99 | "texts = [\n", 100 | " \"good movie\", \"not a good movie\", \"did not like\", \n", 101 | " \"i like it\", \"good one\"\n", 102 | "]\n", 103 | "# using default tokenizer in TfidfVectorizer\n", 104 | "tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))\n", 105 | "features = tfidf.fit_transform(texts)\n", 106 | "pd.DataFrame(\n", 107 | " features.todense(),\n", 108 | " columns=tfidf.get_feature_names()\n", 109 | ")" 110 | ] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.14" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /week1/week1-MultilabelClassification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predict tags on StackOverflow with linear models" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this assignment you will learn how to predict tags for posts from [StackOverflow](https://stackoverflow.com). To solve this task you will use multilabel classification approach.\n", 15 | "\n", 16 | "### Libraries\n", 17 | "\n", 18 | "In this task you will need the following libraries:\n", 19 | "- [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 20 | "- [Pandas](https://pandas.pydata.org) — a library providing high-performance, easy-to-use data structures and data analysis tools for the Python\n", 21 | "- [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n", 22 | "- [NLTK](http://www.nltk.org) — a platform to work with natural language." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Data\n", 30 | "\n", 31 | "The following cell will download all data required for this assignment into the folder `week1/data`." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "try:\n", 41 | " import google.colab\n", 42 | " IN_COLAB = True\n", 43 | "except:\n", 44 | " IN_COLAB = False\n", 45 | "\n", 46 | "if IN_COLAB:\n", 47 | " ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 48 | " import setup_google_colab\n", 49 | " setup_google_colab.setup_week1() \n", 50 | " \n", 51 | "import sys\n", 52 | "sys.path.append(\"..\")\n", 53 | "from common.download_utils import download_week1_resources\n", 54 | "\n", 55 | "download_week1_resources()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Grading\n", 63 | "We will create a grader instance below and use it to collect your answers. Note that these outputs will be stored locally inside grader and will be uploaded to platform only after running submitting function in the last part of this assignment. If you want to make partial submission, you can run that cell any time you want." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "from grader import Grader" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "grader = Grader()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Text preprocessing" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "For this and most of the following assignments you will need to use a list of stop words. It can be downloaded from *nltk*:" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import nltk\n", 111 | "nltk.download('stopwords')\n", 112 | "from nltk.corpus import stopwords" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "In this task you will deal with a dataset of post titles from StackOverflow. You are provided a split to 3 sets: *train*, *validation* and *test*. All corpora (except for *test*) contain titles of the posts and corresponding tags (100 tags are available). The *test* set is provided for Coursera's grading and doesn't contain answers. Upload the corpora using *pandas* and look at the data:" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from ast import literal_eval\n", 131 | "import pandas as pd\n", 132 | "import numpy as np" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "def read_data(filename):\n", 144 | " data = pd.read_csv(filename, sep='\\t')\n", 145 | " data['tags'] = data['tags'].apply(literal_eval)\n", 146 | " return data" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "train = read_data('data/train.tsv')\n", 158 | "validation = read_data('data/validation.tsv')\n", 159 | "test = pd.read_csv('data/test.tsv', sep='\\t')" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "train.head()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "As you can see, *title* column contains titles of the posts and *tags* column contains the tags. It could be noticed that a number of tags for a post is not fixed and could be as many as necessary." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "For a more comfortable usage, initialize *X_train*, *X_val*, *X_test*, *y_train*, *y_val*." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "X_train, y_train = train['title'].values, train['tags'].values\n", 196 | "X_val, y_val = validation['title'].values, validation['tags'].values\n", 197 | "X_test = test['title'].values" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "One of the most known difficulties when working with natural data is that it's unstructured. For example, if you use it \"as is\" and extract tokens just by splitting the titles by whitespaces, you will see that there are many \"weird\" tokens like *3.5?*, *\"Flip*, etc. To prevent the problems, it's usually useful to prepare the data somehow. In this task you'll write a function, which will be also used in the other assignments. \n", 205 | "\n", 206 | "**Task 1 (TextPrepare).** Implement the function *text_prepare* following the instructions. After that, run the function *test_text_prepare* to test it on tiny cases and submit it to Coursera." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "import re" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n", 229 | "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n", 230 | "STOPWORDS = set(stopwords.words('english'))\n", 231 | "\n", 232 | "def text_prepare(text):\n", 233 | " \"\"\"\n", 234 | " text: a string\n", 235 | " \n", 236 | " return: modified initial string\n", 237 | " \"\"\"\n", 238 | " text = # lowercase text\n", 239 | " text = # replace REPLACE_BY_SPACE_RE symbols by space in text\n", 240 | " text = # delete symbols which are in BAD_SYMBOLS_RE from text\n", 241 | " text = # delete stopwords from text\n", 242 | " return text" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "def test_text_prepare():\n", 254 | " examples = [\"SQL Server - any equivalent of Excel's CHOOSE function?\",\n", 255 | " \"How to free c++ memory vector * arr?\"]\n", 256 | " answers = [\"sql server equivalent excels choose function\", \n", 257 | " \"free c++ memory vectorint arr\"]\n", 258 | " for ex, ans in zip(examples, answers):\n", 259 | " if text_prepare(ex) != ans:\n", 260 | " return \"Wrong answer for the case: '%s'\" % ex\n", 261 | " return 'Basic tests are passed.'" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "print(test_text_prepare())" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Run your implementation for questions from file *text_prepare_tests.tsv* to earn the points." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "prepared_questions = []\n", 291 | "for line in open('data/text_prepare_tests.tsv', encoding='utf-8'):\n", 292 | " line = text_prepare(line.strip())\n", 293 | " prepared_questions.append(line)\n", 294 | "text_prepare_results = '\\n'.join(prepared_questions)\n", 295 | "\n", 296 | "grader.submit_tag('TextPrepare', text_prepare_results)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "Now we can preprocess the titles using function *text_prepare* and making sure that the headers don't have bad symbols:" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "X_train = [text_prepare(x) for x in X_train]\n", 315 | "X_val = [text_prepare(x) for x in X_val]\n", 316 | "X_test = [text_prepare(x) for x in X_test]" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "X_train[:3]" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "For each tag and for each word calculate how many times they occur in the train corpus. \n", 335 | "\n", 336 | "**Task 2 (WordsTagsCount).** Find 3 most popular tags and 3 most popular words in the train data and submit the results to earn the points." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "collapsed": true 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "# Dictionary of all tags from train corpus with their counts.\n", 348 | "tags_counts = {}\n", 349 | "# Dictionary of all words from train corpus with their counts.\n", 350 | "words_counts = {}\n", 351 | "\n", 352 | "######################################\n", 353 | "######### YOUR CODE HERE #############\n", 354 | "######################################" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "We are assuming that *tags_counts* and *words_counts* are dictionaries like `{'some_word_or_tag': frequency}`. After applying the sorting procedure, results will be look like this: `[('most_popular_word_or_tag', frequency), ('less_popular_word_or_tag', frequency), ...]`. The grader gets the results in the following format (two comma-separated strings with line break):\n", 362 | "\n", 363 | " tag1,tag2,tag3\n", 364 | " word1,word2,word3\n", 365 | "\n", 366 | "Pay attention that in this assignment you should not submit frequencies or some additional information." 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n", 378 | "most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]\n", 379 | "\n", 380 | "grader.submit_tag('WordsTagsCount', '%s\\n%s' % (','.join(tag for tag, _ in most_common_tags), \n", 381 | " ','.join(word for word, _ in most_common_words)))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "### Transforming text to a vector\n", 389 | "\n", 390 | "Machine Learning algorithms work with numeric data and we cannot use the provided text data \"as is\". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.\n", 391 | "\n", 392 | "#### Bag of words\n", 393 | "\n", 394 | "One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:\n", 395 | "1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.\n", 396 | "2. For each title in the corpora create a zero vector with the dimension equals to *N*.\n", 397 | "3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.\n", 398 | "\n", 399 | "Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is \n", 400 | "\n", 401 | " ['hi', 'you', 'me', 'are']\n", 402 | "\n", 403 | "Then we need to numerate them, for example, like this: \n", 404 | "\n", 405 | " {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n", 406 | "\n", 407 | "And we have the text, which we want to transform to the vector:\n", 408 | "\n", 409 | " 'hi how are you'\n", 410 | "\n", 411 | "For this text we create a corresponding zero vector \n", 412 | "\n", 413 | " [0, 0, 0, 0]\n", 414 | " \n", 415 | "And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:\n", 416 | "\n", 417 | " 'hi': [1, 0, 0, 0]\n", 418 | " 'how': [1, 0, 0, 0] # word 'how' is not in our dictionary\n", 419 | " 'are': [1, 0, 0, 1]\n", 420 | " 'you': [1, 1, 0, 1]\n", 421 | "\n", 422 | "The resulting vector will be \n", 423 | "\n", 424 | " [1, 1, 0, 1]\n", 425 | " \n", 426 | "Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "DICT_SIZE = 5000\n", 438 | "WORDS_TO_INDEX = ####### YOUR CODE HERE #######\n", 439 | "INDEX_TO_WORDS = ####### YOUR CODE HERE #######\n", 440 | "ALL_WORDS = WORDS_TO_INDEX.keys()\n", 441 | "\n", 442 | "def my_bag_of_words(text, words_to_index, dict_size):\n", 443 | " \"\"\"\n", 444 | " text: a string\n", 445 | " dict_size: size of the dictionary\n", 446 | " \n", 447 | " return a vector which is a bag-of-words representation of 'text'\n", 448 | " \"\"\"\n", 449 | " result_vector = np.zeros(dict_size)\n", 450 | " ######################################\n", 451 | " ######### YOUR CODE HERE #############\n", 452 | " ######################################\n", 453 | " return result_vector" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "def test_my_bag_of_words():\n", 465 | " words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n", 466 | " examples = ['hi how are you']\n", 467 | " answers = [[1, 1, 0, 1]]\n", 468 | " for ex, ans in zip(examples, answers):\n", 469 | " if (my_bag_of_words(ex, words_to_index, 4) != ans).any():\n", 470 | " return \"Wrong answer for the case: '%s'\" % ex\n", 471 | " return 'Basic tests are passed.'" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "collapsed": true 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "print(test_my_bag_of_words())" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "Now apply the implemented function to all samples (this might take up to a minute):" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "collapsed": true 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "from scipy import sparse as sp_sparse" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": true 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])\n", 512 | "X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])\n", 513 | "X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])\n", 514 | "print('X_train shape ', X_train_mybag.shape)\n", 515 | "print('X_val shape ', X_val_mybag.shape)\n", 516 | "print('X_test shape ', X_test_mybag.shape)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "As you might notice, we transform the data to sparse representation, to store the useful information efficiently. There are many [types](https://docs.scipy.org/doc/scipy/reference/sparse.html) of such representations, however sklearn algorithms can work only with [csr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix) matrix, so we will use this one." 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "**Task 3 (BagOfWords).** For the 11th row in *X_train_mybag* find how many non-zero elements it has. In this task the answer (variable *non_zero_elements_count*) should be an integer number, e.g. 20." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": { 537 | "collapsed": true 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "row = X_train_mybag[10].toarray()[0]\n", 542 | "non_zero_elements_count = ####### YOUR CODE HERE #######\n", 543 | "\n", 544 | "grader.submit_tag('BagOfWords', str(non_zero_elements_count))" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "#### TF-IDF\n", 552 | "\n", 553 | "The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space. \n", 554 | "\n", 555 | "Implement function *tfidf_features* using class [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) from *scikit-learn*. Use *train* corpus to train a vectorizer. Don't forget to take a look into the arguments that you can pass to it. We suggest that you filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the titles). Also, use bigrams along with unigrams in your vocabulary. " 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "from sklearn.feature_extraction.text import TfidfVectorizer" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": { 573 | "collapsed": true 574 | }, 575 | "outputs": [], 576 | "source": [ 577 | "def tfidf_features(X_train, X_val, X_test):\n", 578 | " \"\"\"\n", 579 | " X_train, X_val, X_test — samples \n", 580 | " return TF-IDF vectorized representation of each sample and vocabulary\n", 581 | " \"\"\"\n", 582 | " # Create TF-IDF vectorizer with a proper parameters choice\n", 583 | " # Fit the vectorizer on the train set\n", 584 | " # Transform the train, test, and val sets and return the result\n", 585 | " \n", 586 | " \n", 587 | " tfidf_vectorizer = ####### YOUR CODE HERE #######\n", 588 | " \n", 589 | " ######################################\n", 590 | " ######### YOUR CODE HERE #############\n", 591 | " ######################################\n", 592 | " \n", 593 | " return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "Once you have done text preprocessing, always have a look at the results. Be very careful at this step, because the performance of future models will drastically depend on it. \n", 601 | "\n", 602 | "In this case, check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "collapsed": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)\n", 614 | "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": { 621 | "collapsed": true 622 | }, 623 | "outputs": [], 624 | "source": [ 625 | "######### YOUR CODE HERE #############" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\\S+)' regexp as a *token_pattern* in the constructor of the vectorizer. " 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "Now, use this transormation for the data and check again." 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": { 646 | "collapsed": true 647 | }, 648 | "outputs": [], 649 | "source": [ 650 | "######### YOUR CODE HERE #############" 651 | ] 652 | }, 653 | { 654 | "cell_type": "markdown", 655 | "metadata": {}, 656 | "source": [ 657 | "### MultiLabel classifier\n", 658 | "\n", 659 | "As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*." 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": { 666 | "collapsed": true 667 | }, 668 | "outputs": [], 669 | "source": [ 670 | "from sklearn.preprocessing import MultiLabelBinarizer" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": { 677 | "collapsed": true 678 | }, 679 | "outputs": [], 680 | "source": [ 681 | "mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))\n", 682 | "y_train = mlb.fit_transform(y_train)\n", 683 | "y_val = mlb.fit_transform(y_val)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large." 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": { 697 | "collapsed": true 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "from sklearn.multiclass import OneVsRestClassifier\n", 702 | "from sklearn.linear_model import LogisticRegression, RidgeClassifier" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": { 709 | "collapsed": true 710 | }, 711 | "outputs": [], 712 | "source": [ 713 | "def train_classifier(X_train, y_train):\n", 714 | " \"\"\"\n", 715 | " X_train, y_train — training data\n", 716 | " \n", 717 | " return: trained classifier\n", 718 | " \"\"\"\n", 719 | " \n", 720 | " # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n", 721 | "\n", 722 | " ######################################\n", 723 | " ######### YOUR CODE HERE #############\n", 724 | " ###################################### " 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "metadata": {}, 730 | "source": [ 731 | "Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*.\n", 732 | "\n", 733 | "If you receive a convergence warning, please set parameter *max_iter* in LogisticRegression to a larger value (the default is 100)." 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": { 740 | "collapsed": true 741 | }, 742 | "outputs": [], 743 | "source": [ 744 | "classifier_mybag = train_classifier(X_train_mybag, y_train)\n", 745 | "classifier_tfidf = train_classifier(X_train_tfidf, y_train)" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "Now you can create predictions for the data. You will need two types of predictions: labels and scores." 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": { 759 | "collapsed": true 760 | }, 761 | "outputs": [], 762 | "source": [ 763 | "y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)\n", 764 | "y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)\n", 765 | "\n", 766 | "y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)\n", 767 | "y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)" 768 | ] 769 | }, 770 | { 771 | "cell_type": "markdown", 772 | "metadata": {}, 773 | "source": [ 774 | "Now take a look at how classifier, which uses TF-IDF, works for a few examples:" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": null, 780 | "metadata": { 781 | "collapsed": true 782 | }, 783 | "outputs": [], 784 | "source": [ 785 | "y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)\n", 786 | "y_val_inversed = mlb.inverse_transform(y_val)\n", 787 | "for i in range(3):\n", 788 | " print('Title:\\t{}\\nTrue labels:\\t{}\\nPredicted labels:\\t{}\\n\\n'.format(\n", 789 | " X_val[i],\n", 790 | " ','.join(y_val_inversed[i]),\n", 791 | " ','.join(y_val_pred_inversed[i])\n", 792 | " ))" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "Now, we would need to compare the results of different predictions, e.g. to see whether TF-IDF transformation helps or to try different regularization techniques in logistic regression. For all these experiments, we need to setup evaluation procedure. " 800 | ] 801 | }, 802 | { 803 | "cell_type": "markdown", 804 | "metadata": {}, 805 | "source": [ 806 | "### Evaluation\n", 807 | "\n", 808 | "To evaluate the results we will use several classification metrics:\n", 809 | " - [Accuracy](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)\n", 810 | " - [F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n", 811 | " - [Area under ROC-curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)\n", 812 | " - [Area under precision-recall curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score) \n", 813 | " \n", 814 | "Make sure you are familiar with all of them. How would you expect the things work for the multi-label scenario? Read about micro/macro/weighted averaging following the sklearn links provided above." 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": { 821 | "collapsed": true 822 | }, 823 | "outputs": [], 824 | "source": [ 825 | "from sklearn.metrics import accuracy_score\n", 826 | "from sklearn.metrics import f1_score\n", 827 | "from sklearn.metrics import roc_auc_score \n", 828 | "from sklearn.metrics import average_precision_score\n", 829 | "from sklearn.metrics import recall_score" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "Implement the function *print_evaluation_scores* which calculates and prints to stdout:\n", 837 | " - *accuracy*\n", 838 | " - *F1-score macro/micro/weighted*\n", 839 | " - *Precision macro/micro/weighted*" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": null, 845 | "metadata": { 846 | "collapsed": true 847 | }, 848 | "outputs": [], 849 | "source": [ 850 | "def print_evaluation_scores(y_val, predicted):\n", 851 | " \n", 852 | " ######################################\n", 853 | " ######### YOUR CODE HERE #############\n", 854 | " ######################################" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": { 861 | "collapsed": true 862 | }, 863 | "outputs": [], 864 | "source": [ 865 | "print('Bag-of-words')\n", 866 | "print_evaluation_scores(y_val, y_val_predicted_labels_mybag)\n", 867 | "print('Tfidf')\n", 868 | "print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are:\n", 876 | " - true labels\n", 877 | " - decision functions scores\n", 878 | " - number of classes" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": null, 884 | "metadata": { 885 | "collapsed": true 886 | }, 887 | "outputs": [], 888 | "source": [ 889 | "from metrics import roc_auc\n", 890 | "%matplotlib inline" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "metadata": { 897 | "collapsed": true 898 | }, 899 | "outputs": [], 900 | "source": [ 901 | "n_classes = len(tags_counts)\n", 902 | "roc_auc(y_val, y_val_predicted_scores_mybag, n_classes)" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "metadata": { 909 | "collapsed": true 910 | }, 911 | "outputs": [], 912 | "source": [ 913 | "n_classes = len(tags_counts)\n", 914 | "roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes)" 915 | ] 916 | }, 917 | { 918 | "cell_type": "markdown", 919 | "metadata": {}, 920 | "source": [ 921 | "**Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation:\n", 922 | "- compare the quality of the bag-of-words and TF-IDF approaches and chose one of them.\n", 923 | "- for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100).\n", 924 | "\n", 925 | "You also could try other improvements of the preprocessing / model, if you want. " 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "metadata": { 932 | "collapsed": true 933 | }, 934 | "outputs": [], 935 | "source": [ 936 | "######################################\n", 937 | "######### YOUR CODE HERE #############\n", 938 | "######################################" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "When you are happy with the quality, create predictions for *test* set, which you will submit to Coursera." 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": { 952 | "collapsed": true 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "test_predictions = ######### YOUR CODE HERE #############\n", 957 | "test_pred_inversed = mlb.inverse_transform(test_predictions)\n", 958 | "\n", 959 | "test_predictions_for_submission = '\\n'.join('%i\\t%s' % (i, ','.join(row)) for i, row in enumerate(test_pred_inversed))\n", 960 | "grader.submit_tag('MultilabelClassification', test_predictions_for_submission)" 961 | ] 962 | }, 963 | { 964 | "cell_type": "markdown", 965 | "metadata": {}, 966 | "source": [ 967 | "### Analysis of the most important features" 968 | ] 969 | }, 970 | { 971 | "cell_type": "markdown", 972 | "metadata": {}, 973 | "source": [ 974 | "Finally, it is usually a good idea to look at the features (words or n-grams) that are used with the largest weigths in your logistic regression model." 975 | ] 976 | }, 977 | { 978 | "cell_type": "markdown", 979 | "metadata": {}, 980 | "source": [ 981 | "Implement the function *print_words_for_tag* to find them. Get back to sklearn documentation on [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) and [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) if needed." 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": { 988 | "collapsed": true 989 | }, 990 | "outputs": [], 991 | "source": [ 992 | "def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):\n", 993 | " \"\"\"\n", 994 | " classifier: trained classifier\n", 995 | " tag: particular tag\n", 996 | " tags_classes: a list of classes names from MultiLabelBinarizer\n", 997 | " index_to_words: index_to_words transformation\n", 998 | " all_words: all words in the dictionary\n", 999 | " \n", 1000 | " return nothing, just print top 5 positive and top 5 negative words for current tag\n", 1001 | " \"\"\"\n", 1002 | " print('Tag:\\t{}'.format(tag))\n", 1003 | " \n", 1004 | " # Extract an estimator from the classifier for the given tag.\n", 1005 | " # Extract feature coefficients from the estimator. \n", 1006 | " \n", 1007 | " ######################################\n", 1008 | " ######### YOUR CODE HERE #############\n", 1009 | " ######################################\n", 1010 | " \n", 1011 | " top_positive_words = # top-5 words sorted by the coefficiens.\n", 1012 | " top_negative_words = # bottom-5 words sorted by the coefficients.\n", 1013 | " print('Top positive words:\\t{}'.format(', '.join(top_positive_words)))\n", 1014 | " print('Top negative words:\\t{}\\n'.format(', '.join(top_negative_words)))" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": null, 1020 | "metadata": { 1021 | "collapsed": true 1022 | }, 1023 | "outputs": [], 1024 | "source": [ 1025 | "print_words_for_tag(classifier_tfidf, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n", 1026 | "print_words_for_tag(classifier_tfidf, 'c++', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)\n", 1027 | "print_words_for_tag(classifier_tfidf, 'linux', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "### Authorization & Submission\n", 1035 | "To submit the assignmnent to Cousera platform, please, enter your e-mail and token into variables below. You can generate the token on this programming assignment page. Note: The token expires 30 minutes after generation.\n", 1036 | "\n", 1037 | "Please, submit your solutions for the assignments **only** to Coursera platform, **do not create a Pull request on Github**." 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": { 1044 | "collapsed": true 1045 | }, 1046 | "outputs": [], 1047 | "source": [ 1048 | "grader.status()" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": null, 1054 | "metadata": { 1055 | "collapsed": true 1056 | }, 1057 | "outputs": [], 1058 | "source": [ 1059 | "STUDENT_EMAIL = # EMAIL \n", 1060 | "STUDENT_TOKEN = # TOKEN \n", 1061 | "grader.status()" 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "markdown", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "If you want to submit these answers, run cell below" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": null, 1074 | "metadata": { 1075 | "collapsed": true 1076 | }, 1077 | "outputs": [], 1078 | "source": [ 1079 | "grader.submit(STUDENT_EMAIL, STUDENT_TOKEN)" 1080 | ] 1081 | } 1082 | ], 1083 | "metadata": { 1084 | "kernelspec": { 1085 | "display_name": "Python 3", 1086 | "language": "python", 1087 | "name": "python3" 1088 | }, 1089 | "language_info": { 1090 | "codemirror_mode": { 1091 | "name": "ipython", 1092 | "version": 3 1093 | }, 1094 | "file_extension": ".py", 1095 | "mimetype": "text/x-python", 1096 | "name": "python", 1097 | "nbconvert_exporter": "python", 1098 | "pygments_lexer": "ipython3", 1099 | "version": "3.4.3" 1100 | } 1101 | }, 1102 | "nbformat": 4, 1103 | "nbformat_minor": 2 1104 | } 1105 | -------------------------------------------------------------------------------- /week2/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False): 4 | if candidate == 'B-' + current_tag: 5 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 6 | current_chunk[-1].append(current_pos - 1) 7 | current_chunk.append([current_pos]) 8 | elif candidate == 'I-' + current_tag: 9 | if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag): 10 | current_chunk.append([current_pos]) 11 | if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'): 12 | current_chunk.append([current_pos]) 13 | elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag: 14 | if len(current_chunk) > 0: 15 | current_chunk[-1].append(current_pos - 1) 16 | 17 | def _update_last_chunk(current_chunk, current_pos): 18 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 19 | current_chunk[-1].append(current_pos - 1) 20 | 21 | def _tag_precision_recall_f1(tp, fp, fn): 22 | precision, recall, f1 = 0, 0, 0 23 | if tp + fp > 0: 24 | precision = tp / (tp + fp) * 100 25 | if tp + fn > 0: 26 | recall = tp / (tp + fn) * 100 27 | if precision + recall > 0: 28 | f1 = 2 * precision * recall / (precision + recall) 29 | return precision, recall, f1 30 | 31 | def _aggregate_metrics(results, total_correct): 32 | total_true_entities = 0 33 | total_predicted_entities = 0 34 | total_precision = 0 35 | total_recall = 0 36 | total_f1 = 0 37 | for tag, tag_metrics in results.items(): 38 | n_pred = tag_metrics['n_predicted_entities'] 39 | n_true = tag_metrics['n_true_entities'] 40 | total_true_entities += n_true 41 | total_predicted_entities += n_pred 42 | total_precision += tag_metrics['precision'] * n_pred 43 | total_recall += tag_metrics['recall'] * n_true 44 | 45 | accuracy = 0 46 | if total_true_entities > 0: 47 | accuracy = total_correct / total_true_entities * 100 48 | else: 49 | print('CAUTION! Accuracy equals zero because there are no '\ 50 | 'correct entities. Check the correctness of your data.') 51 | if total_predicted_entities > 0: 52 | total_precision = total_precision / total_predicted_entities 53 | total_recall = total_recall / total_true_entities 54 | if total_precision + total_recall > 0: 55 | total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall) 56 | return total_true_entities, total_predicted_entities, \ 57 | total_precision, total_recall, total_f1, accuracy 58 | 59 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct): 60 | print('processed {len} tokens ' \ 61 | 'with {tot_true} phrases; ' \ 62 | 'found: {tot_pred} phrases; ' \ 63 | 'correct: {tot_cor}.\n'.format(len=n_tokens, 64 | tot_true=total_true_entities, 65 | tot_pred=total_predicted_entities, 66 | tot_cor=total_correct)) 67 | 68 | def _print_metrics(accuracy, total_precision, total_recall, total_f1): 69 | print('precision: {tot_prec:.2f}%; ' \ 70 | 'recall: {tot_recall:.2f}%; ' \ 71 | 'F1: {tot_f1:.2f}\n'.format(acc=accuracy, 72 | tot_prec=total_precision, 73 | tot_recall=total_recall, 74 | tot_f1=total_f1)) 75 | 76 | def _print_tag_metrics(tag, tag_results): 77 | print(('\t%12s' % tag) + ': precision: {tot_prec:6.2f}%; ' \ 78 | 'recall: {tot_recall:6.2f}%; ' \ 79 | 'F1: {tot_f1:6.2f}; ' \ 80 | 'predicted: {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'], 81 | tot_recall=tag_results['recall'], 82 | tot_f1=tag_results['f1'], 83 | tot_predicted=tag_results['n_predicted_entities'])) 84 | 85 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False): 86 | # Find all tags 87 | tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O')) 88 | 89 | results = OrderedDict((tag, OrderedDict()) for tag in tags) 90 | n_tokens = len(y_true) 91 | total_correct = 0 92 | 93 | # For eval_conll_try we find all chunks in the ground truth and prediction 94 | # For each chunk we store starting and ending indices 95 | for tag in tags: 96 | true_chunk = list() 97 | predicted_chunk = list() 98 | for position in range(n_tokens): 99 | _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position) 100 | _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True) 101 | 102 | _update_last_chunk(true_chunk, position) 103 | _update_last_chunk(predicted_chunk, position) 104 | 105 | # Then we find all correctly classified intervals 106 | # True positive results 107 | tp = sum(chunk in predicted_chunk for chunk in true_chunk) 108 | total_correct += tp 109 | 110 | # And then just calculate errors of the first and second kind 111 | # False negative 112 | fn = len(true_chunk) - tp 113 | # False positive 114 | fp = len(predicted_chunk) - tp 115 | precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn) 116 | 117 | results[tag]['precision'] = precision 118 | results[tag]['recall'] = recall 119 | results[tag]['f1'] = f1 120 | results[tag]['n_predicted_entities'] = len(predicted_chunk) 121 | results[tag]['n_true_entities'] = len(true_chunk) 122 | 123 | total_true_entities, total_predicted_entities, \ 124 | total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct) 125 | 126 | if print_results: 127 | _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct) 128 | _print_metrics(accuracy, total_precision, total_recall, total_f1) 129 | 130 | if not short_report: 131 | for tag, tag_results in results.items(): 132 | _print_tag_metrics(tag, tag_results) 133 | return results 134 | -------------------------------------------------------------------------------- /week2/week2-NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Recognize named entities on Twitter with LSTMs\n", 10 | "\n", 11 | "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n", 12 | "\n", 13 | "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n", 14 | "\n", 15 | " Ian Goodfellow works for Google Brain\n", 16 | "\n", 17 | "a NER model needs to provide the following sequence of tags:\n", 18 | "\n", 19 | " B-PER I-PER O O B-ORG I-ORG\n", 20 | "\n", 21 | "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n", 22 | "\n", 23 | "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n", 24 | "\n", 25 | "### Libraries\n", 26 | "\n", 27 | "For this task you will need the following libraries:\n", 28 | " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 29 | " \n", 30 | "In this assignment, we use Tensorflow 1.15.0. You can install it with pip:\n", 31 | "\n", 32 | " !pip install tensorflow==1.15.0\n", 33 | " \n", 34 | " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 35 | " \n", 36 | "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Data\n", 44 | "\n", 45 | "The following cell will download all data required for this assignment into the folder `week2/data`." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "try:\n", 57 | " import google.colab\n", 58 | " IN_COLAB = True\n", 59 | "except:\n", 60 | " IN_COLAB = False\n", 61 | "\n", 62 | "if IN_COLAB:\n", 63 | " ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 64 | " import setup_google_colab\n", 65 | " setup_google_colab.setup_week2()\n", 66 | "\n", 67 | "import sys\n", 68 | "sys.path.append(\"..\")\n", 69 | "from common.download_utils import download_week2_resources\n", 70 | "\n", 71 | "download_week2_resources()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Load the Twitter Named Entity Recognition corpus\n", 79 | "\n", 80 | "We will work with a corpus, which contains tweets with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n", 81 | "\n", 82 | "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `` token and any URL to `` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "def read_data(file_path):\n", 94 | " tokens = []\n", 95 | " tags = []\n", 96 | " \n", 97 | " tweet_tokens = []\n", 98 | " tweet_tags = []\n", 99 | " for line in open(file_path, encoding='utf-8'):\n", 100 | " line = line.strip()\n", 101 | " if not line:\n", 102 | " if tweet_tokens:\n", 103 | " tokens.append(tweet_tokens)\n", 104 | " tags.append(tweet_tags)\n", 105 | " tweet_tokens = []\n", 106 | " tweet_tags = []\n", 107 | " else:\n", 108 | " token, tag = line.split()\n", 109 | " # Replace all urls with token\n", 110 | " # Replace all users with token\n", 111 | "\n", 112 | " ######################################\n", 113 | " ######### YOUR CODE HERE #############\n", 114 | " ######################################\n", 115 | " \n", 116 | " tweet_tokens.append(token)\n", 117 | " tweet_tags.append(tag)\n", 118 | " \n", 119 | " return tokens, tags" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "And now we can load three separate parts of the dataset:\n", 127 | " - *train* data for training the model;\n", 128 | " - *validation* data for evaluation and hyperparameters tuning;\n", 129 | " - *test* data for final evaluation of the model." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "train_tokens, train_tags = read_data('data/train.txt')\n", 141 | "validation_tokens, validation_tags = read_data('data/validation.txt')\n", 142 | "test_tokens, test_tags = read_data('data/test.txt')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "for i in range(3):\n", 161 | " for token, tag in zip(train_tokens[i], train_tags[i]):\n", 162 | " print('%s\\t%s' % (token, tag))\n", 163 | " print()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Prepare dictionaries\n", 171 | "\n", 172 | "To train a neural network, we will use two mappings: \n", 173 | "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", 174 | "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", 175 | "\n", 176 | "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. " 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "from collections import defaultdict" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "def build_dict(tokens_or_tags, special_tokens):\n", 199 | " \"\"\"\n", 200 | " tokens_or_tags: a list of lists of tokens or tags\n", 201 | " special_tokens: some special tokens\n", 202 | " \"\"\"\n", 203 | " # Create a dictionary with default value 0\n", 204 | " tok2idx = defaultdict(lambda: 0)\n", 205 | " idx2tok = []\n", 206 | " \n", 207 | " # Create mappings from tokens (or tags) to indices and vice versa.\n", 208 | " # At first, add special tokens (or tags) to the dictionaries.\n", 209 | " # The first special token must have index 0.\n", 210 | " \n", 211 | " # Mapping tok2idx should contain each token or tag only once. \n", 212 | " # To do so, you should:\n", 213 | " # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n", 214 | " # occur in special_tokens (because they could have non-empty intersection)\n", 215 | " # 2. index them (for example, you can add them into the list idx2tok\n", 216 | " # 3. for each token/tag save the index into tok2idx).\n", 217 | " \n", 218 | " ######################################\n", 219 | " ######### YOUR CODE HERE #############\n", 220 | " ######################################\n", 221 | " \n", 222 | " return tok2idx, idx2tok" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n", 230 | " - `` token for out of vocabulary tokens;\n", 231 | " - `` token for padding sentence to the same length when we create batches of sentences." 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "special_tokens = ['', '']\n", 243 | "special_tags = ['O']\n", 244 | "\n", 245 | "# Create dictionaries \n", 246 | "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n", 247 | "tag2idx, idx2tag = build_dict(train_tags, special_tags)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "The next additional functions will help you to create the mapping between tokens and ids for a sentence. " 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "def words2idxs(tokens_list):\n", 266 | " return [token2idx[word] for word in tokens_list]\n", 267 | "\n", 268 | "def tags2idxs(tags_list):\n", 269 | " return [tag2idx[tag] for tag in tags_list]\n", 270 | "\n", 271 | "def idxs2words(idxs):\n", 272 | " return [idx2token[idx] for idx in idxs]\n", 273 | "\n", 274 | "def idxs2tags(idxs):\n", 275 | " return [idx2tag[idx] for idx in idxs]" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Generate batches\n", 283 | "\n", 284 | "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. " 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "def batches_generator(batch_size, tokens, tags,\n", 296 | " shuffle=True, allow_smaller_last_batch=True):\n", 297 | " \"\"\"Generates padded batches of tokens and tags.\"\"\"\n", 298 | " \n", 299 | " n_samples = len(tokens)\n", 300 | " if shuffle:\n", 301 | " order = np.random.permutation(n_samples)\n", 302 | " else:\n", 303 | " order = np.arange(n_samples)\n", 304 | "\n", 305 | " n_batches = n_samples // batch_size\n", 306 | " if allow_smaller_last_batch and n_samples % batch_size:\n", 307 | " n_batches += 1\n", 308 | "\n", 309 | " for k in range(n_batches):\n", 310 | " batch_start = k * batch_size\n", 311 | " batch_end = min((k + 1) * batch_size, n_samples)\n", 312 | " current_batch_size = batch_end - batch_start\n", 313 | " x_list = []\n", 314 | " y_list = []\n", 315 | " max_len_token = 0\n", 316 | " for idx in order[batch_start: batch_end]:\n", 317 | " x_list.append(words2idxs(tokens[idx]))\n", 318 | " y_list.append(tags2idxs(tags[idx]))\n", 319 | " max_len_token = max(max_len_token, len(tags[idx]))\n", 320 | " \n", 321 | " # Fill in the data into numpy nd-arrays filled with padding indices.\n", 322 | " x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['']\n", 323 | " y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n", 324 | " lengths = np.zeros(current_batch_size, dtype=np.int32)\n", 325 | " for n in range(current_batch_size):\n", 326 | " utt_len = len(x_list[n])\n", 327 | " x[n, :utt_len] = x_list[n]\n", 328 | " lengths[n] = utt_len\n", 329 | " y[n, :utt_len] = y_list[n]\n", 330 | " yield x, y, lengths" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "## Build a recurrent neural network\n", 338 | "\n", 339 | "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification. " 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "import tensorflow as tf\n", 351 | "import numpy as np" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "class BiLSTMModel():\n", 363 | " pass" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "source": [ 372 | "First, we need to create [placeholders](https://www.tensorflow.org/api_docs/python/tf/compat/v1/placeholder) to specify what data we are going to feed into the network during the execution time. For this task we will need the following placeholders:\n", 373 | " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n", 374 | " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n", 375 | " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n", 376 | " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n", 377 | " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n", 378 | "\n", 379 | "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n", 380 | "\n", 381 | "You need to complete the function *declare_placeholders*." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "def declare_placeholders(self):\n", 393 | " \"\"\"Specifies placeholders for the model.\"\"\"\n", 394 | "\n", 395 | " # Placeholders for input and ground truth output.\n", 396 | " self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n", 397 | " self.ground_truth_tags = ######### YOUR CODE HERE #############\n", 398 | " \n", 399 | " # Placeholder for lengths of the sequences.\n", 400 | " self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n", 401 | " \n", 402 | " # Placeholder for a dropout keep probability. If we don't feed\n", 403 | " # a value for this placeholder, it will be equal to 1.0.\n", 404 | " self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n", 405 | " \n", 406 | " # Placeholder for a learning rate (tf.float32).\n", 407 | " self.learning_rate_ph = ######### YOUR CODE HERE #############" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": { 414 | "collapsed": true 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "source": [ 427 | "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n", 428 | " \n", 429 | "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type (*tf.float32*), and initialize with random values.\n", 430 | "- Create forward and backward LSTM cells. TensorFlow provides a number of RNN cells ready for you. We suggest that you use *LSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n", 431 | "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n", 432 | " \n", 433 | "After that, you can build the computation graph that transforms an input_batch:\n", 434 | "\n", 435 | "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n", 436 | "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n", 437 | "- Create a dense layer on top. Its output will be used directly in loss function. \n", 438 | " \n", 439 | "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n", 440 | " " 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": true 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n", 452 | " \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n", 453 | " \n", 454 | " # Create embedding variable (tf.Variable) with dtype tf.float32\n", 455 | " initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)\n", 456 | " embedding_matrix_variable = ######### YOUR CODE HERE #############\n", 457 | " \n", 458 | " # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n", 459 | " # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n", 460 | " forward_cell = ######### YOUR CODE HERE #############\n", 461 | " backward_cell = ######### YOUR CODE HERE #############\n", 462 | "\n", 463 | " # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n", 464 | " # Shape: [batch_size, sequence_len, embedding_dim].\n", 465 | " embeddings = ######### YOUR CODE HERE #############\n", 466 | " \n", 467 | " # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n", 468 | " # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n", 469 | " # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n", 470 | " (rnn_output_fw, rnn_output_bw), _ = ######### YOUR CODE HERE #############\n", 471 | " rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n", 472 | "\n", 473 | " # Dense layer on top.\n", 474 | " # Shape: [batch_size, sequence_len, n_tags]. \n", 475 | " self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": { 482 | "collapsed": true 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "BiLSTMModel.__build_layers = classmethod(build_layers)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "collapsed": true 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "def compute_predictions(self):\n", 505 | " \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n", 506 | " \n", 507 | " # Create softmax (tf.nn.softmax) function\n", 508 | " softmax_output = ######### YOUR CODE HERE #############\n", 509 | " \n", 510 | " # Use argmax (tf.argmax) to get the most probable tags\n", 511 | " # Don't forget to set axis=-1\n", 512 | " # otherwise argmax will be calculated in a wrong way\n", 513 | " self.predictions = ######### YOUR CODE HERE #############" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "collapsed": true 521 | }, 522 | "outputs": [], 523 | "source": [ 524 | "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "collapsed": true 531 | }, 532 | "source": [ 533 | "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n", 534 | "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note, that we do not want to take into account loss terms coming from `` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)." 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": { 541 | "collapsed": true 542 | }, 543 | "outputs": [], 544 | "source": [ 545 | "def compute_loss(self, n_tags, PAD_index):\n", 546 | " \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n", 547 | " \n", 548 | " # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)\n", 549 | " ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n", 550 | " loss_tensor = ######### YOUR CODE HERE #############\n", 551 | " \n", 552 | " mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n", 553 | " # Create loss function which doesn't operate with tokens (tf.reduce_mean)\n", 554 | " # Be careful that the argument of tf.reduce_mean should be\n", 555 | " # multiplication of mask and loss_tensor.\n", 556 | " self.loss = ######### YOUR CODE HERE #############" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "collapsed": true 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "BiLSTMModel.__compute_loss = classmethod(compute_loss)" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "The last thing to specify is how we want to optimize the loss. \n", 575 | "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n", 576 | "You will also need to apply clipping to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. " 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": { 583 | "collapsed": true 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "def perform_optimization(self):\n", 588 | " \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n", 589 | " \n", 590 | " # Create an optimizer (tf.train.AdamOptimizer)\n", 591 | " self.optimizer = ######### YOUR CODE HERE #############\n", 592 | " self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n", 593 | " \n", 594 | " # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n", 595 | " # Pay attention that you need to apply this operation only for gradients \n", 596 | " # because self.grads_and_vars also contains variables.\n", 597 | " # list comprehension might be useful in this case.\n", 598 | " clip_norm = tf.cast(1.0, tf.float32)\n", 599 | " self.grads_and_vars = ######### YOUR CODE HERE #############\n", 600 | " \n", 601 | " self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": { 608 | "collapsed": true 609 | }, 610 | "outputs": [], 611 | "source": [ 612 | "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": { 618 | "collapsed": true 619 | }, 620 | "source": [ 621 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n", 622 | "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. " 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": { 629 | "collapsed": true 630 | }, 631 | "outputs": [], 632 | "source": [ 633 | "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n", 634 | " self.__declare_placeholders()\n", 635 | " self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n", 636 | " self.__compute_predictions()\n", 637 | " self.__compute_loss(n_tags, PAD_index)\n", 638 | " self.__perform_optimization()" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "collapsed": true 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "BiLSTMModel.__init__ = classmethod(init_model)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "## Train the network and predict tags" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": { 662 | "collapsed": true 663 | }, 664 | "source": [ 665 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. " 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": { 672 | "collapsed": true 673 | }, 674 | "outputs": [], 675 | "source": [ 676 | "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n", 677 | " feed_dict = {self.input_batch: x_batch,\n", 678 | " self.ground_truth_tags: y_batch,\n", 679 | " self.learning_rate_ph: learning_rate,\n", 680 | " self.dropout_ph: dropout_keep_probability,\n", 681 | " self.lengths: lengths}\n", 682 | " \n", 683 | " session.run(self.train_op, feed_dict=feed_dict)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "collapsed": true 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "BiLSTMModel.train_on_batch = classmethod(train_on_batch)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*." 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": { 708 | "collapsed": true 709 | }, 710 | "outputs": [], 711 | "source": [ 712 | "def predict_for_batch(self, session, x_batch, lengths):\n", 713 | " ######################################\n", 714 | " ######### YOUR CODE HERE #############\n", 715 | " ######################################\n", 716 | " \n", 717 | " return predictions" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": { 724 | "collapsed": true 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n", 736 | "\n", 737 | "### Evaluation \n", 738 | "To simplify the evaluation process we provide two functions for you:\n", 739 | " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n", 740 | " - *eval_conll*: calculates precision, recall and F1 for the results." 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "metadata": { 747 | "collapsed": true 748 | }, 749 | "outputs": [], 750 | "source": [ 751 | "from evaluation import precision_recall_f1" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": { 758 | "collapsed": true 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "def predict_tags(model, session, token_idxs_batch, lengths):\n", 763 | " \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n", 764 | " \n", 765 | " tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n", 766 | " \n", 767 | " tags_batch, tokens_batch = [], []\n", 768 | " for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n", 769 | " tags, tokens = [], []\n", 770 | " for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n", 771 | " tags.append(idx2tag[tag_idx])\n", 772 | " tokens.append(idx2token[token_idx])\n", 773 | " tags_batch.append(tags)\n", 774 | " tokens_batch.append(tokens)\n", 775 | " return tags_batch, tokens_batch\n", 776 | " \n", 777 | " \n", 778 | "def eval_conll(model, session, tokens, tags, short_report=True):\n", 779 | " \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n", 780 | " \n", 781 | " y_true, y_pred = [], []\n", 782 | " for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n", 783 | " tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n", 784 | " if len(x_batch[0]) != len(tags_batch[0]):\n", 785 | " raise Exception(\"Incorrect length of prediction for the input, \"\n", 786 | " \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n", 787 | " predicted_tags = []\n", 788 | " ground_truth_tags = []\n", 789 | " for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n", 790 | " if token != '':\n", 791 | " ground_truth_tags.append(idx2tag[gt_tag_idx])\n", 792 | " predicted_tags.append(pred_tag)\n", 793 | "\n", 794 | " # We extend every prediction and ground truth sequence with 'O' tag\n", 795 | " # to indicate a possible end of entity.\n", 796 | " y_true.extend(ground_truth_tags + ['O'])\n", 797 | " y_pred.extend(predicted_tags + ['O'])\n", 798 | " \n", 799 | " results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n", 800 | " return results" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "## Run your experiment" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "Create *BiLSTMModel* model with the following parameters:\n", 815 | " - *vocabulary_size* — number of tokens;\n", 816 | " - *n_tags* — number of tags;\n", 817 | " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n", 818 | " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n", 819 | " - *PAD_index* — an index of the padding token (``).\n", 820 | "\n", 821 | "Set hyperparameters. You might want to start with the following recommended values:\n", 822 | "- *batch_size*: 32;\n", 823 | "- 4 epochs;\n", 824 | "- starting value of *learning_rate*: 0.005\n", 825 | "- *learning_rate_decay*: a square root of 2;\n", 826 | "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n", 827 | "\n", 828 | "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment." 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": true 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "tf.reset_default_graph()\n", 840 | "\n", 841 | "model = ######### YOUR CODE HERE #############\n", 842 | "\n", 843 | "batch_size = ######### YOUR CODE HERE #############\n", 844 | "n_epochs = ######### YOUR CODE HERE #############\n", 845 | "learning_rate = ######### YOUR CODE HERE #############\n", 846 | "learning_rate_decay = ######### YOUR CODE HERE #############\n", 847 | "dropout_keep_probability = ######### YOUR CODE HERE #############" 848 | ] 849 | }, 850 | { 851 | "cell_type": "markdown", 852 | "metadata": {}, 853 | "source": [ 854 | "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables." 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": {}, 860 | "source": [ 861 | "Finally, we are ready to run the training!" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": null, 867 | "metadata": { 868 | "collapsed": true 869 | }, 870 | "outputs": [], 871 | "source": [ 872 | "sess = tf.Session()\n", 873 | "sess.run(tf.global_variables_initializer())\n", 874 | "\n", 875 | "print('Start training... \\n')\n", 876 | "for epoch in range(n_epochs):\n", 877 | " # For each epoch evaluate the model on train and validation data\n", 878 | " print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n", 879 | " print('Train data evaluation:')\n", 880 | " eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n", 881 | " print('Validation data evaluation:')\n", 882 | " eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n", 883 | " \n", 884 | " # Train the model\n", 885 | " for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n", 886 | " model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n", 887 | " \n", 888 | " # Decaying the learning rate\n", 889 | " learning_rate = learning_rate / learning_rate_decay\n", 890 | " \n", 891 | "print('...training finished.')" 892 | ] 893 | }, 894 | { 895 | "cell_type": "markdown", 896 | "metadata": {}, 897 | "source": [ 898 | "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n", 899 | "\n", 900 | "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": null, 906 | "metadata": { 907 | "collapsed": true 908 | }, 909 | "outputs": [], 910 | "source": [ 911 | "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n", 912 | "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n", 913 | "\n", 914 | "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n", 915 | "validation_results = ######### YOUR CODE HERE #############\n", 916 | "\n", 917 | "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n", 918 | "test_results = ######### YOUR CODE HERE #############" 919 | ] 920 | }, 921 | { 922 | "cell_type": "markdown", 923 | "metadata": {}, 924 | "source": [ 925 | "### Conclusions\n", 926 | "\n", 927 | "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested." 928 | ] 929 | } 930 | ], 931 | "metadata": { 932 | "kernelspec": { 933 | "display_name": "Python 3", 934 | "language": "python", 935 | "name": "python3" 936 | }, 937 | "language_info": { 938 | "codemirror_mode": { 939 | "name": "ipython", 940 | "version": 3 941 | }, 942 | "file_extension": ".py", 943 | "mimetype": "text/x-python", 944 | "name": "python", 945 | "nbconvert_exporter": "python", 946 | "pygments_lexer": "ipython3", 947 | "version": "3.4.3" 948 | } 949 | }, 950 | "nbformat": 4, 951 | "nbformat_minor": 1 952 | } 953 | -------------------------------------------------------------------------------- /week3/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A' 10 | self.parts = OrderedDict([('98mDT', 'Question2Vec'), 11 | ('nc7RP', 'HitsCount'), 12 | ('bNp90', 'DCGScore'), 13 | ('3gRlQ', 'W2VTokenizedRanks'), 14 | ('mX6wS', 'StarSpaceRanks')]) 15 | self.answers = {key: None for key in self.parts} 16 | 17 | @staticmethod 18 | def ravel_output(output): 19 | ''' 20 | If student accidentally submitted np.array with one 21 | element instead of number, this function will submit 22 | this number instead 23 | ''' 24 | if isinstance(output, np.ndarray) and output.size == 1: 25 | output = output.item(0) 26 | return output 27 | 28 | def submit(self, email, token): 29 | submission = { 30 | "assignmentKey": self.assignment_key, 31 | "submitterEmail": email, 32 | "secret": token, 33 | "parts": {} 34 | } 35 | for part, output in self.answers.items(): 36 | if output is not None: 37 | submission["parts"][part] = {"output": output} 38 | else: 39 | submission["parts"][part] = dict() 40 | request = requests.post(self.submission_page, data=json.dumps(submission)) 41 | response = request.json() 42 | if request.status_code == 201: 43 | print('Submitted to Coursera platform. See results on assignment page!') 44 | elif u'details' in response and u'learnerMessage' in response[u'details']: 45 | print(response[u'details'][u'learnerMessage']) 46 | else: 47 | print("Unknown response from Coursera: {}".format(request.status_code)) 48 | print(response) 49 | 50 | def status(self): 51 | print("You want to submit these parts:") 52 | for part_id, part_name in self.parts.items(): 53 | answer = self.answers[part_id] 54 | if answer is None: 55 | answer = '-'*10 56 | print("Task {}: {}".format(part_name, answer[:100] + '...')) 57 | 58 | def submit_part(self, part, output): 59 | self.answers[part] = output 60 | print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...')) 61 | 62 | def submit_tag(self, tag, output): 63 | part_id = [k for k, v in self.parts.items() if v == tag] 64 | if len(part_id) != 1: 65 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 66 | part_id = part_id[0] 67 | self.submit_part(part_id, str(self.ravel_output(output))) 68 | -------------------------------------------------------------------------------- /week3/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') 6 | STOPWORDS = set(stopwords.words('english')) 7 | def text_prepare(text): 8 | text = text.lower() 9 | text = REPLACE_BY_SPACE_RE.sub(' ', text) 10 | text = GOOD_SYMBOLS_RE.sub('', text) 11 | text = ' '.join([x for x in text.split() if x and x not in STOPWORDS]) 12 | return text.strip() 13 | 14 | def array_to_string(arr): 15 | return '\n'.join(str(num) for num in arr) 16 | 17 | def matrix_to_string(matrix): 18 | return '\n'.join('\t'.join(str(num) for num in line) for line in matrix) -------------------------------------------------------------------------------- /week4/encoder-decoder-pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hse-aml/natural-language-processing/5b06a1ac8918af5720117b1ebdc8c55de13bae59/week4/encoder-decoder-pic.png -------------------------------------------------------------------------------- /week5/dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from chatterbot.trainers import ChatterBotCorpusTrainer 6 | from utils import * 7 | 8 | 9 | class ThreadRanker(object): 10 | def __init__(self, paths): 11 | self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS']) 12 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 13 | 14 | def __load_embeddings_by_tag(self, tag_name): 15 | embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl") 16 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 17 | return thread_ids, thread_embeddings 18 | 19 | def get_best_thread(self, question, tag_name): 20 | """ Returns id of the most similar thread for the question. 21 | The search is performed across the threads with a given tag. 22 | """ 23 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 24 | 25 | # HINT: you have already implemented a similar routine in the 3rd assignment. 26 | 27 | question_vec = #### YOUR CODE HERE #### 28 | best_thread = #### YOUR CODE HERE #### 29 | 30 | return thread_ids[best_thread] 31 | 32 | 33 | class DialogueManager(object): 34 | def __init__(self, paths): 35 | print("Loading resources...") 36 | 37 | # Intent recognition: 38 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 39 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 40 | 41 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 42 | 43 | # Goal-oriented part: 44 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 45 | self.thread_ranker = ThreadRanker(paths) 46 | self.__init_chitchat_bot() 47 | 48 | def __init_chitchat_bot(self): 49 | """Initializes self.chitchat_bot with some conversational model.""" 50 | 51 | # Hint: you might want to create and train chatterbot.ChatBot here. 52 | # Create an instance of the ChatBot class. 53 | # Set a trainer set_trainer(ChatterBotCorpusTrainer) for the ChatBot. 54 | # Train the ChatBot with "chatterbot.corpus.english" param. 55 | # Note that we use chatterbot==0.7.6 in this project. 56 | # You are welcome to experiment with other versions but they might have slightly different API. 57 | 58 | ######################## 59 | #### YOUR CODE HERE #### 60 | ######################## 61 | 62 | # remove this when you're done 63 | raise NotImplementedError( 64 | "Open dialogue_manager.py and fill with your code. In case of Google Colab, download" 65 | "(https://github.com/hse-aml/natural-language-processing/blob/master/project/dialogue_manager.py), " 66 | "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD") 67 | 68 | def generate_answer(self, question): 69 | """Combines stackoverflow and chitchat parts using intent recognition.""" 70 | 71 | # Recognize intent of the question using `intent_recognizer`. 72 | # Don't forget to prepare question and calculate features for the question. 73 | 74 | prepared_question = #### YOUR CODE HERE #### 75 | features = #### YOUR CODE HERE #### 76 | intent = #### YOUR CODE HERE #### 77 | 78 | # Chit-chat part: 79 | if intent == 'dialogue': 80 | # Pass question to chitchat_bot to generate a response. 81 | response = #### YOUR CODE HERE #### 82 | return response 83 | 84 | # Goal-oriented part: 85 | else: 86 | # Pass features to tag_classifier to get predictions. 87 | tag = #### YOUR CODE HERE #### 88 | 89 | # Pass prepared_question to thread_ranker to get predictions. 90 | thread_id = #### YOUR CODE HERE #### 91 | 92 | return self.ANSWER_TEMPLATE % (tag, thread_id) 93 | -------------------------------------------------------------------------------- /week5/utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | 6 | nltk.download('stopwords') 7 | from nltk.corpus import stopwords 8 | 9 | # Paths for all resources for the bot. 10 | RESOURCE_PATH = { 11 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 12 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 13 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 14 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 15 | 'WORD_EMBEDDINGS': 'data/word_embeddings.tsv', 16 | } 17 | 18 | 19 | def text_prepare(text): 20 | """Performs tokenization and simple preprocessing.""" 21 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 22 | good_symbols_re = re.compile('[^0-9a-z #+_]') 23 | stopwords_set = set(stopwords.words('english')) 24 | 25 | text = text.lower() 26 | text = replace_by_space_re.sub(' ', text) 27 | text = good_symbols_re.sub('', text) 28 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 29 | 30 | return text.strip() 31 | 32 | 33 | def load_embeddings(embeddings_path): 34 | """Loads pre-trained word embeddings from tsv file. 35 | Args: 36 | embeddings_path - path to the embeddings file. 37 | Returns: 38 | embeddings - dict mapping words to vectors; 39 | embeddings_dim - dimension of the vectors. 40 | """ 41 | 42 | # Hint: you have already implemented a similar routine in the 3rd assignment. 43 | # Note that here you also need to know the dimension of the loaded embeddings. 44 | # When you load the embeddings, use numpy.float32 type as dtype 45 | 46 | ######################## 47 | #### YOUR CODE HERE #### 48 | ######################## 49 | 50 | # remove this when you're done 51 | raise NotImplementedError( 52 | "Open utils.py and fill with your code. In case of Google Colab, download" 53 | "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), " 54 | "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD") 55 | 56 | 57 | def question_to_vec(question, embeddings, dim): 58 | """Transforms a string to an embedding by averaging word embeddings.""" 59 | 60 | # Hint: you have already implemented exactly this function in the 3rd assignment. 61 | 62 | ######################## 63 | #### YOUR CODE HERE #### 64 | ######################## 65 | 66 | # remove this when you're done 67 | raise NotImplementedError( 68 | "Open utils.py and fill with your code. In case of Google Colab, download" 69 | "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), " 70 | "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD") 71 | 72 | 73 | def unpickle_file(filename): 74 | """Returns the result of unpickling the file content.""" 75 | with open(filename, 'rb') as f: 76 | return pickle.load(f) 77 | -------------------------------------------------------------------------------- /week5/week5-project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Final project: StackOverflow assistant bot\n", 8 | "\n", 9 | "Congratulations on coming this far and solving the programming assignments! In this final project, we will combine everything we have learned about Natural Language Processing to construct a *dialogue chat bot*, which will be able to:\n", 10 | "* answer programming-related questions (using StackOverflow dataset);\n", 11 | "* chit-chat and simulate dialogue on all non programming-related questions.\n", 12 | "\n", 13 | "For a chit-chat mode we will use a pre-trained neural network engine available from [ChatterBot](https://github.com/gunthercox/ChatterBot).\n", 14 | "Those who aim at honor certificates for our course or are just curious, will train their own models for chit-chat.\n", 15 | "![](https://imgs.xkcd.com/comics/twitter_bot.png)\n", 16 | "©[xkcd](https://xkcd.com)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Data description\n", 24 | "\n", 25 | "To detect *intent* of users questions we will need two text collections:\n", 26 | "- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (*positive samples*).\n", 27 | "- `dialogues.tsv` — dialogue phrases from movie subtitles (*negative samples*).\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "try:\n", 37 | " import google.colab\n", 38 | " IN_COLAB = True\n", 39 | "except:\n", 40 | " IN_COLAB = False\n", 41 | "\n", 42 | "if IN_COLAB:\n", 43 | " ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 44 | " import setup_google_colab\n", 45 | " setup_google_colab.setup_project()\n", 46 | "\n", 47 | "import sys\n", 48 | "sys.path.append(\"..\")\n", 49 | "from common.download_utils import download_project_resources\n", 50 | "\n", 51 | "download_project_resources()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "For those questions, that have programming-related intent, we will proceed as follow predict programming language (only one tag per question allowed here) and rank candidates within the tag using embeddings.\n", 59 | "For the ranking part, you will need:\n", 60 | "- `word_embeddings.tsv` — word embeddings, that you trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "As a result of this notebook, you should obtain the following new objects that you will then use in the running bot:\n", 68 | "\n", 69 | "- `intent_recognizer.pkl` — intent recognition model;\n", 70 | "- `tag_classifier.pkl` — programming language classification model;\n", 71 | "- `tfidf_vectorizer.pkl` — vectorizer used during training;\n", 72 | "- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.\n", 73 | " " 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "Some functions will be reused by this notebook and the scripts, so we put them into *utils.py* file. Don't forget to open it and fill in the gaps!" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from utils import *" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Part I. Intent and language recognition" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all, would it?). So the first thing we need to do is to **distinguish programming-related questions from general ones**.\n", 104 | "\n", 105 | "It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here), and exercise our *text classification* skill a bit. :)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "import numpy as np\n", 115 | "import pandas as pd\n", 116 | "import pickle\n", 117 | "import re\n", 118 | "\n", 119 | "from sklearn.feature_extraction.text import TfidfVectorizer" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Data preparation" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "In the first assignment (Predict tags on StackOverflow with linear models), you have already learnt how to preprocess texts and do TF-IDF tranformations. Reuse your code here. In addition, you will also need to [dump](https://docs.python.org/3/library/pickle.html#pickle.dump) the TF-IDF vectorizer with pickle to use it later in the running bot." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "def tfidf_features(X_train, X_test, vectorizer_path):\n", 143 | " \"\"\"Performs TF-IDF transformation and dumps the model.\"\"\"\n", 144 | " \n", 145 | " # Train a vectorizer on X_train data.\n", 146 | " # Transform X_train and X_test data.\n", 147 | " \n", 148 | " # Pickle the trained vectorizer to 'vectorizer_path'\n", 149 | " # Don't forget to open the file in writing bytes mode.\n", 150 | " \n", 151 | " ######################################\n", 152 | " ######### YOUR CODE HERE #############\n", 153 | " ######################################\n", 154 | " \n", 155 | " return X_train, X_test" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Now, load examples of two classes. Use a subsample of stackoverflow data to balance the classes. You will need the full data later." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "sample_size = 200000\n", 172 | "\n", 173 | "dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\\t').sample(sample_size, random_state=0)\n", 174 | "stackoverflow_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t').sample(sample_size, random_state=0)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Check how the data look like:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "dialogue_df.head()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "stackoverflow_df.head()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "Apply *text_prepare* function to preprocess the data.\n", 207 | "\n", 208 | "If you filled in the file, but NotImplementedError is still displayed, please refer to [this thread](https://github.com/hse-aml/natural-language-processing/issues/27)." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "from utils import text_prepare" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "dialogue_df['text'] = ######### YOUR CODE HERE #############\n", 227 | "stackoverflow_df['title'] = ######### YOUR CODE HERE #############" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### Intent recognition" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, prepare the data for this task:\n", 242 | "- concatenate `dialogue` and `stackoverflow` examples into one sample\n", 243 | "- split it into train and test in proportion 9:1, use *random_state=0* for reproducibility\n", 244 | "- transform it into TF-IDF features" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "from sklearn.model_selection import train_test_split" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])\n", 263 | "y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]\n", 264 | "\n", 265 | "X_train, X_test, y_train, y_test = ######### YOUR CODE HERE ##########\n", 266 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))\n", 267 | "\n", 268 | "X_train_tfidf, X_test_tfidf = ######### YOUR CODE HERE ###########" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "Train the **intent recognizer** using LogisticRegression on the train set with the following parameters: *penalty='l2'*, *C=10*, *random_state=0*. Print out the accuracy on the test set to check whether everything looks good." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "from sklearn.linear_model import LogisticRegression\n", 285 | "from sklearn.metrics import accuracy_score" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "######################################\n", 295 | "######### YOUR CODE HERE #############\n", 296 | "######################################" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "# Check test accuracy.\n", 306 | "y_test_pred = intent_recognizer.predict(X_test_tfidf)\n", 307 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 308 | "print('Test accuracy = {}'.format(test_accuracy))" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Dump the classifier to use it in the running bot." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Programming language classification " 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features. \n", 339 | "\n", 340 | "First, let us prepare the data for this task." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "X = stackoverflow_df['title'].values\n", 350 | "y = stackoverflow_df['tag'].values" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", 360 | "print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "Let us reuse the TF-IDF vectorizer that we have already created above. It should not make a huge difference which data was used to train it." 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))\n", 377 | "\n", 378 | "X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "Train the **tag classifier** using OneVsRestClassifier wrapper over LogisticRegression. Use the following parameters: *penalty='l2'*, *C=5*, *random_state=0*." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "from sklearn.multiclass import OneVsRestClassifier" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "######################################\n", 404 | "######### YOUR CODE HERE #############\n", 405 | "######################################" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "# Check test accuracy.\n", 415 | "y_test_pred = tag_classifier.predict(X_test_tfidf)\n", 416 | "test_accuracy = accuracy_score(y_test, y_test_pred)\n", 417 | "print('Test accuracy = {}'.format(test_accuracy))" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "Dump the classifier to use it in the running bot." 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Part II. Ranking questions with embeddings" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "To find a relevant answer (a thread from StackOverflow) on a question you will use vector representations to calculate similarity between the question and existing threads. We already had `question_to_vec` function from the assignment 3, which can create such a representation based on word vectors. \n", 448 | "\n", 449 | "However, it would be costly to compute such a representation for all possible answers in *online mode* of the bot (e.g. when bot is running and answering questions from many users). This is the reason why you will create a *database* with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM. " 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "Load StarSpace embeddings which were trained on Stack Overflow posts. These embeddings were trained in *supervised mode* for duplicates detection on the same corpus that is used in search. We can account on that these representations will allow us to find closely related answers for a question. \n", 457 | "\n", 458 | "If for some reasons you didn't train StarSpace embeddings in the assignment 3, you can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google. All instructions about how to work with these vectors were provided in the same assignment. However, we highly recommend to use StarSpace's embeddings, because it contains more appropriate embeddings. If you chose to use Google's embeddings, delete the words, which are not in Stackoverflow data." 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "starspace_embeddings, embeddings_dim = load_embeddings('data/word_embeddings.tsv')" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "posts_df = pd.read_csv('data/tagged_posts.tsv', sep='\\t')" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "Look at the distribution of posts for programming languages (tags) and find the most common ones. \n", 491 | "You might want to use pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) and [count](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) methods:" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "counts_by_tag = ######### YOUR CODE HERE #############" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "Now for each `tag` you need to create two data structures, which will serve as online search index:\n", 508 | "* `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;\n", 509 | "* `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.\n", 510 | "\n", 511 | "Implement the code which will calculate the mentioned structures and dump it to files. It should take several minutes to compute it." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "import os\n", 521 | "os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)\n", 522 | "\n", 523 | "for tag, count in counts_by_tag.items():\n", 524 | " tag_posts = posts_df[posts_df['tag'] == tag]\n", 525 | " \n", 526 | " tag_post_ids = ######### YOUR CODE HERE #############\n", 527 | " \n", 528 | " tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)\n", 529 | " for i, title in enumerate(tag_posts['title']):\n", 530 | " tag_vectors[i, :] = ######### YOUR CODE HERE ############# \n", 531 | "\n", 532 | " # Dump post ids and vectors to a file.\n", 533 | " filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))\n", 534 | " pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "## Part III. Putting all together" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "Now let's combine everything that we have done and enable the bot to maintain a dialogue. We will teach the bot to sequentially determine the intent and, depending on the intent, select the best answer. As soon as we do this, we will have the opportunity to chat with the bot and check how well it answers questions." 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "Implement Dialogue Manager that will generate the best answer. In order to do this, you should open *dialogue_manager.py* and fill in the gaps." 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "from dialogue_manager import DialogueManager" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "dialogue_manager = ######### YOUR CODE HERE #############" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "Now we are ready to test our chat bot! Let's chat with the bot and ask it some questions. Check that the answers are reasonable." 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "questions = [\n", 590 | " \"Hey\", \n", 591 | " \"How are you doing?\", \n", 592 | " \"What's your hobby?\", \n", 593 | " \"How to write a loop in python?\",\n", 594 | " \"How to delete rows in pandas?\",\n", 595 | " \"python3 re\",\n", 596 | " \"What is the difference between c and c++\",\n", 597 | " \"Multithreading in Java\",\n", 598 | " \"Catch exceptions C++\",\n", 599 | " \"What is AI?\",\n", 600 | "]\n", 601 | "\n", 602 | "for question in questions:\n", 603 | " answer = ######### YOUR CODE HERE #############\n", 604 | " print('Q: %s\\nA: %s \\n' % (question, answer))" 605 | ] 606 | } 607 | ], 608 | "metadata": { 609 | "kernelspec": { 610 | "display_name": "Python 3", 611 | "language": "python", 612 | "name": "python3" 613 | }, 614 | "language_info": { 615 | "codemirror_mode": { 616 | "name": "ipython", 617 | "version": 3 618 | }, 619 | "file_extension": ".py", 620 | "mimetype": "text/x-python", 621 | "name": "python", 622 | "nbconvert_exporter": "python", 623 | "pygments_lexer": "ipython3", 624 | "version": "3.5.2" 625 | }, 626 | "latex_envs": { 627 | "bibliofile": "biblio.bib", 628 | "cite_by": "apalike", 629 | "current_citInitial": 1, 630 | "eqLabelWithNumbers": true, 631 | "eqNumInitial": 0 632 | } 633 | }, 634 | "nbformat": 4, 635 | "nbformat_minor": 2 636 | } 637 | --------------------------------------------------------------------------------