├── common ├── __init__.py ├── tqdm_utils.py └── download_utils.py ├── week 3 ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb ├── util.py └── grader.py ├── README.md ├── week 1 ├── metrics.py ├── setup_google_colab.py ├── grader.py └── .ipynb_checkpoints │ └── Predicting Tags-checkpoint.ipynb ├── week 2 ├── .ipynb_checkpoints │ └── Named Entity Recognition-checkpoint.ipynb ├── evaluation.py └── week2_NER.ipynb └── week 4 └── week4_seq2seq.ipynb /common/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /week 3/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AML - NLP Coursework 2 | 3 | All notebooks are run in Google Colab. 4 | 5 | 6 | # Reference 7 | 8 | [Coursera](https://www.coursera.org/learn/language-processing) 9 | [Github](https://github.com/hse-aml/natural-language-processing) 10 | -------------------------------------------------------------------------------- /week 3/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') 6 | STOPWORDS = set(stopwords.words('english')) 7 | def text_prepare(text): 8 | text = text.lower() 9 | text = REPLACE_BY_SPACE_RE.sub(' ', text) 10 | text = GOOD_SYMBOLS_RE.sub('', text) 11 | text = ' '.join([x for x in text.split() if x and x not in STOPWORDS]) 12 | return text.strip() 13 | 14 | def array_to_string(arr): 15 | return '\n'.join(str(num) for num in arr) 16 | 17 | def matrix_to_string(matrix): 18 | return '\n'.join('\t'.join(str(num) for num in line) for line in matrix) -------------------------------------------------------------------------------- /common/tqdm_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | 6 | class SimpleTqdm(): 7 | def __init__(self, iterable=None, total=None, **kwargs): 8 | self.iterable = list(iterable) if iterable is not None else None 9 | self.total = len(self.iterable) if self.iterable is not None else total 10 | assert self.iterable is not None or self.total is not None 11 | self.current_step = 0 12 | self.print_frequency = max(self.total // 50, 1) 13 | self.desc = "" 14 | 15 | def set_description_str(self, desc): 16 | self.desc = desc 17 | 18 | def set_description(self, desc): 19 | self.desc = desc 20 | 21 | def update(self, steps): 22 | last_print_step = (self.current_step // self.print_frequency) * self.print_frequency 23 | i = 1 24 | while last_print_step + i * self.print_frequency <= self.current_step + steps: 25 | print("*", end='') 26 | i += 1 27 | self.current_step += steps 28 | 29 | def close(self): 30 | print("\n" + self.desc) 31 | 32 | def __iter__(self): 33 | assert self.iterable is not None 34 | self.index = 0 35 | return self 36 | 37 | def __next__(self): 38 | if self.index < self.total: 39 | element = self.iterable[self.index] 40 | self.update(1) 41 | self.index += 1 42 | return element 43 | else: 44 | self.close() 45 | raise StopIteration 46 | 47 | 48 | def tqdm_notebook_failsafe(*args, **kwargs): 49 | try: 50 | import tqdm 51 | tqdm.monitor_interval = 0 # workaround for https://github.com/tqdm/tqdm/issues/481 52 | return tqdm.tqdm_notebook(*args, **kwargs) 53 | except: 54 | # tqdm is broken on Google Colab 55 | return SimpleTqdm(*args, **kwargs) 56 | -------------------------------------------------------------------------------- /week 1/metrics.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.metrics import roc_curve, auc 4 | from scipy import interp 5 | from itertools import cycle 6 | 7 | def roc_auc(y_test, y_score, n_classes): 8 | """Plots ROC curve for micro and macro averaging.""" 9 | 10 | # Compute ROC curve and ROC area for each class 11 | fpr = {} 12 | tpr = {} 13 | roc_auc = {} 14 | for i in range(n_classes): 15 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 16 | roc_auc[i] = auc(fpr[i], tpr[i]) 17 | 18 | # Compute micro-average ROC curve and ROC area 19 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 20 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 21 | 22 | # Compute macro-average ROC curve and ROC area 23 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 24 | mean_tpr = np.zeros_like(all_fpr) 25 | for i in range(n_classes): 26 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 27 | mean_tpr /= n_classes 28 | fpr["macro"] = all_fpr 29 | tpr["macro"] = mean_tpr 30 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 31 | 32 | # Plot all ROC curves 33 | plt.figure() 34 | plt.plot(fpr["micro"], tpr["micro"], 35 | label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), 36 | color='deeppink', linestyle=':', linewidth=4) 37 | 38 | plt.plot(fpr["macro"], tpr["macro"], 39 | label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), 40 | color='navy', linestyle=':', linewidth=4) 41 | 42 | colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) 43 | for i, color in zip(range(0,3), colors): 44 | plt.plot(fpr[i], tpr[i], color=color, lw=2, 45 | label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])) 46 | 47 | plt.plot([0, 1], [0, 1], 'k--', lw=2) 48 | plt.xlim([0.0, 1.0]) 49 | plt.ylim([0.0, 1.05]) 50 | plt.xlabel('False Positive Rate') 51 | plt.ylabel('True Positive Rate') 52 | plt.title('Some extension of ROC to multi-class') 53 | plt.legend(loc="lower right") 54 | plt.show() -------------------------------------------------------------------------------- /week 1/setup_google_colab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | 5 | 6 | def download_github_code(path): 7 | filename = path.rsplit("/")[-1] 8 | os.system("wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/{} -O {}".format(path, filename)) 9 | 10 | 11 | def setup_common(): 12 | os.system("pip install tqdm") 13 | os.system("pip install backports.weakref==1.0.post1") 14 | os.system("pip install ChatterBot==0.7.6") 15 | os.system("pip install enum34==1.1.6") 16 | os.system("pip install funcsigs==1.0.2") 17 | os.system("pip install gensim==3.1.0") 18 | os.system("pip install jedi==0.11.0") 19 | os.system("pip install libarchive==0.4.4") 20 | os.system("pip install mock==2.0.0") 21 | os.system("pip install parso==0.1.0") 22 | os.system("pip install pbr==3.1.1") 23 | os.system("pip install regex==2017.11.9") 24 | 25 | download_github_code("common/download_utils.py") 26 | download_github_code("common/tqdm_utils.py") 27 | download_github_code("common/__init__.py") 28 | os.system("mkdir common") 29 | os.system("mv download_utils.py tqdm_utils.py __init__.py common/") 30 | 31 | 32 | def setup_starspace(): 33 | if not os.path.exists("/usr/local/bin/starspace"): 34 | os.system("wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip") 35 | os.system("unzip boost_1_63_0.zip && mv boost_1_63_0 /usr/local/bin") 36 | os.system("git clone https://github.com/facebookresearch/Starspace.git") 37 | os.system("cd Starspace && make && cp -Rf starspace /usr/local/bin") 38 | 39 | 40 | def setup_week1(): 41 | setup_common() 42 | download_github_code("week1/grader.py") 43 | download_github_code("week1/metrics.py") 44 | 45 | 46 | def setup_week2(): 47 | setup_common() 48 | download_github_code("week2/evaluation.py") 49 | 50 | 51 | def setup_week3(): 52 | setup_common() 53 | download_github_code("week3/grader.py") 54 | download_github_code("week3/util.py") 55 | setup_starspace() 56 | 57 | 58 | def setup_week4(): 59 | setup_common() 60 | 61 | 62 | def setup_project(): 63 | setup_common() 64 | download_github_code("project/dialogue_manager.py") 65 | download_github_code("project/main_bot.py") 66 | download_github_code("project/utils.py") 67 | setup_starspace() 68 | 69 | 70 | def setup_honor(): 71 | setup_common() 72 | download_github_code("honor/datasets.py") 73 | download_github_code("honor/example.py") 74 | download_github_code("honor/download_cornell.sh") 75 | download_github_code("honor/download_opensubs.sh") 76 | -------------------------------------------------------------------------------- /week 1/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g' 10 | self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 11 | ('hTrz8', 'WordsTagsCount'), 12 | ('0kUjR', 'BagOfWords'), 13 | ('tLJV1', 'MultilabelClassification')]) 14 | self.answers = {key: None for key in self.parts} 15 | 16 | @staticmethod 17 | def ravel_output(output): 18 | ''' 19 | If student accidentally submitted np.array with one 20 | element instead of number, this function will submit 21 | this number instead 22 | ''' 23 | if isinstance(output, np.ndarray) and output.size == 1: 24 | output = output.item(0) 25 | return output 26 | 27 | def submit(self, email, token): 28 | submission = { 29 | "assignmentKey": self.assignment_key, 30 | "submitterEmail": email, 31 | "secret": token, 32 | "parts": {} 33 | } 34 | for part, output in self.answers.items(): 35 | if output is not None: 36 | submission["parts"][part] = {"output": output} 37 | else: 38 | submission["parts"][part] = dict() 39 | request = requests.post(self.submission_page, data=json.dumps(submission)) 40 | response = request.json() 41 | if request.status_code == 201: 42 | print('Submitted to Coursera platform. See results on assignment page!') 43 | elif u'details' in response and u'learnerMessage' in response[u'details']: 44 | print(response[u'details'][u'learnerMessage']) 45 | else: 46 | print("Unknown response from Coursera: {}".format(request.status_code)) 47 | print(response) 48 | 49 | def status(self): 50 | print("You want to submit these parts:") 51 | for part_id, part_name in self.parts.items(): 52 | answer = self.answers[part_id] 53 | if answer is None: 54 | answer = '-'*10 55 | print("Task {}:\n {}".format(part_name, answer[:100] + '...')) 56 | 57 | def submit_part(self, part, output): 58 | self.answers[part] = output 59 | print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...')) 60 | 61 | def submit_tag(self, tag, output): 62 | part_id = [k for k, v in self.parts.items() if v == tag] 63 | if len(part_id) != 1: 64 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 65 | part_id = part_id[0] 66 | self.submit_part(part_id, str(self.ravel_output(output))) 67 | -------------------------------------------------------------------------------- /week 3/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A' 10 | self.parts = OrderedDict([('98mDT', 'Question2Vec'), 11 | ('nc7RP', 'HitsCount'), 12 | ('bNp90', 'DCGScore'), 13 | ('3gRlQ', 'W2VTokenizedRanks'), 14 | ('mX6wS', 'StarSpaceRanks')]) 15 | self.answers = {key: None for key in self.parts} 16 | 17 | @staticmethod 18 | def ravel_output(output): 19 | ''' 20 | If student accidentally submitted np.array with one 21 | element instead of number, this function will submit 22 | this number instead 23 | ''' 24 | if isinstance(output, np.ndarray) and output.size == 1: 25 | output = output.item(0) 26 | return output 27 | 28 | def submit(self, email, token): 29 | submission = { 30 | "assignmentKey": self.assignment_key, 31 | "submitterEmail": email, 32 | "secret": token, 33 | "parts": {} 34 | } 35 | for part, output in self.answers.items(): 36 | if output is not None: 37 | submission["parts"][part] = {"output": output} 38 | else: 39 | submission["parts"][part] = dict() 40 | request = requests.post(self.submission_page, data=json.dumps(submission)) 41 | response = request.json() 42 | if request.status_code == 201: 43 | print('Submitted to Coursera platform. See results on assignment page!') 44 | elif u'details' in response and u'learnerMessage' in response[u'details']: 45 | print(response[u'details'][u'learnerMessage']) 46 | else: 47 | print("Unknown response from Coursera: {}".format(request.status_code)) 48 | print(response) 49 | 50 | def status(self): 51 | print("You want to submit these parts:") 52 | for part_id, part_name in self.parts.items(): 53 | answer = self.answers[part_id] 54 | if answer is None: 55 | answer = '-'*10 56 | print("Task {}: {}".format(part_name, answer[:100] + '...')) 57 | 58 | def submit_part(self, part, output): 59 | self.answers[part] = output 60 | print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...')) 61 | 62 | def submit_tag(self, tag, output): 63 | part_id = [k for k, v in self.parts.items() if v == tag] 64 | if len(part_id) != 1: 65 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 66 | part_id = part_id[0] 67 | self.submit_part(part_id, str(self.ravel_output(output))) 68 | -------------------------------------------------------------------------------- /common/download_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import shutil 5 | import requests 6 | from common import tqdm_utils 7 | 8 | 9 | REPOSITORY_PATH = "https://github.com/hse-aml/natural-language-processing" 10 | 11 | 12 | def download_file(url, file_path): 13 | r = requests.get(url, stream=True) 14 | total_size = int(r.headers.get('content-length')) 15 | try: 16 | with open(file_path, 'wb', buffering=16*1024*1024) as f: 17 | bar = tqdm_utils.tqdm_notebook_failsafe(total=total_size, unit='B', unit_scale=True) 18 | bar.set_description(os.path.split(file_path)[-1]) 19 | for chunk in r.iter_content(32 * 1024): 20 | f.write(chunk) 21 | bar.update(len(chunk)) 22 | bar.close() 23 | except Exception: 24 | print("Download failed") 25 | finally: 26 | if os.path.getsize(file_path) != total_size: 27 | os.remove(file_path) 28 | print("Removed incomplete download") 29 | 30 | 31 | def download_from_github(version, fn, target_dir, force=False): 32 | url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn) 33 | file_path = os.path.join(target_dir, fn) 34 | if os.path.exists(file_path) and not force: 35 | print("File {} is already downloaded.".format(file_path)) 36 | return 37 | download_file(url, file_path) 38 | 39 | 40 | def sequential_downloader(version, fns, target_dir, force=False): 41 | os.makedirs(target_dir, exist_ok=True) 42 | for fn in fns: 43 | download_from_github(version, fn, target_dir, force=force) 44 | 45 | 46 | def download_week1_resources(force=False): 47 | sequential_downloader( 48 | "week1", 49 | [ 50 | "train.tsv", 51 | "validation.tsv", 52 | "test.tsv", 53 | "text_prepare_tests.tsv", 54 | ], 55 | "data", 56 | force=force 57 | ) 58 | 59 | 60 | def download_week2_resources(force=False): 61 | sequential_downloader( 62 | "week2", 63 | [ 64 | "train.txt", 65 | "validation.txt", 66 | "test.txt", 67 | ], 68 | "data", 69 | force=force 70 | ) 71 | 72 | 73 | def download_week3_resources(force=False): 74 | sequential_downloader( 75 | "week3", 76 | [ 77 | "train.tsv", 78 | "validation.tsv", 79 | "test.tsv", 80 | "test_embeddings.tsv", 81 | ], 82 | "data", 83 | force=force 84 | ) 85 | print("Downloading GoogleNews-vectors-negative300.bin.gz (1.5G) for you, it will take a while...") 86 | download_file("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", 87 | "GoogleNews-vectors-negative300.bin.gz") 88 | 89 | 90 | def download_project_resources(force=False): 91 | sequential_downloader( 92 | "project", 93 | [ 94 | "dialogues.tsv", 95 | "tagged_posts.tsv", 96 | ], 97 | "data", 98 | force=force 99 | ) 100 | -------------------------------------------------------------------------------- /week 2/.ipynb_checkpoints/Named Entity Recognition-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "1. Load the tokens and tags\n", 8 | "2. Identify the train, test, and validation datasets\n", 9 | "3. Print the data to see what types of stuffs we are dealing with\n", 10 | "4. Build a dictionary to prepare token -> tag relationship, or vice-versa\n", 11 | "5. Create placeholders to specify what data we are going to feed into the network during the execution time.\n", 12 | " input_batch — sequences of words (the shape equals to [batch_size, sequence_len]);\n", 13 | " ground_truth_tags — sequences of tags (the shape equals to [batch_size, sequence_len]);\n", 14 | " lengths — lengths of not padded sequences (the shape equals to [batch_size]);\n", 15 | " dropout_ph — dropout keep probability; this placeholder has a predefined value 1;\n", 16 | " learning_rate_ph — learning rate; we need this placeholder because we want to change the value during training.\n", 17 | "6. Build the NN with following observations: \n", 18 | "\n", 19 | " - Create embeddings matrix with tf.Variable. Specify its name (embeddings_matrix), type (tf.float32), and initialize with random values.\n", 20 | " - Create forward and backward LSTM cells. TensorFlow provides a number of RNN cells ready for you. We suggest that you use BasicLSTMCell, but you can also experiment with other types, e.g. GRU cells. This blogpost could be interesting if you want to learn more about the differences.\n", 21 | " - Wrap your cells with DropoutWrapper. Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n", 22 | " - After that, you can build the computation graph that transforms an input_batch:\n", 23 | "\n", 24 | " * Look up embeddings for an input_batch in the prepared embedding_matrix.\n", 25 | " * Pass the embeddings through Bidirectional Dynamic RNN with the specified forward & backward cells.\n", 26 | " * Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n", 27 | " * Create a dense layer on top. Its output will be used directly in loss function.\n", 28 | " \n", 29 | " - Apply softmax to the last layer\n", 30 | "7. Use cross-entropy loss in the training data (applied in logits, not to the softmax probabilities)\n", 31 | " - Mask the unnecessary like dirty tags\n", 32 | "8. Optimize the loss by using Adam OPtimizer. Use clipping to remove exploding gradients\n", 33 | "9. Train the network\n", 34 | "10. Predict the tags\n", 35 | "11. Evaluate the model\n", 36 | "12. Run the model" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.3" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /week 1/.ipynb_checkpoints/Predicting Tags-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MultiLabel Text Classification" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Step wise implementation of all process:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "1\\. Import numpy, pandas, ntlk and other related dependencies" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "2\\. Import the files, train - test and validation sets\n", 29 | " - Initialize X_Train, X_Test, X_Val, y_train, y_val" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "3\\. Natural data is unstructured, so to remove the redundancy, we preprocess the text data\n", 37 | " - convert everything to lower-case\n", 38 | " - unnecessary symbols, like brackets, at-the-rate, commas,etc. needs to be replaced by spaces\n", 39 | " - bad symbols (anything apart from alpha numeric and spaces) should be removed\n", 40 | " - remove stop-words" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "4\\. Text Pre-process on X_Train, X_Val and X_Test" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "5\\. Show most popular tags and words from the train data (#EDA)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "6\\. Computers doesn't work on text. So transform all values into numberic vectors. There are two procedures:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "A. Bag of Words Model\n", 69 | " \n", 70 | " - Find N most popular words in train corpus.\n", 71 | " - Then we need to numerate them, for example, like this: {'hi': 0, 'you': 1, 'me': 2, 'are': 3}\n", 72 | " - For each title in the corpora create a zero vector with the dimension equals to N, like this: [0, 0, 0, 0]\n", 73 | " - For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.\n", 74 | " text: 'hi how are you'\n", 75 | " corpus: 'hi you me are'\n", 76 | " vector: [1, 1, 0, 1]\n", 77 | " \n", 78 | " - Use a dictionary size of 5000, and generate the most common words used in train data\n", 79 | " - Use the model in all three available variants\n", 80 | " - Show non-zero elements in the newly generated values of X-Train, X-Test, and X-Val (#EDA)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "B. TF-IDF Model (penalize too frequent words)\n", 88 | " \n", 89 | " - Create a TF-IDF vectorizer with a proper parameters choice, and fit the result on train set (most important part: filter out most and rarely used words, and use 1-gram and bi-grams in the model)\n", 90 | " - Transform the train, test, and val sets and return the result\n", 91 | " - Check for the results (#EDA)\n", 92 | " - Check if tags are available in the newly formed model" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "7\\. Multi-Label Classifier (for multiple tags in a title)\n", 100 | "\n", 101 | " - import dependencies and binarize y_train, and y_val\n", 102 | " - transform the model on k numbers of tags from the train data\n", 103 | " - Train the classifiers for different data transformations: bag-of-words and tf-idf.\n", 104 | " - Now you can create predictions for the data. You will need two types of predictions: labels and scores." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "8\\. Evaluation: \n", 112 | " - checks which model are better\n", 113 | " - checks whether or not to use the regularization techniques\n", 114 | "\n", 115 | " - Accuracy (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)\n", 116 | " - F1-score (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n", 117 | " - Area under ROC-curve (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html)\n", 118 | " - Area under precision-recall curve (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score)\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "ROC: \n", 126 | "- You might also want to plot some generalization of the ROC curve for the case of multi-label classification.\n", 127 | "- Provided function roc_auc can make it for you. The input parameters of this function are:\n", 128 | "\n", 129 | " - true labels\n", 130 | " - decision functions scores\n", 131 | " - number of classes" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "9\\. After evaluating the model, experiment a bit with the classifier\n", 139 | "- Use L1 and L2 regularization techniques" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.6.3" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /week 2/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False): 4 | if candidate == 'B-' + current_tag: 5 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 6 | current_chunk[-1].append(current_pos - 1) 7 | current_chunk.append([current_pos]) 8 | elif candidate == 'I-' + current_tag: 9 | if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag): 10 | current_chunk.append([current_pos]) 11 | if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'): 12 | current_chunk.append([current_pos]) 13 | elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag: 14 | if len(current_chunk) > 0: 15 | current_chunk[-1].append(current_pos - 1) 16 | 17 | def _update_last_chunk(current_chunk, current_pos): 18 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 19 | current_chunk[-1].append(current_pos - 1) 20 | 21 | def _tag_precision_recall_f1(tp, fp, fn): 22 | precision, recall, f1 = 0, 0, 0 23 | if tp + fp > 0: 24 | precision = tp / (tp + fp) * 100 25 | if tp + fn > 0: 26 | recall = tp / (tp + fn) * 100 27 | if precision + recall > 0: 28 | f1 = 2 * precision * recall / (precision + recall) 29 | return precision, recall, f1 30 | 31 | def _aggregate_metrics(results, total_correct): 32 | total_true_entities = 0 33 | total_predicted_entities = 0 34 | total_precision = 0 35 | total_recall = 0 36 | total_f1 = 0 37 | for tag, tag_metrics in results.items(): 38 | n_pred = tag_metrics['n_predicted_entities'] 39 | n_true = tag_metrics['n_true_entities'] 40 | total_true_entities += n_true 41 | total_predicted_entities += n_pred 42 | total_precision += tag_metrics['precision'] * n_pred 43 | total_recall += tag_metrics['recall'] * n_true 44 | 45 | accuracy = 0 46 | if total_true_entities > 0: 47 | accuracy = total_correct / total_true_entities * 100 48 | else: 49 | print('CAUTION! Accuracy equals zero because there are no '\ 50 | 'correct entities. Check the correctness of your data.') 51 | if total_predicted_entities > 0: 52 | total_precision = total_precision / total_predicted_entities 53 | total_recall = total_recall / total_true_entities 54 | if total_precision + total_recall > 0: 55 | total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall) 56 | return total_true_entities, total_predicted_entities, \ 57 | total_precision, total_recall, total_f1, accuracy 58 | 59 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct): 60 | print('processed {len} tokens ' \ 61 | 'with {tot_true} phrases; ' \ 62 | 'found: {tot_pred} phrases; ' \ 63 | 'correct: {tot_cor}.\n'.format(len=n_tokens, 64 | tot_true=total_true_entities, 65 | tot_pred=total_predicted_entities, 66 | tot_cor=total_correct)) 67 | 68 | def _print_metrics(accuracy, total_precision, total_recall, total_f1): 69 | print('precision: {tot_prec:.2f}%; ' \ 70 | 'recall: {tot_recall:.2f}%; ' \ 71 | 'F1: {tot_f1:.2f}\n'.format(acc=accuracy, 72 | tot_prec=total_precision, 73 | tot_recall=total_recall, 74 | tot_f1=total_f1)) 75 | 76 | def _print_tag_metrics(tag, tag_results): 77 | print(('\t%12s' % tag) + ': precision: {tot_prec:6.2f}%; ' \ 78 | 'recall: {tot_recall:6.2f}%; ' \ 79 | 'F1: {tot_f1:6.2f}; ' \ 80 | 'predicted: {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'], 81 | tot_recall=tag_results['recall'], 82 | tot_f1=tag_results['f1'], 83 | tot_predicted=tag_results['n_predicted_entities'])) 84 | 85 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False): 86 | # Find all tags 87 | tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O')) 88 | 89 | results = OrderedDict((tag, OrderedDict()) for tag in tags) 90 | n_tokens = len(y_true) 91 | total_correct = 0 92 | 93 | # For eval_conll_try we find all chunks in the ground truth and prediction 94 | # For each chunk we store starting and ending indices 95 | for tag in tags: 96 | true_chunk = list() 97 | predicted_chunk = list() 98 | for position in range(n_tokens): 99 | _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position) 100 | _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True) 101 | 102 | _update_last_chunk(true_chunk, position) 103 | _update_last_chunk(predicted_chunk, position) 104 | 105 | # Then we find all correctly classified intervals 106 | # True positive results 107 | tp = sum(chunk in predicted_chunk for chunk in true_chunk) 108 | total_correct += tp 109 | 110 | # And then just calculate errors of the first and second kind 111 | # False negative 112 | fn = len(true_chunk) - tp 113 | # False positive 114 | fp = len(predicted_chunk) - tp 115 | precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn) 116 | 117 | results[tag]['precision'] = precision 118 | results[tag]['recall'] = recall 119 | results[tag]['f1'] = f1 120 | results[tag]['n_predicted_entities'] = len(predicted_chunk) 121 | results[tag]['n_true_entities'] = len(true_chunk) 122 | 123 | total_true_entities, total_predicted_entities, \ 124 | total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct) 125 | 126 | if print_results: 127 | _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct) 128 | _print_metrics(accuracy, total_precision, total_recall, total_f1) 129 | 130 | if not short_report: 131 | for tag, tag_results in results.items(): 132 | _print_tag_metrics(tag, tag_results) 133 | return results 134 | -------------------------------------------------------------------------------- /week 2/week2_NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "week2-NER.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "toc_visible": true 10 | }, 11 | "kernelspec": { 12 | "display_name": "Python 3", 13 | "language": "python", 14 | "name": "python3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "metadata": { 20 | "id": "csbagiiEClai", 21 | "colab_type": "text" 22 | }, 23 | "cell_type": "markdown", 24 | "source": [ 25 | "# Recognize named entities on Twitter with LSTMs\n", 26 | "\n", 27 | "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n", 28 | "\n", 29 | "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n", 30 | "\n", 31 | " Ian Goodfellow works for Google Brain\n", 32 | "\n", 33 | "a NER model needs to provide the following sequence of tags:\n", 34 | "\n", 35 | " B-PER I-PER O O B-ORG I-ORG\n", 36 | "\n", 37 | "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n", 38 | "\n", 39 | "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n", 40 | "\n", 41 | "### Libraries\n", 42 | "\n", 43 | "For this task you will need the following libraries:\n", 44 | " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 45 | " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 46 | " \n", 47 | "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. " 48 | ] 49 | }, 50 | { 51 | "metadata": { 52 | "id": "NYnNbdtcC4DG", 53 | "colab_type": "code", 54 | "colab": { 55 | "base_uri": "https://localhost:8080/", 56 | "height": 217 57 | }, 58 | "outputId": "d30701ac-cadf-4a86-e356-7c03a0bd860c" 59 | }, 60 | "cell_type": "code", 61 | "source": [ 62 | "! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 63 | "import setup_google_colab\n", 64 | "setup_google_colab.setup_week2() # change to the week you're working on" 65 | ], 66 | "execution_count": 1, 67 | "outputs": [ 68 | { 69 | "output_type": "stream", 70 | "text": [ 71 | "--2018-09-02 12:00:19-- https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py\n", 72 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", 73 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", 74 | "HTTP request sent, awaiting response... 200 OK\n", 75 | "Length: 2330 (2.3K) [text/plain]\n", 76 | "Saving to: ‘setup_google_colab.py’\n", 77 | "\n", 78 | "setup_google_colab. 100%[===================>] 2.28K --.-KB/s in 0s \n", 79 | "\n", 80 | "2018-09-02 12:00:19 (39.7 MB/s) - ‘setup_google_colab.py’ saved [2330/2330]\n", 81 | "\n" 82 | ], 83 | "name": "stdout" 84 | } 85 | ] 86 | }, 87 | { 88 | "metadata": { 89 | "id": "EHhvLTZoClak", 90 | "colab_type": "text" 91 | }, 92 | "cell_type": "markdown", 93 | "source": [ 94 | "### Data\n", 95 | "\n", 96 | "The following cell will download all data required for this assignment into the folder `week2/data`." 97 | ] 98 | }, 99 | { 100 | "metadata": { 101 | "id": "YsDh-YRcClal", 102 | "colab_type": "code", 103 | "colab": { 104 | "base_uri": "https://localhost:8080/", 105 | "height": 126 106 | }, 107 | "outputId": "aa213d7a-fe2d-46bd-b3f7-f98c45aa749f" 108 | }, 109 | "cell_type": "code", 110 | "source": [ 111 | "import sys\n", 112 | "sys.path.append(\"..\")\n", 113 | "from common.download_utils import download_week2_resources\n", 114 | "\n", 115 | "download_week2_resources()" 116 | ], 117 | "execution_count": 2, 118 | "outputs": [ 119 | { 120 | "output_type": "stream", 121 | "text": [ 122 | "**************************************************\n", 123 | "train.txt\n", 124 | "**************************************************\n", 125 | "validation.txt\n", 126 | "**************************************************\n", 127 | "test.txt\n" 128 | ], 129 | "name": "stdout" 130 | } 131 | ] 132 | }, 133 | { 134 | "metadata": { 135 | "id": "f8JMCoFYClap", 136 | "colab_type": "text" 137 | }, 138 | "cell_type": "markdown", 139 | "source": [ 140 | "### Load the Twitter Named Entity Recognition corpus\n", 141 | "\n", 142 | "We will work with a corpus, which contains tweets with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n", 143 | "\n", 144 | "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `` token and any URL to `` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames." 145 | ] 146 | }, 147 | { 148 | "metadata": { 149 | "id": "0Y-rD4p9Claq", 150 | "colab_type": "code", 151 | "colab": {} 152 | }, 153 | "cell_type": "code", 154 | "source": [ 155 | "def read_data(file_path):\n", 156 | " tokens = []\n", 157 | " tags = []\n", 158 | " \n", 159 | " tweet_tokens = []\n", 160 | " tweet_tags = []\n", 161 | " for line in open(file_path, encoding='utf-8'):\n", 162 | " line = line.strip()\n", 163 | " if not line:\n", 164 | " if tweet_tokens:\n", 165 | " tokens.append(tweet_tokens)\n", 166 | " tags.append(tweet_tags)\n", 167 | " tweet_tokens = []\n", 168 | " tweet_tags = []\n", 169 | " else:\n", 170 | " token, tag = line.split()\n", 171 | " # Replace all urls with token\n", 172 | " # Replace all users with token\n", 173 | "\n", 174 | " ######################################\n", 175 | " ######### YOUR CODE HERE #############\n", 176 | " ######################################\n", 177 | " \n", 178 | " tweet_tokens.append(token)\n", 179 | " tweet_tags.append(tag)\n", 180 | " \n", 181 | " return tokens, tags" 182 | ], 183 | "execution_count": 0, 184 | "outputs": [] 185 | }, 186 | { 187 | "metadata": { 188 | "id": "VsFeALYsClas", 189 | "colab_type": "text" 190 | }, 191 | "cell_type": "markdown", 192 | "source": [ 193 | "And now we can load three separate parts of the dataset:\n", 194 | " - *train* data for training the model;\n", 195 | " - *validation* data for evaluation and hyperparameters tuning;\n", 196 | " - *test* data for final evaluation of the model." 197 | ] 198 | }, 199 | { 200 | "metadata": { 201 | "id": "oUFL1-H9Clat", 202 | "colab_type": "code", 203 | "colab": {} 204 | }, 205 | "cell_type": "code", 206 | "source": [ 207 | "train_tokens, train_tags = read_data('data/train.txt')\n", 208 | "validation_tokens, validation_tags = read_data('data/validation.txt')\n", 209 | "test_tokens, test_tags = read_data('data/test.txt')" 210 | ], 211 | "execution_count": 0, 212 | "outputs": [] 213 | }, 214 | { 215 | "metadata": { 216 | "id": "eqNFZ58iClaw", 217 | "colab_type": "text" 218 | }, 219 | "cell_type": "markdown", 220 | "source": [ 221 | "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:" 222 | ] 223 | }, 224 | { 225 | "metadata": { 226 | "id": "tqiIRJHDClaw", 227 | "colab_type": "code", 228 | "colab": { 229 | "base_uri": "https://localhost:8080/", 230 | "height": 1181 231 | }, 232 | "outputId": "26388ece-778b-45f1-d6a1-ff475f78da84" 233 | }, 234 | "cell_type": "code", 235 | "source": [ 236 | "for i in range(3):\n", 237 | " for token, tag in zip(train_tokens[i], train_tags[i]):\n", 238 | " print('%s\\t%s' % (token, tag))\n", 239 | " print()" 240 | ], 241 | "execution_count": 5, 242 | "outputs": [ 243 | { 244 | "output_type": "stream", 245 | "text": [ 246 | "RT\tO\n", 247 | "@TheValarium\tO\n", 248 | ":\tO\n", 249 | "Online\tO\n", 250 | "ticket\tO\n", 251 | "sales\tO\n", 252 | "for\tO\n", 253 | "Ghostland\tB-musicartist\n", 254 | "Observatory\tI-musicartist\n", 255 | "extended\tO\n", 256 | "until\tO\n", 257 | "6\tO\n", 258 | "PM\tO\n", 259 | "EST\tO\n", 260 | "due\tO\n", 261 | "to\tO\n", 262 | "high\tO\n", 263 | "demand\tO\n", 264 | ".\tO\n", 265 | "Get\tO\n", 266 | "them\tO\n", 267 | "before\tO\n", 268 | "they\tO\n", 269 | "sell\tO\n", 270 | "out\tO\n", 271 | "...\tO\n", 272 | "\n", 273 | "Apple\tB-product\n", 274 | "MacBook\tI-product\n", 275 | "Pro\tI-product\n", 276 | "A1278\tI-product\n", 277 | "13.3\tI-product\n", 278 | "\"\tI-product\n", 279 | "Laptop\tI-product\n", 280 | "-\tI-product\n", 281 | "MD101LL/A\tI-product\n", 282 | "(\tO\n", 283 | "June\tO\n", 284 | ",\tO\n", 285 | "2012\tO\n", 286 | ")\tO\n", 287 | "-\tO\n", 288 | "Full\tO\n", 289 | "read\tO\n", 290 | "by\tO\n", 291 | "eBay\tB-company\n", 292 | "http://t.co/2zgQ99nmuf\tO\n", 293 | "http://t.co/eQmogqqABK\tO\n", 294 | "\n", 295 | "Happy\tO\n", 296 | "Birthday\tO\n", 297 | "@AshForeverAshey\tO\n", 298 | "!\tO\n", 299 | "May\tO\n", 300 | "Allah\tB-person\n", 301 | "s.w.t\tO\n", 302 | "bless\tO\n", 303 | "you\tO\n", 304 | "with\tO\n", 305 | "goodness\tO\n", 306 | "and\tO\n", 307 | "happiness\tO\n", 308 | ".\tO\n", 309 | "\n" 310 | ], 311 | "name": "stdout" 312 | } 313 | ] 314 | }, 315 | { 316 | "metadata": { 317 | "id": "4YbemazyClaz", 318 | "colab_type": "text" 319 | }, 320 | "cell_type": "markdown", 321 | "source": [ 322 | "### Prepare dictionaries\n", 323 | "\n", 324 | "To train a neural network, we will use two mappings: \n", 325 | "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", 326 | "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", 327 | "\n", 328 | "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. " 329 | ] 330 | }, 331 | { 332 | "metadata": { 333 | "id": "xRoPdVWgCla0", 334 | "colab_type": "code", 335 | "colab": {} 336 | }, 337 | "cell_type": "code", 338 | "source": [ 339 | "from collections import defaultdict" 340 | ], 341 | "execution_count": 0, 342 | "outputs": [] 343 | }, 344 | { 345 | "metadata": { 346 | "id": "1XzBKhTaCla2", 347 | "colab_type": "code", 348 | "colab": {} 349 | }, 350 | "cell_type": "code", 351 | "source": [ 352 | "def build_dict(tokens_or_tags, special_tokens):\n", 353 | " \"\"\"\n", 354 | " tokens_or_tags: a list of lists of tokens or tags\n", 355 | " special_tokens: some special tokens\n", 356 | " \"\"\"\n", 357 | " # Create a dictionary with default value 0\n", 358 | " tok2idx = defaultdict(lambda: 0)\n", 359 | " idx2tok = []\n", 360 | " \n", 361 | " # Create mappings from tokens (or tags) to indices and vice versa.\n", 362 | " # At first, add special tokens (or tags) to the dictionaries.\n", 363 | " # The first special token must have index 0.\n", 364 | " \n", 365 | " # Mapping tok2idx should contain each token or tag only once. \n", 366 | " # To do so, you should:\n", 367 | " # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n", 368 | " # occur in special_tokens (because they could have non-empty intersection)\n", 369 | " # 2. index them (for example, you can add them into the list idx2tok\n", 370 | " # 3. for each token/tag save the index into tok2idx).\n", 371 | " \n", 372 | " ######################################\n", 373 | " ######### YOUR CODE HERE #############\n", 374 | " ######################################\n", 375 | " for twt in tokens_or_tags:\n", 376 | " for tok in twt:\n", 377 | " idx2tok.append(tok)\n", 378 | " idx2tok = list(set(idx2tok))\n", 379 | " idx2tok = special_tokens + idx2tok\n", 380 | " for i, v in enumerate(idx2tok):\n", 381 | " tok2idx[v] = i\n", 382 | " \n", 383 | " return tok2idx, idx2tok" 384 | ], 385 | "execution_count": 0, 386 | "outputs": [] 387 | }, 388 | { 389 | "metadata": { 390 | "id": "oCG8Xsl2Cla5", 391 | "colab_type": "text" 392 | }, 393 | "cell_type": "markdown", 394 | "source": [ 395 | "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n", 396 | " - `` token for out of vocabulary tokens;\n", 397 | " - `` token for padding sentence to the same length when we create batches of sentences." 398 | ] 399 | }, 400 | { 401 | "metadata": { 402 | "id": "WPEspWxCCla6", 403 | "colab_type": "code", 404 | "colab": {} 405 | }, 406 | "cell_type": "code", 407 | "source": [ 408 | "special_tokens = ['', '']\n", 409 | "special_tags = ['O']\n", 410 | "\n", 411 | "# Create dictionaries \n", 412 | "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n", 413 | "tag2idx, idx2tag = build_dict(train_tags, special_tags)" 414 | ], 415 | "execution_count": 0, 416 | "outputs": [] 417 | }, 418 | { 419 | "metadata": { 420 | "id": "tN0c0PpaCla9", 421 | "colab_type": "text" 422 | }, 423 | "cell_type": "markdown", 424 | "source": [ 425 | "The next additional functions will help you to create the mapping between tokens and ids for a sentence. " 426 | ] 427 | }, 428 | { 429 | "metadata": { 430 | "id": "7bB_K_MOCla-", 431 | "colab_type": "code", 432 | "colab": {} 433 | }, 434 | "cell_type": "code", 435 | "source": [ 436 | "def words2idxs(tokens_list):\n", 437 | " return [token2idx[word] for word in tokens_list]\n", 438 | "\n", 439 | "def tags2idxs(tags_list):\n", 440 | " return [tag2idx[tag] for tag in tags_list]\n", 441 | "\n", 442 | "def idxs2words(idxs):\n", 443 | " return [idx2token[idx] for idx in idxs]\n", 444 | "\n", 445 | "def idxs2tags(idxs):\n", 446 | " return [idx2tag[idx] for idx in idxs]" 447 | ], 448 | "execution_count": 0, 449 | "outputs": [] 450 | }, 451 | { 452 | "metadata": { 453 | "id": "nsUfuMHIClbB", 454 | "colab_type": "text" 455 | }, 456 | "cell_type": "markdown", 457 | "source": [ 458 | "### Generate batches\n", 459 | "\n", 460 | "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. " 461 | ] 462 | }, 463 | { 464 | "metadata": { 465 | "id": "2__r2T8PClbB", 466 | "colab_type": "code", 467 | "colab": {} 468 | }, 469 | "cell_type": "code", 470 | "source": [ 471 | "def batches_generator(batch_size, tokens, tags,\n", 472 | " shuffle=True, allow_smaller_last_batch=True):\n", 473 | " \"\"\"Generates padded batches of tokens and tags.\"\"\"\n", 474 | " \n", 475 | " n_samples = len(tokens)\n", 476 | " if shuffle:\n", 477 | " order = np.random.permutation(n_samples)\n", 478 | " else:\n", 479 | " order = np.arange(n_samples)\n", 480 | "\n", 481 | " n_batches = n_samples // batch_size\n", 482 | " if allow_smaller_last_batch and n_samples % batch_size:\n", 483 | " n_batches += 1\n", 484 | "\n", 485 | " for k in range(n_batches):\n", 486 | " batch_start = k * batch_size\n", 487 | " batch_end = min((k + 1) * batch_size, n_samples)\n", 488 | " current_batch_size = batch_end - batch_start\n", 489 | " x_list = []\n", 490 | " y_list = []\n", 491 | " max_len_token = 0\n", 492 | " for idx in order[batch_start: batch_end]:\n", 493 | " x_list.append(words2idxs(tokens[idx]))\n", 494 | " y_list.append(tags2idxs(tags[idx]))\n", 495 | " max_len_token = max(max_len_token, len(tags[idx]))\n", 496 | " \n", 497 | " # Fill in the data into numpy nd-arrays filled with padding indices.\n", 498 | " x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['']\n", 499 | " y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n", 500 | " lengths = np.zeros(current_batch_size, dtype=np.int32)\n", 501 | " for n in range(current_batch_size):\n", 502 | " utt_len = len(x_list[n])\n", 503 | " x[n, :utt_len] = x_list[n]\n", 504 | " lengths[n] = utt_len\n", 505 | " y[n, :utt_len] = y_list[n]\n", 506 | " yield x, y, lengths" 507 | ], 508 | "execution_count": 0, 509 | "outputs": [] 510 | }, 511 | { 512 | "metadata": { 513 | "id": "KSw36ieUClbE", 514 | "colab_type": "text" 515 | }, 516 | "cell_type": "markdown", 517 | "source": [ 518 | "## Build a recurrent neural network\n", 519 | "\n", 520 | "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification. " 521 | ] 522 | }, 523 | { 524 | "metadata": { 525 | "id": "tbiWnb1HClbE", 526 | "colab_type": "code", 527 | "colab": {} 528 | }, 529 | "cell_type": "code", 530 | "source": [ 531 | "import tensorflow as tf\n", 532 | "import numpy as np" 533 | ], 534 | "execution_count": 0, 535 | "outputs": [] 536 | }, 537 | { 538 | "metadata": { 539 | "id": "i8kpPBDDClbH", 540 | "colab_type": "code", 541 | "colab": {} 542 | }, 543 | "cell_type": "code", 544 | "source": [ 545 | "class BiLSTMModel():\n", 546 | " pass" 547 | ], 548 | "execution_count": 0, 549 | "outputs": [] 550 | }, 551 | { 552 | "metadata": { 553 | "id": "n1QILdZkClbK", 554 | "colab_type": "text" 555 | }, 556 | "cell_type": "markdown", 557 | "source": [ 558 | "First, we need to create [placeholders](https://www.tensorflow.org/versions/master/api_docs/python/tf/placeholder) to specify what data we are going to feed into the network during the execution time. For this task we will need the following placeholders:\n", 559 | " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n", 560 | " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n", 561 | " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n", 562 | " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n", 563 | " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n", 564 | "\n", 565 | "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n", 566 | "\n", 567 | "You need to complete the function *declare_placeholders*." 568 | ] 569 | }, 570 | { 571 | "metadata": { 572 | "id": "4xitqhY3ClbK", 573 | "colab_type": "code", 574 | "colab": {} 575 | }, 576 | "cell_type": "code", 577 | "source": [ 578 | "def declare_placeholders(self):\n", 579 | " \"\"\"Specifies placeholders for the model.\"\"\"\n", 580 | "\n", 581 | " # Placeholders for input and ground truth output.\n", 582 | " self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n", 583 | " self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags') ######### YOUR CODE HERE #############\n", 584 | " \n", 585 | " # Placeholder for lengths of the sequences.\n", 586 | " self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n", 587 | " \n", 588 | " # Placeholder for a dropout keep probability. If we don't feed\n", 589 | " # a value for this placeholder, it will be equal to 1.0.\n", 590 | " self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n", 591 | " \n", 592 | " # Placeholder for a learning rate (tf.float32).\n", 593 | " self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[]) ######### YOUR CODE HERE #############" 594 | ], 595 | "execution_count": 0, 596 | "outputs": [] 597 | }, 598 | { 599 | "metadata": { 600 | "id": "YFtwWRdBClbN", 601 | "colab_type": "code", 602 | "colab": {} 603 | }, 604 | "cell_type": "code", 605 | "source": [ 606 | "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)" 607 | ], 608 | "execution_count": 0, 609 | "outputs": [] 610 | }, 611 | { 612 | "metadata": { 613 | "id": "rCxr1142ClbP", 614 | "colab_type": "text" 615 | }, 616 | "cell_type": "markdown", 617 | "source": [ 618 | "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n", 619 | " \n", 620 | "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type (*tf.float32*), and initialize with random values.\n", 621 | "- Create forward and backward LSTM cells. TensorFlow provides a number of [RNN cells](https://www.tensorflow.org/api_guides/python/contrib.rnn#Core_RNN_Cells_for_use_with_TensorFlow_s_core_RNN_methods) ready for you. We suggest that you use *BasicLSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n", 622 | "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n", 623 | " \n", 624 | "After that, you can build the computation graph that transforms an input_batch:\n", 625 | "\n", 626 | "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n", 627 | "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n", 628 | "- Create a dense layer on top. Its output will be used directly in loss function. \n", 629 | " \n", 630 | "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n", 631 | " " 632 | ] 633 | }, 634 | { 635 | "metadata": { 636 | "id": "a236YXAkClbQ", 637 | "colab_type": "code", 638 | "colab": {} 639 | }, 640 | "cell_type": "code", 641 | "source": [ 642 | "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n", 643 | " \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n", 644 | " \n", 645 | " # Create embedding variable (tf.Variable) with dtype tf.float32\n", 646 | " initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)\n", 647 | " embedding_matrix_variable = tf.Variable(initial_embedding_matrix, dtype=tf.float32) ######### YOUR CODE HERE #############\n", 648 | " \n", 649 | " # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n", 650 | " # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n", 651 | " forward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(n_hidden_rnn),\n", 652 | " input_keep_prob=self.dropout_ph,\n", 653 | " output_keep_prob=self.dropout_ph,\n", 654 | " state_keep_prob=self.dropout_ph) ######### YOUR CODE HERE #############\n", 655 | " \n", 656 | " backward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(n_hidden_rnn),\n", 657 | " input_keep_prob=self.dropout_ph,\n", 658 | " output_keep_prob=self.dropout_ph,\n", 659 | " state_keep_prob=self.dropout_ph) ######### YOUR CODE HERE #############\n", 660 | "\n", 661 | " # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n", 662 | " # Shape: [batch_size, sequence_len, embedding_dim].\n", 663 | " embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch) ######### YOUR CODE HERE #############\n", 664 | " \n", 665 | " # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n", 666 | " # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n", 667 | " # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n", 668 | " (rnn_output_fw, rnn_output_bw), _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,embeddings,\n", 669 | " dtype=tf.float32,sequence_length=self.lengths)######### YOUR CODE HERE #############\n", 670 | " rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n", 671 | "\n", 672 | " # Dense layer on top.\n", 673 | " # Shape: [batch_size, sequence_len, n_tags]. \n", 674 | " self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)" 675 | ], 676 | "execution_count": 0, 677 | "outputs": [] 678 | }, 679 | { 680 | "metadata": { 681 | "id": "hG8k1dNMClbT", 682 | "colab_type": "code", 683 | "colab": {} 684 | }, 685 | "cell_type": "code", 686 | "source": [ 687 | "BiLSTMModel.__build_layers = classmethod(build_layers)" 688 | ], 689 | "execution_count": 0, 690 | "outputs": [] 691 | }, 692 | { 693 | "metadata": { 694 | "id": "H71asa6yClba", 695 | "colab_type": "text" 696 | }, 697 | "cell_type": "markdown", 698 | "source": [ 699 | "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)." 700 | ] 701 | }, 702 | { 703 | "metadata": { 704 | "id": "83o_vtxeClba", 705 | "colab_type": "code", 706 | "colab": {} 707 | }, 708 | "cell_type": "code", 709 | "source": [ 710 | "def compute_predictions(self):\n", 711 | " \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n", 712 | " \n", 713 | " # Create softmax (tf.nn.softmax) function\n", 714 | " softmax_output = tf.nn.softmax(logits=self.logits) ######### YOUR CODE HERE #############\n", 715 | " \n", 716 | " # Use argmax (tf.argmax) to get the most probable tags\n", 717 | " # Don't forget to set axis=-1\n", 718 | " # otherwise argmax will be calculated in a wrong way\n", 719 | " self.predictions = tf.argmax(softmax_output, axis=-1) ######### YOUR CODE HERE #############" 720 | ], 721 | "execution_count": 0, 722 | "outputs": [] 723 | }, 724 | { 725 | "metadata": { 726 | "id": "GyLdLgYlClbd", 727 | "colab_type": "code", 728 | "colab": {} 729 | }, 730 | "cell_type": "code", 731 | "source": [ 732 | "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)" 733 | ], 734 | "execution_count": 0, 735 | "outputs": [] 736 | }, 737 | { 738 | "metadata": { 739 | "id": "8BwNBstdClbg", 740 | "colab_type": "text" 741 | }, 742 | "cell_type": "markdown", 743 | "source": [ 744 | "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n", 745 | "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note, that we do not want to take into account loss terms coming from `` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)." 746 | ] 747 | }, 748 | { 749 | "metadata": { 750 | "id": "64aK1J9BClbi", 751 | "colab_type": "code", 752 | "colab": {} 753 | }, 754 | "cell_type": "code", 755 | "source": [ 756 | "def compute_loss(self, n_tags, PAD_index):\n", 757 | " \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n", 758 | " \n", 759 | " # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)\n", 760 | " ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n", 761 | " loss_tensor = tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_tags_one_hot,\n", 762 | " logits=self.logits) ######### YOUR CODE HERE #############\n", 763 | " \n", 764 | " mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n", 765 | " # Create loss function which doesn't operate with tokens (tf.reduce_mean)\n", 766 | " # Be careful that the argument of tf.reduce_mean should be\n", 767 | " # multiplication of mask and loss_tensor.\n", 768 | " self.loss = tf.reduce_mean(mask*loss_tensor) ######### YOUR CODE HERE #############" 769 | ], 770 | "execution_count": 0, 771 | "outputs": [] 772 | }, 773 | { 774 | "metadata": { 775 | "id": "8VwcMg4DClbk", 776 | "colab_type": "code", 777 | "colab": {} 778 | }, 779 | "cell_type": "code", 780 | "source": [ 781 | "BiLSTMModel.__compute_loss = classmethod(compute_loss)" 782 | ], 783 | "execution_count": 0, 784 | "outputs": [] 785 | }, 786 | { 787 | "metadata": { 788 | "id": "IpIG_HXcClbn", 789 | "colab_type": "text" 790 | }, 791 | "cell_type": "markdown", 792 | "source": [ 793 | "The last thing to specify is how we want to optimize the loss. \n", 794 | "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n", 795 | "You will also need to apply [clipping](https://www.tensorflow.org/api_guides/python/train#Gradient_Clipping) to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. " 796 | ] 797 | }, 798 | { 799 | "metadata": { 800 | "id": "0MAtU6pIClbo", 801 | "colab_type": "code", 802 | "colab": {} 803 | }, 804 | "cell_type": "code", 805 | "source": [ 806 | "def perform_optimization(self):\n", 807 | " \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n", 808 | " \n", 809 | " # Create an optimizer (tf.train.AdamOptimizer)\n", 810 | " self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) ######### YOUR CODE HERE #############\n", 811 | " self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n", 812 | " \n", 813 | " # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n", 814 | " # Pay attention that you need to apply this operation only for gradients \n", 815 | " # because self.grads_and_vars also contains variables.\n", 816 | " # list comprehension might be useful in this case.\n", 817 | " clip_norm = tf.cast(1.0, tf.float32)\n", 818 | " self.grads_and_vars = [(tf.clip_by_norm(g, clip_norm), v) for g,v in self.grads_and_vars] ######### YOUR CODE HERE #############\n", 819 | " \n", 820 | " self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)" 821 | ], 822 | "execution_count": 0, 823 | "outputs": [] 824 | }, 825 | { 826 | "metadata": { 827 | "id": "QpmcMY_GClbq", 828 | "colab_type": "code", 829 | "colab": {} 830 | }, 831 | "cell_type": "code", 832 | "source": [ 833 | "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)" 834 | ], 835 | "execution_count": 0, 836 | "outputs": [] 837 | }, 838 | { 839 | "metadata": { 840 | "id": "p0y-LNqZClbs", 841 | "colab_type": "text" 842 | }, 843 | "cell_type": "markdown", 844 | "source": [ 845 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n", 846 | "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. " 847 | ] 848 | }, 849 | { 850 | "metadata": { 851 | "id": "UfiDFVc8Clbs", 852 | "colab_type": "code", 853 | "colab": {} 854 | }, 855 | "cell_type": "code", 856 | "source": [ 857 | "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n", 858 | " self.__declare_placeholders()\n", 859 | " self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n", 860 | " self.__compute_predictions()\n", 861 | " self.__compute_loss(n_tags, PAD_index)\n", 862 | " self.__perform_optimization()" 863 | ], 864 | "execution_count": 0, 865 | "outputs": [] 866 | }, 867 | { 868 | "metadata": { 869 | "id": "gjKuKdutClbv", 870 | "colab_type": "code", 871 | "colab": {} 872 | }, 873 | "cell_type": "code", 874 | "source": [ 875 | "BiLSTMModel.__init__ = classmethod(init_model)" 876 | ], 877 | "execution_count": 0, 878 | "outputs": [] 879 | }, 880 | { 881 | "metadata": { 882 | "id": "RxFYxAcdClbx", 883 | "colab_type": "text" 884 | }, 885 | "cell_type": "markdown", 886 | "source": [ 887 | "## Train the network and predict tags" 888 | ] 889 | }, 890 | { 891 | "metadata": { 892 | "id": "OPUTtwn-Clby", 893 | "colab_type": "text" 894 | }, 895 | "cell_type": "markdown", 896 | "source": [ 897 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. " 898 | ] 899 | }, 900 | { 901 | "metadata": { 902 | "id": "H5O66R4fClbz", 903 | "colab_type": "code", 904 | "colab": {} 905 | }, 906 | "cell_type": "code", 907 | "source": [ 908 | "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n", 909 | " feed_dict = {self.input_batch: x_batch,\n", 910 | " self.ground_truth_tags: y_batch,\n", 911 | " self.learning_rate_ph: learning_rate,\n", 912 | " self.dropout_ph: dropout_keep_probability,\n", 913 | " self.lengths: lengths}\n", 914 | " \n", 915 | " session.run(self.train_op, feed_dict=feed_dict)" 916 | ], 917 | "execution_count": 0, 918 | "outputs": [] 919 | }, 920 | { 921 | "metadata": { 922 | "id": "FE5YcNvXClb1", 923 | "colab_type": "code", 924 | "colab": {} 925 | }, 926 | "cell_type": "code", 927 | "source": [ 928 | "BiLSTMModel.train_on_batch = classmethod(train_on_batch)" 929 | ], 930 | "execution_count": 0, 931 | "outputs": [] 932 | }, 933 | { 934 | "metadata": { 935 | "id": "6gPxoxROClb4", 936 | "colab_type": "text" 937 | }, 938 | "cell_type": "markdown", 939 | "source": [ 940 | "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*." 941 | ] 942 | }, 943 | { 944 | "metadata": { 945 | "id": "37N9hiG5Clb4", 946 | "colab_type": "code", 947 | "colab": {} 948 | }, 949 | "cell_type": "code", 950 | "source": [ 951 | "def predict_for_batch(self, session, x_batch, lengths):\n", 952 | " ######################################\n", 953 | " ######### YOUR CODE HERE #############\n", 954 | " ######################################\n", 955 | " \n", 956 | " predictions = sess.run(self.predictions, feed_dict={\n", 957 | " self.input_batch:x_batch,\n", 958 | " self.lengths:lengths\n", 959 | " })\n", 960 | " \n", 961 | " return predictions" 962 | ], 963 | "execution_count": 0, 964 | "outputs": [] 965 | }, 966 | { 967 | "metadata": { 968 | "id": "nPQX4_MUClb6", 969 | "colab_type": "code", 970 | "colab": {} 971 | }, 972 | "cell_type": "code", 973 | "source": [ 974 | "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)" 975 | ], 976 | "execution_count": 0, 977 | "outputs": [] 978 | }, 979 | { 980 | "metadata": { 981 | "id": "jY8vIF10Clb_", 982 | "colab_type": "text" 983 | }, 984 | "cell_type": "markdown", 985 | "source": [ 986 | "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n", 987 | "\n", 988 | "### Evaluation \n", 989 | "To simplify the evaluation process we provide two functions for you:\n", 990 | " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n", 991 | " - *eval_conll*: calculates precision, recall and F1 for the results." 992 | ] 993 | }, 994 | { 995 | "metadata": { 996 | "id": "24cvaPYHClcA", 997 | "colab_type": "code", 998 | "colab": {} 999 | }, 1000 | "cell_type": "code", 1001 | "source": [ 1002 | "from evaluation import precision_recall_f1" 1003 | ], 1004 | "execution_count": 0, 1005 | "outputs": [] 1006 | }, 1007 | { 1008 | "metadata": { 1009 | "id": "z69gl_EUClcC", 1010 | "colab_type": "code", 1011 | "colab": {} 1012 | }, 1013 | "cell_type": "code", 1014 | "source": [ 1015 | "def predict_tags(model, session, token_idxs_batch, lengths):\n", 1016 | " \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n", 1017 | " \n", 1018 | " tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n", 1019 | " \n", 1020 | " tags_batch, tokens_batch = [], []\n", 1021 | " for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n", 1022 | " tags, tokens = [], []\n", 1023 | " for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n", 1024 | " tags.append(idx2tag[tag_idx])\n", 1025 | " tokens.append(idx2token[token_idx])\n", 1026 | " tags_batch.append(tags)\n", 1027 | " tokens_batch.append(tokens)\n", 1028 | " return tags_batch, tokens_batch\n", 1029 | " \n", 1030 | " \n", 1031 | "def eval_conll(model, session, tokens, tags, short_report=True):\n", 1032 | " \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n", 1033 | " \n", 1034 | " y_true, y_pred = [], []\n", 1035 | " for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n", 1036 | " tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n", 1037 | " if len(x_batch[0]) != len(tags_batch[0]):\n", 1038 | " raise Exception(\"Incorrect length of prediction for the input, \"\n", 1039 | " \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n", 1040 | " predicted_tags = []\n", 1041 | " ground_truth_tags = []\n", 1042 | " for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n", 1043 | " if token != '':\n", 1044 | " ground_truth_tags.append(idx2tag[gt_tag_idx])\n", 1045 | " predicted_tags.append(pred_tag)\n", 1046 | "\n", 1047 | " # We extend every prediction and ground truth sequence with 'O' tag\n", 1048 | " # to indicate a possible end of entity.\n", 1049 | " y_true.extend(ground_truth_tags + ['O'])\n", 1050 | " y_pred.extend(predicted_tags + ['O'])\n", 1051 | " \n", 1052 | " results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n", 1053 | " return results" 1054 | ], 1055 | "execution_count": 0, 1056 | "outputs": [] 1057 | }, 1058 | { 1059 | "metadata": { 1060 | "id": "6d4gejKKClcH", 1061 | "colab_type": "text" 1062 | }, 1063 | "cell_type": "markdown", 1064 | "source": [ 1065 | "## Run your experiment" 1066 | ] 1067 | }, 1068 | { 1069 | "metadata": { 1070 | "id": "Lg1ZaBdAClcH", 1071 | "colab_type": "text" 1072 | }, 1073 | "cell_type": "markdown", 1074 | "source": [ 1075 | "Create *BiLSTMModel* model with the following parameters:\n", 1076 | " - *vocabulary_size* — number of tokens;\n", 1077 | " - *n_tags* — number of tags;\n", 1078 | " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n", 1079 | " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n", 1080 | " - *PAD_index* — an index of the padding token (``).\n", 1081 | "\n", 1082 | "Set hyperparameters. You might want to start with the following recommended values:\n", 1083 | "- *batch_size*: 32;\n", 1084 | "- 4 epochs;\n", 1085 | "- starting value of *learning_rate*: 0.005\n", 1086 | "- *learning_rate_decay*: a square root of 2;\n", 1087 | "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n", 1088 | "\n", 1089 | "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment." 1090 | ] 1091 | }, 1092 | { 1093 | "metadata": { 1094 | "id": "eeS2HiH6ClcH", 1095 | "colab_type": "code", 1096 | "colab": { 1097 | "base_uri": "https://localhost:8080/", 1098 | "height": 292 1099 | }, 1100 | "outputId": "62f541e8-8fd1-497f-94c5-bb93900924bc" 1101 | }, 1102 | "cell_type": "code", 1103 | "source": [ 1104 | "tf.reset_default_graph()\n", 1105 | "\n", 1106 | "model = BiLSTMModel(len(token2idx), len(tag2idx), 200, 200, token2idx[''])\n", 1107 | "######### YOUR CODE HERE #############\n", 1108 | "\n", 1109 | "batch_size = 32######### YOUR CODE HERE #############\n", 1110 | "n_epochs = 4######### YOUR CODE HERE #############\n", 1111 | "learning_rate = 0.005######### YOUR CODE HERE #############\n", 1112 | "learning_rate_decay = np.sqrt(2)######### YOUR CODE HERE #############\n", 1113 | "dropout_keep_probability = 0.5 # 0.1, 0.5, 0.9######### YOUR CODE HERE #############" 1114 | ], 1115 | "execution_count": 31, 1116 | "outputs": [ 1117 | { 1118 | "output_type": "stream", 1119 | "text": [ 1120 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn.py:430: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.\n", 1121 | "Instructions for updating:\n", 1122 | "seq_dim is deprecated, use seq_axis instead\n", 1123 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py:454: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.\n", 1124 | "Instructions for updating:\n", 1125 | "batch_dim is deprecated, use batch_axis instead\n", 1126 | "WARNING:tensorflow:From :7: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.\n", 1127 | "Instructions for updating:\n", 1128 | "\n", 1129 | "Future major versions of TensorFlow will allow gradients to flow\n", 1130 | "into the labels input on backprop by default.\n", 1131 | "\n", 1132 | "See @{tf.nn.softmax_cross_entropy_with_logits_v2}.\n", 1133 | "\n" 1134 | ], 1135 | "name": "stdout" 1136 | } 1137 | ] 1138 | }, 1139 | { 1140 | "metadata": { 1141 | "id": "dqo6aAczClcJ", 1142 | "colab_type": "text" 1143 | }, 1144 | "cell_type": "markdown", 1145 | "source": [ 1146 | "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables." 1147 | ] 1148 | }, 1149 | { 1150 | "metadata": { 1151 | "id": "X1jNZq0kClcK", 1152 | "colab_type": "text" 1153 | }, 1154 | "cell_type": "markdown", 1155 | "source": [ 1156 | "Finally, we are ready to run the training!" 1157 | ] 1158 | }, 1159 | { 1160 | "metadata": { 1161 | "id": "NM3ivTGvClcK", 1162 | "colab_type": "code", 1163 | "colab": { 1164 | "base_uri": "https://localhost:8080/", 1165 | "height": 872 1166 | }, 1167 | "outputId": "63ed5ebc-1869-4bb2-fa1b-0d30a52d39db" 1168 | }, 1169 | "cell_type": "code", 1170 | "source": [ 1171 | "sess = tf.Session()\n", 1172 | "sess.run(tf.global_variables_initializer())\n", 1173 | "\n", 1174 | "print('Start training... \\n')\n", 1175 | "for epoch in range(n_epochs):\n", 1176 | " # For each epoch evaluate the model on train and validation data\n", 1177 | " print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n", 1178 | " print('Train data evaluation:')\n", 1179 | " eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n", 1180 | " print('Validation data evaluation:')\n", 1181 | " eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n", 1182 | " \n", 1183 | " # Train the model\n", 1184 | " for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n", 1185 | " model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n", 1186 | " \n", 1187 | " # Decaying the learning rate\n", 1188 | " learning_rate = learning_rate / learning_rate_decay\n", 1189 | " \n", 1190 | "print('...training finished.')" 1191 | ], 1192 | "execution_count": 32, 1193 | "outputs": [ 1194 | { 1195 | "output_type": "stream", 1196 | "text": [ 1197 | "Start training... \n", 1198 | "\n", 1199 | "-------------------- Epoch 1 of 4 --------------------\n", 1200 | "Train data evaluation:\n", 1201 | "processed 105778 tokens with 4489 phrases; found: 65993 phrases; correct: 153.\n", 1202 | "\n", 1203 | "precision: 0.23%; recall: 3.41%; F1: 0.43\n", 1204 | "\n", 1205 | "Validation data evaluation:\n", 1206 | "processed 12836 tokens with 537 phrases; found: 7922 phrases; correct: 21.\n", 1207 | "\n", 1208 | "precision: 0.27%; recall: 3.91%; F1: 0.50\n", 1209 | "\n", 1210 | "-------------------- Epoch 2 of 4 --------------------\n", 1211 | "Train data evaluation:\n", 1212 | "processed 105778 tokens with 4489 phrases; found: 778 phrases; correct: 241.\n", 1213 | "\n", 1214 | "precision: 30.98%; recall: 5.37%; F1: 9.15\n", 1215 | "\n", 1216 | "Validation data evaluation:\n", 1217 | "processed 12836 tokens with 537 phrases; found: 78 phrases; correct: 26.\n", 1218 | "\n", 1219 | "precision: 33.33%; recall: 4.84%; F1: 8.46\n", 1220 | "\n", 1221 | "-------------------- Epoch 3 of 4 --------------------\n", 1222 | "Train data evaluation:\n", 1223 | "processed 105778 tokens with 4489 phrases; found: 4403 phrases; correct: 1112.\n", 1224 | "\n", 1225 | "precision: 25.26%; recall: 24.77%; F1: 25.01\n", 1226 | "\n", 1227 | "Validation data evaluation:\n", 1228 | "processed 12836 tokens with 537 phrases; found: 308 phrases; correct: 94.\n", 1229 | "\n", 1230 | "precision: 30.52%; recall: 17.50%; F1: 22.25\n", 1231 | "\n", 1232 | "-------------------- Epoch 4 of 4 --------------------\n", 1233 | "Train data evaluation:\n", 1234 | "processed 105778 tokens with 4489 phrases; found: 5112 phrases; correct: 1554.\n", 1235 | "\n", 1236 | "precision: 30.40%; recall: 34.62%; F1: 32.37\n", 1237 | "\n", 1238 | "Validation data evaluation:\n", 1239 | "processed 12836 tokens with 537 phrases; found: 502 phrases; correct: 127.\n", 1240 | "\n", 1241 | "precision: 25.30%; recall: 23.65%; F1: 24.45\n", 1242 | "\n", 1243 | "...training finished.\n" 1244 | ], 1245 | "name": "stdout" 1246 | } 1247 | ] 1248 | }, 1249 | { 1250 | "metadata": { 1251 | "id": "lVgPcracClcM", 1252 | "colab_type": "text" 1253 | }, 1254 | "cell_type": "markdown", 1255 | "source": [ 1256 | "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n", 1257 | "\n", 1258 | "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**" 1259 | ] 1260 | }, 1261 | { 1262 | "metadata": { 1263 | "id": "NJgEfgueClcN", 1264 | "colab_type": "code", 1265 | "colab": { 1266 | "base_uri": "https://localhost:8080/", 1267 | "height": 1381 1268 | }, 1269 | "outputId": "2b0713f0-b66e-41fc-df7d-7e35b7bde26b" 1270 | }, 1271 | "cell_type": "code", 1272 | "source": [ 1273 | "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n", 1274 | "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n", 1275 | "\n", 1276 | "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n", 1277 | "validation_results = eval_conll(model, sess, validation_tokens, validation_tags, short_report=False)\n", 1278 | "######### YOUR CODE HERE #############\n", 1279 | "\n", 1280 | "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n", 1281 | "test_results = eval_conll(model, sess, test_tokens, test_tags, short_report=False)\n", 1282 | "######### YOUR CODE HERE #############" 1283 | ], 1284 | "execution_count": 34, 1285 | "outputs": [ 1286 | { 1287 | "output_type": "stream", 1288 | "text": [ 1289 | "-------------------- Train set quality: --------------------\n", 1290 | "processed 105778 tokens with 4489 phrases; found: 5072 phrases; correct: 2110.\n", 1291 | "\n", 1292 | "precision: 41.60%; recall: 47.00%; F1: 44.14\n", 1293 | "\n", 1294 | "\t company: precision: 63.60%; recall: 75.27%; F1: 68.95; predicted: 761\n", 1295 | "\n", 1296 | "\t facility: precision: 41.79%; recall: 53.50%; F1: 46.93; predicted: 402\n", 1297 | "\n", 1298 | "\t geo-loc: precision: 65.13%; recall: 89.46%; F1: 75.38; predicted: 1368\n", 1299 | "\n", 1300 | "\t movie: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1301 | "\n", 1302 | "\t musicartist: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1303 | "\n", 1304 | "\t other: precision: 27.98%; recall: 62.09%; F1: 38.57; predicted: 1680\n", 1305 | "\n", 1306 | "\t person: precision: 1.04%; recall: 0.68%; F1: 0.82; predicted: 579\n", 1307 | "\n", 1308 | "\t product: precision: 22.48%; recall: 15.41%; F1: 18.28; predicted: 218\n", 1309 | "\n", 1310 | "\t sportsteam: precision: 65.62%; recall: 19.35%; F1: 29.89; predicted: 64\n", 1311 | "\n", 1312 | "\t tvshow: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1313 | "\n", 1314 | "-------------------- Validation set quality: --------------------\n", 1315 | "processed 12836 tokens with 537 phrases; found: 644 phrases; correct: 138.\n", 1316 | "\n", 1317 | "precision: 21.43%; recall: 25.70%; F1: 23.37\n", 1318 | "\n", 1319 | "\t company: precision: 44.55%; recall: 47.12%; F1: 45.79; predicted: 110\n", 1320 | "\n", 1321 | "\t facility: precision: 22.50%; recall: 26.47%; F1: 24.32; predicted: 40\n", 1322 | "\n", 1323 | "\t geo-loc: precision: 52.73%; recall: 51.33%; F1: 52.02; predicted: 110\n", 1324 | "\n", 1325 | "\t movie: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1326 | "\n", 1327 | "\t musicartist: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 1\n", 1328 | "\n", 1329 | "\t other: precision: 6.93%; recall: 23.46%; F1: 10.70; predicted: 274\n", 1330 | "\n", 1331 | "\t person: precision: 1.18%; recall: 0.89%; F1: 1.02; predicted: 85\n", 1332 | "\n", 1333 | "\t product: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 20\n", 1334 | "\n", 1335 | "\t sportsteam: precision: 50.00%; recall: 10.00%; F1: 16.67; predicted: 4\n", 1336 | "\n", 1337 | "\t tvshow: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1338 | "\n", 1339 | "-------------------- Test set quality: --------------------\n", 1340 | "processed 13258 tokens with 604 phrases; found: 767 phrases; correct: 166.\n", 1341 | "\n", 1342 | "precision: 21.64%; recall: 27.48%; F1: 24.22\n", 1343 | "\n", 1344 | "\t company: precision: 39.08%; recall: 40.48%; F1: 39.77; predicted: 87\n", 1345 | "\n", 1346 | "\t facility: precision: 19.67%; recall: 25.53%; F1: 22.22; predicted: 61\n", 1347 | "\n", 1348 | "\t geo-loc: precision: 62.86%; recall: 53.33%; F1: 57.70; predicted: 140\n", 1349 | "\n", 1350 | "\t movie: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1351 | "\n", 1352 | "\t musicartist: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 1\n", 1353 | "\n", 1354 | "\t other: precision: 8.01%; recall: 30.10%; F1: 12.65; predicted: 387\n", 1355 | "\n", 1356 | "\t person: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 55\n", 1357 | "\n", 1358 | "\t product: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 32\n", 1359 | "\n", 1360 | "\t sportsteam: precision: 25.00%; recall: 3.23%; F1: 5.71; predicted: 4\n", 1361 | "\n", 1362 | "\t tvshow: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1363 | "\n" 1364 | ], 1365 | "name": "stdout" 1366 | } 1367 | ] 1368 | }, 1369 | { 1370 | "metadata": { 1371 | "id": "xFadtd70ClcQ", 1372 | "colab_type": "text" 1373 | }, 1374 | "cell_type": "markdown", 1375 | "source": [ 1376 | "### Conclusions\n", 1377 | "\n", 1378 | "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested." 1379 | ] 1380 | } 1381 | ] 1382 | } -------------------------------------------------------------------------------- /week 4/week4_seq2seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "week4-seq2seq.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "toc_visible": true 10 | }, 11 | "kernelspec": { 12 | "display_name": "Python 3", 13 | "language": "python", 14 | "name": "python3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "metadata": { 20 | "id": "-g_T1vgOlKJ5", 21 | "colab_type": "text" 22 | }, 23 | "cell_type": "markdown", 24 | "source": [ 25 | "# Learn to calculate with seq2seq model\n", 26 | "\n", 27 | "In this assignment, you will learn how to use neural networks to solve sequence-to-sequence prediction tasks. Seq2Seq models are very popular these days because they achieve great results in Machine Translation, Text Summarization, Conversational Modeling and more.\n", 28 | "\n", 29 | "Using sequence-to-sequence modeling you are going to build a calculator for evaluating arithmetic expressions, by taking an equation as an input to the neural network and producing an answer as it's output.\n", 30 | "\n", 31 | "The resulting solution for this problem will be based on state-of-the-art approaches for sequence-to-sequence learning and you should be able to easily adapt it to solve other tasks. However, if you want to train your own machine translation system or intellectual chat bot, it would be useful to have access to compute resources like GPU, and be patient, because training of such systems is usually time consuming. \n", 32 | "\n", 33 | "### Libraries\n", 34 | "\n", 35 | "For this task you will need the following libraries:\n", 36 | " - [TensorFlow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 37 | " - [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n", 38 | " \n", 39 | "If you have never worked with TensorFlow, you will probably want to read some tutorials during your work on this assignment, e.g. [Neural Machine Translation](https://www.tensorflow.org/tutorials/seq2seq) tutorial deals with very similar task and can explain some concepts to you. " 40 | ] 41 | }, 42 | { 43 | "metadata": { 44 | "id": "3ZKpV6dFlKJ7", 45 | "colab_type": "text" 46 | }, 47 | "cell_type": "markdown", 48 | "source": [ 49 | "### Data\n", 50 | "\n", 51 | "One benefit of this task is that you don't need to download any data — you will generate it on your own! We will use two operators (addition and subtraction) and work with positive integer numbers in some range. Here are examples of correct inputs and outputs:\n", 52 | "\n", 53 | " Input: '1+2'\n", 54 | " Output: '3'\n", 55 | " \n", 56 | " Input: '0-99'\n", 57 | " Output: '-99'\n", 58 | "\n", 59 | "*Note, that there are no spaces between operators and operands.*\n", 60 | "\n", 61 | "\n", 62 | "Now you need to implement the function *generate_equations*, which will be used to generate the data." 63 | ] 64 | }, 65 | { 66 | "metadata": { 67 | "id": "Mb0x0WXRmDaS", 68 | "colab_type": "code", 69 | "colab": { 70 | "base_uri": "https://localhost:8080/", 71 | "height": 217 72 | }, 73 | "outputId": "3af234ca-156a-4519-a8d8-1d1d93d0c6a4" 74 | }, 75 | "cell_type": "code", 76 | "source": [ 77 | "! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py\n", 78 | "import setup_google_colab\n", 79 | "setup_google_colab.setup_week4() # change to the week you're working on" 80 | ], 81 | "execution_count": 1, 82 | "outputs": [ 83 | { 84 | "output_type": "stream", 85 | "text": [ 86 | "--2018-09-02 14:03:45-- https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py\n", 87 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", 88 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", 89 | "HTTP request sent, awaiting response... 200 OK\n", 90 | "Length: 2330 (2.3K) [text/plain]\n", 91 | "Saving to: ‘setup_google_colab.py’\n", 92 | "\n", 93 | "setup_google_colab. 100%[===================>] 2.28K --.-KB/s in 0s \n", 94 | "\n", 95 | "2018-09-02 14:03:45 (34.6 MB/s) - ‘setup_google_colab.py’ saved [2330/2330]\n", 96 | "\n" 97 | ], 98 | "name": "stdout" 99 | } 100 | ] 101 | }, 102 | { 103 | "metadata": { 104 | "id": "XfFj8o3NlKJ7", 105 | "colab_type": "code", 106 | "colab": {} 107 | }, 108 | "cell_type": "code", 109 | "source": [ 110 | "import random" 111 | ], 112 | "execution_count": 0, 113 | "outputs": [] 114 | }, 115 | { 116 | "metadata": { 117 | "id": "J7Ig895wlKJ_", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "cell_type": "code", 122 | "source": [ 123 | "def generate_equations(allowed_operators, dataset_size, min_value, max_value):\n", 124 | " \"\"\"Generates pairs of equations and solutions to them.\n", 125 | " \n", 126 | " Each equation has a form of two integers with an operator in between.\n", 127 | " Each solution is an integer with the result of the operaion.\n", 128 | " \n", 129 | " allowed_operators: list of strings, allowed operators.\n", 130 | " dataset_size: an integer, number of equations to be generated.\n", 131 | " min_value: an integer, min value of each operand.\n", 132 | " max_value: an integer, max value of each operand.\n", 133 | "\n", 134 | " result: a list of tuples of strings (equation, solution).\n", 135 | " \"\"\"\n", 136 | " sample = []\n", 137 | " for _ in range(dataset_size):\n", 138 | " ######################################\n", 139 | " ######### YOUR CODE HERE #############\n", 140 | " ######################################\n", 141 | " l = random.randint(a=min_value, b=max_value)\n", 142 | " r = random.randint(a=min_value, b=max_value)\n", 143 | " o = random.choice(allowed_operators)\n", 144 | " if o == '-':\n", 145 | " solu = l-r\n", 146 | " else:\n", 147 | " solu = l+r\n", 148 | " sample.append((str(l)+o+str(r), str(solu)))\n", 149 | " return sample" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [] 153 | }, 154 | { 155 | "metadata": { 156 | "id": "_V8FhDUtlKKD", 157 | "colab_type": "text" 158 | }, 159 | "cell_type": "markdown", 160 | "source": [ 161 | "To check the correctness of your implementation, use *test_generate_equations* function:" 162 | ] 163 | }, 164 | { 165 | "metadata": { 166 | "id": "dHPLB_nIlKKD", 167 | "colab_type": "code", 168 | "colab": {} 169 | }, 170 | "cell_type": "code", 171 | "source": [ 172 | "def test_generate_equations():\n", 173 | " allowed_operators = ['+', '-']\n", 174 | " dataset_size = 10\n", 175 | " for (input_, output_) in generate_equations(allowed_operators, dataset_size, 0, 100):\n", 176 | " if not (type(input_) is str and type(output_) is str):\n", 177 | " return \"Both parts should be strings.\"\n", 178 | " if eval(input_) != int(output_):\n", 179 | " return \"The (equation: {!r}, solution: {!r}) pair is incorrect.\".format(input_, output_)\n", 180 | " return \"Tests passed.\"" 181 | ], 182 | "execution_count": 0, 183 | "outputs": [] 184 | }, 185 | { 186 | "metadata": { 187 | "id": "GpJgfQXvlKKG", 188 | "colab_type": "code", 189 | "colab": { 190 | "base_uri": "https://localhost:8080/", 191 | "height": 35 192 | }, 193 | "outputId": "a1834b78-6b51-44e2-ecfb-8d4b1151c51d" 194 | }, 195 | "cell_type": "code", 196 | "source": [ 197 | "print(test_generate_equations())" 198 | ], 199 | "execution_count": 5, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "text": [ 204 | "Tests passed.\n" 205 | ], 206 | "name": "stdout" 207 | } 208 | ] 209 | }, 210 | { 211 | "metadata": { 212 | "id": "lj53a66vlKKI", 213 | "colab_type": "text" 214 | }, 215 | "cell_type": "markdown", 216 | "source": [ 217 | "Finally, we are ready to generate the train and test data for the neural network:" 218 | ] 219 | }, 220 | { 221 | "metadata": { 222 | "id": "9oGLqHe3lKKK", 223 | "colab_type": "code", 224 | "colab": {} 225 | }, 226 | "cell_type": "code", 227 | "source": [ 228 | "from sklearn.model_selection import train_test_split" 229 | ], 230 | "execution_count": 0, 231 | "outputs": [] 232 | }, 233 | { 234 | "metadata": { 235 | "id": "3lyXkKMKlKKM", 236 | "colab_type": "code", 237 | "colab": {} 238 | }, 239 | "cell_type": "code", 240 | "source": [ 241 | "allowed_operators = ['+', '-']\n", 242 | "dataset_size = 100000\n", 243 | "data = generate_equations(allowed_operators, dataset_size, min_value=0, max_value=9999)\n", 244 | "\n", 245 | "train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)" 246 | ], 247 | "execution_count": 0, 248 | "outputs": [] 249 | }, 250 | { 251 | "metadata": { 252 | "id": "2hw7ujoulKKP", 253 | "colab_type": "text" 254 | }, 255 | "cell_type": "markdown", 256 | "source": [ 257 | "## Prepare data for the neural network\n", 258 | "\n", 259 | "The next stage of data preparation is creating mappings of the characters to their indices in some vocabulary. Since in our task we already know which symbols will appear in the inputs and outputs, generating the vocabulary is a simple step.\n", 260 | "\n", 261 | "#### How to create dictionaries for other task\n", 262 | "\n", 263 | "First of all, you need to understand what is the basic unit of the sequence in your task. In our case, we operate on symbols and the basic unit is a symbol. The number of symbols is small, so we don't need to think about filtering/normalization steps. However, in other tasks, the basic unit is often a word, and in this case the mapping would be *word $\\to$ integer*. The number of words might be huge, so it would be reasonable to filter them, for example, by frequency and leave only the frequent ones. Other strategies that your should consider are: data normalization (lowercasing, tokenization, how to consider punctuation marks), separate vocabulary for input and for output (e.g. for machine translation), some specifics of the task." 264 | ] 265 | }, 266 | { 267 | "metadata": { 268 | "id": "tXs4ODA-lKKQ", 269 | "colab_type": "code", 270 | "colab": {} 271 | }, 272 | "cell_type": "code", 273 | "source": [ 274 | "word2id = {symbol:i for i, symbol in enumerate('#^$+-1234567890')}\n", 275 | "id2word = {i:symbol for symbol, i in word2id.items()}" 276 | ], 277 | "execution_count": 0, 278 | "outputs": [] 279 | }, 280 | { 281 | "metadata": { 282 | "id": "o25frF3XlKKT", 283 | "colab_type": "text" 284 | }, 285 | "cell_type": "markdown", 286 | "source": [ 287 | "#### Special symbols" 288 | ] 289 | }, 290 | { 291 | "metadata": { 292 | "id": "V1FzRlgOlKKU", 293 | "colab_type": "code", 294 | "colab": {} 295 | }, 296 | "cell_type": "code", 297 | "source": [ 298 | "start_symbol = '^'\n", 299 | "end_symbol = '$'\n", 300 | "padding_symbol = '#'" 301 | ], 302 | "execution_count": 0, 303 | "outputs": [] 304 | }, 305 | { 306 | "metadata": { 307 | "id": "RBbI2w3NlKKX", 308 | "colab_type": "text" 309 | }, 310 | "cell_type": "markdown", 311 | "source": [ 312 | "You could notice that we have added 3 special symbols: '^', '\\$' and '#':\n", 313 | "- '^' symbol will be passed to the network to indicate the beginning of the decoding procedure. We will discuss this one later in more details.\n", 314 | "- '\\$' symbol will be used to indicate the *end of a string*, both for input and output sequences. \n", 315 | "- '#' symbol will be used as a *padding* character to make lengths of all strings equal within one training batch.\n", 316 | "\n", 317 | "People have a bit different habits when it comes to special symbols in encoder-decoder networks, so don't get too much confused if you come across other variants in tutorials you read. " 318 | ] 319 | }, 320 | { 321 | "metadata": { 322 | "id": "G0c8CNTllKKY", 323 | "colab_type": "text" 324 | }, 325 | "cell_type": "markdown", 326 | "source": [ 327 | "#### Padding" 328 | ] 329 | }, 330 | { 331 | "metadata": { 332 | "id": "Vi9tWslAlKKZ", 333 | "colab_type": "text" 334 | }, 335 | "cell_type": "markdown", 336 | "source": [ 337 | "When vocabularies are ready, we need to be able to convert a sentence to a list of vocabulary word indices and back. At the same time, let's care about padding. We are going to preprocess each sequence from the input (and output ground truth) in such a way that:\n", 338 | "- it has a predefined length *padded_len*\n", 339 | "- it is probably cut off or padded with the *padding symbol* '#'\n", 340 | "- it *always* ends with the *end symbol* '$'\n", 341 | "\n", 342 | "We will treat the original characters of the sequence **and the end symbol** as the valid part of the input. We will store *the actual length* of the sequence, which includes the end symbol, but does not include the padding symbols. " 343 | ] 344 | }, 345 | { 346 | "metadata": { 347 | "id": "NFYhUzxtlKKa", 348 | "colab_type": "text" 349 | }, 350 | "cell_type": "markdown", 351 | "source": [ 352 | " Now you need to implement the function *sentence_to_ids* that does the described job. " 353 | ] 354 | }, 355 | { 356 | "metadata": { 357 | "id": "1n_NXv2TlKKa", 358 | "colab_type": "code", 359 | "colab": {} 360 | }, 361 | "cell_type": "code", 362 | "source": [ 363 | "def sentence_to_ids(sentence, word2id, padded_len):\n", 364 | " \"\"\" Converts a sequence of symbols to a padded sequence of their ids.\n", 365 | " \n", 366 | " sentence: a string, input/output sequence of symbols.\n", 367 | " word2id: a dict, a mapping from original symbols to ids.\n", 368 | " padded_len: an integer, a desirable length of the sequence.\n", 369 | "\n", 370 | " result: a tuple of (a list of ids, an actual length of sentence).\n", 371 | " \"\"\"\n", 372 | " \n", 373 | " sent_ids = [word2id[end_symbol] for i in range(padded_len)]\n", 374 | " length = padded_len-1 if padded_len-1 < len(sentence) else len(sentence)\n", 375 | " for i in range(length):\n", 376 | " sent_ids[i] = word2id[sentence[i]]\n", 377 | " for i in range(length+1,padded_len):\n", 378 | " sent_ids[i] = word2id[padding_symbol]\n", 379 | " sent_len = length + 1 \n", 380 | " \n", 381 | " return sent_ids, sent_len" 382 | ], 383 | "execution_count": 0, 384 | "outputs": [] 385 | }, 386 | { 387 | "metadata": { 388 | "id": "he-nnQTllKKd", 389 | "colab_type": "text" 390 | }, 391 | "cell_type": "markdown", 392 | "source": [ 393 | "Check that your implementation is correct:" 394 | ] 395 | }, 396 | { 397 | "metadata": { 398 | "id": "cQDfVYqVlKKd", 399 | "colab_type": "code", 400 | "colab": {} 401 | }, 402 | "cell_type": "code", 403 | "source": [ 404 | "def test_sentence_to_ids():\n", 405 | " sentences = [(\"123+123\", 7), (\"123+123\", 8), (\"123+123\", 10)]\n", 406 | " expected_output = [([5, 6, 7, 3, 5, 6, 2], 7), \n", 407 | " ([5, 6, 7, 3, 5, 6, 7, 2], 8), \n", 408 | " ([5, 6, 7, 3, 5, 6, 7, 2, 0, 0], 8)] \n", 409 | " for (sentence, padded_len), (sentence_ids, expected_length) in zip(sentences, expected_output):\n", 410 | " output, length = sentence_to_ids(sentence, word2id, padded_len)\n", 411 | " if output != sentence_ids:\n", 412 | " return(\"Convertion of '{}' for padded_len={} to {} is incorrect.\".format(\n", 413 | " sentence, padded_len, output))\n", 414 | " if length != expected_length:\n", 415 | " return(\"Convertion of '{}' for padded_len={} has incorrect actual length {}.\".format(\n", 416 | " sentence, padded_len, length))\n", 417 | " return(\"Tests passed.\")" 418 | ], 419 | "execution_count": 0, 420 | "outputs": [] 421 | }, 422 | { 423 | "metadata": { 424 | "id": "GHonr45rlKKg", 425 | "colab_type": "code", 426 | "colab": { 427 | "base_uri": "https://localhost:8080/", 428 | "height": 35 429 | }, 430 | "outputId": "659eea68-b0de-4047-cc5d-2b08ef21c7ef" 431 | }, 432 | "cell_type": "code", 433 | "source": [ 434 | "print(test_sentence_to_ids())" 435 | ], 436 | "execution_count": 12, 437 | "outputs": [ 438 | { 439 | "output_type": "stream", 440 | "text": [ 441 | "Tests passed.\n" 442 | ], 443 | "name": "stdout" 444 | } 445 | ] 446 | }, 447 | { 448 | "metadata": { 449 | "id": "z6Qo-iA7lKKj", 450 | "colab_type": "text" 451 | }, 452 | "cell_type": "markdown", 453 | "source": [ 454 | "We also need to be able to get back from indices to symbols:" 455 | ] 456 | }, 457 | { 458 | "metadata": { 459 | "id": "ludwyXkTlKKj", 460 | "colab_type": "code", 461 | "colab": {} 462 | }, 463 | "cell_type": "code", 464 | "source": [ 465 | "def ids_to_sentence(ids, id2word):\n", 466 | " \"\"\" Converts a sequence of ids to a sequence of symbols.\n", 467 | " \n", 468 | " ids: a list, indices for the padded sequence.\n", 469 | " id2word: a dict, a mapping from ids to original symbols.\n", 470 | "\n", 471 | " result: a list of symbols.\n", 472 | " \"\"\"\n", 473 | " \n", 474 | " return [id2word[i] for i in ids] " 475 | ], 476 | "execution_count": 0, 477 | "outputs": [] 478 | }, 479 | { 480 | "metadata": { 481 | "id": "dMctuqHulKKn", 482 | "colab_type": "text" 483 | }, 484 | "cell_type": "markdown", 485 | "source": [ 486 | "#### Generating batches" 487 | ] 488 | }, 489 | { 490 | "metadata": { 491 | "id": "Dlw0k9URlKKo", 492 | "colab_type": "text" 493 | }, 494 | "cell_type": "markdown", 495 | "source": [ 496 | "The final step of data preparation is a function that transforms a batch of sentences to a list of lists of indices. " 497 | ] 498 | }, 499 | { 500 | "metadata": { 501 | "id": "CFSXasWKlKKp", 502 | "colab_type": "code", 503 | "colab": {} 504 | }, 505 | "cell_type": "code", 506 | "source": [ 507 | "def batch_to_ids(sentences, word2id, max_len):\n", 508 | " \"\"\"Prepares batches of indices. \n", 509 | " \n", 510 | " Sequences are padded to match the longest sequence in the batch,\n", 511 | " if it's longer than max_len, then max_len is used instead.\n", 512 | "\n", 513 | " sentences: a list of strings, original sequences.\n", 514 | " word2id: a dict, a mapping from original symbols to ids.\n", 515 | " max_len: an integer, max len of sequences allowed.\n", 516 | "\n", 517 | " result: a list of lists of ids, a list of actual lengths.\n", 518 | " \"\"\"\n", 519 | " \n", 520 | " max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)\n", 521 | " batch_ids, batch_ids_len = [], []\n", 522 | " for sentence in sentences:\n", 523 | " ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)\n", 524 | " batch_ids.append(ids)\n", 525 | " batch_ids_len.append(ids_len)\n", 526 | " return batch_ids, batch_ids_len" 527 | ], 528 | "execution_count": 0, 529 | "outputs": [] 530 | }, 531 | { 532 | "metadata": { 533 | "id": "gK-1V0mXlKKr", 534 | "colab_type": "text" 535 | }, 536 | "cell_type": "markdown", 537 | "source": [ 538 | "The function *generate_batches* will help to generate batches with defined size from given samples." 539 | ] 540 | }, 541 | { 542 | "metadata": { 543 | "id": "Ty3FxrtZlKKt", 544 | "colab_type": "code", 545 | "colab": {} 546 | }, 547 | "cell_type": "code", 548 | "source": [ 549 | "def generate_batches(samples, batch_size=64):\n", 550 | " X, Y = [], []\n", 551 | " for i, (x, y) in enumerate(samples, 1):\n", 552 | " X.append(x)\n", 553 | " Y.append(y)\n", 554 | " if i % batch_size == 0:\n", 555 | " yield X, Y\n", 556 | " X, Y = [], []\n", 557 | " if X and Y:\n", 558 | " yield X, Y" 559 | ], 560 | "execution_count": 0, 561 | "outputs": [] 562 | }, 563 | { 564 | "metadata": { 565 | "id": "rsFtKKinlKKw", 566 | "colab_type": "text" 567 | }, 568 | "cell_type": "markdown", 569 | "source": [ 570 | "To illustrate the result of the implemented functions, run the following cell:" 571 | ] 572 | }, 573 | { 574 | "metadata": { 575 | "id": "-iAz4RaTlKKz", 576 | "colab_type": "code", 577 | "colab": { 578 | "base_uri": "https://localhost:8080/", 579 | "height": 72 580 | }, 581 | "outputId": "aeb87041-887a-482e-ae84-c972a6ae35e0" 582 | }, 583 | "cell_type": "code", 584 | "source": [ 585 | "sentences = train_set[0]\n", 586 | "ids, sent_lens = batch_to_ids(sentences, word2id, max_len=10)\n", 587 | "print('Input:', sentences)\n", 588 | "print('Ids: {}\\nSentences lengths: {}'.format(ids, sent_lens))" 589 | ], 590 | "execution_count": 16, 591 | "outputs": [ 592 | { 593 | "output_type": "stream", 594 | "text": [ 595 | "Input: ('3889+8943', '12832')\n", 596 | "Ids: [[7, 12, 12, 13, 3, 12, 13, 8, 7, 2], [5, 6, 12, 7, 6, 2, 0, 0, 0, 0]]\n", 597 | "Sentences lengths: [10, 6]\n" 598 | ], 599 | "name": "stdout" 600 | } 601 | ] 602 | }, 603 | { 604 | "metadata": { 605 | "id": "IiMK5fIelKK2", 606 | "colab_type": "text" 607 | }, 608 | "cell_type": "markdown", 609 | "source": [ 610 | "## Encoder-Decoder architecture\n", 611 | "\n", 612 | "Encoder-Decoder is a successful architecture for Seq2Seq tasks with different lengths of input and output sequences. The main idea is to use two recurrent neural networks, where the first neural network *encodes* the input sequence into a real-valued vector and then the second neural network *decodes* this vector into the output sequence. While building the neural network, we will specify some particular characteristics of this architecture." 613 | ] 614 | }, 615 | { 616 | "metadata": { 617 | "id": "88kWO9-DlKK3", 618 | "colab_type": "code", 619 | "colab": {} 620 | }, 621 | "cell_type": "code", 622 | "source": [ 623 | "import tensorflow as tf" 624 | ], 625 | "execution_count": 0, 626 | "outputs": [] 627 | }, 628 | { 629 | "metadata": { 630 | "id": "xewXr9HklKK6", 631 | "colab_type": "text" 632 | }, 633 | "cell_type": "markdown", 634 | "source": [ 635 | "Let us use TensorFlow building blocks to specify the network architecture." 636 | ] 637 | }, 638 | { 639 | "metadata": { 640 | "id": "O9tdBTarlKK6", 641 | "colab_type": "code", 642 | "colab": {} 643 | }, 644 | "cell_type": "code", 645 | "source": [ 646 | "class Seq2SeqModel(object):\n", 647 | " pass" 648 | ], 649 | "execution_count": 0, 650 | "outputs": [] 651 | }, 652 | { 653 | "metadata": { 654 | "id": "aMoZvRV5lKK9", 655 | "colab_type": "text" 656 | }, 657 | "cell_type": "markdown", 658 | "source": [ 659 | "First, we need to create [placeholders](https://www.tensorflow.org/api_guides/python/io_ops#Placeholders) to specify what data we are going to feed into the network during the execution time. For this task we will need:\n", 660 | " - *input_batch* — sequences of sentences (the shape will equal to [batch_size, max_sequence_len_in_batch]);\n", 661 | " - *input_batch_lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n", 662 | " - *ground_truth* — sequences of groundtruth (the shape will equal to [batch_size, max_sequence_len_in_batch]);\n", 663 | " - *ground_truth_lengths* — lengths of not padded groundtruth sequences (the shape equals to [batch_size]);\n", 664 | " - *dropout_ph* — dropout keep probability; this placeholder has a predifined value 1;\n", 665 | " - *learning_rate_ph* — learning rate." 666 | ] 667 | }, 668 | { 669 | "metadata": { 670 | "id": "zJjZ0MDulKK-", 671 | "colab_type": "code", 672 | "colab": {} 673 | }, 674 | "cell_type": "code", 675 | "source": [ 676 | "def declare_placeholders(self):\n", 677 | " \"\"\"Specifies placeholders for the model.\"\"\"\n", 678 | " \n", 679 | " # Placeholders for input and its actual lengths.\n", 680 | " self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')\n", 681 | " self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')\n", 682 | " \n", 683 | " # Placeholders for groundtruth and its actual lengths.\n", 684 | " self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32)\n", 685 | " ######### YOUR CODE HERE #############\n", 686 | " self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32)\n", 687 | " ######### YOUR CODE HERE #############\n", 688 | " \n", 689 | " self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n", 690 | " self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=())\n", 691 | " ######### YOUR CODE HERE #############" 692 | ], 693 | "execution_count": 0, 694 | "outputs": [] 695 | }, 696 | { 697 | "metadata": { 698 | "id": "FKPRCciHlKLA", 699 | "colab_type": "code", 700 | "colab": {} 701 | }, 702 | "cell_type": "code", 703 | "source": [ 704 | "Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)" 705 | ], 706 | "execution_count": 0, 707 | "outputs": [] 708 | }, 709 | { 710 | "metadata": { 711 | "id": "fxeuGtx3lKLD", 712 | "colab_type": "text" 713 | }, 714 | "cell_type": "markdown", 715 | "source": [ 716 | "Now, let us specify the layers of the neural network. First, we need to prepare an embedding matrix. Since we use the same vocabulary for input and output, we need only one such matrix. For tasks with different vocabularies there would be multiple embedding layers.\n", 717 | "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name, type (tf.float32), and initialize with random values.\n", 718 | "- Perform [embeddings lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) for a given input batch." 719 | ] 720 | }, 721 | { 722 | "metadata": { 723 | "id": "Ped4IWwrlKLD", 724 | "colab_type": "code", 725 | "colab": {} 726 | }, 727 | "cell_type": "code", 728 | "source": [ 729 | "def create_embeddings(self, vocab_size, embeddings_size):\n", 730 | " \"\"\"Specifies embeddings layer and embeds an input batch.\"\"\"\n", 731 | " \n", 732 | " random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)\n", 733 | " self.embeddings = tf.Variable(random_initializer)\n", 734 | " ######### YOUR CODE HERE ############# \n", 735 | " \n", 736 | " # Perform embeddings lookup for self.input_batch. \n", 737 | " self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)\n", 738 | " ######### YOUR CODE HERE #############\n", 739 | " " 740 | ], 741 | "execution_count": 0, 742 | "outputs": [] 743 | }, 744 | { 745 | "metadata": { 746 | "id": "-jW_2-KBlKLF", 747 | "colab_type": "code", 748 | "colab": {} 749 | }, 750 | "cell_type": "code", 751 | "source": [ 752 | "Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)" 753 | ], 754 | "execution_count": 0, 755 | "outputs": [] 756 | }, 757 | { 758 | "metadata": { 759 | "id": "OHIvBuqjlKLH", 760 | "colab_type": "text" 761 | }, 762 | "cell_type": "markdown", 763 | "source": [ 764 | "#### Encoder\n", 765 | "\n", 766 | "The first RNN of the current architecture is called an *encoder* and serves for encoding an input sequence to a real-valued vector. Input of this RNN is an embedded input batch. Since sentences in the same batch could have different actual lengths, we also provide input lengths to avoid unnecessary computations. The final encoder state will be passed to the second RNN (decoder), which we will create soon. \n", 767 | "\n", 768 | "- TensorFlow provides a number of [RNN cells](https://www.tensorflow.org/api_guides/python/contrib.rnn#Core_RNN_Cells_for_use_with_TensorFlow_s_core_RNN_methods) ready for use. We suggest that you use [GRU cell](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/GRUCell), but you can also experiment with other types. \n", 769 | "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify input keep probability using the dropout placeholder that we created before.\n", 770 | "- Combine the defined encoder cells with [Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn). Use the embedded input batches and their lengths here.\n", 771 | "- Use *dtype=tf.float32* everywhere." 772 | ] 773 | }, 774 | { 775 | "metadata": { 776 | "id": "KaHB3R_AlKLI", 777 | "colab_type": "code", 778 | "colab": {} 779 | }, 780 | "cell_type": "code", 781 | "source": [ 782 | "def build_encoder(self, hidden_size):\n", 783 | " \"\"\"Specifies encoder architecture and computes its output.\"\"\"\n", 784 | " \n", 785 | " # Create GRUCell with dropout.\n", 786 | " encoder_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_size),\n", 787 | " input_keep_prob=self.dropout_ph,\n", 788 | " output_keep_prob=self.dropout_ph) ######### YOUR CODE HERE #############\n", 789 | " \n", 790 | " # Create RNN with the predefined cell.\n", 791 | " _, self.final_encoder_state = tf.nn.dynamic_rnn(encoder_cell,\n", 792 | " self.input_batch_embedded,\n", 793 | " dtype=tf.float32,\n", 794 | " sequence_length=self.input_batch_lengths) ######### YOUR CODE HERE #############" 795 | ], 796 | "execution_count": 0, 797 | "outputs": [] 798 | }, 799 | { 800 | "metadata": { 801 | "id": "0jsy6E7LlKLK", 802 | "colab_type": "code", 803 | "colab": {} 804 | }, 805 | "cell_type": "code", 806 | "source": [ 807 | "Seq2SeqModel.__build_encoder = classmethod(build_encoder)" 808 | ], 809 | "execution_count": 0, 810 | "outputs": [] 811 | }, 812 | { 813 | "metadata": { 814 | "id": "4sD8yS_3lKLM", 815 | "colab_type": "text" 816 | }, 817 | "cell_type": "markdown", 818 | "source": [ 819 | "#### Decoder\n", 820 | "\n", 821 | "The second RNN is called a *decoder* and serves for generating the output sequence. In the simple seq2seq arcitecture, the input sequence is provided to the decoder only as the final state of the encoder. Obviously, it is a bottleneck and [Attention techniques](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) can help to overcome it. So far, we do not need them to make our calculator work, but this would be a necessary ingredient for more advanced tasks. \n", 822 | "\n", 823 | "During training, decoder also uses information about the true output. It is feeded in as input symbol by symbol. However, during the prediction stage (which is called *inference* in this architecture), the decoder can only use its own generated output from the previous step to feed it in at the next step. Because of this difference (*training* vs *inference*), we will create two distinct instances, which will serve for the described scenarios.\n", 824 | "\n", 825 | "The picture below illustrates the point. It also shows our work with the special characters, e.g. look how the start symbol `^` is used. The transparent parts are ignored. In decoder, it is masked out in the loss computation. In encoder, the green state is considered as final and passed to the decoder. " 826 | ] 827 | }, 828 | { 829 | "metadata": { 830 | "id": "aCjZ8a-qlKLM", 831 | "colab_type": "text" 832 | }, 833 | "cell_type": "markdown", 834 | "source": [ 835 | "" 836 | ] 837 | }, 838 | { 839 | "metadata": { 840 | "id": "zbJZuZsElKLN", 841 | "colab_type": "text" 842 | }, 843 | "cell_type": "markdown", 844 | "source": [ 845 | "Now, it's time to implement the decoder:\n", 846 | " - First, we should create two [helpers](https://www.tensorflow.org/api_guides/python/contrib.seq2seq#Dynamic_Decoding). These classes help to determine the behaviour of the decoder. During the training time, we will use [TrainingHelper](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/TrainingHelper). For the inference we recommend to use [GreedyEmbeddingHelper](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/GreedyEmbeddingHelper).\n", 847 | " - To share all parameters during training and inference, we use one scope and set the flag 'reuse' to True at inference time. You might be interested to know more about how [variable scopes](https://www.tensorflow.org/programmers_guide/variables) work in TF. \n", 848 | " - To create the decoder itself, we will use [BasicDecoder](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/BasicDecoder) class. As previously, you should choose some RNN cell, e.g. GRU cell. To turn hidden states into logits, we will need a projection layer. One of the simple solutions is using [OutputProjectionWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/OutputProjectionWrapper).\n", 849 | " - For getting the predictions, it will be convinient to use [dynamic_decode](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/dynamic_decode). This function uses the provided decoder to perform decoding." 850 | ] 851 | }, 852 | { 853 | "metadata": { 854 | "id": "8lNEi621lKLO", 855 | "colab_type": "code", 856 | "colab": {} 857 | }, 858 | "cell_type": "code", 859 | "source": [ 860 | "def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):\n", 861 | " \"\"\"Specifies decoder architecture and computes the output.\n", 862 | " \n", 863 | " Uses different helpers:\n", 864 | " - for train: feeding ground truth\n", 865 | " - for inference: feeding generated output\n", 866 | "\n", 867 | " As a result, self.train_outputs and self.infer_outputs are created. \n", 868 | " Each of them contains two fields:\n", 869 | " rnn_output (predicted logits)\n", 870 | " sample_id (predictions).\n", 871 | "\n", 872 | " \"\"\"\n", 873 | " \n", 874 | " # Use start symbols as the decoder inputs at the first time step.\n", 875 | " batch_size = tf.shape(self.input_batch)[0]\n", 876 | " start_tokens = tf.fill([batch_size], start_symbol_id)\n", 877 | " ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)\n", 878 | " \n", 879 | " # Use the embedding layer defined before to lookup embedings for ground_truth_as_input. \n", 880 | " self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings, ground_truth_as_input) ######### YOUR CODE HERE #############\n", 881 | " \n", 882 | " # Create TrainingHelper for the train stage.\n", 883 | " train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded, \n", 884 | " self.ground_truth_lengths)\n", 885 | " \n", 886 | " # Create GreedyEmbeddingHelper for the inference stage.\n", 887 | " # You should provide the embedding layer, start_tokens and index of the end symbol.\n", 888 | " infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings,\n", 889 | " start_tokens, end_symbol_id) ######### YOUR CODE HERE #############\n", 890 | " \n", 891 | " \n", 892 | " def decode(helper, scope, reuse=None):\n", 893 | " \"\"\"Creates decoder and return the results of the decoding with a given helper.\"\"\"\n", 894 | " \n", 895 | " with tf.variable_scope(scope, reuse=reuse):\n", 896 | " # Create GRUCell with dropout. Do not forget to set the reuse flag properly.\n", 897 | " decoder_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_size, reuse=reuse),\n", 898 | " input_keep_prob=self.dropout_ph,\n", 899 | " output_keep_prob=self.dropout_ph) ######### YOUR CODE HERE #############\n", 900 | " \n", 901 | " # Create a projection wrapper.\n", 902 | " decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)\n", 903 | " \n", 904 | " # Create BasicDecoder, pass the defined cell, a helper, and initial state.\n", 905 | " # The initial state should be equal to the final state of the encoder!\n", 906 | " decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper,\n", 907 | " self.final_encoder_state) ######### YOUR CODE HERE #############\n", 908 | " \n", 909 | " # The first returning argument of dynamic_decode contains two fields:\n", 910 | " # rnn_output (predicted logits)\n", 911 | " # sample_id (predictions)\n", 912 | " outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter, \n", 913 | " output_time_major=False, impute_finished=True)\n", 914 | "\n", 915 | " return outputs\n", 916 | " \n", 917 | " self.train_outputs = decode(train_helper, 'decode')\n", 918 | " self.infer_outputs = decode(infer_helper, 'decode', reuse=True)" 919 | ], 920 | "execution_count": 0, 921 | "outputs": [] 922 | }, 923 | { 924 | "metadata": { 925 | "id": "mwH-OAQslKLQ", 926 | "colab_type": "code", 927 | "colab": {} 928 | }, 929 | "cell_type": "code", 930 | "source": [ 931 | "Seq2SeqModel.__build_decoder = classmethod(build_decoder)" 932 | ], 933 | "execution_count": 0, 934 | "outputs": [] 935 | }, 936 | { 937 | "metadata": { 938 | "id": "RFfZtXPclKLT", 939 | "colab_type": "text" 940 | }, 941 | "cell_type": "markdown", 942 | "source": [ 943 | "In this task we will use [sequence_loss](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/sequence_loss), which is a weighted cross-entropy loss for a sequence of logits. Take a moment to understand, what is your train logits and targets. Also note, that we do not want to take into account loss terms coming from padding symbols, so we will mask them out using weights. " 944 | ] 945 | }, 946 | { 947 | "metadata": { 948 | "id": "SVN5siT-lKLU", 949 | "colab_type": "code", 950 | "colab": {} 951 | }, 952 | "cell_type": "code", 953 | "source": [ 954 | "def compute_loss(self):\n", 955 | " \"\"\"Computes sequence loss (masked cross-entopy loss with logits).\"\"\"\n", 956 | " \n", 957 | " weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)\n", 958 | " \n", 959 | " self.loss = tf.contrib.seq2seq.sequence_loss(self.train_outputs.rnn_output, self.ground_truth, weights) ######### YOUR CODE HERE #############" 960 | ], 961 | "execution_count": 0, 962 | "outputs": [] 963 | }, 964 | { 965 | "metadata": { 966 | "id": "0TrgkkRBlKLW", 967 | "colab_type": "code", 968 | "colab": {} 969 | }, 970 | "cell_type": "code", 971 | "source": [ 972 | "Seq2SeqModel.__compute_loss = classmethod(compute_loss)" 973 | ], 974 | "execution_count": 0, 975 | "outputs": [] 976 | }, 977 | { 978 | "metadata": { 979 | "id": "9Q5yTBmYlKLZ", 980 | "colab_type": "text" 981 | }, 982 | "cell_type": "markdown", 983 | "source": [ 984 | "The last thing to specify is the optimization of the defined loss. \n", 985 | "We suggest that you use [optimize_loss](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss) with Adam optimizer and a learning rate from the corresponding placeholder. You might also need to pass global step (e.g. as tf.train.get_global_step()) and clip gradients by 1.0." 986 | ] 987 | }, 988 | { 989 | "metadata": { 990 | "id": "1oDBqF4ulKLZ", 991 | "colab_type": "code", 992 | "colab": {} 993 | }, 994 | "cell_type": "code", 995 | "source": [ 996 | "def perform_optimization(self):\n", 997 | " \"\"\"Specifies train_op that optimizes self.loss.\"\"\"\n", 998 | " \n", 999 | " self.train_op = tf.contrib.layers.optimize_loss(self.loss,tf.train.get_global_step(),\n", 1000 | " self.learning_rate_ph,'Adam',clip_gradients=1.0) ######### YOUR CODE HERE #############" 1001 | ], 1002 | "execution_count": 0, 1003 | "outputs": [] 1004 | }, 1005 | { 1006 | "metadata": { 1007 | "id": "iT4_3E0tlKLb", 1008 | "colab_type": "code", 1009 | "colab": {} 1010 | }, 1011 | "cell_type": "code", 1012 | "source": [ 1013 | "Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)" 1014 | ], 1015 | "execution_count": 0, 1016 | "outputs": [] 1017 | }, 1018 | { 1019 | "metadata": { 1020 | "id": "-hed8hjolKLd", 1021 | "colab_type": "text" 1022 | }, 1023 | "cell_type": "markdown", 1024 | "source": [ 1025 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipies on how the network should function.\n", 1026 | "Now we will put them to the constructor of our Seq2SeqModel class to use it in the next section. " 1027 | ] 1028 | }, 1029 | { 1030 | "metadata": { 1031 | "id": "ude2cnuxlKLe", 1032 | "colab_type": "code", 1033 | "colab": {} 1034 | }, 1035 | "cell_type": "code", 1036 | "source": [ 1037 | "def init_model(self, vocab_size, embeddings_size, hidden_size, \n", 1038 | " max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):\n", 1039 | " \n", 1040 | " self.__declare_placeholders()\n", 1041 | " self.__create_embeddings(vocab_size, embeddings_size)\n", 1042 | " self.__build_encoder(hidden_size)\n", 1043 | " self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)\n", 1044 | " \n", 1045 | " # Compute loss and back-propagate.\n", 1046 | " self.__compute_loss()\n", 1047 | " self.__perform_optimization()\n", 1048 | " \n", 1049 | " # Get predictions for evaluation.\n", 1050 | " self.train_predictions = self.train_outputs.sample_id\n", 1051 | " self.infer_predictions = self.infer_outputs.sample_id" 1052 | ], 1053 | "execution_count": 0, 1054 | "outputs": [] 1055 | }, 1056 | { 1057 | "metadata": { 1058 | "id": "tXidyExClKLg", 1059 | "colab_type": "code", 1060 | "colab": {} 1061 | }, 1062 | "cell_type": "code", 1063 | "source": [ 1064 | "Seq2SeqModel.__init__ = classmethod(init_model)" 1065 | ], 1066 | "execution_count": 0, 1067 | "outputs": [] 1068 | }, 1069 | { 1070 | "metadata": { 1071 | "id": "Q4CrUPF6lKLi", 1072 | "colab_type": "text" 1073 | }, 1074 | "cell_type": "markdown", 1075 | "source": [ 1076 | "## Train the network and predict output\n", 1077 | "\n", 1078 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*. To predict output, we just need to compute *self.infer_predictions*. In any case, we need to feed actual data through the placeholders that we defined above. " 1079 | ] 1080 | }, 1081 | { 1082 | "metadata": { 1083 | "id": "7AAFJglDlKLi", 1084 | "colab_type": "code", 1085 | "colab": {} 1086 | }, 1087 | "cell_type": "code", 1088 | "source": [ 1089 | "def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):\n", 1090 | " feed_dict = {\n", 1091 | " self.input_batch: X,\n", 1092 | " self.input_batch_lengths: X_seq_len,\n", 1093 | " self.ground_truth: Y,\n", 1094 | " self.ground_truth_lengths: Y_seq_len,\n", 1095 | " self.learning_rate_ph: learning_rate,\n", 1096 | " self.dropout_ph: dropout_keep_probability\n", 1097 | " }\n", 1098 | " pred, loss, _ = session.run([\n", 1099 | " self.train_predictions,\n", 1100 | " self.loss,\n", 1101 | " self.train_op], feed_dict=feed_dict)\n", 1102 | " return pred, loss" 1103 | ], 1104 | "execution_count": 0, 1105 | "outputs": [] 1106 | }, 1107 | { 1108 | "metadata": { 1109 | "id": "5Rf9yPvqlKLn", 1110 | "colab_type": "code", 1111 | "colab": {} 1112 | }, 1113 | "cell_type": "code", 1114 | "source": [ 1115 | "Seq2SeqModel.train_on_batch = classmethod(train_on_batch)" 1116 | ], 1117 | "execution_count": 0, 1118 | "outputs": [] 1119 | }, 1120 | { 1121 | "metadata": { 1122 | "id": "0fQLEFntlKLr", 1123 | "colab_type": "text" 1124 | }, 1125 | "cell_type": "markdown", 1126 | "source": [ 1127 | "We implemented two prediction functions: *predict_for_batch* and *predict_for_batch_with_loss*. The first one allows only to predict output for some input sequence, while the second one could compute loss because we provide also ground truth values. Both these functions might be useful since the first one could be used for predicting only, and the second one is helpful for validating results on not-training data during the training." 1128 | ] 1129 | }, 1130 | { 1131 | "metadata": { 1132 | "id": "hXQHwiGDlKLs", 1133 | "colab_type": "code", 1134 | "colab": {} 1135 | }, 1136 | "cell_type": "code", 1137 | "source": [ 1138 | "def predict_for_batch(self, session, X, X_seq_len):\n", 1139 | " feed_dict = {\n", 1140 | " self.input_batch: X,\n", 1141 | " self.input_batch_lengths: X_seq_len,\n", 1142 | " }\n", 1143 | " ######### YOUR CODE HERE #############\n", 1144 | " pred = session.run([\n", 1145 | " self.infer_predictions\n", 1146 | " ], feed_dict=feed_dict)[0]\n", 1147 | " return pred\n", 1148 | "\n", 1149 | "def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):\n", 1150 | " feed_dict = {\n", 1151 | " self.input_batch: X,\n", 1152 | " self.input_batch_lengths: X_seq_len,\n", 1153 | " self.ground_truth: Y,\n", 1154 | " self.ground_truth_lengths: Y_seq_len,\n", 1155 | " }\n", 1156 | " ######### YOUR CODE HERE #############\n", 1157 | " pred, loss = session.run([\n", 1158 | " self.infer_predictions,\n", 1159 | " self.loss,\n", 1160 | " ], feed_dict=feed_dict)\n", 1161 | " return pred, loss" 1162 | ], 1163 | "execution_count": 0, 1164 | "outputs": [] 1165 | }, 1166 | { 1167 | "metadata": { 1168 | "id": "dqKah63LlKLv", 1169 | "colab_type": "code", 1170 | "colab": {} 1171 | }, 1172 | "cell_type": "code", 1173 | "source": [ 1174 | "Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)\n", 1175 | "Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)" 1176 | ], 1177 | "execution_count": 0, 1178 | "outputs": [] 1179 | }, 1180 | { 1181 | "metadata": { 1182 | "id": "U1A2TpB7lKL0", 1183 | "colab_type": "text" 1184 | }, 1185 | "cell_type": "markdown", 1186 | "source": [ 1187 | "## Run your experiment\n", 1188 | "\n", 1189 | "Create *Seq2SeqModel* model with the following parameters:\n", 1190 | " - *vocab_size* — number of tokens;\n", 1191 | " - *embeddings_size* — dimension of embeddings, recommended value: 20;\n", 1192 | " - *max_iter* — maximum number of steps in decoder, recommended value: 7;\n", 1193 | " - *hidden_size* — size of hidden layers for RNN, recommended value: 512;\n", 1194 | " - *start_symbol_id* — an index of the start token (`^`).\n", 1195 | " - *end_symbol_id* — an index of the end token (`$`).\n", 1196 | " - *padding_symbol_id* — an index of the padding token (`#`).\n", 1197 | "\n", 1198 | "Set hyperparameters. You might want to start with the following values and see how it works:\n", 1199 | "- *batch_size*: 128;\n", 1200 | "- at least 10 epochs;\n", 1201 | "- value of *learning_rate*: 0.001\n", 1202 | "- *dropout_keep_probability* equals to 0.5 for training (typical values for dropout probability are ranging from 0.1 to 1.0); larger values correspond smaler number of dropout units;\n", 1203 | "- *max_len*: 20." 1204 | ] 1205 | }, 1206 | { 1207 | "metadata": { 1208 | "id": "BqgNe0pRlKL2", 1209 | "colab_type": "code", 1210 | "colab": {} 1211 | }, 1212 | "cell_type": "code", 1213 | "source": [ 1214 | "tf.reset_default_graph()\n", 1215 | "\n", 1216 | "model = Seq2SeqModel(len(word2id), 20, 512, 7,\n", 1217 | " word2id[start_symbol], word2id[end_symbol], word2id[padding_symbol])######### YOUR CODE HERE #############\n", 1218 | "\n", 1219 | "batch_size = 128 ######### YOUR CODE HERE #############\n", 1220 | "n_epochs = 10 ######### YOUR CODE HERE #############\n", 1221 | "learning_rate = 0.001 ######### YOUR CODE HERE #############\n", 1222 | "dropout_keep_probability = 0.5 ######### YOUR CODE HERE #############\n", 1223 | "max_len = 20 ######### YOUR CODE HERE #############\n", 1224 | "\n", 1225 | "n_step = int(len(train_set) / batch_size)" 1226 | ], 1227 | "execution_count": 0, 1228 | "outputs": [] 1229 | }, 1230 | { 1231 | "metadata": { 1232 | "id": "gqrPjt-ulKL4", 1233 | "colab_type": "text" 1234 | }, 1235 | "cell_type": "markdown", 1236 | "source": [ 1237 | "Finally, we are ready to run the training! A good indicator that everything works fine is decreasing loss during the training. You should account on the loss value equal to approximately 2.7 at the beginning of the training and near 1 after the 10th epoch." 1238 | ] 1239 | }, 1240 | { 1241 | "metadata": { 1242 | "id": "ifgVIS0elKL5", 1243 | "colab_type": "code", 1244 | "colab": { 1245 | "base_uri": "https://localhost:8080/", 1246 | "height": 3363 1247 | }, 1248 | "outputId": "8932b50d-185c-4100-94b6-cc8c523b2c37" 1249 | }, 1250 | "cell_type": "code", 1251 | "source": [ 1252 | "session = tf.Session()\n", 1253 | "session.run(tf.global_variables_initializer())\n", 1254 | " \n", 1255 | "invalid_number_prediction_counts = []\n", 1256 | "all_model_predictions = []\n", 1257 | "all_ground_truth = []\n", 1258 | "\n", 1259 | "print('Start training... \\n')\n", 1260 | "for epoch in range(n_epochs): \n", 1261 | " random.shuffle(train_set)\n", 1262 | " random.shuffle(test_set)\n", 1263 | " \n", 1264 | " print('Train: epoch', epoch + 1)\n", 1265 | " for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):\n", 1266 | " ######################################\n", 1267 | " ######### YOUR CODE HERE #############\n", 1268 | " ######################################\n", 1269 | " # prepare the data (X_batch and Y_batch) for training\n", 1270 | " # using function batch_to_ids\n", 1271 | " X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)\n", 1272 | " Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)\n", 1273 | " predictions, loss = model.train_on_batch(session,\n", 1274 | " X,\n", 1275 | " X_seq_len,\n", 1276 | " Y,\n", 1277 | " Y_seq_len,\n", 1278 | " learning_rate,\n", 1279 | " dropout_keep_probability) ######### YOUR CODE HERE #############\n", 1280 | " \n", 1281 | " if n_iter % 200 == 0:\n", 1282 | " print(\"Epoch: [%d/%d], step: [%d/%d], loss: %f\" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))\n", 1283 | " \n", 1284 | " X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))\n", 1285 | " ######################################\n", 1286 | " ######### YOUR CODE HERE #############\n", 1287 | " ######################################\n", 1288 | " # prepare test data (X_sent and Y_sent) for predicting \n", 1289 | " # quality and computing value of the loss function\n", 1290 | " # using function batch_to_ids\n", 1291 | " \n", 1292 | " X_test, X_test_len = batch_to_ids(X_sent, word2id, max_len)\n", 1293 | " Y_test, Y_test_len = batch_to_ids(Y_sent, word2id, max_len)\n", 1294 | " \n", 1295 | " predictions, loss = model.predict_for_batch_with_loss(session, X_test, X_test_len, Y_test, Y_test_len) ######### YOUR CODE HERE #############\n", 1296 | " print('Test: epoch', epoch + 1, 'loss:', loss,)\n", 1297 | " for x, y, p in list(zip(X, Y, predictions))[:3]:\n", 1298 | " print('X:',''.join(ids_to_sentence(x, id2word)))\n", 1299 | " print('Y:',''.join(ids_to_sentence(y, id2word)))\n", 1300 | " print('O:',''.join(ids_to_sentence(p, id2word)))\n", 1301 | " print('')\n", 1302 | "\n", 1303 | " model_predictions = []\n", 1304 | " ground_truth = []\n", 1305 | " invalid_number_prediction_count = 0\n", 1306 | " # For the whole test set calculate ground-truth values (as integer numbers)\n", 1307 | " # and prediction values (also as integers) to calculate metrics.\n", 1308 | " # If generated by model number is not correct (e.g. '1-1'), \n", 1309 | " # increase invalid_number_prediction_count and don't append this and corresponding\n", 1310 | " # ground-truth value to the arrays.\n", 1311 | " for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):\n", 1312 | " ######################################\n", 1313 | " ######### YOUR CODE HERE #############\n", 1314 | " ######################################\n", 1315 | " X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)\n", 1316 | " Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)\n", 1317 | " predictions = model.predict_for_batch(session, X, X_seq_len)\n", 1318 | " for y, p in zip(Y, predictions):\n", 1319 | " valid_y = ''.join(ids_to_sentence(y, id2word))\n", 1320 | " valid_y = valid_y[:valid_y.find('$')]\n", 1321 | " valid_p = ''.join(ids_to_sentence(p, id2word))\n", 1322 | " valid_p = valid_p if -1 == valid_p.find('$') else valid_p[:valid_p.find('$')]\n", 1323 | " try:\n", 1324 | " po = int(valid_p)\n", 1325 | " py = int(valid_y)\n", 1326 | " model_predictions.append(po)\n", 1327 | " ground_truth.append(py)\n", 1328 | " except:\n", 1329 | " print(valid_y, valid_p)\n", 1330 | " invalid_number_prediction_count += 1\n", 1331 | " \n", 1332 | " all_model_predictions.append(model_predictions)\n", 1333 | " all_ground_truth.append(ground_truth)\n", 1334 | " invalid_number_prediction_counts.append(invalid_number_prediction_count)\n", 1335 | " \n", 1336 | "print('\\n...training finished.')" 1337 | ], 1338 | "execution_count": 40, 1339 | "outputs": [ 1340 | { 1341 | "output_type": "stream", 1342 | "text": [ 1343 | "Start training... \n", 1344 | "\n", 1345 | "Train: epoch 1\n", 1346 | "Epoch: [1/10], step: [1/625], loss: 2.700891\n", 1347 | "Epoch: [1/10], step: [201/625], loss: 1.845106\n", 1348 | "Epoch: [1/10], step: [401/625], loss: 1.771681\n", 1349 | "Epoch: [1/10], step: [601/625], loss: 1.726979\n", 1350 | "Test: epoch 1 loss: 1.6190765\n", 1351 | "X: 7688-6989$\n", 1352 | "Y: 699$##\n", 1353 | "O: -111$#\n", 1354 | "\n", 1355 | "X: 6527-5674$\n", 1356 | "Y: 853$##\n", 1357 | "O: 8001$#\n", 1358 | "\n", 1359 | "X: 8480+2575$\n", 1360 | "Y: 11055$\n", 1361 | "O: -2118$\n", 1362 | "\n", 1363 | "Train: epoch 2\n", 1364 | "Epoch: [2/10], step: [1/625], loss: 1.684533\n", 1365 | "Epoch: [2/10], step: [201/625], loss: 1.599683\n", 1366 | "Epoch: [2/10], step: [401/625], loss: 1.602870\n", 1367 | "Epoch: [2/10], step: [601/625], loss: 1.557222\n", 1368 | "Test: epoch 2 loss: 1.4373295\n", 1369 | "X: 8573-6146$\n", 1370 | "Y: 2427$#\n", 1371 | "O: -8246$\n", 1372 | "\n", 1373 | "X: 6113+1101$\n", 1374 | "Y: 7214$#\n", 1375 | "O: -1880$\n", 1376 | "\n", 1377 | "X: 6733-8807$\n", 1378 | "Y: -2074$\n", 1379 | "O: 8846$#\n", 1380 | "\n", 1381 | "Train: epoch 3\n", 1382 | "Epoch: [3/10], step: [1/625], loss: 1.514549\n", 1383 | "Epoch: [3/10], step: [201/625], loss: 1.456522\n", 1384 | "Epoch: [3/10], step: [401/625], loss: 1.458606\n", 1385 | "Epoch: [3/10], step: [601/625], loss: 1.395552\n", 1386 | "Test: epoch 3 loss: 1.3948303\n", 1387 | "X: 6292-1597$\n", 1388 | "Y: 4695$#\n", 1389 | "O: 4081$#\n", 1390 | "\n", 1391 | "X: 2564-2567$\n", 1392 | "Y: -3$###\n", 1393 | "O: 9337$#\n", 1394 | "\n", 1395 | "X: 9095-1052$\n", 1396 | "Y: 8043$#\n", 1397 | "O: 5083$#\n", 1398 | "\n", 1399 | "Train: epoch 4\n", 1400 | "Epoch: [4/10], step: [1/625], loss: 1.440212\n", 1401 | "Epoch: [4/10], step: [201/625], loss: 1.416748\n", 1402 | "Epoch: [4/10], step: [401/625], loss: 1.397778\n", 1403 | "Epoch: [4/10], step: [601/625], loss: 1.389731\n", 1404 | "Test: epoch 4 loss: 1.3075192\n", 1405 | "X: 5235-8939$\n", 1406 | "Y: -3704$\n", 1407 | "O: 3188$#\n", 1408 | "\n", 1409 | "X: 3416+2913$\n", 1410 | "Y: 6329$#\n", 1411 | "O: -2085$\n", 1412 | "\n", 1413 | "X: 3844+1521$\n", 1414 | "Y: 5365$#\n", 1415 | "O: 16515$\n", 1416 | "\n", 1417 | "Train: epoch 5\n", 1418 | "Epoch: [5/10], step: [1/625], loss: 1.381194\n", 1419 | "Epoch: [5/10], step: [201/625], loss: 1.357408\n", 1420 | "Epoch: [5/10], step: [401/625], loss: 1.312566\n", 1421 | "Epoch: [5/10], step: [601/625], loss: 1.329302\n", 1422 | "Test: epoch 5 loss: 1.2988263\n", 1423 | "X: 627+2906$#\n", 1424 | "Y: 3533$#\n", 1425 | "O: 10194$\n", 1426 | "\n", 1427 | "X: 9711+5424$\n", 1428 | "Y: 15135$\n", 1429 | "O: -1884$\n", 1430 | "\n", 1431 | "X: 1973-4850$\n", 1432 | "Y: -2877$\n", 1433 | "O: 12814$\n", 1434 | "\n", 1435 | "Train: epoch 6\n", 1436 | "Epoch: [6/10], step: [1/625], loss: 1.364512\n", 1437 | "Epoch: [6/10], step: [201/625], loss: 1.359489\n", 1438 | "Epoch: [6/10], step: [401/625], loss: 1.324831\n", 1439 | "Epoch: [6/10], step: [601/625], loss: 1.299981\n", 1440 | "Test: epoch 6 loss: 1.2658614\n", 1441 | "X: 6714-6489$\n", 1442 | "Y: 225$##\n", 1443 | "O: -1027$\n", 1444 | "\n", 1445 | "X: 3902-479$#\n", 1446 | "Y: 3423$#\n", 1447 | "O: -5400$\n", 1448 | "\n", 1449 | "X: 8244+457$#\n", 1450 | "Y: 8701$#\n", 1451 | "O: 14989$\n", 1452 | "\n", 1453 | "Train: epoch 7\n", 1454 | "Epoch: [7/10], step: [1/625], loss: 1.270696\n", 1455 | "Epoch: [7/10], step: [201/625], loss: 1.320629\n", 1456 | "Epoch: [7/10], step: [401/625], loss: 1.303759\n", 1457 | "Epoch: [7/10], step: [601/625], loss: 1.255467\n", 1458 | "Test: epoch 7 loss: 1.272886\n", 1459 | "X: 3337-6513$\n", 1460 | "Y: -3176$\n", 1461 | "O: 7369$#\n", 1462 | "\n", 1463 | "X: 9511+8519$\n", 1464 | "Y: 18030$\n", 1465 | "O: 9899$#\n", 1466 | "\n", 1467 | "X: 6120+5440$\n", 1468 | "Y: 11560$\n", 1469 | "O: -669$#\n", 1470 | "\n", 1471 | "Train: epoch 8\n", 1472 | "Epoch: [8/10], step: [1/625], loss: 1.253978\n", 1473 | "Epoch: [8/10], step: [201/625], loss: 1.245318\n", 1474 | "Epoch: [8/10], step: [401/625], loss: 1.267780\n", 1475 | "Epoch: [8/10], step: [601/625], loss: 1.253992\n", 1476 | "Test: epoch 8 loss: 1.246848\n", 1477 | "X: 8751+8550$\n", 1478 | "Y: 17301$\n", 1479 | "O: -6908$\n", 1480 | "\n", 1481 | "X: 9439-9712$\n", 1482 | "Y: -273$#\n", 1483 | "O: 5791$#\n", 1484 | "\n", 1485 | "X: 601-5466$#\n", 1486 | "Y: -4865$\n", 1487 | "O: 15300$\n", 1488 | "\n", 1489 | "Train: epoch 9\n", 1490 | "Epoch: [9/10], step: [1/625], loss: 1.255143\n", 1491 | "Epoch: [9/10], step: [201/625], loss: 1.246980\n", 1492 | "Epoch: [9/10], step: [401/625], loss: 1.235558\n", 1493 | "Epoch: [9/10], step: [601/625], loss: 1.243173\n", 1494 | "Test: epoch 9 loss: 1.2540202\n", 1495 | "X: 2250-902$#\n", 1496 | "Y: 1348$#\n", 1497 | "O: 17572$\n", 1498 | "\n", 1499 | "X: 2049-3746$\n", 1500 | "Y: -1697$\n", 1501 | "O: -6475$\n", 1502 | "\n", 1503 | "X: 7747-4065$\n", 1504 | "Y: 3682$#\n", 1505 | "O: 2727$#\n", 1506 | "\n", 1507 | "Train: epoch 10\n", 1508 | "Epoch: [10/10], step: [1/625], loss: 1.256141\n", 1509 | "Epoch: [10/10], step: [201/625], loss: 1.248966\n", 1510 | "Epoch: [10/10], step: [401/625], loss: 1.252261\n", 1511 | "Epoch: [10/10], step: [601/625], loss: 1.197563\n", 1512 | "Test: epoch 10 loss: 1.2317717\n", 1513 | "X: 5440-4574$\n", 1514 | "Y: 866$##\n", 1515 | "O: 1772$#\n", 1516 | "\n", 1517 | "X: 1114-5423$\n", 1518 | "Y: -4309$\n", 1519 | "O: 10974$\n", 1520 | "\n", 1521 | "X: 8921-9641$\n", 1522 | "Y: -720$#\n", 1523 | "O: 8172$#\n", 1524 | "\n", 1525 | "\n", 1526 | "...training finished.\n" 1527 | ], 1528 | "name": "stdout" 1529 | } 1530 | ] 1531 | }, 1532 | { 1533 | "metadata": { 1534 | "id": "iaB6ti1ylKL8", 1535 | "colab_type": "text" 1536 | }, 1537 | "cell_type": "markdown", 1538 | "source": [ 1539 | "## Evaluate results\n", 1540 | "\n", 1541 | "Because our task is simple and the output is straight-forward, we will use [MAE](https://en.wikipedia.org/wiki/Mean_absolute_error) metric to evaluate the trained model during the epochs. Compute the value of the metric for the output from each epoch." 1542 | ] 1543 | }, 1544 | { 1545 | "metadata": { 1546 | "id": "o5QQPaTmlKL8", 1547 | "colab_type": "code", 1548 | "colab": {} 1549 | }, 1550 | "cell_type": "code", 1551 | "source": [ 1552 | "from sklearn.metrics import mean_absolute_error" 1553 | ], 1554 | "execution_count": 0, 1555 | "outputs": [] 1556 | }, 1557 | { 1558 | "metadata": { 1559 | "id": "7OMyNk6nlKL-", 1560 | "colab_type": "code", 1561 | "colab": { 1562 | "base_uri": "https://localhost:8080/", 1563 | "height": 199 1564 | }, 1565 | "outputId": "1b4c76fe-2742-4e9d-c77d-4a3ecd620dde" 1566 | }, 1567 | "cell_type": "code", 1568 | "source": [ 1569 | "for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,\n", 1570 | " all_model_predictions,\n", 1571 | " invalid_number_prediction_counts), 1):\n", 1572 | " mae = mean_absolute_error(gts, predictions) ######### YOUR CODE HERE #############\n", 1573 | " print(\"Epoch: %i, MAE: %f, Invalid numbers: %i\" % (i, mae, invalid_number_prediction_count))" 1574 | ], 1575 | "execution_count": 42, 1576 | "outputs": [ 1577 | { 1578 | "output_type": "stream", 1579 | "text": [ 1580 | "Epoch: 1, MAE: 1241.154800, Invalid numbers: 0\n", 1581 | "Epoch: 2, MAE: 417.877900, Invalid numbers: 0\n", 1582 | "Epoch: 3, MAE: 393.824650, Invalid numbers: 0\n", 1583 | "Epoch: 4, MAE: 209.569350, Invalid numbers: 0\n", 1584 | "Epoch: 5, MAE: 185.149000, Invalid numbers: 0\n", 1585 | "Epoch: 6, MAE: 191.029700, Invalid numbers: 0\n", 1586 | "Epoch: 7, MAE: 180.715400, Invalid numbers: 0\n", 1587 | "Epoch: 8, MAE: 170.048550, Invalid numbers: 0\n", 1588 | "Epoch: 9, MAE: 161.782400, Invalid numbers: 0\n", 1589 | "Epoch: 10, MAE: 155.556250, Invalid numbers: 0\n" 1590 | ], 1591 | "name": "stdout" 1592 | } 1593 | ] 1594 | } 1595 | ] 1596 | } --------------------------------------------------------------------------------