├── README.md ├── utils.py ├── dialogue_manager.py ├── main_bot.py ├── week2-NER.ipynb └── week4-seq2seq.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Natural-Language-Processing---Coursera-Advanced-Machine-Learning 2 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | 6 | nltk.download('stopwords') 7 | from nltk.corpus import stopwords 8 | 9 | # Paths for all resources for the bot. 10 | RESOURCE_PATH = { 11 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 12 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 13 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 14 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 15 | 'WORD_EMBEDDINGS': 'word_embeddings.tsv', 16 | } 17 | 18 | 19 | def text_prepare(text): 20 | """Performs tokenization and simple preprocessing.""" 21 | 22 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 23 | bad_symbols_re = re.compile('[^0-9a-z #+_]') 24 | stopwords_set = set(stopwords.words('english')) 25 | 26 | text = text.lower() 27 | text = replace_by_space_re.sub(' ', text) 28 | text = bad_symbols_re.sub('', text) 29 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 30 | 31 | return text.strip() 32 | 33 | 34 | def load_embeddings(embeddings_path): 35 | """Loads pre-trained word embeddings from tsv file. 36 | 37 | Args: 38 | embeddings_path - path to the embeddings file. 39 | 40 | Returns: 41 | embeddings - dict mapping words to vectors; 42 | embeddings_dim - dimension of the vectors. 43 | """ 44 | 45 | # Hint: you have already implemented a similar routine in the 3rd assignment. 46 | # Note that here you also need to know the dimension of the loaded embeddings. 47 | # When you load the embeddings, use numpy.float32 type as dtype 48 | 49 | ######################## 50 | #### YOUR CODE HERE #### 51 | ######################## 52 | 53 | embeddings = {} 54 | for line in open(embeddings_path, encoding='utf-8'): 55 | word, *vec = line.strip().split('\t') 56 | embeddings_dim = len(vec) 57 | embeddings[word] = np.array(vec, dtype=np.float32) 58 | 59 | return embeddings, embeddings_dim 60 | 61 | def question_to_vec(question, embeddings, dim): 62 | """Transforms a string to an embedding by averaging word embeddings.""" 63 | 64 | # Hint: you have already implemented exactly this function in the 3rd assignment. 65 | 66 | ######################## 67 | #### YOUR CODE HERE #### 68 | ######################## 69 | 70 | result = np.zeros(dim, dtype=np.float32) 71 | count = 0 72 | for word in question.split(): 73 | if word in embeddings: 74 | result += embeddings[word] 75 | count += 1 76 | return result / count if count != 0 else result 77 | 78 | 79 | def unpickle_file(filename): 80 | """Returns the result of unpickling the file content.""" 81 | with open(filename, 'rb') as f: 82 | return pickle.load(f) 83 | -------------------------------------------------------------------------------- /dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from chatterbot.trainers import ChatterBotCorpusTrainer 6 | from utils import * 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | 9 | class ThreadRanker(object): 10 | def __init__(self, paths): 11 | self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS']) 12 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 13 | 14 | def __load_embeddings_by_tag(self, tag_name): 15 | embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl") 16 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 17 | return thread_ids, thread_embeddings 18 | 19 | def get_best_thread(self, question, tag_name): 20 | """ Returns id of the most similar thread for the question. 21 | The search is performed across the threads with a given tag. 22 | """ 23 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 24 | 25 | # HINT: you have already implemented a similar routine in the 3rd assignment. 26 | 27 | question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)[np.newaxis, :] 28 | best_thread = pairwise_distances_argmin(question_vec, thread_embeddings, metric='cosine')[0] 29 | 30 | return thread_ids[best_thread] 31 | 32 | 33 | class DialogueManager(object): 34 | def __init__(self, paths): 35 | print("Loading resources...") 36 | 37 | # Intent recognition: 38 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 39 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 40 | 41 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 42 | 43 | # Goal-oriented part: 44 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 45 | self.thread_ranker = ThreadRanker(paths) 46 | 47 | self.create_chitchat_bot() 48 | 49 | def create_chitchat_bot(self): 50 | """Initializes self.chitchat_bot with some conversational model.""" 51 | 52 | # Hint: you might want to create and train chatterbot.ChatBot here. 53 | # It could be done by creating ChatBot with the *trainer* parameter equals 54 | # "chatterbot.trainers.ChatterBotCorpusTrainer" 55 | # and then calling *train* function with "chatterbot.corpus.english" param 56 | 57 | ######################## 58 | #### YOUR CODE HERE #### 59 | ######################## 60 | chatbot = ChatBot('Nannan') 61 | trainer = ChatterBotCorpusTrainer(chatbot) 62 | trainer.train('chatterbot.corpus.english') 63 | 64 | self.chitchat_bot = chatbot 65 | 66 | def generate_answer(self, question): 67 | """Combines stackoverflow and chitchat parts using intent recognition.""" 68 | 69 | # Recognize intent of the question using `intent_recognizer`. 70 | # Don't forget to prepare question and calculate features for the question. 71 | 72 | prepared_question = text_prepare(question) 73 | features = self.tfidf_vectorizer.transform([prepared_question]) 74 | intent = self.intent_recognizer.predict(features)[0] 75 | 76 | # Chit-chat part: 77 | if intent == 'dialogue': 78 | # Pass question to chitchat_bot to generate a response. 79 | response = self.chitchat_bot.get_response(question) 80 | return response 81 | 82 | # Goal-oriented part: 83 | else: 84 | # Pass features to tag_classifier to get predictions. 85 | tag = self.tag_classifier.predict(features)[0] 86 | 87 | # Pass prepared_question to thread_ranker to get predictions. 88 | thread_id = self.thread_ranker.get_best_thread(prepared_question, tag) 89 | 90 | return self.ANSWER_TEMPLATE % (tag, thread_id) 91 | 92 | -------------------------------------------------------------------------------- /main_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import argparse 6 | import os 7 | import json 8 | 9 | from requests.compat import urljoin 10 | from dialogue_manager import DialogueManager 11 | from utils import RESOURCE_PATH 12 | 13 | class BotHandler(object): 14 | """ 15 | BotHandler is a class which implements all back-end of the bot. 16 | It has tree main functions: 17 | 'get_updates' — checks for new messages 18 | 'send_message' – posts new message to user 19 | 'get_answer' — computes the most relevant on a user's question 20 | """ 21 | 22 | def __init__(self, token, dialogue_manager): 23 | self.token = token 24 | self.api_url = "https://api.telegram.org/bot{}/".format(token) 25 | self.dialogue_manager = dialogue_manager 26 | 27 | def get_updates(self, offset=None, timeout=30): 28 | params = {"timeout": timeout, "offset": offset} 29 | raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params) 30 | try: 31 | resp = raw_resp.json() 32 | except json.decoder.JSONDecodeError as e: 33 | print("Failed to parse response {}: {}.".format(raw_resp.content, e)) 34 | return [] 35 | 36 | if "result" not in resp: 37 | return [] 38 | return resp["result"] 39 | 40 | def send_message(self, chat_id, text): 41 | params = {"chat_id": chat_id, "text": text} 42 | return requests.post(urljoin(self.api_url, "sendMessage"), params) 43 | 44 | def get_answer(self, question): 45 | if question == '/start': 46 | return "Hi, I am your project bot. How can I help you today?" 47 | return self.dialogue_manager.generate_answer(question) 48 | 49 | 50 | def parse_args(): 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('--token', type=str, default='') 53 | return parser.parse_args() 54 | 55 | 56 | def is_unicode(text): 57 | return len(text) == len(text.encode()) 58 | 59 | 60 | class SimpleDialogueManager(object): 61 | """ 62 | This is the simplest dialogue manager to test the telegram bot. 63 | Your task is to create a more advanced one in dialogue_manager.py." 64 | """ 65 | 66 | def generate_answer(self, question): 67 | return "Hello, world!" 68 | 69 | 70 | def main(): 71 | args = parse_args() 72 | token = args.token 73 | 74 | if not token: 75 | if not "TELEGRAM_TOKEN" in os.environ: 76 | print("Please, set bot token through --token or TELEGRAM_TOKEN env variable") 77 | return 78 | token = os.environ["TELEGRAM_TOKEN"] 79 | 80 | ################################################################# 81 | 82 | # Your task is to complete dialogue_manager.py and use your 83 | # advanced DialogueManager instead of SimpleDialogueManager. 84 | 85 | # This is the point where you plug it into the Telegram bot. 86 | # Do not forget to import all needed dependencies when you do so. 87 | 88 | #simple_manager = SimpleDialogueManager() 89 | #bot = BotHandler(token, simple_manager) 90 | 91 | dialogue_manager = DialogueManager(RESOURCE_PATH) 92 | bot = BotHandler(token, dialogue_manager) 93 | 94 | ############################################################### 95 | 96 | print("Ready to talk!") 97 | offset = 0 98 | while True: 99 | updates = bot.get_updates(offset=offset) 100 | for update in updates: 101 | print("An update received.") 102 | if "message" in update: 103 | chat_id = update["message"]["chat"]["id"] 104 | if "text" in update["message"]: 105 | text = update["message"]["text"] 106 | if is_unicode(text): 107 | print("Update content: {}".format(update)) 108 | bot.send_message(chat_id, bot.get_answer(update["message"]["text"])) 109 | else: 110 | bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...") 111 | offset = max(offset, update['update_id'] + 1) 112 | time.sleep(1) 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /week2-NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Recognize named entities on Twitter with LSTMs\n", 10 | "\n", 11 | "In this assignment, you will use a recurrent neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities from Twitter.\n", 12 | "\n", 13 | "For example, we want to extract persons' and organizations' names from the text. Than for the input text:\n", 14 | "\n", 15 | " Ian Goodfellow works for Google Brain\n", 16 | "\n", 17 | "a NER model needs to provide the following sequence of tags:\n", 18 | "\n", 19 | " B-PER I-PER O O B-ORG I-ORG\n", 20 | "\n", 21 | "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n", 22 | "\n", 23 | "A solution of the task will be based on neural networks, particularly, on Bi-Directional Long Short-Term Memory Networks (Bi-LSTMs).\n", 24 | "\n", 25 | "### Libraries\n", 26 | "\n", 27 | "For this task you will need the following libraries:\n", 28 | " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 29 | " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n", 30 | " \n", 31 | "If you have never worked with Tensorflow, you would probably need to read some tutorials during your work on this assignment, e.g. [this one](https://www.tensorflow.org/tutorials/recurrent) could be a good starting point. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### Data\n", 39 | "\n", 40 | "The following cell will download all data required for this assignment into the folder `week2/data`." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "File data\\train.txt is already downloaded.\n", 53 | "File data\\validation.txt is already downloaded.\n", 54 | "File data\\test.txt is already downloaded.\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "import sys\n", 60 | "sys.path.append(\"..\")\n", 61 | "from common.download_utils import download_week2_resources\n", 62 | "\n", 63 | "download_week2_resources()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Load the Twitter Named Entity Recognition corpus\n", 71 | "\n", 72 | "We will work with a corpus, which contains tweets with NE tags. Every line of a file contains a pair of a token (word/punctuation symbol) and a tag, separated by a whitespace. Different tweets are separated by an empty line.\n", 73 | "\n", 74 | "The function *read_data* reads a corpus from the *file_path* and returns two lists: one with tokens and one with the corresponding tags. You need to complete this function by adding a code, which will replace a user's nickname to `` token and any URL to `` token. You could think that a URL and a nickname are just strings which start with *http://* or *https://* in case of URLs and a *@* symbol for nicknames." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def read_data(file_path):\n", 84 | " tokens = []\n", 85 | " tags = []\n", 86 | " \n", 87 | " tweet_tokens = []\n", 88 | " tweet_tags = []\n", 89 | " for line in open(file_path, encoding='utf-8'):\n", 90 | " line = line.strip()\n", 91 | " if not line:\n", 92 | " if tweet_tokens:\n", 93 | " tokens.append(tweet_tokens)\n", 94 | " tags.append(tweet_tags)\n", 95 | " tweet_tokens = []\n", 96 | " tweet_tags = []\n", 97 | " else:\n", 98 | " token, tag = line.split()\n", 99 | " # Replace all urls with token\n", 100 | " # Replace all users with token\n", 101 | "\n", 102 | " ######################################\n", 103 | " ######### YOUR CODE HERE #############\n", 104 | " ######################################\n", 105 | " if \"http://\" in token or \"https://\" in token:\n", 106 | " token = \"\"\n", 107 | " if \"@\" in token:\n", 108 | " token = \"\"\n", 109 | " tweet_tokens.append(token)\n", 110 | " tweet_tags.append(tag)\n", 111 | " \n", 112 | " return tokens, tags" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "And now we can load three separate parts of the dataset:\n", 120 | " - *train* data for training the model;\n", 121 | " - *validation* data for evaluation and hyperparameters tuning;\n", 122 | " - *test* data for final evaluation of the model." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 3, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "train_tokens, train_tags = read_data('data/train.txt')\n", 132 | "validation_tokens, validation_tags = read_data('data/validation.txt')\n", 133 | "test_tokens, test_tags = read_data('data/test.txt')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 4, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "RT\tO\n", 153 | "\tO\n", 154 | ":\tO\n", 155 | "Online\tO\n", 156 | "ticket\tO\n", 157 | "sales\tO\n", 158 | "for\tO\n", 159 | "Ghostland\tB-musicartist\n", 160 | "Observatory\tI-musicartist\n", 161 | "extended\tO\n", 162 | "until\tO\n", 163 | "6\tO\n", 164 | "PM\tO\n", 165 | "EST\tO\n", 166 | "due\tO\n", 167 | "to\tO\n", 168 | "high\tO\n", 169 | "demand\tO\n", 170 | ".\tO\n", 171 | "Get\tO\n", 172 | "them\tO\n", 173 | "before\tO\n", 174 | "they\tO\n", 175 | "sell\tO\n", 176 | "out\tO\n", 177 | "...\tO\n", 178 | "\n", 179 | "Apple\tB-product\n", 180 | "MacBook\tI-product\n", 181 | "Pro\tI-product\n", 182 | "A1278\tI-product\n", 183 | "13.3\tI-product\n", 184 | "\"\tI-product\n", 185 | "Laptop\tI-product\n", 186 | "-\tI-product\n", 187 | "MD101LL/A\tI-product\n", 188 | "(\tO\n", 189 | "June\tO\n", 190 | ",\tO\n", 191 | "2012\tO\n", 192 | ")\tO\n", 193 | "-\tO\n", 194 | "Full\tO\n", 195 | "read\tO\n", 196 | "by\tO\n", 197 | "eBay\tB-company\n", 198 | "\tO\n", 199 | "\tO\n", 200 | "\n", 201 | "Happy\tO\n", 202 | "Birthday\tO\n", 203 | "\tO\n", 204 | "!\tO\n", 205 | "May\tO\n", 206 | "Allah\tB-person\n", 207 | "s.w.t\tO\n", 208 | "bless\tO\n", 209 | "you\tO\n", 210 | "with\tO\n", 211 | "goodness\tO\n", 212 | "and\tO\n", 213 | "happiness\tO\n", 214 | ".\tO\n", 215 | "\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "for i in range(3):\n", 221 | " for token, tag in zip(train_tokens[i], train_tags[i]):\n", 222 | " print('%s\\t%s' % (token, tag))\n", 223 | " print()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Prepare dictionaries\n", 231 | "\n", 232 | "To train a neural network, we will use two mappings: \n", 233 | "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", 234 | "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", 235 | "\n", 236 | "Now you need to implement the function *build_dict* which will return {token or tag}$\\to${index} and vice versa. " 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 5, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "from collections import defaultdict" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 6, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "from functools import reduce\n", 255 | "def build_dict(tokens_or_tags, special_tokens):\n", 256 | " \"\"\"\n", 257 | " tokens_or_tags: a list of lists of tokens or tags\n", 258 | " special_tokens: some special tokens\n", 259 | " \"\"\"\n", 260 | " # Create a dictionary with default value 0\n", 261 | " tok2idx = defaultdict(lambda: 0)\n", 262 | " tokens_or_tags = reduce(lambda x,y: x+y,tokens_or_tags)\n", 263 | " idx2tok = []\n", 264 | " for i, token in enumerate(special_tokens):\n", 265 | " tok2idx[token] = i\n", 266 | " idx2tok.append(token)\n", 267 | " token_set = set(tokens_or_tags)\n", 268 | " unique_tokens_or_tags = list(token_set)\n", 269 | " for i, token in enumerate((unique_tokens_or_tags)):\n", 270 | " tok2idx[token] = i+len(special_tokens)\n", 271 | " idx2tok.append(token)\n", 272 | " # Create mappings from tokens (or tags) to indices and vice versa.\n", 273 | " # At first, add special tokens (or tags) to the dictionaries.\n", 274 | " # The first special token must have index 0.\n", 275 | " \n", 276 | " # Mapping tok2idx should contain each token or tag only once. \n", 277 | " # To do so, you should:\n", 278 | " # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not\n", 279 | " # occur in special_tokens (because they could have non-empty intersection)\n", 280 | " # 2. index them (for example, you can add them into the list idx2tok\n", 281 | " # 3. for each token/tag save the index into tok2idx).\n", 282 | " \n", 283 | " ######################################\n", 284 | " ######### YOUR CODE HERE #############\n", 285 | " ######################################\n", 286 | " \n", 287 | " return tok2idx, idx2tok" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n", 295 | " - `` token for out of vocabulary tokens;\n", 296 | " - `` token for padding sentence to the same length when we create batches of sentences." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 7, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "special_tokens = ['', '']\n", 306 | "special_tags = ['O']\n", 307 | "\n", 308 | "# Create dictionaries \n", 309 | "token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)\n", 310 | "tag2idx, idx2tag = build_dict(train_tags, special_tags)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "The next additional functions will help you to create the mapping between tokens and ids for a sentence. " 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 8, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "def words2idxs(tokens_list):\n", 327 | " return [token2idx[word] for word in tokens_list]\n", 328 | "\n", 329 | "def tags2idxs(tags_list):\n", 330 | " return [tag2idx[tag] for tag in tags_list]\n", 331 | "\n", 332 | "def idxs2words(idxs):\n", 333 | " return [idx2token[idx] for idx in idxs]\n", 334 | "\n", 335 | "def idxs2tags(idxs):\n", 336 | " return [idx2tag[idx] for idx in idxs]" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "### Generate batches\n", 344 | "\n", 345 | "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. " 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 9, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "def batches_generator(batch_size, tokens, tags,\n", 355 | " shuffle=True, allow_smaller_last_batch=True):\n", 356 | " \"\"\"Generates padded batches of tokens and tags.\"\"\"\n", 357 | " \n", 358 | " n_samples = len(tokens)\n", 359 | " if shuffle:\n", 360 | " order = np.random.permutation(n_samples)\n", 361 | " else:\n", 362 | " order = np.arange(n_samples)\n", 363 | "\n", 364 | " n_batches = n_samples // batch_size\n", 365 | " if allow_smaller_last_batch and n_samples % batch_size:\n", 366 | " n_batches += 1\n", 367 | "\n", 368 | " for k in range(n_batches):\n", 369 | " batch_start = k * batch_size\n", 370 | " batch_end = min((k + 1) * batch_size, n_samples)\n", 371 | " current_batch_size = batch_end - batch_start\n", 372 | " x_list = []\n", 373 | " y_list = []\n", 374 | " max_len_token = 0\n", 375 | " for idx in order[batch_start: batch_end]:\n", 376 | " x_list.append(words2idxs(tokens[idx]))\n", 377 | " y_list.append(tags2idxs(tags[idx]))\n", 378 | " max_len_token = max(max_len_token, len(tags[idx]))\n", 379 | " \n", 380 | " # Fill in the data into numpy nd-arrays filled with padding indices.\n", 381 | " x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['']\n", 382 | " y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']\n", 383 | " lengths = np.zeros(current_batch_size, dtype=np.int32)\n", 384 | " for n in range(current_batch_size):\n", 385 | " utt_len = len(x_list[n])\n", 386 | " x[n, :utt_len] = x_list[n]\n", 387 | " lengths[n] = utt_len\n", 388 | " y[n, :utt_len] = y_list[n]\n", 389 | " yield x, y, lengths" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "## Build a recurrent neural network\n", 397 | "\n", 398 | "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an LSTM network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use Bi-Directional LSTM (Bi-LSTM). Dense layer will be used on top to perform tag classification. " 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 10, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "import tensorflow as tf\n", 408 | "import numpy as np" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 11, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "class BiLSTMModel():\n", 418 | " pass" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "source": [ 427 | "First, we need to create [placeholders](https://www.tensorflow.org/versions/master/api_docs/python/tf/placeholder) to specify what data we are going to feed into the network during the execution time. For this task we will need the following placeholders:\n", 428 | " - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);\n", 429 | " - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);\n", 430 | " - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);\n", 431 | " - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;\n", 432 | " - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.\n", 433 | "\n", 434 | "It could be noticed that we use *None* in the shapes in the declaration, which means that data of any size can be feeded. \n", 435 | "\n", 436 | "You need to complete the function *declare_placeholders*." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 12, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "def declare_placeholders(self):\n", 446 | " \"\"\"Specifies placeholders for the model.\"\"\"\n", 447 | "\n", 448 | " # Placeholders for input and ground truth output.\n", 449 | " self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') \n", 450 | " self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags') \n", 451 | " \n", 452 | " # Placeholder for lengths of the sequences.\n", 453 | " self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') \n", 454 | " \n", 455 | " # Placeholder for a dropout keep probability. If we don't feed\n", 456 | " # a value for this placeholder, it will be equal to 1.0.\n", 457 | " self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])\n", 458 | " \n", 459 | " # Placeholder for a learning rate (tf.float32).\n", 460 | " self.learning_rate_ph =tf.placeholder(dtype=tf.float32, shape=[])" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 13, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": { 475 | "collapsed": true 476 | }, 477 | "source": [ 478 | "Now, let us specify the layers of the neural network. First, we need to perform some preparatory steps: \n", 479 | " \n", 480 | "- Create embeddings matrix with [tf.Variable](https://www.tensorflow.org/api_docs/python/tf/Variable). Specify its name (*embeddings_matrix*), type (*tf.float32*), and initialize with random values.\n", 481 | "- Create forward and backward LSTM cells. TensorFlow provides a number of RNN cells ready for you. We suggest that you use *LSTMCell*, but you can also experiment with other types, e.g. GRU cells. [This](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) blogpost could be interesting if you want to learn more about the differences.\n", 482 | "- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify all keep probabilities using the dropout placeholder that we created before.\n", 483 | " \n", 484 | "After that, you can build the computation graph that transforms an input_batch:\n", 485 | "\n", 486 | "- [Look up](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup) embeddings for an *input_batch* in the prepared *embedding_matrix*.\n", 487 | "- Pass the embeddings through [Bidirectional Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn) with the specified forward and backward cells. Use the lengths placeholder here to avoid computations for padding tokens inside the RNN.\n", 488 | "- Create a dense layer on top. Its output will be used directly in loss function. \n", 489 | " \n", 490 | "Fill in the code below. In case you need to debug something, the easiest way is to check that tensor shapes of each step match the expected ones. \n", 491 | " " 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 14, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):\n", 501 | " \"\"\"Specifies bi-LSTM architecture and computes logits for inputs.\"\"\"\n", 502 | " \n", 503 | " # Create embedding variable (tf.Variable) with dtype tf.float32\n", 504 | " initial_embedding_matrix = (np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim))\n", 505 | " embedding_matrix_variable = tf.Variable(initial_embedding_matrix, dtype=tf.float32)\n", 506 | " \n", 507 | " # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units \n", 508 | " # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.\n", 509 | " forward_cell = lstm_cell = tf.contrib.rnn.LSTMCell(n_hidden_rnn) \n", 510 | " forward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn, forget_bias=1.0),\n", 511 | " input_keep_prob=self.dropout_ph,\n", 512 | " output_keep_prob=self.dropout_ph,\n", 513 | " state_keep_prob=self.dropout_ph)\n", 514 | " \n", 515 | " backward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn, forget_bias=1.0),\n", 516 | " input_keep_prob=self.dropout_ph,\n", 517 | " output_keep_prob=self.dropout_ph,\n", 518 | " state_keep_prob=self.dropout_ph)\n", 519 | "\n", 520 | " # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).\n", 521 | " # Shape: [batch_size, sequence_len, embedding_dim].\n", 522 | " embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)\n", 523 | " \n", 524 | " # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).\n", 525 | " # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. \n", 526 | " # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.\n", 527 | " (rnn_output_fw, rnn_output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell,\n", 528 | " cell_bw=backward_cell,\n", 529 | " inputs=embeddings,\n", 530 | " dtype=tf.float32,\n", 531 | " sequence_length=self.lengths)\n", 532 | " rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)\n", 533 | "\n", 534 | " # Dense layer on top.\n", 535 | " # Shape: [batch_size, sequence_len, n_tags]. \n", 536 | " self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 15, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "BiLSTMModel.__build_layers = classmethod(build_layers)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "To compute the actual predictions of the neural network, you need to apply [softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) to the last layer and find the most probable tags with [argmax](https://www.tensorflow.org/api_docs/python/tf/argmax)." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 16, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "def compute_predictions(self):\n", 562 | " \"\"\"Transforms logits to probabilities and finds the most probable tags.\"\"\"\n", 563 | " \n", 564 | " # Create softmax (tf.nn.softmax) function\n", 565 | " softmax_output = tf.nn.softmax(logits=self.logits)\n", 566 | " \n", 567 | " # Use argmax (tf.argmax) to get the most probable tags\n", 568 | " # Don't forget to set axis=-1\n", 569 | " # otherwise argmax will be calculated in a wrong way\n", 570 | " self.predictions = tf.argmax(softmax_output, axis=-1)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 17, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "BiLSTMModel.__compute_predictions = classmethod(compute_predictions)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "collapsed": true 586 | }, 587 | "source": [ 588 | "During training we do not need predictions of the network, but we need a loss function. We will use [cross-entropy loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy), efficiently implemented in TF as \n", 589 | "[cross entropy with logits](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2). Note that it should be applied to logits of the model (not to softmax probabilities!). Also note, that we do not want to take into account loss terms coming from `` tokens. So we need to mask them out, before computing [mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 18, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "def compute_loss(self, n_tags, PAD_index):\n", 599 | " \"\"\"Computes masked cross-entopy loss with logits.\"\"\"\n", 600 | " \n", 601 | " # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)\n", 602 | " ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)\n", 603 | " loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=ground_truth_tags_one_hot,\n", 604 | " logits=self.logits)\n", 605 | " \n", 606 | " mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)\n", 607 | " # Create loss function which doesn't operate with tokens (tf.reduce_mean)\n", 608 | " # Be careful that the argument of tf.reduce_mean should be\n", 609 | " # multiplication of mask and loss_tensor.\n", 610 | " loss = tf.boolean_mask(loss_tensor, mask) \n", 611 | " self.loss = tf.reduce_mean(tf.cast(loss, tf.float32), keepdims=False)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 19, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "BiLSTMModel.__compute_loss = classmethod(compute_loss)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "The last thing to specify is how we want to optimize the loss. \n", 628 | "We suggest that you use [Adam](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer) optimizer with a learning rate from the corresponding placeholder. \n", 629 | "You will also need to apply clipping to eliminate exploding gradients. It can be easily done with [clip_by_norm](https://www.tensorflow.org/api_docs/python/tf/clip_by_norm) function. " 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 20, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "def perform_optimization(self):\n", 639 | " \"\"\"Specifies the optimizer and train_op for the model.\"\"\"\n", 640 | " \n", 641 | " # Create an optimizer (tf.train.AdamOptimizer)\n", 642 | " self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)\n", 643 | " self.grads_and_vars = self.optimizer.compute_gradients(self.loss)\n", 644 | " \n", 645 | " # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars\n", 646 | " # Pay attention that you need to apply this operation only for gradients \n", 647 | " # because self.grads_and_vars also contains variables.\n", 648 | " # list comprehension might be useful in this case.\n", 649 | " clip_norm = tf.cast(1.0, tf.float32)\n", 650 | " grads_and_vars = []\n", 651 | " for gradient, variable in self.grads_and_vars:\n", 652 | " grads_and_vars.append((tf.clip_by_norm(gradient, clip_norm), variable))\n", 653 | " \n", 654 | " self.grads_and_vars = grads_and_vars \n", 655 | " \n", 656 | " self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 21, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "BiLSTMModel.__perform_optimization = classmethod(perform_optimization)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": { 671 | "collapsed": true 672 | }, 673 | "source": [ 674 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipes on how the network should function.\n", 675 | "Now we will put them to the constructor of our Bi-LSTM class to use it in the next section. " 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 22, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [ 684 | "def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):\n", 685 | " self.__declare_placeholders()\n", 686 | " self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)\n", 687 | " self.__compute_predictions()\n", 688 | " self.__compute_loss(n_tags, PAD_index)\n", 689 | " self.__perform_optimization()" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 23, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "BiLSTMModel.__init__ = classmethod(init_model)" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "## Train the network and predict tags" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": { 711 | "collapsed": true 712 | }, 713 | "source": [ 714 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*, which was declared in *perform_optimization*. To predict tags, we just need to compute *self.predictions*. Anyway, we need to feed actual data through the placeholders that we defined before. " 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 24, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):\n", 724 | " feed_dict = {self.input_batch: x_batch,\n", 725 | " self.ground_truth_tags: y_batch,\n", 726 | " self.learning_rate_ph: learning_rate,\n", 727 | " self.dropout_ph: dropout_keep_probability,\n", 728 | " self.lengths: lengths}\n", 729 | " \n", 730 | " session.run(self.train_op, feed_dict=feed_dict)" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 25, 736 | "metadata": {}, 737 | "outputs": [], 738 | "source": [ 739 | "BiLSTMModel.train_on_batch = classmethod(train_on_batch)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "Implement the function *predict_for_batch* by initializing *feed_dict* with input *x_batch* and *lengths* and running the *session* for *self.predictions*." 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 26, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "def predict_for_batch(self, session, x_batch, lengths):\n", 756 | " ######################################\n", 757 | " ######### YOUR CODE HERE #############\n", 758 | " ######################################\n", 759 | " predictions = session.run(self.predictions,\n", 760 | " feed_dict={self.input_batch: x_batch, self.lengths: lengths})\n", 761 | " return predictions" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": 27, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": {}, 776 | "source": [ 777 | "We finished with necessary methods of our BiLSTMModel model and almost ready to start experimenting.\n", 778 | "\n", 779 | "### Evaluation \n", 780 | "To simplify the evaluation process we provide two functions for you:\n", 781 | " - *predict_tags*: uses a model to get predictions and transforms indices to tokens and tags;\n", 782 | " - *eval_conll*: calculates precision, recall and F1 for the results." 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 28, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [ 791 | "from evaluation import precision_recall_f1" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 29, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [ 800 | "def predict_tags(model, session, token_idxs_batch, lengths):\n", 801 | " \"\"\"Performs predictions and transforms indices to tokens and tags.\"\"\"\n", 802 | " \n", 803 | " tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)\n", 804 | " \n", 805 | " tags_batch, tokens_batch = [], []\n", 806 | " for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):\n", 807 | " tags, tokens = [], []\n", 808 | " for tag_idx, token_idx in zip(tag_idxs, token_idxs):\n", 809 | " tags.append(idx2tag[tag_idx])\n", 810 | " tokens.append(idx2token[token_idx])\n", 811 | " tags_batch.append(tags)\n", 812 | " tokens_batch.append(tokens)\n", 813 | " return tags_batch, tokens_batch\n", 814 | " \n", 815 | " \n", 816 | "def eval_conll(model, session, tokens, tags, short_report=True):\n", 817 | " \"\"\"Computes NER quality measures using CONLL shared task script.\"\"\"\n", 818 | " \n", 819 | " y_true, y_pred = [], []\n", 820 | " for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):\n", 821 | " tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)\n", 822 | " if len(x_batch[0]) != len(tags_batch[0]):\n", 823 | " raise Exception(\"Incorrect length of prediction for the input, \"\n", 824 | " \"expected length: %i, got: %i\" % (len(x_batch[0]), len(tags_batch[0])))\n", 825 | " predicted_tags = []\n", 826 | " ground_truth_tags = []\n", 827 | " for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): \n", 828 | " if token != '':\n", 829 | " ground_truth_tags.append(idx2tag[gt_tag_idx])\n", 830 | " predicted_tags.append(pred_tag)\n", 831 | "\n", 832 | " # We extend every prediction and ground truth sequence with 'O' tag\n", 833 | " # to indicate a possible end of entity.\n", 834 | " y_true.extend(ground_truth_tags + ['O'])\n", 835 | " y_pred.extend(predicted_tags + ['O'])\n", 836 | " \n", 837 | " results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)\n", 838 | " return results" 839 | ] 840 | }, 841 | { 842 | "cell_type": "markdown", 843 | "metadata": {}, 844 | "source": [ 845 | "## Run your experiment" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": {}, 851 | "source": [ 852 | "Create *BiLSTMModel* model with the following parameters:\n", 853 | " - *vocabulary_size* — number of tokens;\n", 854 | " - *n_tags* — number of tags;\n", 855 | " - *embedding_dim* — dimension of embeddings, recommended value: 200;\n", 856 | " - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;\n", 857 | " - *PAD_index* — an index of the padding token (``).\n", 858 | "\n", 859 | "Set hyperparameters. You might want to start with the following recommended values:\n", 860 | "- *batch_size*: 32;\n", 861 | "- 4 epochs;\n", 862 | "- starting value of *learning_rate*: 0.005\n", 863 | "- *learning_rate_decay*: a square root of 2;\n", 864 | "- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.\n", 865 | "\n", 866 | "However, feel free to conduct more experiments to tune hyperparameters and earn extra points for the assignment." 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": 30, 872 | "metadata": {}, 873 | "outputs": [ 874 | { 875 | "name": "stderr", 876 | "output_type": "stream", 877 | "text": [ 878 | "c:\\users\\matcha.11\\appdata\\local\\continuum\\anaconda3\\envs\\tf_gpu\\lib\\site-packages\\tensorflow\\python\\ops\\gradients_impl.py:112: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", 879 | " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" 880 | ] 881 | } 882 | ], 883 | "source": [ 884 | "tf.reset_default_graph()\n", 885 | "\n", 886 | "model = BiLSTMModel(vocabulary_size=len(token2idx)+2,\n", 887 | " n_tags=len(tag2idx),\n", 888 | " embedding_dim=400, \n", 889 | " n_hidden_rnn=400,\n", 890 | " PAD_index=token2idx[''])\n", 891 | "\n", 892 | "batch_size = 32\n", 893 | "n_epochs = 6\n", 894 | "learning_rate = 0.005\n", 895 | "learning_rate_decay = np.sqrt(2)\n", 896 | "dropout_keep_probability = 0.55" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "If you got an error *\"Tensor conversion requested dtype float64 for Tensor with dtype float32\"* in this point, check if there are variables without dtype initialised. Set the value of dtype equals to *tf.float32* for such variables." 904 | ] 905 | }, 906 | { 907 | "cell_type": "markdown", 908 | "metadata": {}, 909 | "source": [ 910 | "Finally, we are ready to run the training!" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 31, 916 | "metadata": {}, 917 | "outputs": [ 918 | { 919 | "name": "stdout", 920 | "output_type": "stream", 921 | "text": [ 922 | "Start training... \n", 923 | "\n", 924 | "-------------------- Epoch 1 of 6 --------------------\n", 925 | "Train data evaluation:\n", 926 | "processed 105778 tokens with 4489 phrases; found: 76467 phrases; correct: 197.\n", 927 | "\n", 928 | "precision: 0.26%; recall: 4.39%; F1: 0.49\n", 929 | "\n", 930 | "Validation data evaluation:\n", 931 | "processed 12836 tokens with 537 phrases; found: 9464 phrases; correct: 17.\n", 932 | "\n", 933 | "precision: 0.18%; recall: 3.17%; F1: 0.34\n", 934 | "\n", 935 | "-------------------- Epoch 2 of 6 --------------------\n", 936 | "Train data evaluation:\n", 937 | "processed 105778 tokens with 4489 phrases; found: 2964 phrases; correct: 1360.\n", 938 | "\n", 939 | "precision: 45.88%; recall: 30.30%; F1: 36.50\n", 940 | "\n", 941 | "Validation data evaluation:\n", 942 | "processed 12836 tokens with 537 phrases; found: 225 phrases; correct: 103.\n", 943 | "\n", 944 | "precision: 45.78%; recall: 19.18%; F1: 27.03\n", 945 | "\n", 946 | "-------------------- Epoch 3 of 6 --------------------\n", 947 | "Train data evaluation:\n", 948 | "processed 105778 tokens with 4489 phrases; found: 4368 phrases; correct: 3000.\n", 949 | "\n", 950 | "precision: 68.68%; recall: 66.83%; F1: 67.74\n", 951 | "\n", 952 | "Validation data evaluation:\n", 953 | "processed 12836 tokens with 537 phrases; found: 347 phrases; correct: 170.\n", 954 | "\n", 955 | "precision: 48.99%; recall: 31.66%; F1: 38.46\n", 956 | "\n", 957 | "-------------------- Epoch 4 of 6 --------------------\n", 958 | "Train data evaluation:\n", 959 | "processed 105778 tokens with 4489 phrases; found: 4656 phrases; correct: 3897.\n", 960 | "\n", 961 | "precision: 83.70%; recall: 86.81%; F1: 85.23\n", 962 | "\n", 963 | "Validation data evaluation:\n", 964 | "processed 12836 tokens with 537 phrases; found: 435 phrases; correct: 199.\n", 965 | "\n", 966 | "precision: 45.75%; recall: 37.06%; F1: 40.95\n", 967 | "\n", 968 | "-------------------- Epoch 5 of 6 --------------------\n", 969 | "Train data evaluation:\n", 970 | "processed 105778 tokens with 4489 phrases; found: 4607 phrases; correct: 4233.\n", 971 | "\n", 972 | "precision: 91.88%; recall: 94.30%; F1: 93.07\n", 973 | "\n", 974 | "Validation data evaluation:\n", 975 | "processed 12836 tokens with 537 phrases; found: 444 phrases; correct: 204.\n", 976 | "\n", 977 | "precision: 45.95%; recall: 37.99%; F1: 41.59\n", 978 | "\n", 979 | "-------------------- Epoch 6 of 6 --------------------\n", 980 | "Train data evaluation:\n", 981 | "processed 105778 tokens with 4489 phrases; found: 4521 phrases; correct: 4327.\n", 982 | "\n", 983 | "precision: 95.71%; recall: 96.39%; F1: 96.05\n", 984 | "\n", 985 | "Validation data evaluation:\n", 986 | "processed 12836 tokens with 537 phrases; found: 382 phrases; correct: 198.\n", 987 | "\n", 988 | "precision: 51.83%; recall: 36.87%; F1: 43.09\n", 989 | "\n", 990 | "...training finished.\n" 991 | ] 992 | } 993 | ], 994 | "source": [ 995 | "sess = tf.Session()\n", 996 | "sess.run(tf.global_variables_initializer())\n", 997 | "\n", 998 | "print('Start training... \\n')\n", 999 | "for epoch in range(n_epochs):\n", 1000 | " # For each epoch evaluate the model on train and validation data\n", 1001 | " print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)\n", 1002 | " print('Train data evaluation:')\n", 1003 | " eval_conll(model, sess, train_tokens, train_tags, short_report=True)\n", 1004 | " print('Validation data evaluation:')\n", 1005 | " eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)\n", 1006 | " \n", 1007 | " # Train the model\n", 1008 | " for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):\n", 1009 | " model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)\n", 1010 | " \n", 1011 | " # Decaying the learning rate\n", 1012 | " learning_rate = learning_rate / learning_rate_decay\n", 1013 | " \n", 1014 | "print('...training finished.')" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "markdown", 1019 | "metadata": {}, 1020 | "source": [ 1021 | "Now let us see full quality reports for the final model on train, validation, and test sets. To give you a hint whether you have implemented everything correctly, you might expect F-score about 40% on the validation set.\n", 1022 | "\n", 1023 | "**The output of the cell below (as well as the output of all the other cells) should be present in the notebook for peer2peer review!**" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 32, 1029 | "metadata": {}, 1030 | "outputs": [ 1031 | { 1032 | "name": "stdout", 1033 | "output_type": "stream", 1034 | "text": [ 1035 | "-------------------- Train set quality: --------------------\n", 1036 | "processed 105778 tokens with 4489 phrases; found: 4543 phrases; correct: 4405.\n", 1037 | "\n", 1038 | "precision: 96.96%; recall: 98.13%; F1: 97.54\n", 1039 | "\n", 1040 | "\t company: precision: 97.70%; recall: 98.91%; F1: 98.30; predicted: 651\n", 1041 | "\n", 1042 | "\t facility: precision: 92.73%; recall: 97.45%; F1: 95.03; predicted: 330\n", 1043 | "\n", 1044 | "\t geo-loc: precision: 98.12%; recall: 99.30%; F1: 98.70; predicted: 1008\n", 1045 | "\n", 1046 | "\t movie: precision: 100.00%; recall: 100.00%; F1: 100.00; predicted: 68\n", 1047 | "\n", 1048 | "\t musicartist: precision: 96.20%; recall: 98.28%; F1: 97.23; predicted: 237\n", 1049 | "\n", 1050 | "\t other: precision: 96.62%; recall: 98.28%; F1: 97.45; predicted: 770\n", 1051 | "\n", 1052 | "\t person: precision: 99.33%; recall: 99.77%; F1: 99.55; predicted: 890\n", 1053 | "\n", 1054 | "\t product: precision: 98.75%; recall: 99.37%; F1: 99.06; predicted: 320\n", 1055 | "\n", 1056 | "\t sportsteam: precision: 99.06%; recall: 96.77%; F1: 97.90; predicted: 212\n", 1057 | "\n", 1058 | "\t tvshow: precision: 42.11%; recall: 41.38%; F1: 41.74; predicted: 57\n", 1059 | "\n", 1060 | "-------------------- Validation set quality: --------------------\n", 1061 | "processed 12836 tokens with 537 phrases; found: 432 phrases; correct: 200.\n", 1062 | "\n", 1063 | "precision: 46.30%; recall: 37.24%; F1: 41.28\n", 1064 | "\n", 1065 | "\t company: precision: 64.04%; recall: 54.81%; F1: 59.07; predicted: 89\n", 1066 | "\n", 1067 | "\t facility: precision: 46.43%; recall: 38.24%; F1: 41.94; predicted: 28\n", 1068 | "\n", 1069 | "\t geo-loc: precision: 67.50%; recall: 47.79%; F1: 55.96; predicted: 80\n", 1070 | "\n", 1071 | "\t movie: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 19\n", 1072 | "\n", 1073 | "\t musicartist: precision: 25.00%; recall: 17.86%; F1: 20.83; predicted: 20\n", 1074 | "\n", 1075 | "\t other: precision: 38.81%; recall: 32.10%; F1: 35.14; predicted: 67\n", 1076 | "\n", 1077 | "\t person: precision: 50.00%; recall: 30.36%; F1: 37.78; predicted: 68\n", 1078 | "\n", 1079 | "\t product: precision: 11.76%; recall: 11.76%; F1: 11.76; predicted: 34\n", 1080 | "\n", 1081 | "\t sportsteam: precision: 25.93%; recall: 35.00%; F1: 29.79; predicted: 27\n", 1082 | "\n", 1083 | "\t tvshow: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 0\n", 1084 | "\n", 1085 | "-------------------- Test set quality: --------------------\n", 1086 | "processed 13258 tokens with 604 phrases; found: 462 phrases; correct: 242.\n", 1087 | "\n", 1088 | "precision: 52.38%; recall: 40.07%; F1: 45.40\n", 1089 | "\n", 1090 | "\t company: precision: 64.91%; recall: 44.05%; F1: 52.48; predicted: 57\n", 1091 | "\n", 1092 | "\t facility: precision: 51.43%; recall: 38.30%; F1: 43.90; predicted: 35\n", 1093 | "\n", 1094 | "\t geo-loc: precision: 74.17%; recall: 53.94%; F1: 62.46; predicted: 120\n", 1095 | "\n", 1096 | "\t movie: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 7\n", 1097 | "\n", 1098 | "\t musicartist: precision: 16.67%; recall: 7.41%; F1: 10.26; predicted: 12\n", 1099 | "\n", 1100 | "\t other: precision: 34.51%; recall: 37.86%; F1: 36.11; predicted: 113\n", 1101 | "\n", 1102 | "\t person: precision: 58.44%; recall: 43.27%; F1: 49.72; predicted: 77\n", 1103 | "\n", 1104 | "\t product: precision: 28.57%; recall: 14.29%; F1: 19.05; predicted: 14\n", 1105 | "\n", 1106 | "\t sportsteam: precision: 33.33%; recall: 25.81%; F1: 29.09; predicted: 24\n", 1107 | "\n", 1108 | "\t tvshow: precision: 0.00%; recall: 0.00%; F1: 0.00; predicted: 3\n", 1109 | "\n" 1110 | ] 1111 | } 1112 | ], 1113 | "source": [ 1114 | "print('-' * 20 + ' Train set quality: ' + '-' * 20)\n", 1115 | "train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)\n", 1116 | "\n", 1117 | "print('-' * 20 + ' Validation set quality: ' + '-' * 20)\n", 1118 | "validation_results = eval_conll(model, sess, validation_tokens, validation_tags, short_report=False)\n", 1119 | "\n", 1120 | "print('-' * 20 + ' Test set quality: ' + '-' * 20)\n", 1121 | "test_results = eval_conll(model, sess, test_tokens, test_tags, short_report=False)" 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "code", 1126 | "execution_count": null, 1127 | "metadata": {}, 1128 | "outputs": [], 1129 | "source": [] 1130 | }, 1131 | { 1132 | "cell_type": "markdown", 1133 | "metadata": {}, 1134 | "source": [ 1135 | "### Conclusions\n", 1136 | "\n", 1137 | "Could we say that our model is state of the art and the results are acceptable for the task? Definately, we can say so. Nowadays, Bi-LSTM is one of the state of the art approaches for solving NER problem and it outperforms other classical methods. Despite the fact that we used small training corpora (in comparison with usual sizes of corpora in Deep Learning), our results are quite good. In addition, in this task there are many possible named entities and for some of them we have only several dozens of trainig examples, which is definately small. However, the implemented model outperforms classical CRFs for this task. Even better results could be obtained by some combinations of several types of methods, e.g. see [this](https://arxiv.org/abs/1603.01354) paper if you are interested." 1138 | ] 1139 | } 1140 | ], 1141 | "metadata": { 1142 | "kernelspec": { 1143 | "display_name": "nlplstm", 1144 | "language": "python", 1145 | "name": "nlplstm" 1146 | }, 1147 | "language_info": { 1148 | "codemirror_mode": { 1149 | "name": "ipython", 1150 | "version": 3 1151 | }, 1152 | "file_extension": ".py", 1153 | "mimetype": "text/x-python", 1154 | "name": "python", 1155 | "nbconvert_exporter": "python", 1156 | "pygments_lexer": "ipython3", 1157 | "version": "3.6.8" 1158 | } 1159 | }, 1160 | "nbformat": 4, 1161 | "nbformat_minor": 1 1162 | } 1163 | -------------------------------------------------------------------------------- /week4-seq2seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learn to calculate with seq2seq model\n", 8 | "\n", 9 | "In this assignment, you will learn how to use neural networks to solve sequence-to-sequence prediction tasks. Seq2Seq models are very popular these days because they achieve great results in Machine Translation, Text Summarization, Conversational Modeling and more.\n", 10 | "\n", 11 | "Using sequence-to-sequence modeling you are going to build a calculator for evaluating arithmetic expressions, by taking an equation as an input to the neural network and producing an answer as it's output.\n", 12 | "\n", 13 | "The resulting solution for this problem will be based on state-of-the-art approaches for sequence-to-sequence learning and you should be able to easily adapt it to solve other tasks. However, if you want to train your own machine translation system or intellectual chat bot, it would be useful to have access to compute resources like GPU, and be patient, because training of such systems is usually time consuming. \n", 14 | "\n", 15 | "### Libraries\n", 16 | "\n", 17 | "For this task you will need the following libraries:\n", 18 | " - [TensorFlow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", 19 | " - [scikit-learn](http://scikit-learn.org/stable/index.html) — a tool for data mining and data analysis.\n", 20 | " \n", 21 | "If you have never worked with TensorFlow, you will probably want to read some tutorials during your work on this assignment, e.g. [Neural Machine Translation](https://www.tensorflow.org/tutorials/seq2seq) tutorial deals with very similar task and can explain some concepts to you. " 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Data\n", 29 | "\n", 30 | "One benefit of this task is that you don't need to download any data — you will generate it on your own! We will use two operators (addition and subtraction) and work with positive integer numbers in some range. Here are examples of correct inputs and outputs:\n", 31 | "\n", 32 | " Input: '1+2'\n", 33 | " Output: '3'\n", 34 | " \n", 35 | " Input: '0-99'\n", 36 | " Output: '-99'\n", 37 | "\n", 38 | "*Note, that there are no spaces between operators and operands.*\n", 39 | "\n", 40 | "\n", 41 | "Now you need to implement the function *generate_equations*, which will be used to generate the data." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import random" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "def generate_equations(allowed_operators, dataset_size, min_value, max_value):\n", 60 | " \"\"\"Generates pairs of equations and solutions to them.\n", 61 | " \n", 62 | " Each equation has a form of two integers with an operator in between.\n", 63 | " Each solution is an integer with the result of the operaion.\n", 64 | " \n", 65 | " allowed_operators: list of strings, allowed operators.\n", 66 | " dataset_size: an integer, number of equations to be generated.\n", 67 | " min_value: an integer, min value of each operand.\n", 68 | " max_value: an integer, max value of each operand.\n", 69 | "\n", 70 | " result: a list of tuples of strings (equation, solution).\n", 71 | " \"\"\"\n", 72 | " sample = []\n", 73 | " for _ in range(dataset_size):\n", 74 | " a1 = random.randint(min_value, max_value)\n", 75 | " a2 = random.randint(min_value, max_value) \n", 76 | " val = random.randint(0, 10)\n", 77 | " if(val%2==0):\n", 78 | " stro = str(a1)+\"+\"+str(a2)\n", 79 | " o = a1+a2\n", 80 | " else: \n", 81 | " stro = str(a1)+\"-\"+str(a2)\n", 82 | " o = a1-a2\n", 83 | " sample.append((stro, str(o))) \n", 84 | " return sample" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "To check the correctness of your implementation, use *test_generate_equations* function:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "def test_generate_equations():\n", 101 | " allowed_operators = ['+', '-']\n", 102 | " dataset_size = 10\n", 103 | " for (input_, output_) in generate_equations(allowed_operators, dataset_size, 0, 100):\n", 104 | " if not (type(input_) is str and type(output_) is str):\n", 105 | " return \"Both parts should be strings.\"\n", 106 | " if eval(input_) != int(output_):\n", 107 | " return \"The (equation: {!r}, solution: {!r}) pair is incorrect.\".format(input_, output_)\n", 108 | " return \"Tests passed.\"" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Tests passed.\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print(test_generate_equations())" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Finally, we are ready to generate the train and test data for the neural network:" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.model_selection import train_test_split" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 9, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "allowed_operators = ['+', '-']\n", 151 | "dataset_size = 100000\n", 152 | "data = generate_equations(allowed_operators, dataset_size, min_value=0, max_value=9999)\n", 153 | "\n", 154 | "train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Prepare data for the neural network\n", 162 | "\n", 163 | "The next stage of data preparation is creating mappings of the characters to their indices in some vocabulary. Since in our task we already know which symbols will appear in the inputs and outputs, generating the vocabulary is a simple step.\n", 164 | "\n", 165 | "#### How to create dictionaries for other task\n", 166 | "\n", 167 | "First of all, you need to understand what is the basic unit of the sequence in your task. In our case, we operate on symbols and the basic unit is a symbol. The number of symbols is small, so we don't need to think about filtering/normalization steps. However, in other tasks, the basic unit is often a word, and in this case the mapping would be *word $\\to$ integer*. The number of words might be huge, so it would be reasonable to filter them, for example, by frequency and leave only the frequent ones. Other strategies that your should consider are: data normalization (lowercasing, tokenization, how to consider punctuation marks), separate vocabulary for input and for output (e.g. for machine translation), some specifics of the task." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "word2id = {symbol:i for i, symbol in enumerate('#^$+-1234567890')}\n", 177 | "id2word = {i:symbol for symbol, i in word2id.items()}" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "#### Special symbols" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 11, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "start_symbol = '^'\n", 194 | "end_symbol = '$'\n", 195 | "padding_symbol = '#'" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "You could notice that we have added 3 special symbols: '^', '\\$' and '#':\n", 203 | "- '^' symbol will be passed to the network to indicate the beginning of the decoding procedure. We will discuss this one later in more details.\n", 204 | "- '\\$' symbol will be used to indicate the *end of a string*, both for input and output sequences. \n", 205 | "- '#' symbol will be used as a *padding* character to make lengths of all strings equal within one training batch.\n", 206 | "\n", 207 | "People have a bit different habits when it comes to special symbols in encoder-decoder networks, so don't get too much confused if you come across other variants in tutorials you read. " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "#### Padding" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "When vocabularies are ready, we need to be able to convert a sentence to a list of vocabulary word indices and back. At the same time, let's care about padding. We are going to preprocess each sequence from the input (and output ground truth) in such a way that:\n", 222 | "- it has a predefined length *padded_len*\n", 223 | "- it is probably cut off or padded with the *padding symbol* '#'\n", 224 | "- it *always* ends with the *end symbol* '$'\n", 225 | "\n", 226 | "We will treat the original characters of the sequence **and the end symbol** as the valid part of the input. We will store *the actual length* of the sequence, which includes the end symbol, but does not include the padding symbols. " 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | " Now you need to implement the function *sentence_to_ids* that does the described job. " 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 24, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "def sentence_to_ids(sentence, word2id, padded_len):\n", 243 | " \"\"\" Converts a sequence of symbols to a padded sequence of their ids.\n", 244 | " \n", 245 | " sentence: a string, input/output sequence of symbols.\n", 246 | " word2id: a dict, a mapping from original symbols to ids.\n", 247 | " padded_len: an integer, a desirable length of the sequence.\n", 248 | "\n", 249 | " result: a tuple of (a list of ids, an actual length of sentence).\n", 250 | " \"\"\"\n", 251 | " sent_ids = []\n", 252 | " for char in sentence:\n", 253 | " idv = word2id[char]\n", 254 | " sent_ids.append(idv)\n", 255 | " if len(sent_ids)==padded_len: \n", 256 | " sent_ids[-1] = word2id[\"$\"]\n", 257 | " else:\n", 258 | " sent_ids.append(word2id[\"$\"])\n", 259 | " sent_len = len(sent_ids) \n", 260 | " if len(sent_ids)" 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "metadata": {}, 610 | "source": [ 611 | "Now, it's time to implement the decoder:\n", 612 | " - First, we should create two [helpers](https://www.tensorflow.org/api_guides/python/contrib.seq2seq#Dynamic_Decoding). These classes help to determine the behaviour of the decoder. During the training time, we will use [TrainingHelper](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/TrainingHelper). For the inference we recommend to use [GreedyEmbeddingHelper](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/GreedyEmbeddingHelper).\n", 613 | " - To share all parameters during training and inference, we use one scope and set the flag 'reuse' to True at inference time. You might be interested to know more about how [variable scopes](https://www.tensorflow.org/programmers_guide/variables) work in TF. \n", 614 | " - To create the decoder itself, we will use [BasicDecoder](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/BasicDecoder) class. As previously, you should choose some RNN cell, e.g. GRU cell. To turn hidden states into logits, we will need a projection layer. One of the simple solutions is using [OutputProjectionWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/OutputProjectionWrapper).\n", 615 | " - For getting the predictions, it will be convinient to use [dynamic_decode](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/dynamic_decode). This function uses the provided decoder to perform decoding." 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 43, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):\n", 625 | " \"\"\"Specifies decoder architecture and computes the output.\n", 626 | " \n", 627 | " Uses different helpers:\n", 628 | " - for train: feeding ground truth\n", 629 | " - for inference: feeding generated output\n", 630 | "\n", 631 | " As a result, self.train_outputs and self.infer_outputs are created. \n", 632 | " Each of them contains two fields:\n", 633 | " rnn_output (predicted logits)\n", 634 | " sample_id (predictions).\n", 635 | "\n", 636 | " \"\"\"\n", 637 | " \n", 638 | " # Use start symbols as the decoder inputs at the first time step.\n", 639 | " batch_size = tf.shape(self.input_batch)[0]\n", 640 | " start_tokens = tf.fill([batch_size], start_symbol_id)\n", 641 | " ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)\n", 642 | " \n", 643 | " # Use the embedding layer defined before to lookup embedings for ground_truth_as_input. \n", 644 | " self.ground_truth_embedded = tf.nn.embedding_lookup(params=self.embeddings,\n", 645 | " ids=ground_truth_as_input)\n", 646 | " \n", 647 | " # Create TrainingHelper for the train stage.\n", 648 | " train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded, \n", 649 | " self.ground_truth_lengths)\n", 650 | " \n", 651 | " # Create GreedyEmbeddingHelper for the inference stage.\n", 652 | " # You should provide the embedding layer, start_tokens and index of the end symbol.\n", 653 | " infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embeddings,\n", 654 | " start_tokens=start_tokens,\n", 655 | " end_token=end_symbol_id)\n", 656 | " \n", 657 | " \n", 658 | " def decode(helper, scope, reuse=None):\n", 659 | " \"\"\"Creates decoder and return the results of the decoding with a given helper.\"\"\"\n", 660 | " \n", 661 | " with tf.variable_scope(scope, reuse=reuse):\n", 662 | " # Create GRUCell with dropout. Do not forget to set the reuse flag properly.\n", 663 | " decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(hidden_size, reuse=reuse), self.dropout_ph)\n", 664 | " \n", 665 | " # Create a projection wrapper.\n", 666 | " decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)\n", 667 | " \n", 668 | " # Create BasicDecoder, pass the defined cell, a helper, and initial state.\n", 669 | " # The initial state should be equal to the final state of the encoder!\n", 670 | " decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper=helper,\n", 671 | " initial_state=self.final_encoder_state)\n", 672 | " \n", 673 | " # The first returning argument of dynamic_decode contains two fields:\n", 674 | " # rnn_output (predicted logits)\n", 675 | " # sample_id (predictions)\n", 676 | " outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter, \n", 677 | " output_time_major=False, impute_finished=True)\n", 678 | "\n", 679 | " return outputs\n", 680 | " \n", 681 | " self.train_outputs = decode(train_helper, 'decode')\n", 682 | " self.infer_outputs = decode(infer_helper, 'decode', reuse=True)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 44, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "Seq2SeqModel.__build_decoder = classmethod(build_decoder)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "In this task we will use [sequence_loss](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/sequence_loss), which is a weighted cross-entropy loss for a sequence of logits. Take a moment to understand, what is your train logits and targets. Also note, that we do not want to take into account loss terms coming from padding symbols, so we will mask them out using weights. " 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 45, 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "def compute_loss(self):\n", 708 | " \"\"\"Computes sequence loss (masked cross-entopy loss with logits).\"\"\"\n", 709 | " \n", 710 | " weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)\n", 711 | " \n", 712 | " self.loss = tf.contrib.seq2seq.sequence_loss(logits=self.train_outputs.rnn_output,\n", 713 | " targets=self.ground_truth,\n", 714 | " weights=weights)" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 46, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "Seq2SeqModel.__compute_loss = classmethod(compute_loss)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "The last thing to specify is the optimization of the defined loss. \n", 731 | "We suggest that you use [optimize_loss](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/optimize_loss) with Adam optimizer and a learning rate from the corresponding placeholder. You might also need to pass global step (e.g. as tf.train.get_global_step()) and clip gradients by 1.0." 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 47, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [ 740 | "def perform_optimization(self):\n", 741 | " \"\"\"Specifies train_op that optimizes self.loss.\"\"\"\n", 742 | " \n", 743 | " self.train_op = tf.contrib.layers.optimize_loss(self.loss, optimizer='Adam',\n", 744 | " learning_rate=self.learning_rate_ph,\n", 745 | " clip_gradients=1.0,\n", 746 | " global_step=tf.train.get_global_step())" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 48, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "Congratulations! You have specified all the parts of your network. You may have noticed, that we didn't deal with any real data yet, so what you have written is just recipies on how the network should function.\n", 763 | "Now we will put them to the constructor of our Seq2SeqModel class to use it in the next section. " 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 49, 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "def init_model(self, vocab_size, embeddings_size, hidden_size, \n", 773 | " max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):\n", 774 | " \n", 775 | " self.__declare_placeholders()\n", 776 | " self.__create_embeddings(vocab_size, embeddings_size)\n", 777 | " self.__build_encoder(hidden_size)\n", 778 | " self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)\n", 779 | " \n", 780 | " # Compute loss and back-propagate.\n", 781 | " self.__compute_loss()\n", 782 | " self.__perform_optimization()\n", 783 | " \n", 784 | " # Get predictions for evaluation.\n", 785 | " self.train_predictions = self.train_outputs.sample_id\n", 786 | " self.infer_predictions = self.infer_outputs.sample_id" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 50, 792 | "metadata": {}, 793 | "outputs": [], 794 | "source": [ 795 | "Seq2SeqModel.__init__ = classmethod(init_model)" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "## Train the network and predict output\n", 803 | "\n", 804 | "[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*. To predict output, we just need to compute *self.infer_predictions*. In any case, we need to feed actual data through the placeholders that we defined above. " 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 51, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):\n", 814 | " feed_dict = {\n", 815 | " self.input_batch: X,\n", 816 | " self.input_batch_lengths: X_seq_len,\n", 817 | " self.ground_truth: Y,\n", 818 | " self.ground_truth_lengths: Y_seq_len,\n", 819 | " self.learning_rate_ph: learning_rate,\n", 820 | " self.dropout_ph: dropout_keep_probability\n", 821 | " }\n", 822 | " pred, loss, _ = session.run([\n", 823 | " self.train_predictions,\n", 824 | " self.loss,\n", 825 | " self.train_op], feed_dict=feed_dict)\n", 826 | " return pred, loss" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 52, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "Seq2SeqModel.train_on_batch = classmethod(train_on_batch)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "We implemented two prediction functions: *predict_for_batch* and *predict_for_batch_with_loss*. The first one allows only to predict output for some input sequence, while the second one could compute loss because we provide also ground truth values. Both these functions might be useful since the first one could be used for predicting only, and the second one is helpful for validating results on not-training data during the training." 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 53, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "def predict_for_batch(self, session, X, X_seq_len):\n", 852 | " feed_dict = {self.input_batch: X, self.input_batch_lengths: X_seq_len}\n", 853 | " pred = session.run([\n", 854 | " self.infer_predictions\n", 855 | " ], feed_dict=feed_dict)[0]\n", 856 | " return pred\n", 857 | "\n", 858 | "def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):\n", 859 | " feed_dict = {self.input_batch: X, self.input_batch_lengths: X_seq_len,\n", 860 | " self.ground_truth: Y, self.ground_truth_lengths: Y_seq_len}\n", 861 | " pred, loss = session.run([\n", 862 | " self.infer_predictions,\n", 863 | " self.loss,\n", 864 | " ], feed_dict=feed_dict)\n", 865 | " return pred, loss" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": 54, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)\n", 875 | "Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)" 876 | ] 877 | }, 878 | { 879 | "cell_type": "markdown", 880 | "metadata": {}, 881 | "source": [ 882 | "## Run your experiment\n", 883 | "\n", 884 | "Create *Seq2SeqModel* model with the following parameters:\n", 885 | " - *vocab_size* — number of tokens;\n", 886 | " - *embeddings_size* — dimension of embeddings, recommended value: 20;\n", 887 | " - *max_iter* — maximum number of steps in decoder, recommended value: 7;\n", 888 | " - *hidden_size* — size of hidden layers for RNN, recommended value: 512;\n", 889 | " - *start_symbol_id* — an index of the start token (`^`).\n", 890 | " - *end_symbol_id* — an index of the end token (`$`).\n", 891 | " - *padding_symbol_id* — an index of the padding token (`#`).\n", 892 | "\n", 893 | "Set hyperparameters. You might want to start with the following values and see how it works:\n", 894 | "- *batch_size*: 128;\n", 895 | "- at least 10 epochs;\n", 896 | "- value of *learning_rate*: 0.001\n", 897 | "- *dropout_keep_probability* equals to 0.5 for training (typical values for dropout probability are ranging from 0.1 to 1.0); larger values correspond smaler number of dropout units;\n", 898 | "- *max_len*: 20." 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 55, 904 | "metadata": {}, 905 | "outputs": [ 906 | { 907 | "name": "stdout", 908 | "output_type": "stream", 909 | "text": [ 910 | "WARNING:tensorflow:From c:\\users\\matcha.11\\documents\\cpuenv\\lib\\site-packages\\tensorflow\\python\\framework\\op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 911 | "Instructions for updating:\n", 912 | "Colocations handled automatically by placer.\n", 913 | "\n", 914 | "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", 915 | "For more information, please see:\n", 916 | " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", 917 | " * https://github.com/tensorflow/addons\n", 918 | "If you depend on functionality not listed there, please file an issue.\n", 919 | "\n", 920 | "WARNING:tensorflow:From :5: GRUCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.\n", 921 | "Instructions for updating:\n", 922 | "This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.\n", 923 | "WARNING:tensorflow:From :11: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", 924 | "Instructions for updating:\n", 925 | "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", 926 | "WARNING:tensorflow:From c:\\users\\matcha.11\\documents\\cpuenv\\lib\\site-packages\\tensorflow\\python\\ops\\rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 927 | "Instructions for updating:\n", 928 | "Use tf.cast instead.\n", 929 | "WARNING:tensorflow:From c:\\users\\matcha.11\\documents\\cpuenv\\lib\\site-packages\\tensorflow\\python\\ops\\rnn_cell_impl.py:1259: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 930 | "Instructions for updating:\n", 931 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n" 932 | ] 933 | } 934 | ], 935 | "source": [ 936 | "tf.reset_default_graph()\n", 937 | "\n", 938 | "model = Seq2SeqModel(vocab_size=len(word2id),\n", 939 | " embeddings_size=20,\n", 940 | " max_iter=7,\n", 941 | " hidden_size=512,\n", 942 | " start_symbol_id=word2id['^'],\n", 943 | " end_symbol_id=word2id['$'],\n", 944 | " padding_symbol_id=word2id['#'])\n", 945 | "\n", 946 | "batch_size = 128\n", 947 | "n_epochs = 10\n", 948 | "learning_rate = 0.001\n", 949 | "dropout_keep_probability = 0.5\n", 950 | "max_len = 20\n", 951 | "\n", 952 | "n_step = int(len(train_set) / batch_size)" 953 | ] 954 | }, 955 | { 956 | "cell_type": "markdown", 957 | "metadata": {}, 958 | "source": [ 959 | "Finally, we are ready to run the training! A good indicator that everything works fine is decreasing loss during the training. You should account on the loss value equal to approximately 2.7 at the beginning of the training and near 1 after the 10th epoch." 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 57, 965 | "metadata": { 966 | "scrolled": true 967 | }, 968 | "outputs": [ 969 | { 970 | "name": "stdout", 971 | "output_type": "stream", 972 | "text": [ 973 | "Start training... \n", 974 | "\n", 975 | "Train: epoch 1\n", 976 | "Epoch: [1/10], step: [1/625], loss: 2.702646\n", 977 | "Epoch: [1/10], step: [201/625], loss: 1.786058\n", 978 | "Epoch: [1/10], step: [401/625], loss: 1.755579\n", 979 | "Epoch: [1/10], step: [601/625], loss: 1.634027\n", 980 | "Test: epoch 1 loss: 1.6066998\n", 981 | "X: 2519+1128$\n", 982 | "Y: 3647$#\n", 983 | "O: 3499$#\n", 984 | "\n", 985 | "X: 1082+1439$\n", 986 | "Y: 2521$#\n", 987 | "O: 3411$#\n", 988 | "\n", 989 | "X: 5606+3692$\n", 990 | "Y: 9298$#\n", 991 | "O: 9499$#\n", 992 | "\n", 993 | "Train: epoch 2\n", 994 | "Epoch: [2/10], step: [1/625], loss: 1.647852\n", 995 | "Epoch: [2/10], step: [201/625], loss: 1.560007\n", 996 | "Epoch: [2/10], step: [401/625], loss: 1.502402\n", 997 | "Epoch: [2/10], step: [601/625], loss: 1.470002\n", 998 | "Test: epoch 2 loss: 1.4363232\n", 999 | "X: 5697-5382$\n", 1000 | "Y: 315$##\n", 1001 | "O: -113$#\n", 1002 | "\n", 1003 | "X: 2829-4995$\n", 1004 | "Y: -2166$\n", 1005 | "O: -1723$\n", 1006 | "\n", 1007 | "X: 6080-9262$\n", 1008 | "Y: -3182$\n", 1009 | "O: -2777$\n", 1010 | "\n", 1011 | "Train: epoch 3\n", 1012 | "Epoch: [3/10], step: [1/625], loss: 1.449630\n", 1013 | "Epoch: [3/10], step: [201/625], loss: 1.412287\n", 1014 | "Epoch: [3/10], step: [401/625], loss: 1.432986\n", 1015 | "Epoch: [3/10], step: [601/625], loss: 1.395352\n", 1016 | "Test: epoch 3 loss: 1.3397688\n", 1017 | "X: 8206+4281$\n", 1018 | "Y: 12487$\n", 1019 | "O: 12188$\n", 1020 | "\n", 1021 | "X: 7389+7349$\n", 1022 | "Y: 14738$\n", 1023 | "O: 14958$\n", 1024 | "\n", 1025 | "X: 7033+8423$\n", 1026 | "Y: 15456$\n", 1027 | "O: 15588$\n", 1028 | "\n", 1029 | "Train: epoch 4\n", 1030 | "Epoch: [4/10], step: [1/625], loss: 1.370272\n", 1031 | "Epoch: [4/10], step: [201/625], loss: 1.367970\n", 1032 | "Epoch: [4/10], step: [401/625], loss: 1.334640\n", 1033 | "Epoch: [4/10], step: [601/625], loss: 1.308336\n", 1034 | "Test: epoch 4 loss: 1.2765975\n", 1035 | "X: 5392-4178$\n", 1036 | "Y: 1214$#\n", 1037 | "O: 1069$#\n", 1038 | "\n", 1039 | "X: 4755-3479$\n", 1040 | "Y: 1276$#\n", 1041 | "O: 1466$#\n", 1042 | "\n", 1043 | "X: 1057+6822$\n", 1044 | "Y: 7879$#\n", 1045 | "O: 8099$#\n", 1046 | "\n", 1047 | "Train: epoch 5\n", 1048 | "Epoch: [5/10], step: [1/625], loss: 1.333535\n", 1049 | "Epoch: [5/10], step: [201/625], loss: 1.263995\n", 1050 | "Epoch: [5/10], step: [401/625], loss: 1.261926\n", 1051 | "Epoch: [5/10], step: [601/625], loss: 1.195226\n", 1052 | "Test: epoch 5 loss: 1.12511\n", 1053 | "X: 4756-1640$\n", 1054 | "Y: 3116$#\n", 1055 | "O: 2931$#\n", 1056 | "\n", 1057 | "X: 5969+1768$\n", 1058 | "Y: 7737$#\n", 1059 | "O: 7691$#\n", 1060 | "\n", 1061 | "X: 5275-8804$\n", 1062 | "Y: -3529$\n", 1063 | "O: -3610$\n", 1064 | "\n", 1065 | "Train: epoch 6\n", 1066 | "Epoch: [6/10], step: [1/625], loss: 1.166049\n", 1067 | "Epoch: [6/10], step: [201/625], loss: 1.131641\n", 1068 | "Epoch: [6/10], step: [401/625], loss: 1.115376\n", 1069 | "Epoch: [6/10], step: [601/625], loss: 1.064710\n", 1070 | "Test: epoch 6 loss: 0.99838525\n", 1071 | "X: 5271-5790$\n", 1072 | "Y: -519$#\n", 1073 | "O: -510$#\n", 1074 | "\n", 1075 | "X: 600-1947$#\n", 1076 | "Y: -1347$\n", 1077 | "O: -1211$\n", 1078 | "\n", 1079 | "X: 1439+5299$\n", 1080 | "Y: 6738$#\n", 1081 | "O: 6769$#\n", 1082 | "\n", 1083 | "Train: epoch 7\n", 1084 | "Epoch: [7/10], step: [1/625], loss: 1.054645\n", 1085 | "Epoch: [7/10], step: [201/625], loss: 1.039298\n", 1086 | "Epoch: [7/10], step: [401/625], loss: 1.014447\n", 1087 | "Epoch: [7/10], step: [601/625], loss: 0.981347\n", 1088 | "Test: epoch 7 loss: 0.97818017\n", 1089 | "X: 2877-7070$\n", 1090 | "Y: -4193$\n", 1091 | "O: -4209$\n", 1092 | "\n", 1093 | "X: 9736-7442$\n", 1094 | "Y: 2294$#\n", 1095 | "O: 2298$#\n", 1096 | "\n", 1097 | "X: 680+3719$#\n", 1098 | "Y: 4399$#\n", 1099 | "O: 4328$#\n", 1100 | "\n", 1101 | "Train: epoch 8\n", 1102 | "Epoch: [8/10], step: [1/625], loss: 1.018787\n", 1103 | "Epoch: [8/10], step: [201/625], loss: 0.989767\n", 1104 | "Epoch: [8/10], step: [401/625], loss: 0.982673\n", 1105 | "Epoch: [8/10], step: [601/625], loss: 0.932108\n", 1106 | "Test: epoch 8 loss: 0.9346564\n", 1107 | "X: 6383+9676$\n", 1108 | "Y: 16059$\n", 1109 | "O: 16069$\n", 1110 | "\n", 1111 | "X: 5640-8704$\n", 1112 | "Y: -3064$\n", 1113 | "O: -3009$\n", 1114 | "\n", 1115 | "X: 8108-4381$\n", 1116 | "Y: 3727$#\n", 1117 | "O: 3785$#\n", 1118 | "\n", 1119 | "Train: epoch 9\n", 1120 | "Epoch: [9/10], step: [1/625], loss: 0.968657\n", 1121 | "Epoch: [9/10], step: [201/625], loss: 0.947608\n", 1122 | "Epoch: [9/10], step: [401/625], loss: 0.963065\n", 1123 | "Epoch: [9/10], step: [601/625], loss: 0.940374\n", 1124 | "Test: epoch 9 loss: 0.92420053\n", 1125 | "X: 6955+8754$\n", 1126 | "Y: 15709$\n", 1127 | "O: 15700$\n", 1128 | "\n", 1129 | "X: 1129+3809$\n", 1130 | "Y: 4938$#\n", 1131 | "O: 4944$#\n", 1132 | "\n", 1133 | "X: 5108+6665$\n", 1134 | "Y: 11773$\n", 1135 | "O: 11794$\n", 1136 | "\n", 1137 | "Train: epoch 10\n", 1138 | "Epoch: [10/10], step: [1/625], loss: 0.918517\n", 1139 | "Epoch: [10/10], step: [201/625], loss: 0.884715\n", 1140 | "Epoch: [10/10], step: [401/625], loss: 0.923777\n", 1141 | "Epoch: [10/10], step: [601/625], loss: 0.882094\n", 1142 | "Test: epoch 10 loss: 0.9394735\n", 1143 | "X: 9604+2097$\n", 1144 | "Y: 11701$\n", 1145 | "O: 11716$\n", 1146 | "\n", 1147 | "X: 3888+2271$\n", 1148 | "Y: 6159$#\n", 1149 | "O: 6131$#\n", 1150 | "\n", 1151 | "X: 5608+8922$\n", 1152 | "Y: 14530$\n", 1153 | "O: 14526$\n", 1154 | "\n", 1155 | "\n", 1156 | "...training finished.\n" 1157 | ] 1158 | } 1159 | ], 1160 | "source": [ 1161 | "session = tf.Session()\n", 1162 | "session.run(tf.global_variables_initializer())\n", 1163 | " \n", 1164 | "invalid_number_prediction_counts = []\n", 1165 | "all_model_predictions = []\n", 1166 | "all_ground_truth = []\n", 1167 | "\n", 1168 | "print('Start training... \\n')\n", 1169 | "for epoch in range(n_epochs): \n", 1170 | " random.shuffle(train_set)\n", 1171 | " random.shuffle(test_set)\n", 1172 | " \n", 1173 | " print('Train: epoch', epoch + 1)\n", 1174 | " for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):\n", 1175 | " ######################################\n", 1176 | " ######### YOUR CODE HERE #############\n", 1177 | " ######################################\n", 1178 | " # prepare the data (X_batch and Y_batch) for training\n", 1179 | " # using function batch_to_ids\n", 1180 | " x, x_len = batch_to_ids(X_batch, word2id, max_len=max_len)\n", 1181 | " y, y_len = batch_to_ids(Y_batch, word2id, max_len=max_len)\n", 1182 | " predictions, loss = model.train_on_batch(session=session,\n", 1183 | " X=x, Y=y,\n", 1184 | " X_seq_len=x_len,\n", 1185 | " Y_seq_len=y_len,\n", 1186 | " learning_rate=learning_rate,\n", 1187 | " dropout_keep_probability=dropout_keep_probability)\n", 1188 | " \n", 1189 | " if n_iter % 200 == 0:\n", 1190 | " print(\"Epoch: [%d/%d], step: [%d/%d], loss: %f\" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))\n", 1191 | " \n", 1192 | " X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))\n", 1193 | " X, X_sent_lens = batch_to_ids(X_sent, word2id, max_len=max_len)\n", 1194 | " Y, Y_sent_lens = batch_to_ids(Y_sent, word2id, max_len=max_len)\n", 1195 | " ######################################\n", 1196 | " ######### YOUR CODE HERE #############\n", 1197 | " ######################################\n", 1198 | " # prepare test data (X_sent and Y_sent) for predicting \n", 1199 | " # quality and computing value of the loss function\n", 1200 | " # using function batch_to_ids\n", 1201 | " \n", 1202 | " predictions, loss = model.predict_for_batch_with_loss(session, X, X_sent_lens, Y, Y_sent_lens)\n", 1203 | " print('Test: epoch', epoch + 1, 'loss:', loss,)\n", 1204 | " for x, y, p in list(zip(X, Y, predictions))[:3]:\n", 1205 | " print('X:',''.join(ids_to_sentence(x, id2word)))\n", 1206 | " print('Y:',''.join(ids_to_sentence(y, id2word)))\n", 1207 | " print('O:',''.join(ids_to_sentence(p, id2word)))\n", 1208 | " print('')\n", 1209 | "\n", 1210 | " model_predictions = []\n", 1211 | " ground_truth = []\n", 1212 | " invalid_number_prediction_count = 0\n", 1213 | " # For the whole test set calculate ground-truth values (as integer numbers)\n", 1214 | " # and prediction values (also as integers) to calculate metrics.\n", 1215 | " # If generated by model number is not correct (e.g. '1-1'), \n", 1216 | " # increase invalid_number_prediction_count and don't append this and corresponding\n", 1217 | " # ground-truth value to the arrays.\n", 1218 | " for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):\n", 1219 | " X, X_len = batch_to_ids(X_batch, word2id, max_len=max_len)\n", 1220 | " Y, Y_len = batch_to_ids(Y_batch, word2id, max_len=max_len)\n", 1221 | " predictions = model.predict_for_batch(session, X, X_len)\n", 1222 | " \n", 1223 | " for Y_true, Y_pred in zip(Y, predictions):\n", 1224 | " \n", 1225 | " try:\n", 1226 | " end_token = '$'\n", 1227 | " Y_true, Y_pred = ''.join(ids_to_sentence(Y_true, id2word)), ''.join(ids_to_sentence(Y_pred, id2word))\n", 1228 | " Y_true, Y_pred = Y_true[:Y_true.find(end_token)], Y_pred[:Y_pred.find(end_token)]\n", 1229 | " model_predictions.append(int(Y_pred))\n", 1230 | " ground_truth.append(int(Y_true))\n", 1231 | " \n", 1232 | " except:\n", 1233 | " invalid_number_prediction_count = invalid_number_prediction_count + 1\n", 1234 | " \n", 1235 | " all_model_predictions.append(model_predictions)\n", 1236 | " all_ground_truth.append(ground_truth)\n", 1237 | " invalid_number_prediction_counts.append(invalid_number_prediction_count)\n", 1238 | " \n", 1239 | "print('\\n...training finished.')" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "markdown", 1244 | "metadata": {}, 1245 | "source": [ 1246 | "## Evaluate results\n", 1247 | "\n", 1248 | "Because our task is simple and the output is straight-forward, we will use [MAE](https://en.wikipedia.org/wiki/Mean_absolute_error) metric to evaluate the trained model during the epochs. Compute the value of the metric for the output from each epoch." 1249 | ] 1250 | }, 1251 | { 1252 | "cell_type": "code", 1253 | "execution_count": 58, 1254 | "metadata": {}, 1255 | "outputs": [], 1256 | "source": [ 1257 | "from sklearn.metrics import mean_absolute_error" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": 59, 1263 | "metadata": {}, 1264 | "outputs": [ 1265 | { 1266 | "name": "stdout", 1267 | "output_type": "stream", 1268 | "text": [ 1269 | "Epoch: 1, MAE: 891.170600, Invalid numbers: 0\n", 1270 | "Epoch: 2, MAE: 390.128050, Invalid numbers: 0\n", 1271 | "Epoch: 3, MAE: 239.674250, Invalid numbers: 0\n", 1272 | "Epoch: 4, MAE: 181.889700, Invalid numbers: 0\n", 1273 | "Epoch: 5, MAE: 119.460900, Invalid numbers: 0\n", 1274 | "Epoch: 6, MAE: 53.872400, Invalid numbers: 0\n", 1275 | "Epoch: 7, MAE: 60.479250, Invalid numbers: 0\n", 1276 | "Epoch: 8, MAE: 50.086000, Invalid numbers: 0\n", 1277 | "Epoch: 9, MAE: 43.830000, Invalid numbers: 0\n", 1278 | "Epoch: 10, MAE: 35.136950, Invalid numbers: 0\n" 1279 | ] 1280 | } 1281 | ], 1282 | "source": [ 1283 | "for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,\n", 1284 | " all_model_predictions,\n", 1285 | " invalid_number_prediction_counts), 1):\n", 1286 | " mae = mean_absolute_error(gts, predictions)\n", 1287 | " print(\"Epoch: %i, MAE: %f, Invalid numbers: %i\" % (i, mae, invalid_number_prediction_count))" 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": null, 1293 | "metadata": {}, 1294 | "outputs": [], 1295 | "source": [] 1296 | } 1297 | ], 1298 | "metadata": { 1299 | "kernelspec": { 1300 | "display_name": "cpuenv", 1301 | "language": "python", 1302 | "name": "cpuenv" 1303 | }, 1304 | "language_info": { 1305 | "codemirror_mode": { 1306 | "name": "ipython", 1307 | "version": 3 1308 | }, 1309 | "file_extension": ".py", 1310 | "mimetype": "text/x-python", 1311 | "name": "python", 1312 | "nbconvert_exporter": "python", 1313 | "pygments_lexer": "ipython3", 1314 | "version": "3.7.1" 1315 | } 1316 | }, 1317 | "nbformat": 4, 1318 | "nbformat_minor": 2 1319 | } 1320 | --------------------------------------------------------------------------------