├── README.md ├── Text_Summarization.ipynb ├── dataset_handler.py ├── eval_classification.py ├── eval_msrp.py ├── eval_rank.py ├── eval_sick.py ├── eval_trec.py ├── nbsvm.py ├── skipthoughts.py └── skipthoughts.pyc /README.md: -------------------------------------------------------------------------------- 1 | # text_summarization 2 | Notebook which provides an overview to several text summarization techniques 3 | 4 | Before running the skipthoughts module, download the dependencies from the source repository: 5 | https://github.com/ryankiros/skip-thoughts/ 6 | -------------------------------------------------------------------------------- /Text_Summarization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import nltk\n", 14 | "from nltk.corpus import stopwords\n", 15 | "from nltk.tokenize import word_tokenize, sent_tokenize\n", 16 | "from nltk.stem.porter import *\n", 17 | "import re" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "passage = \"\"\"\n", 29 | "If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him?\n", 30 | "\n", 31 | "The question of how much these two other-worldly players inspire each other is an interesting one,\n", 32 | "and it's tempting to imagine Messi sitting at home on Tuesday night, watching Ronaldo destroying Atletico, \n", 33 | "angrily glaring at the TV screen and growling: \"Right, I'll show him!\"\n", 34 | "\n", 35 | "As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least.\n", 36 | "\n", 37 | "He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and \n", 38 | "personal glory, however, the Argentine's personal drug is trophies.\n", 39 | "\n", 40 | "Ronaldo, it can be said, never looks happy on the field of play unless he's just scored a goal - and even \n", 41 | "then he's not happy for long, because he just wants to score another one. And that relentless obsession with \n", 42 | "finding the back of the net has undoubtedly played a major role in his stunning career achievements.\n", 43 | "\n", 44 | "Messi, though, is a different animal, shown by the generosity with which he sets up team-mates even if he has \n", 45 | "a chance to shoot, regularly hands over penalty-taking duties to others and invariably celebrates a goal by turning \n", 46 | "straight to the player who passed him the ball with an appreciative smile.\n", 47 | "\n", 48 | "Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close to\n", 49 | "him - are being the best possible version of Lionel Messi, and winning as many trophies as possible.\n", 50 | "\n", 51 | "That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi.\n", 52 | "\n", 53 | "Do Messi and Ronaldo inspire each other? \"Maybe subconsciously in some way they've driven each other on,\" said Rodgers.\n", 54 | "\"But I think both those players inherently have that hunger to be the best players they can be. With the very elite \n", 55 | "performers, that drive comes from within.\"\n", 56 | "\n", 57 | "Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate \n", 58 | "and the media to spread, but it's probably not particularly true.\n", 59 | "\"\"\"" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Text Standardization" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "contractions = { \n", 78 | "\"ain't\": \"am not / are not / is not / has not / have not\",\n", 79 | "\"aren't\": \"are not / am not\",\n", 80 | "\"can't\": \"cannot\",\n", 81 | "\"can't've\": \"cannot have\",\n", 82 | "\"'cause\": \"because\",\n", 83 | "\"could've\": \"could have\",\n", 84 | "\"couldn't\": \"could not\",\n", 85 | "\"couldn't've\": \"could not have\",\n", 86 | "\"didn't\": \"did not\",\n", 87 | "\"doesn't\": \"does not\",\n", 88 | "\"don't\": \"do not\",\n", 89 | "\"hadn't\": \"had not\",\n", 90 | "\"hadn't've\": \"had not have\",\n", 91 | "\"hasn't\": \"has not\",\n", 92 | "\"haven't\": \"have not\",\n", 93 | "\"he'd\": \"he had / he would\",\n", 94 | "\"he'd've\": \"he would have\",\n", 95 | "\"he'll\": \"he shall / he will\",\n", 96 | "\"he'll've\": \"he shall have / he will have\",\n", 97 | "\"he's\": \"he has / he is\",\n", 98 | "\"how'd\": \"how did\",\n", 99 | "\"how'd'y\": \"how do you\",\n", 100 | "\"how'll\": \"how will\",\n", 101 | "\"how's\": \"how has / how is / how does\",\n", 102 | "\"I'd\": \"I had / I would\",\n", 103 | "\"I'd've\": \"I would have\",\n", 104 | "\"I'll\": \"I shall / I will\",\n", 105 | "\"I'll've\": \"I shall have / I will have\",\n", 106 | "\"I'm\": \"I am\",\n", 107 | "\"I've\": \"I have\",\n", 108 | "\"isn't\": \"is not\",\n", 109 | "\"it'd\": \"it had / it would\",\n", 110 | "\"it'd've\": \"it would have\",\n", 111 | "\"it'll\": \"it shall / it will\",\n", 112 | "\"it'll've\": \"it shall have / it will have\",\n", 113 | "\"it's\": \"it has / it is\",\n", 114 | "\"let's\": \"let us\",\n", 115 | "\"ma'am\": \"madam\",\n", 116 | "\"mayn't\": \"may not\",\n", 117 | "\"might've\": \"might have\",\n", 118 | "\"mightn't\": \"might not\",\n", 119 | "\"mightn't've\": \"might not have\",\n", 120 | "\"must've\": \"must have\",\n", 121 | "\"mustn't\": \"must not\",\n", 122 | "\"mustn't've\": \"must not have\",\n", 123 | "\"needn't\": \"need not\",\n", 124 | "\"needn't've\": \"need not have\",\n", 125 | "\"o'clock\": \"of the clock\",\n", 126 | "\"oughtn't\": \"ought not\",\n", 127 | "\"oughtn't've\": \"ought not have\",\n", 128 | "\"shan't\": \"shall not\",\n", 129 | "\"sha'n't\": \"shall not\",\n", 130 | "\"shan't've\": \"shall not have\",\n", 131 | "\"she'd\": \"she had / she would\",\n", 132 | "\"she'd've\": \"she would have\",\n", 133 | "\"she'll\": \"she shall / she will\",\n", 134 | "\"she'll've\": \"she shall have / she will have\",\n", 135 | "\"she's\": \"she has / she is\",\n", 136 | "\"should've\": \"should have\",\n", 137 | "\"shouldn't\": \"should not\",\n", 138 | "\"shouldn't've\": \"should not have\",\n", 139 | "\"so've\": \"so have\",\n", 140 | "\"so's\": \"so as / so is\",\n", 141 | "\"that'd\": \"that would / that had\",\n", 142 | "\"that'd've\": \"that would have\",\n", 143 | "\"that's\": \"that has / that is\",\n", 144 | "\"there'd\": \"there had / there would\",\n", 145 | "\"there'd've\": \"there would have\",\n", 146 | "\"there's\": \"there has / there is\",\n", 147 | "\"they'd\": \"they had / they would\",\n", 148 | "\"they'd've\": \"they would have\",\n", 149 | "\"they'll\": \"they shall / they will\",\n", 150 | "\"they'll've\": \"they shall have / they will have\",\n", 151 | "\"they're\": \"they are\",\n", 152 | "\"they've\": \"they have\",\n", 153 | "\"to've\": \"to have\",\n", 154 | "\"wasn't\": \"was not\",\n", 155 | "\"we'd\": \"we had / we would\",\n", 156 | "\"we'd've\": \"we would have\",\n", 157 | "\"we'll\": \"we will\",\n", 158 | "\"we'll've\": \"we will have\",\n", 159 | "\"we're\": \"we are\",\n", 160 | "\"we've\": \"we have\",\n", 161 | "\"weren't\": \"were not\",\n", 162 | "\"what'll\": \"what shall / what will\",\n", 163 | "\"what'll've\": \"what shall have / what will have\",\n", 164 | "\"what're\": \"what are\",\n", 165 | "\"what's\": \"what has / what is\",\n", 166 | "\"what've\": \"what have\",\n", 167 | "\"when's\": \"when has / when is\",\n", 168 | "\"when've\": \"when have\",\n", 169 | "\"where'd\": \"where did\",\n", 170 | "\"where's\": \"where has / where is\",\n", 171 | "\"where've\": \"where have\",\n", 172 | "\"who'll\": \"who shall / who will\",\n", 173 | "\"who'll've\": \"who shall have / who will have\",\n", 174 | "\"who's\": \"who has / who is\",\n", 175 | "\"who've\": \"who have\",\n", 176 | "\"why's\": \"why has / why is\",\n", 177 | "\"why've\": \"why have\",\n", 178 | "\"will've\": \"will have\",\n", 179 | "\"won't\": \"will not\",\n", 180 | "\"won't've\": \"will not have\",\n", 181 | "\"would've\": \"would have\",\n", 182 | "\"wouldn't\": \"would not\",\n", 183 | "\"wouldn't've\": \"would not have\",\n", 184 | "\"y'all\": \"you all\",\n", 185 | "\"y'all'd\": \"you all would\",\n", 186 | "\"y'all'd've\": \"you all would have\",\n", 187 | "\"y'all're\": \"you all are\",\n", 188 | "\"y'all've\": \"you all have\",\n", 189 | "\"you'd\": \"you had / you would\",\n", 190 | "\"you'd've\": \"you would have\",\n", 191 | "\"you'll\": \"you shall / you will\",\n", 192 | "\"you'll've\": \"you shall have / you will have\",\n", 193 | "\"you're\": \"you are\",\n", 194 | "\"you've\": \"you have\"\n", 195 | "}" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 4, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))\n", 207 | "def expand_contractions(s, contractions_dict=contractions):\n", 208 | " def replace(match):\n", 209 | " return contractions_dict[match.group(0)]\n", 210 | " return contractions_re.sub(replace, s)\n", 211 | " \n", 212 | "sentences = sent_tokenize(passage) \n", 213 | "sentences = [expand_contractions(i) for i in sentences]\n", 214 | "sentences = [re.sub('\\n', '', i) for i in sentences]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "Text Summarization is an increasingly popular area within NLP and with the advancements in moderns deep learning, we are consistently seeing newer, more novel approaches. The goal of this article is to compare the results of a few approaches that I found interesting:\n", 222 | "1. Sentence Scoring based on Word Frequency\n", 223 | "2. TextRank using Universal Sentence Encoder\n", 224 | "3. Unsupervised Learning using Skip-Thought Vectors\n", 225 | "\n", 226 | "Before moving forward, I wanted to give credit to the outstanding Medium authors/articles who are the foundation for this post and help me learn/implement the Text Summarization techniques below:\n", 227 | "1. https://becominghuman.ai/text-summarization-in-5-steps-using-nltk-65b21e352b65\n", 228 | "2. https://medium.com/jatana/unsupervised-text-summarization-using-sentence-embeddings-adb15ce83db1\n", 229 | "3. https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/\n", 230 | "\n", 231 | "Some of the code snippets they've provided will be shown here as well but I encourage you to read through their posts too!" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Sentence Scoring based on Word Freqency (Python 2.7/3.5)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "The first approach we will explore is the simplest of the three. Here we assign weights to each word based on the frequency of the word in the passage. For example, if \"Soccer\" occurs 4 times within the passage, it will have a weight of 4. " 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 39, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def create_freq_table(text_string):\n", 257 | " stopwords_list = set(stopwords.words('english'))\n", 258 | " \n", 259 | " words = word_tokenize(text_string)\n", 260 | " \n", 261 | " ps = PorterStemmer()\n", 262 | " \n", 263 | " freq_table = {}\n", 264 | " \n", 265 | " for word in words:\n", 266 | " #stem word \n", 267 | " word = ps.stem(word)\n", 268 | " \n", 269 | " #remove stopwords\n", 270 | " if word in stopwords_list: \n", 271 | " continue\n", 272 | " elif word in freq_table:\n", 273 | " freq_table[word] += 1\n", 274 | " else:\n", 275 | " freq_table[word] = 1\n", 276 | " \n", 277 | " return freq_table\n", 278 | "\n", 279 | "freq_table = create_freq_table(\" \".join(sentences))" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "Using the weights assigned to each word above, we will create a score for each sentence. At the end of the day, we will be taking the score of the top `N` for the summary. As you'd imagine, just by leveraging the raw score of each sentence, the length of certain sentences will skew the results. This is why will normalize the scores by dividing by the length of each sentence. " 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 5, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "def score_sentences(sentences, freq_table):\n", 298 | " \n", 299 | " sentence_value = {}\n", 300 | " \n", 301 | " for sentence in sentences:\n", 302 | " word_count_in_sentence = len(word_tokenize(sentence))\n", 303 | " \n", 304 | " for wordValue in freq_table:\n", 305 | " \n", 306 | " if wordValue.lower() in sentence.lower(): \n", 307 | " if sentence in sentence_value:\n", 308 | " sentence_value[sentence] += freq_table[wordValue]\n", 309 | " else:\n", 310 | " sentence_value[sentence] = freq_table[wordValue]\n", 311 | "\n", 312 | " sentence_value[sentence] = sentence_value[sentence] // word_count_in_sentence\n", 313 | " return sentence_value\n", 314 | "\n", 315 | "def find_average_score(sentence_value):\n", 316 | " sum_values = 0\n", 317 | " \n", 318 | " for entry in sentence_value:\n", 319 | " sum_values += sentence_value[entry]\n", 320 | " \n", 321 | " average = int(sum_values/len(sentence_value))\n", 322 | " \n", 323 | " return average" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Now, to create the summary, we will take any sentence that has a score that exceeds a threshold. In this case, the threshold will be the average score for for all of the sentences. " 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 7, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "def generate_summary(sentences, sentence_value, threshold):\n", 342 | " sentence_count = 0\n", 343 | " \n", 344 | " summary = ''\n", 345 | " \n", 346 | " for sentence in sentences:\n", 347 | " if sentence in sentence_value and sentence_value[sentence] > threshold:\n", 348 | " summary += \" \" + sentence\n", 349 | " sentence_count += 1\n", 350 | " \n", 351 | " return summary \n", 352 | " \n", 353 | " " 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 41, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | " If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him? As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least. He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and personal glory, however, the Argentine's personal drug is trophies. Do Messi and Ronaldo inspire each other? \"Maybe subconsciously in some way they've driven each other on,\" said Rodgers. With the very elite performers, that drive comes from within.\"\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "#End to End Run\n", 371 | "freq_table = create_freq_table(\" \".join(sentences))\n", 372 | "\n", 373 | "sentence_scores = score_sentences(sentences, freq_table)\n", 374 | "\n", 375 | "threshold = find_average_score(sentence_scores)\n", 376 | "\n", 377 | "summary = generate_summary(sentences, sentence_scores, 1.0 * threshold)\n", 378 | "\n", 379 | "print(re.sub('\\n','',summary))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "### Text Rank using Universal Sentence Embeddings (Python 3.7)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "Next we evaluate the results generated when using universal sentence embeddings and text rank to generate summaries. Before we jump into the code, let's discuss a few concepts that are critical. \n", 394 | "\n", 395 | "**Text Rank**\n", 396 | "This may sound familiar. This is essentially a derivative of the famous PageRank created by the Google cofounders. In PageRank, they generated a matrix that calculaes the probability that a user will move from one page to another. In the case of TextRank, we generate a cosine similarity matrix where we have the similarity of each sentence to each other.\n", 397 | "\n", 398 | "A graph is then generated from this cosine similarity matrix and the pagerank algorithm is applied to this graph and scores are then calculated for each sentence. For more information on the Page Rank algorithm, please use the following resource [pagerank link]\n", 399 | "\n", 400 | "**Universal Sentence Embeddings**\n", 401 | "Without going into too much detail, universal sentence embeddings encode word, sentence and paragraph into semantic vectors. They are trained on Deep Averaging Networks. More details can be found here:\n", 402 | "\n", 403 | "https://tfhub.dev/google/universal-sentence-encoder/1" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 12, 409 | "metadata": { 410 | "scrolled": true 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "INFO:tensorflow:Using C:\\Temp\\tfhub_modules to cache modules.\n", 418 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_0:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_0\n", 419 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_1:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_1\n", 420 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_10:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_10\n", 421 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_11:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_11\n", 422 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_12:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_12\n", 423 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_13:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_13\n", 424 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_14:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_14\n", 425 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_15:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_15\n", 426 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_16:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_16\n", 427 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_2:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_2\n", 428 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_3:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_3\n", 429 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_4:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_4\n", 430 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_5:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_5\n", 431 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_6:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_6\n", 432 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_7:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_7\n", 433 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_8:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_8\n", 434 | "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_9:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_9\n", 435 | "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_0/weights\n", 436 | "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_1/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_1/weights\n", 437 | "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_2/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_2/weights\n", 438 | "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_3/projection:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_3/projection\n", 439 | "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_3/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_3/weights\n", 440 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/bias\n", 441 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/weights\n", 442 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/bias\n", 443 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/weights\n", 444 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/bias\n", 445 | "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/weights\n", 446 | "INFO:tensorflow:Initialize variable module/SNLI/Classifier/LinearLayer/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/LinearLayer/bias\n", 447 | "INFO:tensorflow:Initialize variable module/SNLI/Classifier/LinearLayer/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/LinearLayer/weights\n", 448 | "INFO:tensorflow:Initialize variable module/SNLI/Classifier/tanh_layer_0/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/tanh_layer_0/bias\n", 449 | "INFO:tensorflow:Initialize variable module/SNLI/Classifier/tanh_layer_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/tanh_layer_0/weights\n", 450 | "INFO:tensorflow:Initialize variable module/global_step:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with global_step\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "import tensorflow_hub as hub\n", 456 | "import tensorflow as tf\n", 457 | "\n", 458 | "module_url = \"https://tfhub.dev/google/universal-sentence-encoder/2\"\n", 459 | "\n", 460 | "embed = hub.Module(module_url)\n", 461 | "\n", 462 | "# Reduce logging output.\n", 463 | "tf.logging.set_verbosity(tf.logging.ERROR)\n", 464 | "\n", 465 | "with tf.Session() as session:\n", 466 | " session.run([tf.global_variables_initializer(), tf.tables_initializer()])\n", 467 | " message_embeddings = session.run(embed(sentences))" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 16, 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "data": { 477 | "text/plain": [ 478 | "\"Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. He might show it in a different way, but Messi is just as competitive as Ronaldo. Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate and the media to spread, but it has / it is probably not particularly true. Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one.\"" 479 | ] 480 | }, 481 | "execution_count": 16, 482 | "metadata": {}, 483 | "output_type": "execute_result" 484 | } 485 | ], 486 | "source": [ 487 | "from sklearn.metrics.pairwise import cosine_similarity\n", 488 | "import networkx as nx\n", 489 | "\n", 490 | "#generate cosine similarity matrix\n", 491 | "sim_matrix = cosine_similarity(message_embeddings)\n", 492 | "\n", 493 | "#create graph and generate scores from pagerank algorithms\n", 494 | "nx_graph = nx.from_numpy_array(sim_matrix)\n", 495 | "scores = nx.pagerank(nx_graph)\n", 496 | "\n", 497 | "ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)\n", 498 | " \n", 499 | "num_of_sentences = 5\n", 500 | " \n", 501 | "summary = \" \".join([i[1] for i in ranked_sentences[:num_of_sentences]])\n", 502 | "summary" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "### Unsupervised Learning using Skip Thought Vectors (Python 2.7)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "Now this, in my opinion, is the newest and most novel approach we've discussed here. The high level approach is as follows:\n", 517 | "\n", 518 | "Text Cleaning -> Encoder/Decoder -> K Means Clustering -> Extract Sentences Closest to Cluster Center\n", 519 | "\n", 520 | "Again, there are two main concepts I want to discuss before jumping into the solution:\n", 521 | "\n", 522 | "**Skip Thought Vectors**\n", 523 | "\n", 524 | "Here, we use a encoder/decoder framework to generate feature vectors Taking it from Kushal Chauhan's post, here is how the encoder and decoder layers are defined:\n", 525 | "1. Encoder Network: The encoder is typically a GRU-RNN which generates a fixed length vector representation h(i) for each sentence S(i) in the input. The encoded representation h(i) is obtained by passing final hidden state of the GRU cell (i.e. after it has seen the entire sentence) to multiple dense layers.\n", 526 | "2. Decoder Network: The decoder network takes this vector representation h(i) as input and tries to generate two sentences - S(i-1) and S(i+1), which could occur before and after the input sentence respectively. Separate decoders are implemented for generation of previous and next sentences, both being GRU-RNNs. The vector representation h(i) acts as the initial hidden state for the GRUs of the decoder networks.\n", 527 | "\n", 528 | "Similar to how Word2Vec embeddings are trained by predicting the surrounding words, the Skip Thought Vectors are trained by predicting the sentence at time, t-1 and t+1. As this model is trained, the learned representation (hidden layer) will now place similar sentences closer together which enables higher performance clustering.\n", 529 | "\n", 530 | "I encourage you to review the paper on the same subject for more clarity.\n", 531 | "\n", 532 | "**K-Means Clustering**\n", 533 | "\n", 534 | "Most of you will be familiar with this form of unsupervised learning but I want to elaborate on how it is used and why it is interesting.\n", 535 | "\n", 536 | "As we are aware, each cluster will have some center point which, in the vector space, would indicate the point which closely represents the theme of that cluster. With this in mind, when trying to create a summary, we should only need the sentence which is the closest to the center of that cluster. The key here is choosing the correct number of clusters to do a good job of summarizing the content. Kushal's post recommends that we calculate the cluster size by taking 30% of the number of sentences." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 5, 542 | "metadata": { 543 | "scrolled": true 544 | }, 545 | "outputs": [ 546 | { 547 | "name": "stdout", 548 | "output_type": "stream", 549 | "text": [ 550 | "Loading model parameters...\n", 551 | "Compiling encoders...\n", 552 | "Loading tables...\n", 553 | "Packing up...\n", 554 | "38\n", 555 | "8\n", 556 | "41\n", 557 | "13\n", 558 | "15\n", 559 | "48\n", 560 | "17\n", 561 | "18\n", 562 | "20\n", 563 | "22\n", 564 | "23\n", 565 | "56\n", 566 | "25\n", 567 | "60\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "import skipthoughts\n", 573 | "\n", 574 | "# You would need to download pre-trained models first\n", 575 | "model = skipthoughts.load_model()\n", 576 | "\n", 577 | "encoder = skipthoughts.Encoder(model)\n", 578 | "encoded = encoder.encode(sentences)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "All of the skipthoughts dependencies can be found here.\n", 586 | "As mentioned above, the number of clusters will be the number of sentences that will be included in the summary. For this example, we used a cluster size of 7." 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 36, 592 | "metadata": { 593 | "collapsed": true 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "from sklearn.metrics import pairwise_distances_argmin_min\n", 598 | "import numpy as np\n", 599 | "from sklearn.cluster import KMeans\n", 600 | "kmeans = KMeans(n_clusters=n_clusters)\n", 601 | "kmeans = kmeans.fit(encoded)\n", 602 | "\n", 603 | "n_clusters = int(np.ceil(len(encoded)**0.6))\n", 604 | "print(n_clusters)\n", 605 | "\n", 606 | "avg = []\n", 607 | "for j in range(n_clusters):\n", 608 | " idx = np.where(kmeans.labels_ == j)[0]\n", 609 | " avg.append(np.mean(idx))\n", 610 | "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)\n", 611 | "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n", 612 | "summary = ' '.join([sentences[closest[idx]] for idx in ordering])" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 37, 618 | "metadata": { 619 | "scrolled": true 620 | }, 621 | "outputs": [ 622 | { 623 | "data": { 624 | "text/plain": [ 625 | "'Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one. Rather than being a better player than Ronaldo, Messi\\'s main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi. With the very elite performers, that drive comes from within.\" \"But I think both those players inherently have that hunger to be the best players they can be.'" 626 | ] 627 | }, 628 | "execution_count": 37, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "summary" 635 | ] 636 | } 637 | ], 638 | "metadata": { 639 | "kernelspec": { 640 | "display_name": "Python 2", 641 | "language": "python", 642 | "name": "python2" 643 | }, 644 | "language_info": { 645 | "codemirror_mode": { 646 | "name": "ipython", 647 | "version": 2 648 | }, 649 | "file_extension": ".py", 650 | "mimetype": "text/x-python", 651 | "name": "python", 652 | "nbconvert_exporter": "python", 653 | "pygments_lexer": "ipython2", 654 | "version": "2.7.14" 655 | } 656 | }, 657 | "nbformat": 4, 658 | "nbformat_minor": 2 659 | } 660 | -------------------------------------------------------------------------------- /dataset_handler.py: -------------------------------------------------------------------------------- 1 | # Dataset handler for binary classification tasks (MR, CR, SUBJ, MQPA) 2 | 3 | import numpy as np 4 | from numpy.random import RandomState 5 | import os.path 6 | 7 | 8 | def load_data(encoder, name, loc='./data/', seed=1234): 9 | """ 10 | Load one of MR, CR, SUBJ or MPQA 11 | """ 12 | z = {} 13 | if name == 'MR': 14 | pos, neg = load_rt(loc=loc) 15 | elif name == 'SUBJ': 16 | pos, neg = load_subj(loc=loc) 17 | elif name == 'CR': 18 | pos, neg = load_cr(loc=loc) 19 | elif name == 'MPQA': 20 | pos, neg = load_mpqa(loc=loc) 21 | 22 | labels = compute_labels(pos, neg) 23 | text, labels = shuffle_data(pos+neg, labels, seed=seed) 24 | z['text'] = text 25 | z['labels'] = labels 26 | print 'Computing skip-thought vectors...' 27 | features = encoder.encode(text, verbose=False) 28 | return z, features 29 | 30 | 31 | def load_rt(loc='./data/'): 32 | """ 33 | Load the MR dataset 34 | """ 35 | pos, neg = [], [] 36 | with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f: 37 | for line in f: 38 | pos.append(line.decode('latin-1').strip()) 39 | with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f: 40 | for line in f: 41 | neg.append(line.decode('latin-1').strip()) 42 | return pos, neg 43 | 44 | 45 | def load_subj(loc='./data/'): 46 | """ 47 | Load the SUBJ dataset 48 | """ 49 | pos, neg = [], [] 50 | with open(os.path.join(loc, 'plot.tok.gt9.5000'), 'rb') as f: 51 | for line in f: 52 | pos.append(line.decode('latin-1').strip()) 53 | with open(os.path.join(loc, 'quote.tok.gt9.5000'), 'rb') as f: 54 | for line in f: 55 | neg.append(line.decode('latin-1').strip()) 56 | return pos, neg 57 | 58 | 59 | def load_cr(loc='./data/'): 60 | """ 61 | Load the CR dataset 62 | """ 63 | pos, neg = [], [] 64 | with open(os.path.join(loc, 'custrev.pos'), 'rb') as f: 65 | for line in f: 66 | text = line.strip() 67 | if len(text) > 0: 68 | pos.append(text) 69 | with open(os.path.join(loc, 'custrev.neg'), 'rb') as f: 70 | for line in f: 71 | text = line.strip() 72 | if len(text) > 0: 73 | neg.append(text) 74 | return pos, neg 75 | 76 | 77 | def load_mpqa(loc='./data/'): 78 | """ 79 | Load the MPQA dataset 80 | """ 81 | pos, neg = [], [] 82 | with open(os.path.join(loc, 'mpqa.pos'), 'rb') as f: 83 | for line in f: 84 | text = line.strip() 85 | if len(text) > 0: 86 | pos.append(text) 87 | with open(os.path.join(loc, 'mpqa.neg'), 'rb') as f: 88 | for line in f: 89 | text = line.strip() 90 | if len(text) > 0: 91 | neg.append(text) 92 | return pos, neg 93 | 94 | 95 | def compute_labels(pos, neg): 96 | """ 97 | Construct list of labels 98 | """ 99 | labels = np.zeros(len(pos) + len(neg)) 100 | labels[:len(pos)] = 1.0 101 | labels[len(pos):] = 0.0 102 | return labels 103 | 104 | 105 | def shuffle_data(X, L, seed=1234): 106 | """ 107 | Shuffle the data 108 | """ 109 | prng = RandomState(seed) 110 | inds = np.arange(len(X)) 111 | prng.shuffle(inds) 112 | X = [X[i] for i in inds] 113 | L = L[inds] 114 | return (X, L) 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /eval_classification.py: -------------------------------------------------------------------------------- 1 | # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ) 2 | 3 | import numpy as np 4 | import sys 5 | import nbsvm 6 | import dataset_handler 7 | 8 | from scipy.sparse import hstack 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.cross_validation import KFold 12 | 13 | 14 | def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False): 15 | """ 16 | Evaluate features with nested K-fold cross validation 17 | Outer loop: Held-out evaluation 18 | Inner loop: Hyperparameter tuning 19 | 20 | Datasets can be found at http://nlp.stanford.edu/~sidaw/home/projects:nbsvm 21 | Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA' 22 | """ 23 | # Load the dataset and extract features 24 | z, features = dataset_handler.load_data(encoder, name, loc=loc, seed=seed) 25 | 26 | scan = [2**t for t in range(0,9,1)] 27 | npts = len(z['text']) 28 | kf = KFold(npts, n_folds=k, random_state=seed) 29 | scores = [] 30 | for train, test in kf: 31 | 32 | # Split data 33 | X_train = features[train] 34 | y_train = z['labels'][train] 35 | X_test = features[test] 36 | y_test = z['labels'][test] 37 | 38 | Xraw = [z['text'][i] for i in train] 39 | Xraw_test = [z['text'][i] for i in test] 40 | 41 | scanscores = [] 42 | for s in scan: 43 | 44 | # Inner KFold 45 | innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1) 46 | innerscores = [] 47 | for innertrain, innertest in innerkf: 48 | 49 | # Split data 50 | X_innertrain = X_train[innertrain] 51 | y_innertrain = y_train[innertrain] 52 | X_innertest = X_train[innertest] 53 | y_innertest = y_train[innertest] 54 | 55 | Xraw_innertrain = [Xraw[i] for i in innertrain] 56 | Xraw_innertest = [Xraw[i] for i in innertest] 57 | 58 | # NB (if applicable) 59 | if use_nb: 60 | NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest) 61 | X_innertrain = hstack((X_innertrain, NBtrain)) 62 | X_innertest = hstack((X_innertest, NBtest)) 63 | 64 | # Train classifier 65 | clf = LogisticRegression(C=s) 66 | clf.fit(X_innertrain, y_innertrain) 67 | acc = clf.score(X_innertest, y_innertest) 68 | innerscores.append(acc) 69 | print (s, acc) 70 | 71 | # Append mean score 72 | scanscores.append(np.mean(innerscores)) 73 | 74 | # Get the index of the best score 75 | s_ind = np.argmax(scanscores) 76 | s = scan[s_ind] 77 | print scanscores 78 | print s 79 | 80 | # NB (if applicable) 81 | if use_nb: 82 | NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test) 83 | X_train = hstack((X_train, NBtrain)) 84 | X_test = hstack((X_test, NBtest)) 85 | 86 | # Train classifier 87 | clf = LogisticRegression(C=s) 88 | clf.fit(X_train, y_train) 89 | 90 | # Evaluate 91 | acc = clf.score(X_test, y_test) 92 | scores.append(acc) 93 | print scores 94 | 95 | return scores 96 | 97 | 98 | def compute_nb(X, y, Z): 99 | """ 100 | Compute NB features 101 | """ 102 | labels = [int(t) for t in y] 103 | ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0] 104 | ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1] 105 | poscounts = nbsvm.build_dict(ptrain, [1,2]) 106 | negcounts = nbsvm.build_dict(ntrain, [1,2]) 107 | dic, r = nbsvm.compute_ratio(poscounts, negcounts) 108 | trainX = nbsvm.process_text(X, dic, r, [1,2]) 109 | devX = nbsvm.process_text(Z, dic, r, [1,2]) 110 | return trainX, devX 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /eval_msrp.py: -------------------------------------------------------------------------------- 1 | # Evaluation for MSRP 2 | 3 | import numpy as np 4 | 5 | from collections import defaultdict 6 | from nltk.tokenize import word_tokenize 7 | from numpy.random import RandomState 8 | import os.path 9 | from sklearn.cross_validation import KFold 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import f1_score as f1 12 | 13 | 14 | def evaluate(encoder, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True, loc='./data/'): 15 | """ 16 | Run experiment 17 | k: number of CV folds 18 | test: whether to evaluate on test set 19 | """ 20 | print 'Preparing data...' 21 | traintext, testtext, labels = load_data(loc) 22 | 23 | print 'Computing training skipthoughts...' 24 | trainA = encoder.encode(traintext[0], verbose=False) 25 | trainB = encoder.encode(traintext[1], verbose=False) 26 | 27 | if evalcv: 28 | print 'Running cross-validation...' 29 | C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats) 30 | 31 | if evaltest: 32 | if not evalcv: 33 | C = 4 # Best parameter found from CV (combine-skip with use_feats=True) 34 | 35 | print 'Computing testing skipthoughts...' 36 | testA = encoder.encode(testtext[0], verbose=False) 37 | testB = encoder.encode(testtext[1], verbose=False) 38 | 39 | if use_feats: 40 | train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])] 41 | test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])] 42 | else: 43 | train_features = np.c_[np.abs(trainA - trainB), trainA * trainB] 44 | test_features = np.c_[np.abs(testA - testB), testA * testB] 45 | 46 | print 'Evaluating...' 47 | clf = LogisticRegression(C=C) 48 | clf.fit(train_features, labels[0]) 49 | yhat = clf.predict(test_features) 50 | print 'Test accuracy: ' + str(clf.score(test_features, labels[1])) 51 | print 'Test F1: ' + str(f1(labels[1], yhat)) 52 | 53 | 54 | def load_data(loc='./data/'): 55 | """ 56 | Load MSRP dataset 57 | """ 58 | trainloc = os.path.join(loc, 'msr_paraphrase_train.txt') 59 | testloc = os.path.join(loc, 'msr_paraphrase_test.txt') 60 | 61 | trainA, trainB, testA, testB = [],[],[],[] 62 | trainS, devS, testS = [],[],[] 63 | 64 | f = open(trainloc, 'rb') 65 | for line in f: 66 | text = line.strip().split('\t') 67 | trainA.append(' '.join(word_tokenize(text[3]))) 68 | trainB.append(' '.join(word_tokenize(text[4]))) 69 | trainS.append(text[0]) 70 | f.close() 71 | f = open(testloc, 'rb') 72 | for line in f: 73 | text = line.strip().split('\t') 74 | testA.append(' '.join(word_tokenize(text[3]))) 75 | testB.append(' '.join(word_tokenize(text[4]))) 76 | testS.append(text[0]) 77 | f.close() 78 | 79 | trainS = [int(s) for s in trainS[1:]] 80 | testS = [int(s) for s in testS[1:]] 81 | 82 | return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS] 83 | 84 | 85 | def is_number(s): 86 | try: 87 | float(s) 88 | return True 89 | except ValueError: 90 | return False 91 | 92 | 93 | def feats(A, B): 94 | """ 95 | Compute additional features (similar to Socher et al.) 96 | These alone should give the same result from their paper (~73.2 Acc) 97 | """ 98 | tA = [t.split() for t in A] 99 | tB = [t.split() for t in B] 100 | 101 | nA = [[w for w in t if is_number(w)] for t in tA] 102 | nB = [[w for w in t if is_number(w)] for t in tB] 103 | 104 | features = np.zeros((len(A), 6)) 105 | 106 | # n1 107 | for i in range(len(A)): 108 | if set(nA[i]) == set(nB[i]): 109 | features[i,0] = 1. 110 | 111 | # n2 112 | for i in range(len(A)): 113 | if set(nA[i]) == set(nB[i]) and len(nA[i]) > 0: 114 | features[i,1] = 1. 115 | 116 | # n3 117 | for i in range(len(A)): 118 | if set(nA[i]) <= set(nB[i]) or set(nB[i]) <= set(nA[i]): 119 | features[i,2] = 1. 120 | 121 | # n4 122 | for i in range(len(A)): 123 | features[i,3] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tA[i])) 124 | 125 | # n5 126 | for i in range(len(A)): 127 | features[i,4] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tB[i])) 128 | 129 | # n6 130 | for i in range(len(A)): 131 | features[i,5] = 0.5 * ((1.0*len(tA[i]) / len(tB[i])) + (1.0*len(tB[i]) / len(tA[i]))) 132 | 133 | return features 134 | 135 | 136 | def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=1234, use_feats=False): 137 | """ 138 | Perform k-fold cross validation 139 | """ 140 | # features 141 | labels = np.array(labels) 142 | if use_feats: 143 | features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])] 144 | else: 145 | features = np.c_[np.abs(A - B), A * B] 146 | 147 | scan = [2**t for t in range(0,9,1)] 148 | npts = len(features) 149 | kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed) 150 | scores = [] 151 | 152 | for s in scan: 153 | 154 | scanscores = [] 155 | 156 | for train, test in kf: 157 | 158 | # Split data 159 | X_train = features[train] 160 | y_train = labels[train] 161 | X_test = features[test] 162 | y_test = labels[test] 163 | 164 | # Train classifier 165 | clf = LogisticRegression(C=s) 166 | clf.fit(X_train, y_train) 167 | yhat = clf.predict(X_test) 168 | fscore = f1(y_test, yhat) 169 | scanscores.append(fscore) 170 | print (s, fscore) 171 | 172 | # Append mean score 173 | scores.append(np.mean(scanscores)) 174 | print scores 175 | 176 | # Get the index of the best score 177 | s_ind = np.argmax(scores) 178 | s = scan[s_ind] 179 | print scores 180 | print s 181 | return s 182 | 183 | 184 | -------------------------------------------------------------------------------- /eval_rank.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for image-sentence ranking 3 | ''' 4 | import numpy as np 5 | 6 | import theano 7 | import theano.tensor as tensor 8 | 9 | import cPickle as pkl 10 | import numpy 11 | import copy 12 | import os 13 | import time 14 | 15 | from scipy import optimize, stats 16 | from scipy.linalg import norm 17 | from collections import OrderedDict 18 | from sklearn.cross_validation import KFold 19 | from numpy.random import RandomState 20 | 21 | import warnings 22 | 23 | 24 | # push parameters to Theano shared variables 25 | def zipp(params, tparams): 26 | for kk, vv in params.iteritems(): 27 | tparams[kk].set_value(vv) 28 | 29 | # pull parameters from Theano shared variables 30 | def unzip(zipped): 31 | new_params = OrderedDict() 32 | for kk, vv in zipped.iteritems(): 33 | new_params[kk] = vv.get_value() 34 | return new_params 35 | 36 | # get the list of parameters: Note that tparams must be OrderedDict 37 | def itemlist(tparams): 38 | return [vv for kk, vv in tparams.iteritems()] 39 | 40 | # make prefix-appended name 41 | def _p(pp, name): 42 | return '%s_%s'%(pp, name) 43 | 44 | # all parameters 45 | def init_params(options): 46 | """ 47 | Initalize all model parameters here 48 | """ 49 | params = OrderedDict() 50 | 51 | # Image embedding, sentence embedding 52 | params = get_layer('ff')[0](options, params, prefix='ff_im', nin=options['dim_im'], nout=options['dim']) 53 | params = get_layer('ff')[0](options, params, prefix='ff_s', nin=options['dim_s'], nout=options['dim']) 54 | 55 | return params 56 | 57 | # initialize Theano shared variables according to the initial parameters 58 | def init_tparams(params): 59 | tparams = OrderedDict() 60 | for kk, pp in params.iteritems(): 61 | tparams[kk] = theano.shared(params[kk], name=kk) 62 | return tparams 63 | 64 | # load parameters 65 | def load_params(path, params): 66 | pp = numpy.load(path) 67 | for kk, vv in params.iteritems(): 68 | if kk not in pp: 69 | raise Warning('%s is not in the archive'%kk) 70 | params[kk] = pp[kk] 71 | return params 72 | 73 | # layers: 'name': ('parameter initializer', 'feedforward') 74 | layers = {'ff': ('param_init_fflayer', 'fflayer')} 75 | 76 | def get_layer(name): 77 | """ 78 | Part of the reason the init is very slow is because, 79 | the layer's constructor is called even when it isn't needed 80 | """ 81 | fns = layers[name] 82 | return (eval(fns[0]), eval(fns[1])) 83 | 84 | def norm_weight(nin,nout=None): 85 | """ 86 | Weight initialization 87 | """ 88 | if nout == None: 89 | nout = nin 90 | else: 91 | r = numpy.sqrt( 2. / nin) 92 | W = numpy.random.rand(nin, nout) * 2 * r - r 93 | return W.astype('float32') 94 | 95 | def linear(x): 96 | return x 97 | 98 | # feedforward layer: affine transformation + point-wise nonlinearity 99 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): 100 | if nin == None: 101 | nin = options['dim_proj'] 102 | if nout == None: 103 | nout = options['dim_proj'] 104 | params[_p(prefix,'W')] = norm_weight(nin, nout) 105 | params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') 106 | 107 | return params 108 | 109 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): 110 | return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')]) 111 | 112 | # L2norm, row-wise 113 | def l2norm(X): 114 | norm = tensor.sqrt(tensor.pow(X, 2).sum(1)) 115 | X /= norm[:, None] 116 | return X 117 | 118 | # build a training model 119 | def build_model(tparams, options): 120 | """ 121 | Construct computation graph for the whole model 122 | """ 123 | # inputs (image, sentence, contrast images, constrast sentences) 124 | im = tensor.matrix('im', dtype='float32') 125 | s = tensor.matrix('s', dtype='float32') 126 | cim = tensor.matrix('cim', dtype='float32') 127 | cs = tensor.matrix('cs', dtype='float32') 128 | 129 | # image embedding 130 | lim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear') 131 | lcim = get_layer('ff')[1](tparams, cim, options, prefix='ff_im', activ='linear') 132 | 133 | # sentence embedding 134 | ls = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear') 135 | lcs = get_layer('ff')[1](tparams, cs, options, prefix='ff_s', activ='linear') 136 | 137 | # L2 norm for sentences 138 | ls = l2norm(ls) 139 | lcs = l2norm(lcs) 140 | 141 | # Tile by number of contrast terms 142 | lim = tensor.tile(lim, (options['ncon'], 1)) 143 | ls = tensor.tile(ls, (options['ncon'], 1)) 144 | 145 | # pairwise ranking loss 146 | cost_im = options['margin'] - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1) 147 | cost_im = cost_im * (cost_im > 0.) 148 | cost_im = cost_im.sum(0) 149 | 150 | cost_s = options['margin'] - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1) 151 | cost_s = cost_s * (cost_s > 0.) 152 | cost_s = cost_s.sum(0) 153 | 154 | cost = cost_im + cost_s 155 | return [im, s, cim, cs], cost 156 | 157 | # build an encoder 158 | def build_encoder(tparams, options): 159 | """ 160 | Construct encoder 161 | """ 162 | # inputs (image, sentence) 163 | im = tensor.matrix('im', dtype='float32') 164 | s = tensor.matrix('s', dtype='float32') 165 | 166 | # embeddings 167 | eim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear') 168 | es = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear') 169 | 170 | # L2 norm of rows 171 | lim = l2norm(eim) 172 | ls = l2norm(es) 173 | 174 | return [im, s], lim, ls 175 | 176 | # optimizers 177 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 178 | def adam(lr, tparams, grads, inp, cost): 179 | gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] 180 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 181 | 182 | f_grad_shared = theano.function(inp, cost, updates=gsup) 183 | 184 | lr0 = 0.0002 185 | b1 = 0.1 186 | b2 = 0.001 187 | e = 1e-8 188 | 189 | updates = [] 190 | 191 | i = theano.shared(numpy.float32(0.)) 192 | i_t = i + 1. 193 | fix1 = 1. - b1**(i_t) 194 | fix2 = 1. - b2**(i_t) 195 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 196 | 197 | for p, g in zip(tparams.values(), gshared): 198 | m = theano.shared(p.get_value() * numpy.float32(0.)) 199 | v = theano.shared(p.get_value() * numpy.float32(0.)) 200 | m_t = (b1 * g) + ((1. - b1) * m) 201 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 202 | g_t = m_t / (tensor.sqrt(v_t) + e) 203 | p_t = p - (lr_t * g_t) 204 | updates.append((m, m_t)) 205 | updates.append((v, v_t)) 206 | updates.append((p, p_t)) 207 | updates.append((i, i_t)) 208 | 209 | f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') 210 | 211 | return f_grad_shared, f_update 212 | 213 | # things to avoid doing 214 | def validate_options(options): 215 | 216 | if options['dim'] > options['dim_im']: 217 | warnings.warn('dim should not be bigger than image dimension') 218 | if options['dim'] > options['dim_s']: 219 | warnings.warn('dim should not be bigger than sentence dimension') 220 | if options['margin'] > 1: 221 | warnings.warn('margin should not be bigger than 1') 222 | return options 223 | 224 | # Load a saved model and evaluate the results 225 | def evaluate(X, saveto, evaluate=False, out=False): 226 | print "Loading model..." 227 | with open('%s.pkl'%saveto, 'rb') as f: 228 | model_options = pkl.load(f) 229 | 230 | params = init_params(model_options) 231 | params = load_params(saveto, params) 232 | tparams = init_tparams(params) 233 | 234 | print 'Building encoder' 235 | inps_e, lim, ls = build_encoder(tparams, model_options) 236 | f_emb = theano.function(inps_e, [lim, ls], profile=False) 237 | 238 | print 'Compute embeddings...' 239 | lim, ls = f_emb(X[1], X[2]) 240 | 241 | if evaluate: 242 | (r1, r5, r10, medr) = i2t(lim, ls) 243 | print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) 244 | (r1i, r5i, r10i, medri) = t2i(lim, ls) 245 | print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) 246 | if out: 247 | return lim, ls 248 | 249 | # trainer 250 | def trainer(train, dev, # training and development tuples 251 | dim=1000, # embedding dimensionality 252 | dim_im=4096, # image dimensionality 253 | dim_s=4800, # sentence dimensionality 254 | margin=0.2, # margin for pairwise ranking 255 | ncon=50, # number of contrastive terms 256 | max_epochs=15, 257 | lrate=0.01, # not needed with Adam 258 | dispFreq=10, 259 | optimizer='adam', 260 | batch_size = 100, 261 | valid_batch_size = 100, 262 | saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz', 263 | validFreq=500, 264 | saveFreq=500, 265 | reload_=False): 266 | 267 | # Model options 268 | model_options = {} 269 | model_options['dim'] = dim 270 | model_options['dim_im'] = dim_im 271 | model_options['dim_s'] = dim_s 272 | model_options['margin'] = margin 273 | model_options['ncon'] = ncon 274 | model_options['max_epochs'] = max_epochs 275 | model_options['lrate'] = lrate 276 | model_options['dispFreq'] = dispFreq 277 | model_options['optimizer'] = optimizer 278 | model_options['batch_size'] = batch_size 279 | model_options['valid_batch_size'] = valid_batch_size 280 | model_options['saveto'] = saveto 281 | model_options['validFreq'] = validFreq 282 | model_options['saveFreq'] = saveFreq 283 | model_options['reload_'] = reload_ 284 | 285 | model_options = validate_options(model_options) 286 | print model_options 287 | 288 | # reload options 289 | if reload_ and os.path.exists(saveto): 290 | print "Reloading options" 291 | with open('%s.pkl'%saveto, 'rb') as f: 292 | model_options = pkl.load(f) 293 | 294 | print 'Building model' 295 | params = init_params(model_options) 296 | # reload parameters 297 | if reload_ and os.path.exists(saveto): 298 | print "Reloading model" 299 | params = load_params(saveto, params) 300 | 301 | tparams = init_tparams(params) 302 | 303 | inps, cost = build_model(tparams, model_options) 304 | 305 | print 'Building encoder' 306 | inps_e, lim, ls = build_encoder(tparams, model_options) 307 | 308 | print 'Building functions' 309 | f_cost = theano.function(inps, -cost, profile=False) 310 | f_emb = theano.function(inps_e, [lim, ls], profile=False) 311 | 312 | # gradient computation 313 | print 'Computing gradients' 314 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 315 | lr = tensor.scalar(name='lr') 316 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 317 | 318 | print 'Optimization' 319 | 320 | uidx = 0 321 | estop = False 322 | start = 1234 323 | seed = 1234 324 | inds = numpy.arange(len(train[0])) 325 | numbatches = len(inds) / batch_size 326 | curr = 0 327 | counter = 0 328 | target=None 329 | history_errs = [] 330 | 331 | # Main loop 332 | for eidx in range(max_epochs): 333 | tic = time.time() 334 | prng = RandomState(seed - eidx - 1) 335 | prng.shuffle(inds) 336 | 337 | for minibatch in range(numbatches): 338 | 339 | uidx += 1 340 | conprng_im = RandomState(seed + uidx + 1) 341 | conprng_s = RandomState(2*seed + uidx + 1) 342 | 343 | im = train[1][inds[minibatch::numbatches]] 344 | s = train[2][inds[minibatch::numbatches]] 345 | 346 | cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im)) 347 | cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s)) 348 | cim = train[1][cinds_im] 349 | cs = train[2][cinds_s] 350 | 351 | ud_start = time.time() 352 | cost = f_grad_shared(im, s, cim, cs) 353 | f_update(lrate) 354 | ud_duration = time.time() - ud_start 355 | 356 | if numpy.mod(uidx, dispFreq) == 0: 357 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration 358 | 359 | if numpy.mod(uidx, validFreq) == 0: 360 | 361 | print 'Computing ranks...' 362 | lim, ls = f_emb(dev[1], dev[2]) 363 | (r1, r5, r10, medr) = i2t(lim, ls) 364 | print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) 365 | (r1i, r5i, r10i, medri) = t2i(lim, ls) 366 | print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) 367 | 368 | currscore = r1 + r5 + r10 + r1i + r5i + r10i 369 | if currscore > curr: 370 | curr = currscore 371 | 372 | # Save model 373 | print 'Saving...', 374 | params = unzip(tparams) 375 | numpy.savez(saveto, history_errs=history_errs, **params) 376 | pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) 377 | print 'Done' 378 | 379 | 380 | def i2t(images, captions, npts=None): 381 | """ 382 | Images: (5N, K) matrix of images 383 | Captions: (5N, K) matrix of captions 384 | """ 385 | if npts == None: 386 | npts = images.shape[0] / 5 387 | index_list = [] 388 | 389 | # Project captions 390 | for i in range(len(captions)): 391 | captions[i] /= norm(captions[i]) 392 | 393 | ranks = numpy.zeros(npts) 394 | for index in range(npts): 395 | 396 | # Get query image 397 | im = images[5 * index].reshape(1, images.shape[1]) 398 | im /= norm(im) 399 | 400 | # Compute scores 401 | d = numpy.dot(im, captions.T).flatten() 402 | inds = numpy.argsort(d)[::-1] 403 | index_list.append(inds[0]) 404 | 405 | # Score 406 | rank = 1e20 407 | for i in range(5*index, 5*index + 5, 1): 408 | tmp = numpy.where(inds == i)[0][0] 409 | if tmp < rank: 410 | rank = tmp 411 | ranks[index] = rank 412 | 413 | # Compute metrics 414 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 415 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 416 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 417 | medr = numpy.floor(numpy.median(ranks)) + 1 418 | return (r1, r5, r10, medr) 419 | 420 | 421 | def t2i(images, captions, npts=None): 422 | """ 423 | Images: (5N, K) matrix of images 424 | Captions: (5N, K) matrix of captions 425 | """ 426 | if npts == None: 427 | npts = images.shape[0] / 5 428 | ims = numpy.array([images[i] for i in range(0, len(images), 5)]) 429 | 430 | # Project images 431 | for i in range(len(ims)): 432 | ims[i] /= norm(ims[i]) 433 | 434 | # Project captions 435 | for i in range(len(captions)): 436 | captions[i] /= norm(captions[i]) 437 | 438 | ranks = np.zeros(5 * npts) 439 | for index in range(npts): 440 | 441 | # Get query captions 442 | queries = captions[5*index : 5*index + 5] 443 | 444 | # Compute scores 445 | d = numpy.dot(queries, ims.T) 446 | inds = numpy.zeros(d.shape) 447 | for i in range(len(inds)): 448 | inds[i] = numpy.argsort(d[i])[::-1] 449 | ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0] 450 | 451 | # Compute metrics 452 | r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) 453 | r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) 454 | r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) 455 | medr = numpy.floor(numpy.median(ranks)) + 1 456 | return (r1, r5, r10, medr) 457 | 458 | 459 | -------------------------------------------------------------------------------- /eval_sick.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for the SICK dataset (SemEval 2014 Task 1) 3 | ''' 4 | import numpy as np 5 | import os.path 6 | from sklearn.metrics import mean_squared_error as mse 7 | from scipy.stats import pearsonr 8 | from scipy.stats import spearmanr 9 | from sklearn.utils import shuffle 10 | 11 | from keras.models import Sequential 12 | from keras.layers.core import Dense, Activation 13 | from keras.optimizers import Adam 14 | 15 | 16 | def evaluate(encoder, seed=1234, evaltest=False, loc='./data/'): 17 | """ 18 | Run experiment 19 | """ 20 | print 'Preparing data...' 21 | train, dev, test, scores = load_data(loc) 22 | train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed) 23 | 24 | print 'Computing training skipthoughts...' 25 | trainA = encoder.encode(train[0], verbose=False, use_eos=True) 26 | trainB = encoder.encode(train[1], verbose=False, use_eos=True) 27 | 28 | print 'Computing development skipthoughts...' 29 | devA = encoder.encode(dev[0], verbose=False, use_eos=True) 30 | devB = encoder.encode(dev[1], verbose=False, use_eos=True) 31 | 32 | print 'Computing feature combinations...' 33 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 34 | devF = np.c_[np.abs(devA - devB), devA * devB] 35 | 36 | print 'Encoding labels...' 37 | trainY = encode_labels(scores[0]) 38 | devY = encode_labels(scores[1]) 39 | 40 | print 'Compiling model...' 41 | lrmodel = prepare_model(ninputs=trainF.shape[1]) 42 | 43 | print 'Training...' 44 | bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1]) 45 | 46 | if evaltest: 47 | print 'Computing test skipthoughts...' 48 | testA = encoder.encode(test[0], verbose=False, use_eos=True) 49 | testB = encoder.encode(test[1], verbose=False, use_eos=True) 50 | 51 | print 'Computing feature combinations...' 52 | testF = np.c_[np.abs(testA - testB), testA * testB] 53 | 54 | print 'Evaluating...' 55 | r = np.arange(1,6) 56 | yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r) 57 | pr = pearsonr(yhat, scores[2])[0] 58 | sr = spearmanr(yhat, scores[2])[0] 59 | se = mse(yhat, scores[2]) 60 | print 'Test Pearson: ' + str(pr) 61 | print 'Test Spearman: ' + str(sr) 62 | print 'Test MSE: ' + str(se) 63 | 64 | return yhat 65 | 66 | 67 | def prepare_model(ninputs=9600, nclass=5): 68 | """ 69 | Set up and compile the model architecture (Logistic regression) 70 | """ 71 | lrmodel = Sequential() 72 | lrmodel.add(Dense(input_dim=ninputs, output_dim=nclass)) 73 | lrmodel.add(Activation('softmax')) 74 | lrmodel.compile(loss='categorical_crossentropy', optimizer='adam') 75 | return lrmodel 76 | 77 | 78 | def train_model(lrmodel, X, Y, devX, devY, devscores): 79 | """ 80 | Train model, using pearsonr on dev for early stopping 81 | """ 82 | done = False 83 | best = -1.0 84 | r = np.arange(1,6) 85 | 86 | while not done: 87 | # Every 100 epochs, check Pearson on development set 88 | lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY)) 89 | yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r) 90 | score = pearsonr(yhat, devscores)[0] 91 | if score > best: 92 | print score 93 | best = score 94 | bestlrmodel = prepare_model(ninputs=X.shape[1]) 95 | bestlrmodel.set_weights(lrmodel.get_weights()) 96 | else: 97 | done = True 98 | 99 | yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r) 100 | score = pearsonr(yhat, devscores)[0] 101 | print 'Dev Pearson: ' + str(score) 102 | return bestlrmodel 103 | 104 | 105 | def encode_labels(labels, nclass=5): 106 | """ 107 | Label encoding from Tree LSTM paper (Tai, Socher, Manning) 108 | """ 109 | Y = np.zeros((len(labels), nclass)).astype('float32') 110 | for j, y in enumerate(labels): 111 | for i in range(nclass): 112 | if i+1 == np.floor(y) + 1: 113 | Y[j,i] = y - np.floor(y) 114 | if i+1 == np.floor(y): 115 | Y[j,i] = np.floor(y) - y + 1 116 | return Y 117 | 118 | 119 | def load_data(loc='./data/'): 120 | """ 121 | Load the SICK semantic-relatedness dataset 122 | """ 123 | trainA, trainB, devA, devB, testA, testB = [],[],[],[],[],[] 124 | trainS, devS, testS = [],[],[] 125 | 126 | with open(os.path.join(loc, 'SICK_train.txt'), 'rb') as f: 127 | for line in f: 128 | text = line.strip().split('\t') 129 | trainA.append(text[1]) 130 | trainB.append(text[2]) 131 | trainS.append(text[3]) 132 | with open(os.path.join(loc, 'SICK_trial.txt'), 'rb') as f: 133 | for line in f: 134 | text = line.strip().split('\t') 135 | devA.append(text[1]) 136 | devB.append(text[2]) 137 | devS.append(text[3]) 138 | with open(os.path.join(loc, 'SICK_test_annotated.txt'), 'rb') as f: 139 | for line in f: 140 | text = line.strip().split('\t') 141 | testA.append(text[1]) 142 | testB.append(text[2]) 143 | testS.append(text[3]) 144 | 145 | trainS = [float(s) for s in trainS[1:]] 146 | devS = [float(s) for s in devS[1:]] 147 | testS = [float(s) for s in testS[1:]] 148 | 149 | return [trainA[1:], trainB[1:]], [devA[1:], devB[1:]], [testA[1:], testB[1:]], [trainS, devS, testS] 150 | 151 | 152 | -------------------------------------------------------------------------------- /eval_trec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluation code for the TREC dataset 3 | ''' 4 | import numpy as np 5 | import os.path 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.cross_validation import KFold 8 | from sklearn.utils import shuffle 9 | 10 | 11 | def evaluate(encoder, k=10, seed=1234, evalcv=True, evaltest=False, loc='./data/'): 12 | """ 13 | Run experiment 14 | k: number of CV folds 15 | test: whether to evaluate on test set 16 | """ 17 | print 'Preparing data...' 18 | traintext, testtext = load_data(loc) 19 | train, train_labels = prepare_data(traintext) 20 | test, test_labels = prepare_data(testtext) 21 | train_labels = prepare_labels(train_labels) 22 | test_labels = prepare_labels(test_labels) 23 | train, train_labels = shuffle(train, train_labels, random_state=seed) 24 | 25 | print 'Computing training skipthoughts...' 26 | trainF = encoder.encode(train, verbose=False, use_eos=False) 27 | 28 | if evalcv: 29 | print 'Running cross-validation...' 30 | interval = [2**t for t in range(0,9,1)] # coarse-grained 31 | C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed) 32 | 33 | if evaltest: 34 | if not evalcv: 35 | C = 128 # Best parameter found from CV 36 | 37 | print 'Computing testing skipthoughts...' 38 | testF = encoder.encode(test, verbose=False, use_eos=False) 39 | 40 | print 'Evaluating...' 41 | clf = LogisticRegression(C=C) 42 | clf.fit(trainF, train_labels) 43 | yhat = clf.predict(testF) 44 | print 'Test accuracy: ' + str(clf.score(testF, test_labels)) 45 | 46 | 47 | def load_data(loc='./data/'): 48 | """ 49 | Load the TREC question-type dataset 50 | """ 51 | train, test = [], [] 52 | with open(os.path.join(loc, 'train_5500.label'), 'rb') as f: 53 | for line in f: 54 | train.append(line.strip()) 55 | with open(os.path.join(loc, 'TREC_10.label'), 'rb') as f: 56 | for line in f: 57 | test.append(line.strip()) 58 | return train, test 59 | 60 | 61 | def prepare_data(text): 62 | """ 63 | Prepare data 64 | """ 65 | labels = [t.split()[0] for t in text] 66 | labels = [l.split(':')[0] for l in labels] 67 | X = [t.split()[1:] for t in text] 68 | X = [' '.join(t) for t in X] 69 | return X, labels 70 | 71 | 72 | def prepare_labels(labels): 73 | """ 74 | Process labels to numerical values 75 | """ 76 | d = {} 77 | count = 0 78 | setlabels = set(labels) 79 | for w in setlabels: 80 | d[w] = count 81 | count += 1 82 | idxlabels = np.array([d[w] for w in labels]) 83 | return idxlabels 84 | 85 | 86 | def eval_kfold(features, labels, k=10, scan=[2**t for t in range(0,9,1)], seed=1234): 87 | """ 88 | Perform k-fold cross validation 89 | """ 90 | npts = len(features) 91 | kf = KFold(npts, n_folds=k, random_state=seed) 92 | scores = [] 93 | 94 | for s in scan: 95 | 96 | scanscores = [] 97 | 98 | for train, test in kf: 99 | 100 | # Split data 101 | X_train = features[train] 102 | y_train = labels[train] 103 | X_test = features[test] 104 | y_test = labels[test] 105 | 106 | # Train classifier 107 | clf = LogisticRegression(C=s) 108 | clf.fit(X_train, y_train) 109 | score = clf.score(X_test, y_test) 110 | scanscores.append(score) 111 | print (s, score) 112 | 113 | # Append mean score 114 | scores.append(np.mean(scanscores)) 115 | print scores 116 | 117 | # Get the index of the best score 118 | s_ind = np.argmax(scores) 119 | s = scan[s_ind] 120 | print (s_ind, s) 121 | return s 122 | 123 | -------------------------------------------------------------------------------- /nbsvm.py: -------------------------------------------------------------------------------- 1 | # Naive-Bayes features 2 | # Derived from https://github.com/mesnilgr/nbsvm 3 | 4 | import os 5 | import pdb 6 | import numpy as np 7 | from collections import Counter 8 | from scipy.sparse import lil_matrix 9 | from scipy.sparse import csr_matrix 10 | 11 | 12 | def tokenize(sentence, grams): 13 | words = sentence.split() 14 | tokens = [] 15 | for gram in grams: 16 | for i in range(len(words) - gram + 1): 17 | tokens += ["_*_".join(words[i:i+gram])] 18 | return tokens 19 | 20 | 21 | def build_dict(X, grams): 22 | dic = Counter() 23 | for sentence in X: 24 | dic.update(tokenize(sentence, grams)) 25 | return dic 26 | 27 | 28 | def compute_ratio(poscounts, negcounts, alpha=1): 29 | alltokens = list(set(poscounts.keys() + negcounts.keys())) 30 | dic = dict((t, i) for i, t in enumerate(alltokens)) 31 | d = len(dic) 32 | p, q = np.ones(d) * alpha , np.ones(d) * alpha 33 | for t in alltokens: 34 | p[dic[t]] += poscounts[t] 35 | q[dic[t]] += negcounts[t] 36 | p /= abs(p).sum() 37 | q /= abs(q).sum() 38 | r = np.log(p/q) 39 | return dic, r 40 | 41 | 42 | def process_text(text, dic, r, grams): 43 | """ 44 | Return sparse feature matrix 45 | """ 46 | X = lil_matrix((len(text), len(dic))) 47 | for i, l in enumerate(text): 48 | tokens = tokenize(l, grams) 49 | indexes = [] 50 | for t in tokens: 51 | try: 52 | indexes += [dic[t]] 53 | except KeyError: 54 | pass 55 | indexes = list(set(indexes)) 56 | indexes.sort() 57 | for j in indexes: 58 | X[i,j] = r[j] 59 | return csr_matrix(X) 60 | 61 | -------------------------------------------------------------------------------- /skipthoughts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Skip-thought vectors 3 | ''' 4 | import os 5 | 6 | import theano 7 | import theano.tensor as tensor 8 | 9 | import cPickle as pkl 10 | import numpy 11 | import copy 12 | import nltk 13 | 14 | from collections import OrderedDict, defaultdict 15 | from scipy.linalg import norm 16 | from nltk.tokenize import word_tokenize 17 | 18 | profile = False 19 | 20 | #-----------------------------------------------------------------------------# 21 | # Specify model and table locations here 22 | #-----------------------------------------------------------------------------# 23 | path_to_models = 'C:/Users/mt16558/Documents/Project/NLP Experiments/Text Summarization/' 24 | path_to_tables = 'C:/Users/mt16558/Documents/Project/NLP Experiments/Text Summarization/' 25 | #-----------------------------------------------------------------------------# 26 | 27 | path_to_umodel = path_to_models + 'uni_skip.npz' 28 | path_to_bmodel = path_to_models + 'bi_skip.npz' 29 | 30 | 31 | def load_model(): 32 | """ 33 | Load the model with saved tables 34 | """ 35 | # Load model options 36 | print('Loading model parameters...') 37 | with open('%s.pkl'%path_to_umodel, 'rb') as f: 38 | uoptions = pkl.load(f) 39 | with open('%s.pkl'%path_to_bmodel, 'rb') as f: 40 | boptions = pkl.load(f) 41 | 42 | # Load parameters 43 | uparams = init_params(uoptions) 44 | uparams = load_params(path_to_umodel, uparams) 45 | utparams = init_tparams(uparams) 46 | bparams = init_params_bi(boptions) 47 | bparams = load_params(path_to_bmodel, bparams) 48 | btparams = init_tparams(bparams) 49 | 50 | # Extractor functions 51 | print('Compiling encoders...') 52 | embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions) 53 | f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') 54 | embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions) 55 | f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2') 56 | 57 | # Tables 58 | print('Loading tables...') 59 | utable, btable = load_tables() 60 | 61 | # Store everything we need in a dictionary 62 | print('Packing up...') 63 | model = {} 64 | model['uoptions'] = uoptions 65 | model['boptions'] = boptions 66 | model['utable'] = utable 67 | model['btable'] = btable 68 | model['f_w2v'] = f_w2v 69 | model['f_w2v2'] = f_w2v2 70 | 71 | return model 72 | 73 | 74 | def load_tables(): 75 | """ 76 | Load the tables 77 | """ 78 | words = [] 79 | utable = numpy.load(path_to_tables + 'utable.npy') 80 | btable = numpy.load(path_to_tables + 'btable.npy') 81 | f = open(path_to_tables + 'dictionary.txt', 'rb') 82 | for line in f: 83 | words.append(line.decode('utf-8').strip()) 84 | f.close() 85 | utable = OrderedDict(zip(words, utable)) 86 | btable = OrderedDict(zip(words, btable)) 87 | return utable, btable 88 | 89 | 90 | class Encoder(object): 91 | """ 92 | Sentence encoder. 93 | """ 94 | 95 | def __init__(self, model): 96 | self._model = model 97 | 98 | def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): 99 | """ 100 | Encode sentences in the list X. Each entry will return a vector 101 | """ 102 | return encode(self._model, X, use_norm, verbose, batch_size, use_eos) 103 | 104 | 105 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): 106 | """ 107 | Encode sentences in the list X. Each entry will return a vector 108 | """ 109 | # first, do preprocessing 110 | X = preprocess(X) 111 | 112 | # word dictionary and init 113 | d = defaultdict(lambda : 0) 114 | for w in model['utable'].keys(): 115 | d[w] = 1 116 | ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32') 117 | bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32') 118 | 119 | # length dictionary 120 | ds = defaultdict(list) 121 | captions = [s.split() for s in X] 122 | for i,s in enumerate(captions): 123 | ds[len(s)].append(i) 124 | 125 | # Get features. This encodes by length, in order to avoid wasting computation 126 | for k in ds.keys(): 127 | if verbose: 128 | print(k) 129 | numbatches = len(ds[k]) / batch_size + 1 130 | for minibatch in range(numbatches): 131 | caps = ds[k][minibatch::numbatches] 132 | 133 | if use_eos: 134 | uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32') 135 | bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32') 136 | else: 137 | uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32') 138 | bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32') 139 | for ind, c in enumerate(caps): 140 | caption = captions[c] 141 | for j in range(len(caption)): 142 | if d[caption[j]] > 0: 143 | uembedding[j,ind] = model['utable'][caption[j]] 144 | bembedding[j,ind] = model['btable'][caption[j]] 145 | else: 146 | uembedding[j,ind] = model['utable']['UNK'] 147 | bembedding[j,ind] = model['btable']['UNK'] 148 | if use_eos: 149 | uembedding[-1,ind] = model['utable'][''] 150 | bembedding[-1,ind] = model['btable'][''] 151 | if use_eos: 152 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 153 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 154 | else: 155 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 156 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 157 | if use_norm: 158 | for j in range(len(uff)): 159 | uff[j] /= norm(uff[j]) 160 | bff[j] /= norm(bff[j]) 161 | for ind, c in enumerate(caps): 162 | ufeatures[c] = uff[ind] 163 | bfeatures[c] = bff[ind] 164 | 165 | features = numpy.c_[ufeatures, bfeatures] 166 | return features 167 | 168 | 169 | def preprocess(text): 170 | """ 171 | Preprocess text for encoder 172 | """ 173 | X = [] 174 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 175 | for t in text: 176 | sents = sent_detector.tokenize(t) 177 | result = '' 178 | for s in sents: 179 | tokens = word_tokenize(s) 180 | result += ' ' + ' '.join(tokens) 181 | X.append(result) 182 | return X 183 | 184 | 185 | def nn(model, text, vectors, query, k=5): 186 | """ 187 | Return the nearest neighbour sentences to query 188 | text: list of sentences 189 | vectors: the corresponding representations for text 190 | query: a string to search 191 | """ 192 | qf = encode(model, [query]) 193 | qf /= norm(qf) 194 | scores = numpy.dot(qf, vectors.T).flatten() 195 | sorted_args = numpy.argsort(scores)[::-1] 196 | sentences = [text[a] for a in sorted_args[:k]] 197 | print('QUERY: ' + query) 198 | print('NEAREST: ') 199 | for i, s in enumerate(sentences): 200 | print(s, sorted_args[i]) 201 | 202 | 203 | def word_features(table): 204 | """ 205 | Extract word features into a normalized matrix 206 | """ 207 | features = numpy.zeros((len(table), 620), dtype='float32') 208 | keys = table.keys() 209 | for i in range(len(table)): 210 | f = table[keys[i]] 211 | features[i] = f / norm(f) 212 | return features 213 | 214 | 215 | def nn_words(table, wordvecs, query, k=10): 216 | """ 217 | Get the nearest neighbour words 218 | """ 219 | keys = table.keys() 220 | qf = table[query] 221 | scores = numpy.dot(qf, wordvecs.T).flatten() 222 | sorted_args = numpy.argsort(scores)[::-1] 223 | words = [keys[a] for a in sorted_args[:k]] 224 | print('QUERY: ' + query) 225 | print('NEAREST: ') 226 | for i, w in enumerate(words): 227 | print(w) 228 | 229 | 230 | def _p(pp, name): 231 | """ 232 | make prefix-appended name 233 | """ 234 | return '%s_%s'%(pp, name) 235 | 236 | 237 | def init_tparams(params): 238 | """ 239 | initialize Theano shared variables according to the initial parameters 240 | """ 241 | tparams = OrderedDict() 242 | for kk, pp in params.iteritems(): 243 | tparams[kk] = theano.shared(params[kk], name=kk) 244 | return tparams 245 | 246 | 247 | def load_params(path, params): 248 | """ 249 | load parameters 250 | """ 251 | pp = numpy.load(path) 252 | for kk, vv in params.iteritems(): 253 | if kk not in pp: 254 | warnings.warn('%s is not in the archive'%kk) 255 | continue 256 | params[kk] = pp[kk] 257 | return params 258 | 259 | 260 | # layers: 'name': ('parameter initializer', 'feedforward') 261 | layers = {'gru': ('param_init_gru', 'gru_layer')} 262 | 263 | def get_layer(name): 264 | fns = layers[name] 265 | return (eval(fns[0]), eval(fns[1])) 266 | 267 | 268 | def init_params(options): 269 | """ 270 | initialize all parameters needed for the encoder 271 | """ 272 | params = OrderedDict() 273 | 274 | # embedding 275 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 276 | 277 | # encoder: GRU 278 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 279 | nin=options['dim_word'], dim=options['dim']) 280 | return params 281 | 282 | 283 | def init_params_bi(options): 284 | """ 285 | initialize all paramters needed for bidirectional encoder 286 | """ 287 | params = OrderedDict() 288 | 289 | # embedding 290 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 291 | 292 | # encoder: GRU 293 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 294 | nin=options['dim_word'], dim=options['dim']) 295 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r', 296 | nin=options['dim_word'], dim=options['dim']) 297 | return params 298 | 299 | 300 | def build_encoder(tparams, options): 301 | """ 302 | build an encoder, given pre-computed word embeddings 303 | """ 304 | # word embedding (source) 305 | embedding = tensor.tensor3('embedding', dtype='float32') 306 | x_mask = tensor.matrix('x_mask', dtype='float32') 307 | 308 | # encoder 309 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 310 | prefix='encoder', 311 | mask=x_mask) 312 | ctx = proj[0][-1] 313 | 314 | return embedding, x_mask, ctx 315 | 316 | 317 | def build_encoder_bi(tparams, options): 318 | """ 319 | build bidirectional encoder, given pre-computed word embeddings 320 | """ 321 | # word embedding (source) 322 | embedding = tensor.tensor3('embedding', dtype='float32') 323 | embeddingr = embedding[::-1] 324 | x_mask = tensor.matrix('x_mask', dtype='float32') 325 | xr_mask = x_mask[::-1] 326 | 327 | # encoder 328 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 329 | prefix='encoder', 330 | mask=x_mask) 331 | projr = get_layer(options['encoder'])[1](tparams, embeddingr, options, 332 | prefix='encoder_r', 333 | mask=xr_mask) 334 | 335 | ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1) 336 | 337 | return embedding, x_mask, ctx 338 | 339 | 340 | # some utilities 341 | def ortho_weight(ndim): 342 | W = numpy.random.randn(ndim, ndim) 343 | u, s, v = numpy.linalg.svd(W) 344 | return u.astype('float32') 345 | 346 | 347 | def norm_weight(nin,nout=None, scale=0.1, ortho=True): 348 | if nout == None: 349 | nout = nin 350 | if nout == nin and ortho: 351 | W = ortho_weight(nin) 352 | else: 353 | W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) 354 | return W.astype('float32') 355 | 356 | 357 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 358 | """ 359 | parameter init for GRU 360 | """ 361 | if nin == None: 362 | nin = options['dim_proj'] 363 | if dim == None: 364 | dim = options['dim_proj'] 365 | W = numpy.concatenate([norm_weight(nin,dim), 366 | norm_weight(nin,dim)], axis=1) 367 | params[_p(prefix,'W')] = W 368 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 369 | U = numpy.concatenate([ortho_weight(dim), 370 | ortho_weight(dim)], axis=1) 371 | params[_p(prefix,'U')] = U 372 | 373 | Wx = norm_weight(nin, dim) 374 | params[_p(prefix,'Wx')] = Wx 375 | Ux = ortho_weight(dim) 376 | params[_p(prefix,'Ux')] = Ux 377 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 378 | 379 | return params 380 | 381 | 382 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs): 383 | """ 384 | Forward pass through GRU layer 385 | """ 386 | nsteps = state_below.shape[0] 387 | if state_below.ndim == 3: 388 | n_samples = state_below.shape[1] 389 | else: 390 | n_samples = 1 391 | 392 | dim = tparams[_p(prefix,'Ux')].shape[1] 393 | 394 | if mask == None: 395 | mask = tensor.alloc(1., state_below.shape[0], 1) 396 | 397 | def _slice(_x, n, dim): 398 | if _x.ndim == 3: 399 | return _x[:, :, n*dim:(n+1)*dim] 400 | return _x[:, n*dim:(n+1)*dim] 401 | 402 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 403 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 404 | U = tparams[_p(prefix, 'U')] 405 | Ux = tparams[_p(prefix, 'Ux')] 406 | 407 | def _step_slice(m_, x_, xx_, h_, U, Ux): 408 | preact = tensor.dot(h_, U) 409 | preact += x_ 410 | 411 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 412 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 413 | 414 | preactx = tensor.dot(h_, Ux) 415 | preactx = preactx * r 416 | preactx = preactx + xx_ 417 | 418 | h = tensor.tanh(preactx) 419 | 420 | h = u * h_ + (1. - u) * h 421 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 422 | 423 | return h 424 | 425 | seqs = [mask, state_below_, state_belowx] 426 | _step = _step_slice 427 | 428 | rval, updates = theano.scan(_step, 429 | sequences=seqs, 430 | outputs_info = [tensor.alloc(0., n_samples, dim)], 431 | non_sequences = [tparams[_p(prefix, 'U')], 432 | tparams[_p(prefix, 'Ux')]], 433 | name=_p(prefix, '_layers'), 434 | n_steps=nsteps, 435 | profile=profile, 436 | strict=True) 437 | rval = [rval] 438 | return rval 439 | 440 | 441 | -------------------------------------------------------------------------------- /skipthoughts.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/madhavthaker/text_summarization/7e0c7cc83502a4ce463a975a6e769b68f9b35bfd/skipthoughts.pyc --------------------------------------------------------------------------------