├── README.md
├── Text_Summarization.ipynb
├── dataset_handler.py
├── eval_classification.py
├── eval_msrp.py
├── eval_rank.py
├── eval_sick.py
├── eval_trec.py
├── nbsvm.py
├── skipthoughts.py
└── skipthoughts.pyc


/README.md:
--------------------------------------------------------------------------------
1 | # text_summarization
2 | Notebook which provides an overview to several text summarization techniques
3 | 
4 | Before running the skipthoughts module, download the dependencies from the source repository:
5 | https://github.com/ryankiros/skip-thoughts/
6 | 


--------------------------------------------------------------------------------
/Text_Summarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import nltk\n",
 14 |     "from nltk.corpus import stopwords\n",
 15 |     "from nltk.tokenize import word_tokenize, sent_tokenize\n",
 16 |     "from nltk.stem.porter import *\n",
 17 |     "import re"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "passage = \"\"\"\n",
 29 |     "If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him?\n",
 30 |     "\n",
 31 |     "The question of how much these two other-worldly players inspire each other is an interesting one,\n",
 32 |     "and it's tempting to imagine Messi sitting at home on Tuesday night, watching Ronaldo destroying Atletico, \n",
 33 |     "angrily glaring at the TV screen and growling: \"Right, I'll show him!\"\n",
 34 |     "\n",
 35 |     "As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least.\n",
 36 |     "\n",
 37 |     "He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and \n",
 38 |     "personal glory, however, the Argentine's personal drug is trophies.\n",
 39 |     "\n",
 40 |     "Ronaldo, it can be said, never looks happy on the field of play unless he's just scored a goal - and even \n",
 41 |     "then he's not happy for long, because he just wants to score another one. And that relentless obsession with \n",
 42 |     "finding the back of the net has undoubtedly played a major role in his stunning career achievements.\n",
 43 |     "\n",
 44 |     "Messi, though, is a different animal, shown by the generosity with which he sets up team-mates even if he has \n",
 45 |     "a chance to shoot, regularly hands over penalty-taking duties to others and invariably celebrates a goal by turning \n",
 46 |     "straight to the player who passed him the ball with an appreciative smile.\n",
 47 |     "\n",
 48 |     "Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close to\n",
 49 |     "him - are being the best possible version of Lionel Messi, and winning as many trophies as possible.\n",
 50 |     "\n",
 51 |     "That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi.\n",
 52 |     "\n",
 53 |     "Do Messi and Ronaldo inspire each other? \"Maybe subconsciously in some way they've driven each other on,\" said Rodgers.\n",
 54 |     "\"But I think both those players inherently have that hunger to be the best players they can be. With the very elite \n",
 55 |     "performers, that drive comes from within.\"\n",
 56 |     "\n",
 57 |     "Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate \n",
 58 |     "and the media to spread, but it's probably not particularly true.\n",
 59 |     "\"\"\""
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "### Text Standardization"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 3,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "contractions = { \n",
 78 |     "\"ain't\": \"am not / are not / is not / has not / have not\",\n",
 79 |     "\"aren't\": \"are not / am not\",\n",
 80 |     "\"can't\": \"cannot\",\n",
 81 |     "\"can't've\": \"cannot have\",\n",
 82 |     "\"'cause\": \"because\",\n",
 83 |     "\"could've\": \"could have\",\n",
 84 |     "\"couldn't\": \"could not\",\n",
 85 |     "\"couldn't've\": \"could not have\",\n",
 86 |     "\"didn't\": \"did not\",\n",
 87 |     "\"doesn't\": \"does not\",\n",
 88 |     "\"don't\": \"do not\",\n",
 89 |     "\"hadn't\": \"had not\",\n",
 90 |     "\"hadn't've\": \"had not have\",\n",
 91 |     "\"hasn't\": \"has not\",\n",
 92 |     "\"haven't\": \"have not\",\n",
 93 |     "\"he'd\": \"he had / he would\",\n",
 94 |     "\"he'd've\": \"he would have\",\n",
 95 |     "\"he'll\": \"he shall / he will\",\n",
 96 |     "\"he'll've\": \"he shall have / he will have\",\n",
 97 |     "\"he's\": \"he has / he is\",\n",
 98 |     "\"how'd\": \"how did\",\n",
 99 |     "\"how'd'y\": \"how do you\",\n",
100 |     "\"how'll\": \"how will\",\n",
101 |     "\"how's\": \"how has / how is / how does\",\n",
102 |     "\"I'd\": \"I had / I would\",\n",
103 |     "\"I'd've\": \"I would have\",\n",
104 |     "\"I'll\": \"I shall / I will\",\n",
105 |     "\"I'll've\": \"I shall have / I will have\",\n",
106 |     "\"I'm\": \"I am\",\n",
107 |     "\"I've\": \"I have\",\n",
108 |     "\"isn't\": \"is not\",\n",
109 |     "\"it'd\": \"it had / it would\",\n",
110 |     "\"it'd've\": \"it would have\",\n",
111 |     "\"it'll\": \"it shall / it will\",\n",
112 |     "\"it'll've\": \"it shall have / it will have\",\n",
113 |     "\"it's\": \"it has / it is\",\n",
114 |     "\"let's\": \"let us\",\n",
115 |     "\"ma'am\": \"madam\",\n",
116 |     "\"mayn't\": \"may not\",\n",
117 |     "\"might've\": \"might have\",\n",
118 |     "\"mightn't\": \"might not\",\n",
119 |     "\"mightn't've\": \"might not have\",\n",
120 |     "\"must've\": \"must have\",\n",
121 |     "\"mustn't\": \"must not\",\n",
122 |     "\"mustn't've\": \"must not have\",\n",
123 |     "\"needn't\": \"need not\",\n",
124 |     "\"needn't've\": \"need not have\",\n",
125 |     "\"o'clock\": \"of the clock\",\n",
126 |     "\"oughtn't\": \"ought not\",\n",
127 |     "\"oughtn't've\": \"ought not have\",\n",
128 |     "\"shan't\": \"shall not\",\n",
129 |     "\"sha'n't\": \"shall not\",\n",
130 |     "\"shan't've\": \"shall not have\",\n",
131 |     "\"she'd\": \"she had / she would\",\n",
132 |     "\"she'd've\": \"she would have\",\n",
133 |     "\"she'll\": \"she shall / she will\",\n",
134 |     "\"she'll've\": \"she shall have / she will have\",\n",
135 |     "\"she's\": \"she has / she is\",\n",
136 |     "\"should've\": \"should have\",\n",
137 |     "\"shouldn't\": \"should not\",\n",
138 |     "\"shouldn't've\": \"should not have\",\n",
139 |     "\"so've\": \"so have\",\n",
140 |     "\"so's\": \"so as / so is\",\n",
141 |     "\"that'd\": \"that would / that had\",\n",
142 |     "\"that'd've\": \"that would have\",\n",
143 |     "\"that's\": \"that has / that is\",\n",
144 |     "\"there'd\": \"there had / there would\",\n",
145 |     "\"there'd've\": \"there would have\",\n",
146 |     "\"there's\": \"there has / there is\",\n",
147 |     "\"they'd\": \"they had / they would\",\n",
148 |     "\"they'd've\": \"they would have\",\n",
149 |     "\"they'll\": \"they shall / they will\",\n",
150 |     "\"they'll've\": \"they shall have / they will have\",\n",
151 |     "\"they're\": \"they are\",\n",
152 |     "\"they've\": \"they have\",\n",
153 |     "\"to've\": \"to have\",\n",
154 |     "\"wasn't\": \"was not\",\n",
155 |     "\"we'd\": \"we had / we would\",\n",
156 |     "\"we'd've\": \"we would have\",\n",
157 |     "\"we'll\": \"we will\",\n",
158 |     "\"we'll've\": \"we will have\",\n",
159 |     "\"we're\": \"we are\",\n",
160 |     "\"we've\": \"we have\",\n",
161 |     "\"weren't\": \"were not\",\n",
162 |     "\"what'll\": \"what shall / what will\",\n",
163 |     "\"what'll've\": \"what shall have / what will have\",\n",
164 |     "\"what're\": \"what are\",\n",
165 |     "\"what's\": \"what has / what is\",\n",
166 |     "\"what've\": \"what have\",\n",
167 |     "\"when's\": \"when has / when is\",\n",
168 |     "\"when've\": \"when have\",\n",
169 |     "\"where'd\": \"where did\",\n",
170 |     "\"where's\": \"where has / where is\",\n",
171 |     "\"where've\": \"where have\",\n",
172 |     "\"who'll\": \"who shall / who will\",\n",
173 |     "\"who'll've\": \"who shall have / who will have\",\n",
174 |     "\"who's\": \"who has / who is\",\n",
175 |     "\"who've\": \"who have\",\n",
176 |     "\"why's\": \"why has / why is\",\n",
177 |     "\"why've\": \"why have\",\n",
178 |     "\"will've\": \"will have\",\n",
179 |     "\"won't\": \"will not\",\n",
180 |     "\"won't've\": \"will not have\",\n",
181 |     "\"would've\": \"would have\",\n",
182 |     "\"wouldn't\": \"would not\",\n",
183 |     "\"wouldn't've\": \"would not have\",\n",
184 |     "\"y'all\": \"you all\",\n",
185 |     "\"y'all'd\": \"you all would\",\n",
186 |     "\"y'all'd've\": \"you all would have\",\n",
187 |     "\"y'all're\": \"you all are\",\n",
188 |     "\"y'all've\": \"you all have\",\n",
189 |     "\"you'd\": \"you had / you would\",\n",
190 |     "\"you'd've\": \"you would have\",\n",
191 |     "\"you'll\": \"you shall / you will\",\n",
192 |     "\"you'll've\": \"you shall have / you will have\",\n",
193 |     "\"you're\": \"you are\",\n",
194 |     "\"you've\": \"you have\"\n",
195 |     "}"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 4,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))\n",
207 |     "def expand_contractions(s, contractions_dict=contractions):\n",
208 |     "    def replace(match):\n",
209 |     "        return contractions_dict[match.group(0)]\n",
210 |     "    return contractions_re.sub(replace, s)\n",
211 |     " \n",
212 |     "sentences = sent_tokenize(passage)    \n",
213 |     "sentences = [expand_contractions(i) for i in sentences]\n",
214 |     "sentences = [re.sub('\\n', '', i) for i in sentences]"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "Text Summarization is an increasingly popular area within NLP and with the advancements in moderns deep learning, we are consistently seeing newer, more novel approaches. The goal of this article is to compare the results of a few approaches that I found interesting:\n",
222 |     "1. Sentence Scoring based on Word Frequency\n",
223 |     "2. TextRank using Universal Sentence Encoder\n",
224 |     "3. Unsupervised Learning using Skip-Thought Vectors\n",
225 |     "\n",
226 |     "Before moving forward, I wanted to give credit to the outstanding Medium authors/articles who are the foundation for this post and help me learn/implement the Text Summarization techniques below:\n",
227 |     "1. https://becominghuman.ai/text-summarization-in-5-steps-using-nltk-65b21e352b65\n",
228 |     "2. https://medium.com/jatana/unsupervised-text-summarization-using-sentence-embeddings-adb15ce83db1\n",
229 |     "3. https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/\n",
230 |     "\n",
231 |     "Some of the code snippets they've provided will be shown here as well but I encourage you to read through their posts too!"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Sentence Scoring based on Word Freqency (Python 2.7/3.5)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "The first approach we will explore is the simplest of the three. Here we assign weights to each word based on the frequency of the word in the passage. For example, if \"Soccer\" occurs 4 times within the passage, it will have a weight of 4. "
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 39,
251 |    "metadata": {
252 |     "collapsed": true
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "def create_freq_table(text_string):\n",
257 |     "    stopwords_list = set(stopwords.words('english'))\n",
258 |     "    \n",
259 |     "    words = word_tokenize(text_string)\n",
260 |     "    \n",
261 |     "    ps = PorterStemmer()\n",
262 |     "    \n",
263 |     "    freq_table = {}\n",
264 |     "    \n",
265 |     "    for word in words:\n",
266 |     "        #stem word \n",
267 |     "        word = ps.stem(word)\n",
268 |     "        \n",
269 |     "        #remove stopwords\n",
270 |     "        if word in stopwords_list: \n",
271 |     "            continue\n",
272 |     "        elif word in freq_table:\n",
273 |     "            freq_table[word] += 1\n",
274 |     "        else:\n",
275 |     "            freq_table[word] = 1\n",
276 |     "            \n",
277 |     "    return freq_table\n",
278 |     "\n",
279 |     "freq_table = create_freq_table(\" \".join(sentences))"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "Using the weights assigned to each word above, we will create a score for each sentence. At the end of the day, we will be taking the score of the top `N` for the summary. As you'd imagine, just by leveraging the raw score of each sentence, the length of certain sentences will skew the results. This is why will normalize the scores by dividing by the length of each sentence. "
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 5,
292 |    "metadata": {
293 |     "collapsed": true
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "def score_sentences(sentences, freq_table):\n",
298 |     "    \n",
299 |     "    sentence_value = {}\n",
300 |     "    \n",
301 |     "    for sentence in sentences:\n",
302 |     "        word_count_in_sentence = len(word_tokenize(sentence))\n",
303 |     "        \n",
304 |     "        for wordValue in freq_table:\n",
305 |     "            \n",
306 |     "            if wordValue.lower() in sentence.lower():                \n",
307 |     "                if sentence in sentence_value:\n",
308 |     "                    sentence_value[sentence] += freq_table[wordValue]\n",
309 |     "                else:\n",
310 |     "                    sentence_value[sentence] = freq_table[wordValue]\n",
311 |     "\n",
312 |     "        sentence_value[sentence] = sentence_value[sentence] // word_count_in_sentence\n",
313 |     "    return sentence_value\n",
314 |     "\n",
315 |     "def find_average_score(sentence_value):\n",
316 |     "    sum_values = 0\n",
317 |     "    \n",
318 |     "    for entry in sentence_value:\n",
319 |     "        sum_values += sentence_value[entry]\n",
320 |     "        \n",
321 |     "    average = int(sum_values/len(sentence_value))\n",
322 |     "    \n",
323 |     "    return average"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "Now, to create the summary, we will take any sentence that has a score that exceeds a threshold. In this case, the threshold will be the average score for for all of the sentences. "
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 7,
336 |    "metadata": {
337 |     "collapsed": true
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "def generate_summary(sentences, sentence_value, threshold):\n",
342 |     "    sentence_count = 0\n",
343 |     "    \n",
344 |     "    summary = ''\n",
345 |     "    \n",
346 |     "    for sentence in sentences:\n",
347 |     "        if sentence in sentence_value and sentence_value[sentence] > threshold:\n",
348 |     "            summary += \" \" + sentence\n",
349 |     "            sentence_count += 1\n",
350 |     "            \n",
351 |     "    return summary \n",
352 |     "            \n",
353 |     "            "
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 41,
359 |    "metadata": {},
360 |    "outputs": [
361 |     {
362 |      "name": "stdout",
363 |      "output_type": "stream",
364 |      "text": [
365 |       " If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him? As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least. He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and personal glory, however, the Argentine's personal drug is trophies. Do Messi and Ronaldo inspire each other? \"Maybe subconsciously in some way they've driven each other on,\" said Rodgers. With the very elite performers, that drive comes from within.\"\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "#End to End Run\n",
371 |     "freq_table = create_freq_table(\" \".join(sentences))\n",
372 |     "\n",
373 |     "sentence_scores = score_sentences(sentences, freq_table)\n",
374 |     "\n",
375 |     "threshold = find_average_score(sentence_scores)\n",
376 |     "\n",
377 |     "summary = generate_summary(sentences, sentence_scores, 1.0 * threshold)\n",
378 |     "\n",
379 |     "print(re.sub('\\n','',summary))"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {},
385 |    "source": [
386 |     "### Text Rank using Universal Sentence Embeddings (Python 3.7)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "Next we evaluate the results generated when using universal sentence embeddings and text rank to generate summaries. Before we jump into the code, let's discuss a few concepts that are critical. \n",
394 |     "\n",
395 |     "**Text Rank**\n",
396 |     "This may sound familiar. This is essentially a derivative of the famous PageRank created by the Google cofounders. In PageRank, they generated a matrix that calculaes the probability that a user will move from one page to another. In the case of TextRank, we generate a cosine similarity matrix where we have the similarity of each sentence to each other.\n",
397 |     "\n",
398 |     "A graph is then generated from this cosine similarity matrix and the pagerank algorithm is applied to this graph and scores are then calculated for each sentence. For more information on the Page Rank algorithm, please use the following resource [pagerank link]\n",
399 |     "\n",
400 |     "**Universal Sentence Embeddings**\n",
401 |     "Without going into too much detail, universal sentence embeddings encode word, sentence and paragraph into semantic vectors. They are trained on Deep Averaging Networks. More details can be found here:\n",
402 |     "\n",
403 |     "https://tfhub.dev/google/universal-sentence-encoder/1"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 12,
409 |    "metadata": {
410 |     "scrolled": true
411 |    },
412 |    "outputs": [
413 |     {
414 |      "name": "stdout",
415 |      "output_type": "stream",
416 |      "text": [
417 |       "INFO:tensorflow:Using C:\\Temp\\tfhub_modules to cache modules.\n",
418 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_0:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_0\n",
419 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_1:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_1\n",
420 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_10:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_10\n",
421 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_11:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_11\n",
422 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_12:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_12\n",
423 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_13:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_13\n",
424 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_14:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_14\n",
425 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_15:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_15\n",
426 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_16:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_16\n",
427 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_2:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_2\n",
428 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_3:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_3\n",
429 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_4:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_4\n",
430 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_5:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_5\n",
431 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_6:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_6\n",
432 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_7:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_7\n",
433 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_8:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_8\n",
434 |       "INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_9:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Embeddings_en/sharded_9\n",
435 |       "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_0/weights\n",
436 |       "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_1/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_1/weights\n",
437 |       "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_2/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_2/weights\n",
438 |       "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_3/projection:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_3/projection\n",
439 |       "INFO:tensorflow:Initialize variable module/Encoder_en/DNN/ResidualHidden_3/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with Encoder_en/DNN/ResidualHidden_3/weights\n",
440 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/bias\n",
441 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_0/weights\n",
442 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/bias\n",
443 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_1/weights\n",
444 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/bias\n",
445 |       "INFO:tensorflow:Initialize variable module/SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SHARED_RANK_ANSWER/response_encoder_0/tanh_layer_2/weights\n",
446 |       "INFO:tensorflow:Initialize variable module/SNLI/Classifier/LinearLayer/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/LinearLayer/bias\n",
447 |       "INFO:tensorflow:Initialize variable module/SNLI/Classifier/LinearLayer/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/LinearLayer/weights\n",
448 |       "INFO:tensorflow:Initialize variable module/SNLI/Classifier/tanh_layer_0/bias:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/tanh_layer_0/bias\n",
449 |       "INFO:tensorflow:Initialize variable module/SNLI/Classifier/tanh_layer_0/weights:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with SNLI/Classifier/tanh_layer_0/weights\n",
450 |       "INFO:tensorflow:Initialize variable module/global_step:0 from checkpoint b'C:\\\\Temp\\\\tfhub_modules\\\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\\\variables\\\\variables' with global_step\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "import tensorflow_hub as hub\n",
456 |     "import tensorflow as tf\n",
457 |     "\n",
458 |     "module_url = \"https://tfhub.dev/google/universal-sentence-encoder/2\"\n",
459 |     "\n",
460 |     "embed = hub.Module(module_url)\n",
461 |     "\n",
462 |     "# Reduce logging output.\n",
463 |     "tf.logging.set_verbosity(tf.logging.ERROR)\n",
464 |     "\n",
465 |     "with tf.Session() as session:\n",
466 |     "    session.run([tf.global_variables_initializer(), tf.tables_initializer()])\n",
467 |     "    message_embeddings = session.run(embed(sentences))"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 16,
473 |    "metadata": {},
474 |    "outputs": [
475 |     {
476 |      "data": {
477 |       "text/plain": [
478 |        "\"Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. He might show it in a different way, but Messi is just as competitive as Ronaldo. Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate and the media to spread, but it has / it is probably not particularly true. Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one.\""
479 |       ]
480 |      },
481 |      "execution_count": 16,
482 |      "metadata": {},
483 |      "output_type": "execute_result"
484 |     }
485 |    ],
486 |    "source": [
487 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
488 |     "import networkx as nx\n",
489 |     "\n",
490 |     "#generate cosine similarity matrix\n",
491 |     "sim_matrix = cosine_similarity(message_embeddings)\n",
492 |     "\n",
493 |     "#create graph and generate scores from pagerank algorithms\n",
494 |     "nx_graph = nx.from_numpy_array(sim_matrix)\n",
495 |     "scores = nx.pagerank(nx_graph)\n",
496 |     "\n",
497 |     "ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)\n",
498 |     "   \n",
499 |     "num_of_sentences = 5\n",
500 |     "    \n",
501 |     "summary = \" \".join([i[1] for i in ranked_sentences[:num_of_sentences]])\n",
502 |     "summary"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "### Unsupervised Learning using Skip Thought Vectors (Python 2.7)"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "Now this, in my opinion, is the newest and most novel approach we've discussed here. The high level approach is as follows:\n",
517 |     "\n",
518 |     "Text Cleaning -> Encoder/Decoder -> K Means Clustering -> Extract Sentences Closest to Cluster Center\n",
519 |     "\n",
520 |     "Again, there are two main concepts I want to discuss before jumping into the solution:\n",
521 |     "\n",
522 |     "**Skip Thought Vectors**\n",
523 |     "\n",
524 |     "Here, we use a encoder/decoder framework to generate feature vectors Taking it from Kushal Chauhan's post, here is how the encoder and decoder layers are defined:\n",
525 |     "1. Encoder Network: The encoder is typically a GRU-RNN which generates a fixed length vector representation h(i) for each sentence S(i) in the input. The encoded representation h(i) is obtained by passing final hidden state of the GRU cell (i.e. after it has seen the entire sentence) to multiple dense layers.\n",
526 |     "2. Decoder Network: The decoder network takes this vector representation h(i) as input and tries to generate two sentences - S(i-1) and S(i+1), which could occur before and after the input sentence respectively. Separate decoders are implemented for generation of previous and next sentences, both being GRU-RNNs. The vector representation h(i) acts as the initial hidden state for the GRUs of the decoder networks.\n",
527 |     "\n",
528 |     "Similar to how Word2Vec embeddings are trained by predicting the surrounding words, the Skip Thought Vectors are trained by predicting the sentence at time, t-1 and t+1. As this model is trained, the learned representation (hidden layer) will now place similar sentences closer together which enables higher performance clustering.\n",
529 |     "\n",
530 |     "I encourage you to review the paper on the same subject for more clarity.\n",
531 |     "\n",
532 |     "**K-Means Clustering**\n",
533 |     "\n",
534 |     "Most of you will be familiar with this form of unsupervised learning but I want to elaborate on how it is used and why it is interesting.\n",
535 |     "\n",
536 |     "As we are aware, each cluster will have some center point which, in the vector space, would indicate the point which closely represents the theme of that cluster. With this in mind, when trying to create a summary, we should only need the sentence which is the closest to the center of that cluster. The key here is choosing the correct number of clusters to do a good job of summarizing the content. Kushal's post recommends that we calculate the cluster size by taking 30% of the number of sentences."
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 5,
542 |    "metadata": {
543 |     "scrolled": true
544 |    },
545 |    "outputs": [
546 |     {
547 |      "name": "stdout",
548 |      "output_type": "stream",
549 |      "text": [
550 |       "Loading model parameters...\n",
551 |       "Compiling encoders...\n",
552 |       "Loading tables...\n",
553 |       "Packing up...\n",
554 |       "38\n",
555 |       "8\n",
556 |       "41\n",
557 |       "13\n",
558 |       "15\n",
559 |       "48\n",
560 |       "17\n",
561 |       "18\n",
562 |       "20\n",
563 |       "22\n",
564 |       "23\n",
565 |       "56\n",
566 |       "25\n",
567 |       "60\n"
568 |      ]
569 |     }
570 |    ],
571 |    "source": [
572 |     "import skipthoughts\n",
573 |     "\n",
574 |     "# You would need to download pre-trained models first\n",
575 |     "model = skipthoughts.load_model()\n",
576 |     "\n",
577 |     "encoder = skipthoughts.Encoder(model)\n",
578 |     "encoded =  encoder.encode(sentences)"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "markdown",
583 |    "metadata": {},
584 |    "source": [
585 |     "All of the skipthoughts dependencies can be found here.\n",
586 |     "As mentioned above, the number of clusters will be the number of sentences that will be included in the summary. For this example, we used a cluster size of 7."
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 36,
592 |    "metadata": {
593 |     "collapsed": true
594 |    },
595 |    "outputs": [],
596 |    "source": [
597 |     "from sklearn.metrics import pairwise_distances_argmin_min\n",
598 |     "import numpy as np\n",
599 |     "from sklearn.cluster import KMeans\n",
600 |     "kmeans = KMeans(n_clusters=n_clusters)\n",
601 |     "kmeans = kmeans.fit(encoded)\n",
602 |     "\n",
603 |     "n_clusters = int(np.ceil(len(encoded)**0.6))\n",
604 |     "print(n_clusters)\n",
605 |     "\n",
606 |     "avg = []\n",
607 |     "for j in range(n_clusters):\n",
608 |     "    idx = np.where(kmeans.labels_ == j)[0]\n",
609 |     "    avg.append(np.mean(idx))\n",
610 |     "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)\n",
611 |     "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n",
612 |     "summary = ' '.join([sentences[closest[idx]] for idx in ordering])"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 37,
618 |    "metadata": {
619 |     "scrolled": true
620 |    },
621 |    "outputs": [
622 |     {
623 |      "data": {
624 |       "text/plain": [
625 |        "'Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one. Rather than being a better player than Ronaldo, Messi\\'s main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi. With the very elite performers, that drive comes from within.\" \"But I think both those players inherently have that hunger to be the best players they can be.'"
626 |       ]
627 |      },
628 |      "execution_count": 37,
629 |      "metadata": {},
630 |      "output_type": "execute_result"
631 |     }
632 |    ],
633 |    "source": [
634 |     "summary"
635 |    ]
636 |   }
637 |  ],
638 |  "metadata": {
639 |   "kernelspec": {
640 |    "display_name": "Python 2",
641 |    "language": "python",
642 |    "name": "python2"
643 |   },
644 |   "language_info": {
645 |    "codemirror_mode": {
646 |     "name": "ipython",
647 |     "version": 2
648 |    },
649 |    "file_extension": ".py",
650 |    "mimetype": "text/x-python",
651 |    "name": "python",
652 |    "nbconvert_exporter": "python",
653 |    "pygments_lexer": "ipython2",
654 |    "version": "2.7.14"
655 |   }
656 |  },
657 |  "nbformat": 4,
658 |  "nbformat_minor": 2
659 | }
660 | 


--------------------------------------------------------------------------------
/dataset_handler.py:
--------------------------------------------------------------------------------
  1 | # Dataset handler for binary classification tasks (MR, CR, SUBJ, MQPA)
  2 | 
  3 | import numpy as np
  4 | from numpy.random import RandomState
  5 | import os.path
  6 | 
  7 | 
  8 | def load_data(encoder, name, loc='./data/', seed=1234):
  9 |     """
 10 |     Load one of MR, CR, SUBJ or MPQA
 11 |     """
 12 |     z = {}
 13 |     if name == 'MR':
 14 |         pos, neg = load_rt(loc=loc)
 15 |     elif name == 'SUBJ':
 16 |         pos, neg = load_subj(loc=loc)
 17 |     elif name == 'CR':
 18 |         pos, neg = load_cr(loc=loc)
 19 |     elif name == 'MPQA':
 20 |         pos, neg = load_mpqa(loc=loc)
 21 | 
 22 |     labels = compute_labels(pos, neg)
 23 |     text, labels = shuffle_data(pos+neg, labels, seed=seed)
 24 |     z['text'] = text
 25 |     z['labels'] = labels
 26 |     print 'Computing skip-thought vectors...'
 27 |     features = encoder.encode(text, verbose=False)
 28 |     return z, features
 29 | 
 30 | 
 31 | def load_rt(loc='./data/'):
 32 |     """
 33 |     Load the MR dataset
 34 |     """
 35 |     pos, neg = [], []
 36 |     with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f:
 37 |         for line in f:
 38 |             pos.append(line.decode('latin-1').strip())
 39 |     with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f:
 40 |         for line in f:
 41 |             neg.append(line.decode('latin-1').strip())
 42 |     return pos, neg
 43 | 
 44 | 
 45 | def load_subj(loc='./data/'):
 46 |     """
 47 |     Load the SUBJ dataset
 48 |     """
 49 |     pos, neg = [], []
 50 |     with open(os.path.join(loc, 'plot.tok.gt9.5000'), 'rb') as f:
 51 |         for line in f:
 52 |             pos.append(line.decode('latin-1').strip())
 53 |     with open(os.path.join(loc, 'quote.tok.gt9.5000'), 'rb') as f:
 54 |         for line in f:
 55 |             neg.append(line.decode('latin-1').strip())
 56 |     return pos, neg
 57 | 
 58 | 
 59 | def load_cr(loc='./data/'):
 60 |     """
 61 |     Load the CR dataset
 62 |     """
 63 |     pos, neg = [], []
 64 |     with open(os.path.join(loc, 'custrev.pos'), 'rb') as f:
 65 |         for line in f:
 66 |             text = line.strip()
 67 |             if len(text) > 0:
 68 |                 pos.append(text)
 69 |     with open(os.path.join(loc, 'custrev.neg'), 'rb') as f:
 70 |         for line in f:
 71 |             text = line.strip()
 72 |             if len(text) > 0:
 73 |                 neg.append(text)
 74 |     return pos, neg
 75 | 
 76 | 
 77 | def load_mpqa(loc='./data/'):
 78 |     """
 79 |     Load the MPQA dataset
 80 |     """
 81 |     pos, neg = [], []
 82 |     with open(os.path.join(loc, 'mpqa.pos'), 'rb') as f:
 83 |         for line in f:
 84 |             text = line.strip()
 85 |             if len(text) > 0:
 86 |                 pos.append(text)
 87 |     with open(os.path.join(loc, 'mpqa.neg'), 'rb') as f:
 88 |         for line in f:
 89 |             text = line.strip()
 90 |             if len(text) > 0:
 91 |                 neg.append(text)
 92 |     return pos, neg
 93 | 
 94 | 
 95 | def compute_labels(pos, neg):
 96 |     """
 97 |     Construct list of labels
 98 |     """
 99 |     labels = np.zeros(len(pos) + len(neg))
100 |     labels[:len(pos)] = 1.0
101 |     labels[len(pos):] = 0.0
102 |     return labels
103 | 
104 | 
105 | def shuffle_data(X, L, seed=1234):
106 |     """
107 |     Shuffle the data
108 |     """
109 |     prng = RandomState(seed)
110 |     inds = np.arange(len(X))
111 |     prng.shuffle(inds)
112 |     X = [X[i] for i in inds]
113 |     L = L[inds]
114 |     return (X, L)    
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/eval_classification.py:
--------------------------------------------------------------------------------
  1 | # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
  2 | 
  3 | import numpy as np
  4 | import sys
  5 | import nbsvm
  6 | import dataset_handler
  7 | 
  8 | from scipy.sparse import hstack
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.cross_validation import KFold
 12 | 
 13 | 
 14 | def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False):
 15 |     """
 16 |     Evaluate features with nested K-fold cross validation
 17 |     Outer loop: Held-out evaluation
 18 |     Inner loop: Hyperparameter tuning
 19 | 
 20 |     Datasets can be found at http://nlp.stanford.edu/~sidaw/home/projects:nbsvm
 21 |     Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA'
 22 |     """
 23 |     # Load the dataset and extract features
 24 |     z, features = dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
 25 | 
 26 |     scan = [2**t for t in range(0,9,1)]
 27 |     npts = len(z['text'])
 28 |     kf = KFold(npts, n_folds=k, random_state=seed)
 29 |     scores = []
 30 |     for train, test in kf:
 31 | 
 32 |         # Split data
 33 |         X_train = features[train]
 34 |         y_train = z['labels'][train]
 35 |         X_test = features[test]
 36 |         y_test = z['labels'][test]
 37 | 
 38 |         Xraw = [z['text'][i] for i in train]
 39 |         Xraw_test = [z['text'][i] for i in test]
 40 | 
 41 |         scanscores = []
 42 |         for s in scan:
 43 | 
 44 |             # Inner KFold
 45 |             innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
 46 |             innerscores = []
 47 |             for innertrain, innertest in innerkf:
 48 |         
 49 |                 # Split data
 50 |                 X_innertrain = X_train[innertrain]
 51 |                 y_innertrain = y_train[innertrain]
 52 |                 X_innertest = X_train[innertest]
 53 |                 y_innertest = y_train[innertest]
 54 | 
 55 |                 Xraw_innertrain = [Xraw[i] for i in innertrain]
 56 |                 Xraw_innertest = [Xraw[i] for i in innertest]
 57 | 
 58 |                 # NB (if applicable)
 59 |                 if use_nb:
 60 |                     NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest)
 61 |                     X_innertrain = hstack((X_innertrain, NBtrain))
 62 |                     X_innertest = hstack((X_innertest, NBtest))
 63 | 
 64 |                 # Train classifier
 65 |                 clf = LogisticRegression(C=s)
 66 |                 clf.fit(X_innertrain, y_innertrain)
 67 |                 acc = clf.score(X_innertest, y_innertest)
 68 |                 innerscores.append(acc)
 69 |                 print (s, acc)
 70 | 
 71 |             # Append mean score
 72 |             scanscores.append(np.mean(innerscores))
 73 | 
 74 |         # Get the index of the best score
 75 |         s_ind = np.argmax(scanscores)
 76 |         s = scan[s_ind]
 77 |         print scanscores
 78 |         print s
 79 |  
 80 |         # NB (if applicable)
 81 |         if use_nb:
 82 |             NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
 83 |             X_train = hstack((X_train, NBtrain))
 84 |             X_test = hstack((X_test, NBtest))
 85 |        
 86 |         # Train classifier
 87 |         clf = LogisticRegression(C=s)
 88 |         clf.fit(X_train, y_train)
 89 | 
 90 |         # Evaluate
 91 |         acc = clf.score(X_test, y_test)
 92 |         scores.append(acc)
 93 |         print scores
 94 | 
 95 |     return scores
 96 | 
 97 | 
 98 | def compute_nb(X, y, Z):
 99 |     """
100 |     Compute NB features
101 |     """
102 |     labels = [int(t) for t in y]
103 |     ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0]
104 |     ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1]
105 |     poscounts = nbsvm.build_dict(ptrain, [1,2])
106 |     negcounts = nbsvm.build_dict(ntrain, [1,2])
107 |     dic, r = nbsvm.compute_ratio(poscounts, negcounts)
108 |     trainX = nbsvm.process_text(X, dic, r, [1,2])
109 |     devX = nbsvm.process_text(Z, dic, r, [1,2])
110 |     return trainX, devX
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/eval_msrp.py:
--------------------------------------------------------------------------------
  1 | # Evaluation for MSRP
  2 | 
  3 | import numpy as np
  4 | 
  5 | from collections import defaultdict
  6 | from nltk.tokenize import word_tokenize
  7 | from numpy.random import RandomState
  8 | import os.path
  9 | from sklearn.cross_validation import KFold
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.metrics import f1_score as f1
 12 | 
 13 | 
 14 | def evaluate(encoder, k=10, seed=1234, evalcv=True, evaltest=False, use_feats=True, loc='./data/'):
 15 |     """
 16 |     Run experiment
 17 |     k: number of CV folds
 18 |     test: whether to evaluate on test set
 19 |     """
 20 |     print 'Preparing data...'
 21 |     traintext, testtext, labels = load_data(loc)
 22 | 
 23 |     print 'Computing training skipthoughts...'
 24 |     trainA = encoder.encode(traintext[0], verbose=False)
 25 |     trainB = encoder.encode(traintext[1], verbose=False)
 26 | 
 27 |     if evalcv:
 28 |         print 'Running cross-validation...'
 29 |         C = eval_kfold(trainA, trainB, traintext, labels[0], shuffle=True, k=10, seed=1234, use_feats=use_feats)
 30 | 
 31 |     if evaltest:
 32 |         if not evalcv:
 33 |             C = 4    # Best parameter found from CV (combine-skip with use_feats=True)
 34 | 
 35 |         print 'Computing testing skipthoughts...'
 36 |         testA = encoder.encode(testtext[0], verbose=False)
 37 |         testB = encoder.encode(testtext[1], verbose=False)
 38 | 
 39 |         if use_feats:
 40 |             train_features = np.c_[np.abs(trainA - trainB), trainA * trainB, feats(traintext[0], traintext[1])]
 41 |             test_features = np.c_[np.abs(testA - testB), testA * testB, feats(testtext[0], testtext[1])]
 42 |         else:
 43 |             train_features = np.c_[np.abs(trainA - trainB), trainA * trainB]
 44 |             test_features = np.c_[np.abs(testA - testB), testA * testB]
 45 | 
 46 |         print 'Evaluating...'
 47 |         clf = LogisticRegression(C=C)
 48 |         clf.fit(train_features, labels[0])
 49 |         yhat = clf.predict(test_features)
 50 |         print 'Test accuracy: ' + str(clf.score(test_features, labels[1]))
 51 |         print 'Test F1: ' + str(f1(labels[1], yhat))
 52 | 
 53 | 
 54 | def load_data(loc='./data/'):
 55 |     """
 56 |     Load MSRP dataset
 57 |     """
 58 |     trainloc = os.path.join(loc, 'msr_paraphrase_train.txt')
 59 |     testloc = os.path.join(loc, 'msr_paraphrase_test.txt')
 60 | 
 61 |     trainA, trainB, testA, testB = [],[],[],[]
 62 |     trainS, devS, testS = [],[],[]
 63 | 
 64 |     f = open(trainloc, 'rb')
 65 |     for line in f:
 66 |         text = line.strip().split('\t')
 67 |         trainA.append(' '.join(word_tokenize(text[3])))
 68 |         trainB.append(' '.join(word_tokenize(text[4])))
 69 |         trainS.append(text[0])
 70 |     f.close()
 71 |     f = open(testloc, 'rb')
 72 |     for line in f:
 73 |         text = line.strip().split('\t')
 74 |         testA.append(' '.join(word_tokenize(text[3])))
 75 |         testB.append(' '.join(word_tokenize(text[4])))
 76 |         testS.append(text[0])
 77 |     f.close()
 78 | 
 79 |     trainS = [int(s) for s in trainS[1:]]
 80 |     testS = [int(s) for s in testS[1:]]
 81 | 
 82 |     return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
 83 | 
 84 | 
 85 | def is_number(s):
 86 |     try:
 87 |         float(s)
 88 |         return True
 89 |     except ValueError:
 90 |         return False
 91 | 
 92 | 
 93 | def feats(A, B):
 94 |     """
 95 |     Compute additional features (similar to Socher et al.)
 96 |     These alone should give the same result from their paper (~73.2 Acc)
 97 |     """
 98 |     tA = [t.split() for t in A]
 99 |     tB = [t.split() for t in B]
100 |     
101 |     nA = [[w for w in t if is_number(w)] for t in tA]
102 |     nB = [[w for w in t if is_number(w)] for t in tB]
103 | 
104 |     features = np.zeros((len(A), 6))
105 | 
106 |     # n1
107 |     for i in range(len(A)):
108 |         if set(nA[i]) == set(nB[i]):
109 |             features[i,0] = 1.
110 | 
111 |     # n2
112 |     for i in range(len(A)):
113 |         if set(nA[i]) == set(nB[i]) and len(nA[i]) > 0:
114 |             features[i,1] = 1.
115 | 
116 |     # n3
117 |     for i in range(len(A)):
118 |         if set(nA[i]) <= set(nB[i]) or set(nB[i]) <= set(nA[i]): 
119 |             features[i,2] = 1.
120 | 
121 |     # n4
122 |     for i in range(len(A)):
123 |         features[i,3] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tA[i]))
124 | 
125 |     # n5
126 |     for i in range(len(A)):
127 |         features[i,4] = 1.0 * len(set(tA[i]) & set(tB[i])) / len(set(tB[i]))
128 | 
129 |     # n6
130 |     for i in range(len(A)):
131 |         features[i,5] = 0.5 * ((1.0*len(tA[i]) / len(tB[i])) + (1.0*len(tB[i]) / len(tA[i])))
132 | 
133 |     return features
134 | 
135 | 
136 | def eval_kfold(A, B, train, labels, shuffle=True, k=10, seed=1234, use_feats=False):
137 |     """
138 |     Perform k-fold cross validation
139 |     """
140 |     # features
141 |     labels = np.array(labels)
142 |     if use_feats:
143 |         features = np.c_[np.abs(A - B), A * B, feats(train[0], train[1])]
144 |     else:
145 |         features = np.c_[np.abs(A - B), A * B]
146 | 
147 |     scan = [2**t for t in range(0,9,1)]
148 |     npts = len(features)
149 |     kf = KFold(npts, n_folds=k, shuffle=shuffle, random_state=seed)
150 |     scores = []
151 | 
152 |     for s in scan:
153 | 
154 |         scanscores = []
155 | 
156 |         for train, test in kf:
157 | 
158 |             # Split data
159 |             X_train = features[train]
160 |             y_train = labels[train]
161 |             X_test = features[test]
162 |             y_test = labels[test]
163 | 
164 |             # Train classifier
165 |             clf = LogisticRegression(C=s)
166 |             clf.fit(X_train, y_train)
167 |             yhat = clf.predict(X_test)
168 |             fscore = f1(y_test, yhat)
169 |             scanscores.append(fscore)
170 |             print (s, fscore)
171 | 
172 |         # Append mean score
173 |         scores.append(np.mean(scanscores))
174 |         print scores
175 | 
176 |     # Get the index of the best score
177 |     s_ind = np.argmax(scores)
178 |     s = scan[s_ind]
179 |     print scores
180 |     print s
181 |     return s
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/eval_rank.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for image-sentence ranking
  3 | '''
  4 | import numpy as np
  5 | 
  6 | import theano
  7 | import theano.tensor as tensor
  8 | 
  9 | import cPickle as pkl
 10 | import numpy
 11 | import copy
 12 | import os
 13 | import time
 14 | 
 15 | from scipy import optimize, stats
 16 | from scipy.linalg import norm
 17 | from collections import OrderedDict
 18 | from sklearn.cross_validation import KFold
 19 | from numpy.random import RandomState
 20 | 
 21 | import warnings
 22 | 
 23 | 
 24 | # push parameters to Theano shared variables
 25 | def zipp(params, tparams):
 26 |     for kk, vv in params.iteritems():
 27 |         tparams[kk].set_value(vv)
 28 | 
 29 | # pull parameters from Theano shared variables
 30 | def unzip(zipped):
 31 |     new_params = OrderedDict()
 32 |     for kk, vv in zipped.iteritems():
 33 |         new_params[kk] = vv.get_value()
 34 |     return new_params
 35 | 
 36 | # get the list of parameters: Note that tparams must be OrderedDict
 37 | def itemlist(tparams):
 38 |     return [vv for kk, vv in tparams.iteritems()]
 39 | 
 40 | # make prefix-appended name
 41 | def _p(pp, name):
 42 |     return '%s_%s'%(pp, name)
 43 | 
 44 | # all parameters
 45 | def init_params(options):
 46 |     """
 47 |     Initalize all model parameters here
 48 |     """
 49 |     params = OrderedDict()
 50 | 
 51 |     # Image embedding, sentence embedding
 52 |     params = get_layer('ff')[0](options, params, prefix='ff_im', nin=options['dim_im'], nout=options['dim'])
 53 |     params = get_layer('ff')[0](options, params, prefix='ff_s', nin=options['dim_s'], nout=options['dim'])
 54 | 
 55 |     return params
 56 | 
 57 | # initialize Theano shared variables according to the initial parameters
 58 | def init_tparams(params):
 59 |     tparams = OrderedDict()
 60 |     for kk, pp in params.iteritems():
 61 |         tparams[kk] = theano.shared(params[kk], name=kk)
 62 |     return tparams
 63 | 
 64 | # load parameters
 65 | def load_params(path, params):
 66 |     pp = numpy.load(path)
 67 |     for kk, vv in params.iteritems():
 68 |         if kk not in pp:
 69 |             raise Warning('%s is not in the archive'%kk)
 70 |         params[kk] = pp[kk]
 71 |     return params
 72 | 
 73 | # layers: 'name': ('parameter initializer', 'feedforward')
 74 | layers = {'ff': ('param_init_fflayer', 'fflayer')}
 75 | 
 76 | def get_layer(name):
 77 |     """
 78 |     Part of the reason the init is very slow is because,
 79 |     the layer's constructor is called even when it isn't needed
 80 |     """
 81 |     fns = layers[name]
 82 |     return (eval(fns[0]), eval(fns[1]))
 83 | 
 84 | def norm_weight(nin,nout=None):
 85 |     """
 86 |     Weight initialization
 87 |     """
 88 |     if nout == None:
 89 |         nout = nin
 90 |     else:
 91 |         r = numpy.sqrt( 2. / nin)
 92 |         W = numpy.random.rand(nin, nout) * 2 * r - r
 93 |     return W.astype('float32')
 94 | 
 95 | def linear(x):
 96 |     return x
 97 | 
 98 | # feedforward layer: affine transformation + point-wise nonlinearity
 99 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None):
100 |     if nin == None:
101 |         nin = options['dim_proj']
102 |     if nout == None:
103 |         nout = options['dim_proj']
104 |     params[_p(prefix,'W')] = norm_weight(nin, nout)
105 |     params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32')
106 | 
107 |     return params
108 | 
109 | def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
110 |     return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
111 | 
112 | # L2norm, row-wise
113 | def l2norm(X):
114 |     norm = tensor.sqrt(tensor.pow(X, 2).sum(1))
115 |     X /= norm[:, None]
116 |     return X
117 | 
118 | # build a training model
119 | def build_model(tparams, options):
120 |     """
121 |     Construct computation graph for the whole model
122 |     """
123 |     # inputs (image, sentence, contrast images, constrast sentences)
124 |     im = tensor.matrix('im', dtype='float32')
125 |     s = tensor.matrix('s', dtype='float32')
126 |     cim = tensor.matrix('cim', dtype='float32')
127 |     cs = tensor.matrix('cs', dtype='float32')
128 | 
129 |     # image embedding
130 |     lim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear')
131 |     lcim = get_layer('ff')[1](tparams, cim, options, prefix='ff_im', activ='linear')
132 | 
133 |     # sentence embedding
134 |     ls = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear')
135 |     lcs = get_layer('ff')[1](tparams, cs, options, prefix='ff_s', activ='linear')
136 | 
137 |     # L2 norm for sentences
138 |     ls = l2norm(ls)
139 |     lcs = l2norm(lcs)
140 | 
141 |     # Tile by number of contrast terms
142 |     lim = tensor.tile(lim, (options['ncon'], 1))
143 |     ls = tensor.tile(ls, (options['ncon'], 1))
144 | 
145 |     # pairwise ranking loss
146 |     cost_im = options['margin'] - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1)
147 |     cost_im = cost_im * (cost_im > 0.)
148 |     cost_im = cost_im.sum(0)
149 | 
150 |     cost_s = options['margin'] - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1)
151 |     cost_s = cost_s * (cost_s > 0.)
152 |     cost_s = cost_s.sum(0)
153 | 
154 |     cost = cost_im + cost_s
155 |     return [im, s, cim, cs], cost
156 | 
157 | # build an encoder
158 | def build_encoder(tparams, options):
159 |     """
160 |     Construct encoder
161 |     """
162 |     # inputs (image, sentence)
163 |     im = tensor.matrix('im', dtype='float32')
164 |     s = tensor.matrix('s', dtype='float32')
165 | 
166 |     # embeddings
167 |     eim = get_layer('ff')[1](tparams, im, options, prefix='ff_im', activ='linear')
168 |     es = get_layer('ff')[1](tparams, s, options, prefix='ff_s', activ='linear')
169 | 
170 |     # L2 norm of rows
171 |     lim = l2norm(eim)
172 |     ls = l2norm(es)
173 | 
174 |     return [im, s], lim, ls
175 | 
176 | # optimizers
177 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
178 | def adam(lr, tparams, grads, inp, cost):
179 |     gshared = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
180 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
181 | 
182 |     f_grad_shared = theano.function(inp, cost, updates=gsup)
183 | 
184 |     lr0 = 0.0002
185 |     b1 = 0.1
186 |     b2 = 0.001
187 |     e = 1e-8
188 | 
189 |     updates = []
190 | 
191 |     i = theano.shared(numpy.float32(0.))
192 |     i_t = i + 1.
193 |     fix1 = 1. - b1**(i_t)
194 |     fix2 = 1. - b2**(i_t)
195 |     lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
196 | 
197 |     for p, g in zip(tparams.values(), gshared):
198 |         m = theano.shared(p.get_value() * numpy.float32(0.))
199 |         v = theano.shared(p.get_value() * numpy.float32(0.))
200 |         m_t = (b1 * g) + ((1. - b1) * m)
201 |         v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
202 |         g_t = m_t / (tensor.sqrt(v_t) + e)
203 |         p_t = p - (lr_t * g_t)
204 |         updates.append((m, m_t))
205 |         updates.append((v, v_t))
206 |         updates.append((p, p_t))
207 |     updates.append((i, i_t))
208 | 
209 |     f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
210 | 
211 |     return f_grad_shared, f_update
212 | 
213 | # things to avoid doing 
214 | def validate_options(options):
215 | 
216 |     if options['dim'] > options['dim_im']:
217 |         warnings.warn('dim should not be bigger than image dimension')
218 |     if options['dim'] > options['dim_s']:
219 |         warnings.warn('dim should not be bigger than sentence dimension')
220 |     if options['margin'] > 1:
221 |         warnings.warn('margin should not be bigger than 1')
222 |     return options
223 | 
224 | # Load a saved model and evaluate the results
225 | def evaluate(X, saveto, evaluate=False, out=False):
226 |     print "Loading model..."
227 |     with open('%s.pkl'%saveto, 'rb') as f:
228 |         model_options = pkl.load(f)
229 | 
230 |     params = init_params(model_options)
231 |     params = load_params(saveto, params)
232 |     tparams = init_tparams(params)
233 | 
234 |     print 'Building encoder'
235 |     inps_e, lim, ls = build_encoder(tparams, model_options)
236 |     f_emb = theano.function(inps_e, [lim, ls], profile=False)
237 | 
238 |     print 'Compute embeddings...'
239 |     lim, ls = f_emb(X[1], X[2])
240 | 
241 |     if evaluate:
242 |         (r1, r5, r10, medr) = i2t(lim, ls)
243 |         print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
244 |         (r1i, r5i, r10i, medri) = t2i(lim, ls)
245 |         print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)
246 |     if out:
247 |         return lim, ls
248 | 
249 | # trainer
250 | def trainer(train, dev, # training and development tuples
251 |             dim=1000, # embedding dimensionality
252 |             dim_im=4096, # image dimensionality
253 |             dim_s=4800, # sentence dimensionality
254 |             margin=0.2, # margin for pairwise ranking
255 |             ncon=50, # number of contrastive terms
256 |             max_epochs=15,
257 |             lrate=0.01, # not needed with Adam
258 |             dispFreq=10,
259 |             optimizer='adam',
260 |             batch_size = 100,
261 |             valid_batch_size = 100,
262 |             saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz',
263 |             validFreq=500,
264 |             saveFreq=500,
265 |             reload_=False):
266 | 
267 |     # Model options
268 |     model_options = {}
269 |     model_options['dim'] = dim
270 |     model_options['dim_im'] = dim_im
271 |     model_options['dim_s'] = dim_s
272 |     model_options['margin'] = margin
273 |     model_options['ncon'] = ncon
274 |     model_options['max_epochs'] = max_epochs
275 |     model_options['lrate'] = lrate
276 |     model_options['dispFreq'] = dispFreq
277 |     model_options['optimizer'] = optimizer
278 |     model_options['batch_size'] = batch_size
279 |     model_options['valid_batch_size'] = valid_batch_size
280 |     model_options['saveto'] = saveto
281 |     model_options['validFreq'] = validFreq
282 |     model_options['saveFreq'] = saveFreq
283 |     model_options['reload_'] = reload_
284 | 
285 |     model_options = validate_options(model_options)
286 |     print model_options
287 | 
288 |     # reload options
289 |     if reload_ and os.path.exists(saveto):
290 |         print "Reloading options"
291 |         with open('%s.pkl'%saveto, 'rb') as f:
292 |             model_options = pkl.load(f)
293 | 
294 |     print 'Building model'
295 |     params = init_params(model_options)
296 |     # reload parameters
297 |     if reload_ and os.path.exists(saveto):
298 |         print "Reloading model"
299 |         params = load_params(saveto, params)
300 | 
301 |     tparams = init_tparams(params)
302 | 
303 |     inps, cost = build_model(tparams, model_options)
304 | 
305 |     print 'Building encoder'
306 |     inps_e, lim, ls = build_encoder(tparams, model_options)
307 | 
308 |     print 'Building functions'
309 |     f_cost = theano.function(inps, -cost, profile=False)
310 |     f_emb = theano.function(inps_e, [lim, ls], profile=False)
311 | 
312 |     # gradient computation
313 |     print 'Computing gradients'
314 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
315 |     lr = tensor.scalar(name='lr')
316 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
317 | 
318 |     print 'Optimization'
319 | 
320 |     uidx = 0
321 |     estop = False
322 |     start = 1234
323 |     seed = 1234
324 |     inds = numpy.arange(len(train[0]))
325 |     numbatches = len(inds) / batch_size
326 |     curr = 0
327 |     counter = 0
328 |     target=None
329 |     history_errs = []
330 | 
331 |     # Main loop
332 |     for eidx in range(max_epochs):
333 |         tic = time.time()
334 |         prng = RandomState(seed - eidx - 1)
335 |         prng.shuffle(inds)
336 | 
337 |         for minibatch in range(numbatches):
338 | 
339 |             uidx += 1
340 |             conprng_im = RandomState(seed + uidx + 1)
341 |             conprng_s = RandomState(2*seed + uidx + 1)
342 | 
343 |             im = train[1][inds[minibatch::numbatches]]
344 |             s = train[2][inds[minibatch::numbatches]]
345 | 
346 |             cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im))
347 |             cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s))
348 |             cim = train[1][cinds_im]
349 |             cs = train[2][cinds_s]
350 | 
351 |             ud_start = time.time()
352 |             cost = f_grad_shared(im, s, cim, cs)
353 |             f_update(lrate)
354 |             ud_duration = time.time() - ud_start
355 | 
356 |             if numpy.mod(uidx, dispFreq) == 0:
357 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration
358 | 
359 |             if numpy.mod(uidx, validFreq) == 0:
360 | 
361 |                 print 'Computing ranks...'
362 |                 lim, ls = f_emb(dev[1], dev[2])
363 |                 (r1, r5, r10, medr) = i2t(lim, ls)
364 |                 print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
365 |                 (r1i, r5i, r10i, medri) = t2i(lim, ls)
366 |                 print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)
367 | 
368 |                 currscore = r1 + r5 + r10 + r1i + r5i + r10i
369 |                 if currscore > curr:
370 |                     curr = currscore
371 | 
372 |                     # Save model
373 |                     print 'Saving...',
374 |                     params = unzip(tparams)
375 |                     numpy.savez(saveto, history_errs=history_errs, **params)
376 |                     pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
377 |                     print 'Done'
378 | 
379 | 
380 | def i2t(images, captions, npts=None):
381 |     """
382 |     Images: (5N, K) matrix of images
383 |     Captions: (5N, K) matrix of captions
384 |     """
385 |     if npts == None:
386 |         npts = images.shape[0] / 5
387 |     index_list = []
388 | 
389 |     # Project captions
390 |     for i in range(len(captions)):
391 |         captions[i] /= norm(captions[i])
392 | 
393 |     ranks = numpy.zeros(npts)
394 |     for index in range(npts):
395 | 
396 |         # Get query image
397 |         im = images[5 * index].reshape(1, images.shape[1])
398 |         im /= norm(im)
399 | 
400 |         # Compute scores
401 |         d = numpy.dot(im, captions.T).flatten()
402 |         inds = numpy.argsort(d)[::-1]
403 |         index_list.append(inds[0])
404 | 
405 |         # Score
406 |         rank = 1e20
407 |         for i in range(5*index, 5*index + 5, 1):
408 |             tmp = numpy.where(inds == i)[0][0]
409 |             if tmp < rank:
410 |                 rank = tmp
411 |         ranks[index] = rank
412 | 
413 |     # Compute metrics
414 |     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
415 |     r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
416 |     r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
417 |     medr = numpy.floor(numpy.median(ranks)) + 1
418 |     return (r1, r5, r10, medr)
419 | 
420 | 
421 | def t2i(images, captions, npts=None):
422 |     """
423 |     Images: (5N, K) matrix of images
424 |     Captions: (5N, K) matrix of captions
425 |     """
426 |     if npts == None:
427 |         npts = images.shape[0] / 5
428 |     ims = numpy.array([images[i] for i in range(0, len(images), 5)])
429 | 
430 |     # Project images
431 |     for i in range(len(ims)):
432 |         ims[i] /= norm(ims[i])
433 | 
434 |     # Project captions
435 |     for i in range(len(captions)):
436 |         captions[i] /= norm(captions[i])
437 | 
438 |     ranks = np.zeros(5 * npts)
439 |     for index in range(npts):
440 | 
441 |         # Get query captions
442 |         queries = captions[5*index : 5*index + 5]
443 | 
444 |         # Compute scores
445 |         d = numpy.dot(queries, ims.T)
446 |         inds = numpy.zeros(d.shape)
447 |         for i in range(len(inds)):
448 |             inds[i] = numpy.argsort(d[i])[::-1]
449 |             ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0]
450 | 
451 |     # Compute metrics
452 |     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
453 |     r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
454 |     r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
455 |     medr = numpy.floor(numpy.median(ranks)) + 1
456 |     return (r1, r5, r10, medr)
457 | 
458 | 
459 | 


--------------------------------------------------------------------------------
/eval_sick.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for the SICK dataset (SemEval 2014 Task 1)
  3 | '''
  4 | import numpy as np
  5 | import os.path
  6 | from sklearn.metrics import mean_squared_error as mse
  7 | from scipy.stats import pearsonr
  8 | from scipy.stats import spearmanr
  9 | from sklearn.utils import shuffle
 10 | 
 11 | from keras.models import Sequential
 12 | from keras.layers.core import Dense, Activation
 13 | from keras.optimizers import Adam
 14 | 
 15 | 
 16 | def evaluate(encoder, seed=1234, evaltest=False, loc='./data/'):
 17 |     """
 18 |     Run experiment
 19 |     """
 20 |     print 'Preparing data...'
 21 |     train, dev, test, scores = load_data(loc)
 22 |     train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed)
 23 |     
 24 |     print 'Computing training skipthoughts...'
 25 |     trainA = encoder.encode(train[0], verbose=False, use_eos=True)
 26 |     trainB = encoder.encode(train[1], verbose=False, use_eos=True)
 27 |     
 28 |     print 'Computing development skipthoughts...'
 29 |     devA = encoder.encode(dev[0], verbose=False, use_eos=True)
 30 |     devB = encoder.encode(dev[1], verbose=False, use_eos=True)
 31 | 
 32 |     print 'Computing feature combinations...'
 33 |     trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
 34 |     devF = np.c_[np.abs(devA - devB), devA * devB]
 35 | 
 36 |     print 'Encoding labels...'
 37 |     trainY = encode_labels(scores[0])
 38 |     devY = encode_labels(scores[1])
 39 | 
 40 |     print 'Compiling model...'
 41 |     lrmodel = prepare_model(ninputs=trainF.shape[1])
 42 | 
 43 |     print 'Training...'
 44 |     bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, scores[1])
 45 | 
 46 |     if evaltest:
 47 |         print 'Computing test skipthoughts...'
 48 |         testA = encoder.encode(test[0], verbose=False, use_eos=True)
 49 |         testB = encoder.encode(test[1], verbose=False, use_eos=True)
 50 | 
 51 |         print 'Computing feature combinations...'
 52 |         testF = np.c_[np.abs(testA - testB), testA * testB]
 53 | 
 54 |         print 'Evaluating...'
 55 |         r = np.arange(1,6)
 56 |         yhat = np.dot(bestlrmodel.predict_proba(testF, verbose=2), r)
 57 |         pr = pearsonr(yhat, scores[2])[0]
 58 |         sr = spearmanr(yhat, scores[2])[0]
 59 |         se = mse(yhat, scores[2])
 60 |         print 'Test Pearson: ' + str(pr)
 61 |         print 'Test Spearman: ' + str(sr)
 62 |         print 'Test MSE: ' + str(se)
 63 | 
 64 |         return yhat
 65 | 
 66 | 
 67 | def prepare_model(ninputs=9600, nclass=5):
 68 |     """
 69 |     Set up and compile the model architecture (Logistic regression)
 70 |     """
 71 |     lrmodel = Sequential()
 72 |     lrmodel.add(Dense(input_dim=ninputs, output_dim=nclass))
 73 |     lrmodel.add(Activation('softmax'))
 74 |     lrmodel.compile(loss='categorical_crossentropy', optimizer='adam')
 75 |     return lrmodel
 76 | 
 77 | 
 78 | def train_model(lrmodel, X, Y, devX, devY, devscores):
 79 |     """
 80 |     Train model, using pearsonr on dev for early stopping
 81 |     """
 82 |     done = False
 83 |     best = -1.0
 84 |     r = np.arange(1,6)
 85 |     
 86 |     while not done:
 87 |         # Every 100 epochs, check Pearson on development set
 88 |         lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY))
 89 |         yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r)
 90 |         score = pearsonr(yhat, devscores)[0]
 91 |         if score > best:
 92 |             print score
 93 |             best = score
 94 |             bestlrmodel = prepare_model(ninputs=X.shape[1])
 95 |             bestlrmodel.set_weights(lrmodel.get_weights())
 96 |         else:
 97 |             done = True
 98 | 
 99 |     yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r)
100 |     score = pearsonr(yhat, devscores)[0]
101 |     print 'Dev Pearson: ' + str(score)
102 |     return bestlrmodel
103 |     
104 | 
105 | def encode_labels(labels, nclass=5):
106 |     """
107 |     Label encoding from Tree LSTM paper (Tai, Socher, Manning)
108 |     """
109 |     Y = np.zeros((len(labels), nclass)).astype('float32')
110 |     for j, y in enumerate(labels):
111 |         for i in range(nclass):
112 |             if i+1 == np.floor(y) + 1:
113 |                 Y[j,i] = y - np.floor(y)
114 |             if i+1 == np.floor(y):
115 |                 Y[j,i] = np.floor(y) - y + 1
116 |     return Y
117 | 
118 | 
119 | def load_data(loc='./data/'):
120 |     """
121 |     Load the SICK semantic-relatedness dataset
122 |     """
123 |     trainA, trainB, devA, devB, testA, testB = [],[],[],[],[],[]
124 |     trainS, devS, testS = [],[],[]
125 | 
126 |     with open(os.path.join(loc, 'SICK_train.txt'), 'rb') as f:
127 |         for line in f:
128 |             text = line.strip().split('\t')
129 |             trainA.append(text[1])
130 |             trainB.append(text[2])
131 |             trainS.append(text[3])
132 |     with open(os.path.join(loc, 'SICK_trial.txt'), 'rb') as f:
133 |         for line in f:
134 |             text = line.strip().split('\t')
135 |             devA.append(text[1])
136 |             devB.append(text[2])
137 |             devS.append(text[3])
138 |     with open(os.path.join(loc, 'SICK_test_annotated.txt'), 'rb') as f:
139 |         for line in f:
140 |             text = line.strip().split('\t')
141 |             testA.append(text[1])
142 |             testB.append(text[2])
143 |             testS.append(text[3])
144 | 
145 |     trainS = [float(s) for s in trainS[1:]]
146 |     devS = [float(s) for s in devS[1:]]
147 |     testS = [float(s) for s in testS[1:]]
148 | 
149 |     return [trainA[1:], trainB[1:]], [devA[1:], devB[1:]], [testA[1:], testB[1:]], [trainS, devS, testS]
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/eval_trec.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Evaluation code for the TREC dataset
  3 | '''
  4 | import numpy as np
  5 | import os.path
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn.cross_validation import KFold
  8 | from sklearn.utils import shuffle
  9 | 
 10 | 
 11 | def evaluate(encoder, k=10, seed=1234, evalcv=True, evaltest=False, loc='./data/'):
 12 |     """
 13 |     Run experiment
 14 |     k: number of CV folds
 15 |     test: whether to evaluate on test set
 16 |     """
 17 |     print 'Preparing data...'
 18 |     traintext, testtext = load_data(loc)
 19 |     train, train_labels = prepare_data(traintext)
 20 |     test, test_labels = prepare_data(testtext)
 21 |     train_labels = prepare_labels(train_labels)
 22 |     test_labels = prepare_labels(test_labels)
 23 |     train, train_labels = shuffle(train, train_labels, random_state=seed)
 24 | 
 25 |     print 'Computing training skipthoughts...'
 26 |     trainF = encoder.encode(train, verbose=False, use_eos=False)
 27 |     
 28 |     if evalcv:
 29 |         print 'Running cross-validation...'
 30 |         interval = [2**t for t in range(0,9,1)]     # coarse-grained
 31 |         C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed)
 32 | 
 33 |     if evaltest:
 34 |         if not evalcv:
 35 |             C = 128     # Best parameter found from CV
 36 | 
 37 |         print 'Computing testing skipthoughts...'
 38 |         testF = encoder.encode(test, verbose=False, use_eos=False)
 39 | 
 40 |         print 'Evaluating...'
 41 |         clf = LogisticRegression(C=C)
 42 |         clf.fit(trainF, train_labels)
 43 |         yhat = clf.predict(testF)
 44 |         print 'Test accuracy: ' + str(clf.score(testF, test_labels))
 45 | 
 46 | 
 47 | def load_data(loc='./data/'):
 48 |     """
 49 |     Load the TREC question-type dataset
 50 |     """
 51 |     train, test = [], []
 52 |     with open(os.path.join(loc, 'train_5500.label'), 'rb') as f:
 53 |         for line in f:
 54 |             train.append(line.strip())
 55 |     with open(os.path.join(loc, 'TREC_10.label'), 'rb') as f:
 56 |         for line in f:
 57 |             test.append(line.strip())
 58 |     return train, test
 59 | 
 60 | 
 61 | def prepare_data(text):
 62 |     """
 63 |     Prepare data
 64 |     """
 65 |     labels = [t.split()[0] for t in text]
 66 |     labels = [l.split(':')[0] for l in labels]
 67 |     X = [t.split()[1:] for t in text]
 68 |     X = [' '.join(t) for t in X]
 69 |     return X, labels
 70 | 
 71 | 
 72 | def prepare_labels(labels):
 73 |     """
 74 |     Process labels to numerical values
 75 |     """
 76 |     d = {}
 77 |     count = 0
 78 |     setlabels = set(labels)
 79 |     for w in setlabels:
 80 |         d[w] = count
 81 |         count += 1
 82 |     idxlabels = np.array([d[w] for w in labels])
 83 |     return idxlabels
 84 | 
 85 | 
 86 | def eval_kfold(features, labels, k=10, scan=[2**t for t in range(0,9,1)], seed=1234):
 87 |     """
 88 |     Perform k-fold cross validation
 89 |     """
 90 |     npts = len(features)
 91 |     kf = KFold(npts, n_folds=k, random_state=seed)
 92 |     scores = []
 93 | 
 94 |     for s in scan:
 95 | 
 96 |         scanscores = []
 97 | 
 98 |         for train, test in kf:
 99 | 
100 |             # Split data
101 |             X_train = features[train]
102 |             y_train = labels[train]
103 |             X_test = features[test]
104 |             y_test = labels[test]
105 | 
106 |             # Train classifier
107 |             clf = LogisticRegression(C=s)
108 |             clf.fit(X_train, y_train)
109 |             score = clf.score(X_test, y_test)
110 |             scanscores.append(score)
111 |             print (s, score)
112 | 
113 |         # Append mean score
114 |         scores.append(np.mean(scanscores))
115 |         print scores
116 | 
117 |     # Get the index of the best score
118 |     s_ind = np.argmax(scores)
119 |     s = scan[s_ind]
120 |     print (s_ind, s)
121 |     return s
122 | 
123 | 


--------------------------------------------------------------------------------
/nbsvm.py:
--------------------------------------------------------------------------------
 1 | # Naive-Bayes features
 2 | # Derived from https://github.com/mesnilgr/nbsvm
 3 | 
 4 | import os
 5 | import pdb
 6 | import numpy as np
 7 | from collections import Counter
 8 | from scipy.sparse import lil_matrix
 9 | from scipy.sparse import csr_matrix
10 | 
11 | 
12 | def tokenize(sentence, grams):
13 |     words = sentence.split()
14 |     tokens = []
15 |     for gram in grams:
16 |         for i in range(len(words) - gram + 1):
17 |             tokens += ["_*_".join(words[i:i+gram])]
18 |     return tokens
19 | 
20 | 
21 | def build_dict(X, grams):
22 |     dic = Counter()
23 |     for sentence in X:
24 |         dic.update(tokenize(sentence, grams))
25 |     return dic
26 | 
27 | 
28 | def compute_ratio(poscounts, negcounts, alpha=1):
29 |     alltokens = list(set(poscounts.keys() + negcounts.keys()))
30 |     dic = dict((t, i) for i, t in enumerate(alltokens))
31 |     d = len(dic)
32 |     p, q = np.ones(d) * alpha , np.ones(d) * alpha
33 |     for t in alltokens:
34 |         p[dic[t]] += poscounts[t]
35 |         q[dic[t]] += negcounts[t]
36 |     p /= abs(p).sum()
37 |     q /= abs(q).sum()
38 |     r = np.log(p/q)
39 |     return dic, r
40 | 
41 | 
42 | def process_text(text, dic, r, grams):
43 |     """
44 |     Return sparse feature matrix
45 |     """
46 |     X = lil_matrix((len(text), len(dic)))
47 |     for i, l in enumerate(text):
48 |         tokens = tokenize(l, grams)
49 |         indexes = []
50 |         for t in tokens:
51 |             try:
52 |                 indexes += [dic[t]]
53 |             except KeyError:
54 |                 pass
55 |         indexes = list(set(indexes))
56 |         indexes.sort()
57 |         for j in indexes:
58 |             X[i,j] = r[j]
59 |     return csr_matrix(X)
60 | 
61 | 


--------------------------------------------------------------------------------
/skipthoughts.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Skip-thought vectors
  3 | '''
  4 | import os
  5 | 
  6 | import theano
  7 | import theano.tensor as tensor
  8 | 
  9 | import cPickle as pkl
 10 | import numpy
 11 | import copy
 12 | import nltk
 13 | 
 14 | from collections import OrderedDict, defaultdict
 15 | from scipy.linalg import norm
 16 | from nltk.tokenize import word_tokenize
 17 | 
 18 | profile = False
 19 | 
 20 | #-----------------------------------------------------------------------------#
 21 | # Specify model and table locations here
 22 | #-----------------------------------------------------------------------------#
 23 | path_to_models = 'C:/Users/mt16558/Documents/Project/NLP Experiments/Text Summarization/'
 24 | path_to_tables = 'C:/Users/mt16558/Documents/Project/NLP Experiments/Text Summarization/'
 25 | #-----------------------------------------------------------------------------#
 26 | 
 27 | path_to_umodel = path_to_models + 'uni_skip.npz'
 28 | path_to_bmodel = path_to_models + 'bi_skip.npz'
 29 | 
 30 | 
 31 | def load_model():
 32 |     """
 33 |     Load the model with saved tables
 34 |     """
 35 |     # Load model options
 36 |     print('Loading model parameters...')
 37 |     with open('%s.pkl'%path_to_umodel, 'rb') as f:
 38 |         uoptions = pkl.load(f)
 39 |     with open('%s.pkl'%path_to_bmodel, 'rb') as f:
 40 |         boptions = pkl.load(f)
 41 | 
 42 |     # Load parameters
 43 |     uparams = init_params(uoptions)
 44 |     uparams = load_params(path_to_umodel, uparams)
 45 |     utparams = init_tparams(uparams)
 46 |     bparams = init_params_bi(boptions)
 47 |     bparams = load_params(path_to_bmodel, bparams)
 48 |     btparams = init_tparams(bparams)
 49 | 
 50 |     # Extractor functions
 51 |     print('Compiling encoders...')
 52 |     embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
 53 |     f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
 54 |     embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
 55 |     f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')
 56 | 
 57 |     # Tables
 58 |     print('Loading tables...')
 59 |     utable, btable = load_tables()
 60 | 
 61 |     # Store everything we need in a dictionary
 62 |     print('Packing up...')
 63 |     model = {}
 64 |     model['uoptions'] = uoptions
 65 |     model['boptions'] = boptions
 66 |     model['utable'] = utable
 67 |     model['btable'] = btable
 68 |     model['f_w2v'] = f_w2v
 69 |     model['f_w2v2'] = f_w2v2
 70 | 
 71 |     return model
 72 | 
 73 | 
 74 | def load_tables():
 75 |     """
 76 |     Load the tables
 77 |     """
 78 |     words = []
 79 |     utable = numpy.load(path_to_tables + 'utable.npy')
 80 |     btable = numpy.load(path_to_tables + 'btable.npy')
 81 |     f = open(path_to_tables + 'dictionary.txt', 'rb')
 82 |     for line in f:
 83 |         words.append(line.decode('utf-8').strip())
 84 |     f.close()
 85 |     utable = OrderedDict(zip(words, utable))
 86 |     btable = OrderedDict(zip(words, btable))
 87 |     return utable, btable
 88 | 
 89 | 
 90 | class Encoder(object):
 91 |     """
 92 |     Sentence encoder.
 93 |     """
 94 | 
 95 |     def __init__(self, model):
 96 |       self._model = model
 97 | 
 98 |     def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
 99 |       """
100 |       Encode sentences in the list X. Each entry will return a vector
101 |       """
102 |       return encode(self._model, X, use_norm, verbose, batch_size, use_eos)
103 | 
104 | 
105 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
106 |     """
107 |     Encode sentences in the list X. Each entry will return a vector
108 |     """
109 |     # first, do preprocessing
110 |     X = preprocess(X)
111 | 
112 |     # word dictionary and init
113 |     d = defaultdict(lambda : 0)
114 |     for w in model['utable'].keys():
115 |         d[w] = 1
116 |     ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
117 |     bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')
118 | 
119 |     # length dictionary
120 |     ds = defaultdict(list)
121 |     captions = [s.split() for s in X]
122 |     for i,s in enumerate(captions):
123 |         ds[len(s)].append(i)
124 | 
125 |     # Get features. This encodes by length, in order to avoid wasting computation
126 |     for k in ds.keys():
127 |         if verbose:
128 |             print(k)
129 |         numbatches = len(ds[k]) / batch_size + 1
130 |         for minibatch in range(numbatches):
131 |             caps = ds[k][minibatch::numbatches]
132 | 
133 |             if use_eos:
134 |                 uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32')
135 |                 bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32')
136 |             else:
137 |                 uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32')
138 |                 bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32')
139 |             for ind, c in enumerate(caps):
140 |                 caption = captions[c]
141 |                 for j in range(len(caption)):
142 |                     if d[caption[j]] > 0:
143 |                         uembedding[j,ind] = model['utable'][caption[j]]
144 |                         bembedding[j,ind] = model['btable'][caption[j]]
145 |                     else:
146 |                         uembedding[j,ind] = model['utable']['UNK']
147 |                         bembedding[j,ind] = model['btable']['UNK']
148 |                 if use_eos:
149 |                     uembedding[-1,ind] = model['utable']['<eos>']
150 |                     bembedding[-1,ind] = model['btable']['<eos>']
151 |             if use_eos:
152 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
153 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
154 |             else:
155 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
156 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
157 |             if use_norm:
158 |                 for j in range(len(uff)):
159 |                     uff[j] /= norm(uff[j])
160 |                     bff[j] /= norm(bff[j])
161 |             for ind, c in enumerate(caps):
162 |                 ufeatures[c] = uff[ind]
163 |                 bfeatures[c] = bff[ind]
164 |     
165 |     features = numpy.c_[ufeatures, bfeatures]
166 |     return features
167 | 
168 | 
169 | def preprocess(text):
170 |     """
171 |     Preprocess text for encoder
172 |     """
173 |     X = []
174 |     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
175 |     for t in text:
176 |         sents = sent_detector.tokenize(t)
177 |         result = ''
178 |         for s in sents:
179 |             tokens = word_tokenize(s)
180 |             result += ' ' + ' '.join(tokens)
181 |         X.append(result)
182 |     return X
183 | 
184 | 
185 | def nn(model, text, vectors, query, k=5):
186 |     """
187 |     Return the nearest neighbour sentences to query
188 |     text: list of sentences
189 |     vectors: the corresponding representations for text
190 |     query: a string to search
191 |     """
192 |     qf = encode(model, [query])
193 |     qf /= norm(qf)
194 |     scores = numpy.dot(qf, vectors.T).flatten()
195 |     sorted_args = numpy.argsort(scores)[::-1]
196 |     sentences = [text[a] for a in sorted_args[:k]]
197 |     print('QUERY: ' + query)
198 |     print('NEAREST: ')
199 |     for i, s in enumerate(sentences):
200 |         print(s, sorted_args[i])
201 | 
202 | 
203 | def word_features(table):
204 |     """
205 |     Extract word features into a normalized matrix
206 |     """
207 |     features = numpy.zeros((len(table), 620), dtype='float32')
208 |     keys = table.keys()
209 |     for i in range(len(table)):
210 |         f = table[keys[i]]
211 |         features[i] = f / norm(f)
212 |     return features
213 | 
214 | 
215 | def nn_words(table, wordvecs, query, k=10):
216 |     """
217 |     Get the nearest neighbour words
218 |     """
219 |     keys = table.keys()
220 |     qf = table[query]
221 |     scores = numpy.dot(qf, wordvecs.T).flatten()
222 |     sorted_args = numpy.argsort(scores)[::-1]
223 |     words = [keys[a] for a in sorted_args[:k]]
224 |     print('QUERY: ' + query)
225 |     print('NEAREST: ')
226 |     for i, w in enumerate(words):
227 |         print(w)
228 | 
229 | 
230 | def _p(pp, name):
231 |     """
232 |     make prefix-appended name
233 |     """
234 |     return '%s_%s'%(pp, name)
235 | 
236 | 
237 | def init_tparams(params):
238 |     """
239 |     initialize Theano shared variables according to the initial parameters
240 |     """
241 |     tparams = OrderedDict()
242 |     for kk, pp in params.iteritems():
243 |         tparams[kk] = theano.shared(params[kk], name=kk)
244 |     return tparams
245 | 
246 | 
247 | def load_params(path, params):
248 |     """
249 |     load parameters
250 |     """
251 |     pp = numpy.load(path)
252 |     for kk, vv in params.iteritems():
253 |         if kk not in pp:
254 |             warnings.warn('%s is not in the archive'%kk)
255 |             continue
256 |         params[kk] = pp[kk]
257 |     return params
258 | 
259 | 
260 | # layers: 'name': ('parameter initializer', 'feedforward')
261 | layers = {'gru': ('param_init_gru', 'gru_layer')}
262 | 
263 | def get_layer(name):
264 |     fns = layers[name]
265 |     return (eval(fns[0]), eval(fns[1]))
266 | 
267 | 
268 | def init_params(options):
269 |     """
270 |     initialize all parameters needed for the encoder
271 |     """
272 |     params = OrderedDict()
273 | 
274 |     # embedding
275 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
276 | 
277 |     # encoder: GRU
278 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
279 |                                               nin=options['dim_word'], dim=options['dim'])
280 |     return params
281 | 
282 | 
283 | def init_params_bi(options):
284 |     """
285 |     initialize all paramters needed for bidirectional encoder
286 |     """
287 |     params = OrderedDict()
288 | 
289 |     # embedding
290 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
291 | 
292 |     # encoder: GRU
293 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
294 |                                               nin=options['dim_word'], dim=options['dim'])
295 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r',
296 |                                               nin=options['dim_word'], dim=options['dim'])
297 |     return params
298 | 
299 | 
300 | def build_encoder(tparams, options):
301 |     """
302 |     build an encoder, given pre-computed word embeddings
303 |     """
304 |     # word embedding (source)
305 |     embedding = tensor.tensor3('embedding', dtype='float32')
306 |     x_mask = tensor.matrix('x_mask', dtype='float32')
307 | 
308 |     # encoder
309 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
310 |                                             prefix='encoder',
311 |                                             mask=x_mask)
312 |     ctx = proj[0][-1]
313 | 
314 |     return embedding, x_mask, ctx
315 | 
316 | 
317 | def build_encoder_bi(tparams, options):
318 |     """
319 |     build bidirectional encoder, given pre-computed word embeddings
320 |     """
321 |     # word embedding (source)
322 |     embedding = tensor.tensor3('embedding', dtype='float32')
323 |     embeddingr = embedding[::-1]
324 |     x_mask = tensor.matrix('x_mask', dtype='float32')
325 |     xr_mask = x_mask[::-1]
326 | 
327 |     # encoder
328 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
329 |                                             prefix='encoder',
330 |                                             mask=x_mask)
331 |     projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
332 |                                              prefix='encoder_r',
333 |                                              mask=xr_mask)
334 | 
335 |     ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)
336 | 
337 |     return embedding, x_mask, ctx
338 | 
339 | 
340 | # some utilities
341 | def ortho_weight(ndim):
342 |     W = numpy.random.randn(ndim, ndim)
343 |     u, s, v = numpy.linalg.svd(W)
344 |     return u.astype('float32')
345 | 
346 | 
347 | def norm_weight(nin,nout=None, scale=0.1, ortho=True):
348 |     if nout == None:
349 |         nout = nin
350 |     if nout == nin and ortho:
351 |         W = ortho_weight(nin)
352 |     else:
353 |         W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
354 |     return W.astype('float32')
355 | 
356 | 
357 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
358 |     """
359 |     parameter init for GRU
360 |     """
361 |     if nin == None:
362 |         nin = options['dim_proj']
363 |     if dim == None:
364 |         dim = options['dim_proj']
365 |     W = numpy.concatenate([norm_weight(nin,dim),
366 |                            norm_weight(nin,dim)], axis=1)
367 |     params[_p(prefix,'W')] = W
368 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
369 |     U = numpy.concatenate([ortho_weight(dim),
370 |                            ortho_weight(dim)], axis=1)
371 |     params[_p(prefix,'U')] = U
372 | 
373 |     Wx = norm_weight(nin, dim)
374 |     params[_p(prefix,'Wx')] = Wx
375 |     Ux = ortho_weight(dim)
376 |     params[_p(prefix,'Ux')] = Ux
377 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
378 | 
379 |     return params
380 | 
381 | 
382 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
383 |     """
384 |     Forward pass through GRU layer
385 |     """
386 |     nsteps = state_below.shape[0]
387 |     if state_below.ndim == 3:
388 |         n_samples = state_below.shape[1]
389 |     else:
390 |         n_samples = 1
391 | 
392 |     dim = tparams[_p(prefix,'Ux')].shape[1]
393 | 
394 |     if mask == None:
395 |         mask = tensor.alloc(1., state_below.shape[0], 1)
396 | 
397 |     def _slice(_x, n, dim):
398 |         if _x.ndim == 3:
399 |             return _x[:, :, n*dim:(n+1)*dim]
400 |         return _x[:, n*dim:(n+1)*dim]
401 | 
402 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
403 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
404 |     U = tparams[_p(prefix, 'U')]
405 |     Ux = tparams[_p(prefix, 'Ux')]
406 | 
407 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
408 |         preact = tensor.dot(h_, U)
409 |         preact += x_
410 | 
411 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
412 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
413 | 
414 |         preactx = tensor.dot(h_, Ux)
415 |         preactx = preactx * r
416 |         preactx = preactx + xx_
417 | 
418 |         h = tensor.tanh(preactx)
419 | 
420 |         h = u * h_ + (1. - u) * h
421 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
422 | 
423 |         return h
424 | 
425 |     seqs = [mask, state_below_, state_belowx]
426 |     _step = _step_slice
427 | 
428 |     rval, updates = theano.scan(_step,
429 |                                 sequences=seqs,
430 |                                 outputs_info = [tensor.alloc(0., n_samples, dim)],
431 |                                 non_sequences = [tparams[_p(prefix, 'U')],
432 |                                                  tparams[_p(prefix, 'Ux')]],
433 |                                 name=_p(prefix, '_layers'),
434 |                                 n_steps=nsteps,
435 |                                 profile=profile,
436 |                                 strict=True)
437 |     rval = [rval]
438 |     return rval
439 | 
440 | 
441 | 


--------------------------------------------------------------------------------
/skipthoughts.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/madhavthaker/text_summarization/7e0c7cc83502a4ce463a975a6e769b68f9b35bfd/skipthoughts.pyc


--------------------------------------------------------------------------------