├── EmbeddingAttempt4.ipynb
├── Extractive vs. Abstractive Text Summarization.pdf
├── MICHELLEZHAO_CS141_finalreport.pdf
├── README.md
├── Reduction.ipynb
├── TrainAttempt4.ipynb
├── _config.yml
├── algo1.png
├── cs141_final_poster_toSize.pptx
├── embedding2.ipynb
├── embeddingNotebook.ipynb
├── p1.png
├── p2.png
├── p3.png
├── p4.png
├── p5.png
├── p6.png
├── p7.png
├── predict.py
├── simpleTrain.ipynb
├── simpler.ipynb
├── stopWords.txt
├── testing.ipynb
├── tokenize_recipes.py
├── train.ipynb
├── train2.ipynb
├── train3.ipynb
├── trainNotebook.ipynb
└── train_seq2seq.py


/EmbeddingAttempt4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "GloVe Loaded.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import numpy as np\n",
 18 |     "from __future__ import division\n",
 19 |     "\n",
 20 |     "filename = 'glove.6B.50d.txt' \n",
 21 |     "# (glove data set from: https://nlp.stanford.edu/projects/glove/)\n",
 22 |     "\n",
 23 |     "\n",
 24 |     "def loadGloVe(filename):\n",
 25 |     "    vocab = []\n",
 26 |     "    embd = []\n",
 27 |     "    file = open(filename,'r')\n",
 28 |     "    for line in file.readlines():\n",
 29 |     "        row = line.strip().split(' ')\n",
 30 |     "        vocab.append(row[0])\n",
 31 |     "        embd.append(row[1:])\n",
 32 |     "    print('GloVe Loaded.')\n",
 33 |     "    file.close()\n",
 34 |     "    return vocab,embd\n",
 35 |     "\n",
 36 |     "# Pre-trained GloVe embedding\n",
 37 |     "vocab,embd = loadGloVe(filename)\n",
 38 |     "\n",
 39 |     "embedding = np.asarray(embd)\n",
 40 |     "embedding = embedding.astype(np.float32)\n",
 41 |     "\n",
 42 |     "word_vec_dim = len(embd[0]) # word_vec_dim = dimension of each word vectors"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 12,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import csv\n",
 52 |     "import nltk as nlp\n",
 53 |     "from nltk import word_tokenize\n",
 54 |     "import string\n",
 55 |     "\n",
 56 |     "summaries = []\n",
 57 |     "texts = []\n",
 58 |     "\n",
 59 |     "def clean(text):\n",
 60 |     "    text = text.lower()\n",
 61 |     "    printable = set(string.printable)\n",
 62 |     "    return filter(lambda x: x in printable, text) #filter funny characters, if any. \n",
 63 |     "    \n",
 64 |     "import numpy as np\n",
 65 |     "import os\n",
 66 |     "\n",
 67 |     "def split():\n",
 68 |     "    titles = []\n",
 69 |     "    texts = []\n",
 70 |     "    root = 'Part1'\n",
 71 |     "    \n",
 72 |     "    #dirr = 'Part1/awards_1990/awd_1990_00/'\n",
 73 |     "    dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n",
 74 |     "\n",
 75 |     "    for filename in dirs[1:]:\n",
 76 |     "    #iter = 0\n",
 77 |     "    #print(dirs[1])\n",
 78 |     "            \n",
 79 |     "                #print(iter)\n",
 80 |     "                #iter += 1\n",
 81 |     "                #print(dirs[1:])\n",
 82 |     "                #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n",
 83 |     "        f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n",
 84 |     "        addTitle = False\n",
 85 |     "        addTexts = False\n",
 86 |     "        title = []\n",
 87 |     "        text = []\n",
 88 |     "        for word in f.read().split():\n",
 89 |     "            if (word == \"Title\"):\n",
 90 |     "                addTitle = True\n",
 91 |     "                continue\n",
 92 |     "\n",
 93 |     "            if (word == \"Type\"):\n",
 94 |     "                addTitle = False\n",
 95 |     "\n",
 96 |     "#             if (addTexts == True and word == \"\\n\"):\n",
 97 |     "#                 addTexts = False\n",
 98 |     "#                 break\n",
 99 |     "\n",
100 |     "\n",
101 |     "            if (word == \"Abstract\"):\n",
102 |     "                addTexts = True\n",
103 |     "                continue\n",
104 |     "\n",
105 |     "            if(addTitle == True):\n",
106 |     "                title.append(word)\n",
107 |     "\n",
108 |     "            if(addTexts == True):\n",
109 |     "                text.append(word)\n",
110 |     "\n",
111 |     "        for i in range(len(title)):\n",
112 |     "            s = title[i]\n",
113 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
114 |     "            new_s = s.translate(table)\n",
115 |     "            title[i] = new_s\n",
116 |     "        for i in range(len(text)):\n",
117 |     "            s = text[i]\n",
118 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
119 |     "            new_s = s.translate(table)\n",
120 |     "            text[i] = new_s\n",
121 |     "\n",
122 |     "        title = ' '.join(title)\n",
123 |     "        text =' '.join(text)\n",
124 |     "        titles.append(word_tokenize(title))\n",
125 |     "        texts.append(word_tokenize(text))\n",
126 |     "\n",
127 |     "    return titles, texts\n",
128 |     "\n",
129 |     "summaries, texts = split()\n",
130 |     "\n"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 13,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "SAMPLE CLEANED & TOKENIZED TEXT: \n",
143 |       "\n",
144 |       "['Research', 'in', 'Particulate', 'systems', 'Engineering', 'in', 'crucial', 'to', 'advances', 'in', 'combustion', 'atmospheric', 'and', 'environmental', 'sciences', 'nuclear', 'winter', 'studies', 'nuclear', 'reactor', 'safety', 'and', 'materials', 'manufacturing', 'The', 'purpose', 'of', 'this', 'project', 'is', 'to', 'provide', 'research', 'experiences', 'for', 'undergraduate', 'students', 'in', 'this', 'area', 'At', 'least', 'half', 'of', 'the', 'students', 'will', 'be', 'selected', 'from', 'institutions', 'other', 'the', 'UMC', 'with', 'several', 'recruited', 'from', 'Stephens', 'College', 'a', 'womens', 'college', 'Lincoln', 'University', 'HCBU', 'and', 'other', 'undergraduate', 'schools', 'in', 'the', 'State', 'Each', 'student', 'will', 'undertake', 'a', 'specific', 'research', 'project', 'and', 'a', 'faculty', 'advisor', 'participating', 'in', 'the', 'REU', 'program', 'will', 'work', 'with', 'the', 'student', 'to', 'develop', 'hisher', 'research', 'skills', 'An', 'essential', 'element', 'of', 'the', 'program', 'will', 'be', 'its', 'emphasis', 'on', 'professional', 'development', 'of', 'the', 'students', 'eg', 'paper', 'writing', 'and', 'technical', 'presentations', 'The', 'purpose', 'is', 'to', 'infuse', 'bright', 'undergraduate', 'students', 'with', 'an', 'enthusiasm', 'towards', 'research', 'careers', 'including', 'graduate', 'level', 'education', 'The', 'University', 'of', 'MissouriColumbia', 'has', 'made', 'substantial', 'institutional', 'commitments', 'to', 'the', 'program', 'These', 'include', 'support', 'of', 'faculty', 'time', 'waiver', 'of', 'tuition', 'and', 'fees', 'for', 'the', 'student', 'Participants', 'waiver', 'of', 'all', 'overhead', 'costs', 'and', 'stipends', 'for', 'two', 'minority', 'students', 'in', 'addition', 'to', 'those', 'supported', 'by', 'the', 'NSF', 'funds']\n",
145 |       "\n",
146 |       "SAMPLE CLEANED & TOKENIZED SUMMARY: \n",
147 |       "\n",
148 |       "['Research', 'Expereinces', 'for', 'Undergraduate', 'in', 'Particulate', 'Systems', 'Engineering']\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "import random\n",
154 |     "\n",
155 |     "index = random.randint(0,len(texts)-1)\n",
156 |     "\n",
157 |     "print (\"SAMPLE CLEANED & TOKENIZED TEXT: \\n\\n\"+str(texts[index]))\n",
158 |     "print (\"\\nSAMPLE CLEANED & TOKENIZED SUMMARY: \\n\\n\"+str(summaries[index]))"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 14,
164 |    "metadata": {
165 |     "collapsed": true
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "def np_nearest_neighbour(x):\n",
170 |     "    #returns array in embedding that's most similar (in terms of cosine similarity) to x\n",
171 |     "        \n",
172 |     "    xdoty = np.multiply(embedding,x)\n",
173 |     "    xdoty = np.sum(xdoty,1)\n",
174 |     "    xlen = np.square(x)\n",
175 |     "    xlen = np.sum(xlen,0)\n",
176 |     "    xlen = np.sqrt(xlen)\n",
177 |     "    ylen = np.square(embedding)\n",
178 |     "    ylen = np.sum(ylen,1)\n",
179 |     "    ylen = np.sqrt(ylen)\n",
180 |     "    xlenylen = np.multiply(xlen,ylen)\n",
181 |     "    cosine_similarities = np.divide(xdoty,xlenylen)\n",
182 |     "\n",
183 |     "    return embedding[np.argmax(cosine_similarities)]\n",
184 |     "    \n",
185 |     "\n",
186 |     "\n",
187 |     "def word2vec(word):  # converts a given word into its vector representation\n",
188 |     "    if word in vocab:\n",
189 |     "        return embedding[vocab.index(word)]\n",
190 |     "    else:\n",
191 |     "        return embedding[vocab.index('unk')]\n",
192 |     "\n",
193 |     "def vec2word(vec):   # converts a given vector representation into the represented word \n",
194 |     "    for x in xrange(0, len(embedding)):\n",
195 |     "            if np.array_equal(embedding[x],np.asarray(vec)):\n",
196 |     "                return vocab[x]\n",
197 |     "    return vec2word(np_nearest_neighbour(np.asarray(vec)))"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 15,
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "Vector representation of 'unk':\n",
210 |       "\n",
211 |       "[ -7.91490018e-01   8.66169989e-01   1.19980000e-01   9.22870007e-04\n",
212 |       "   2.77599990e-01  -4.91849989e-01   5.01950026e-01   6.07919996e-04\n",
213 |       "  -2.58450001e-01   1.78650007e-01   2.53500015e-01   7.65720010e-01\n",
214 |       "   5.06640017e-01   4.02500004e-01  -2.13879999e-03  -2.83969998e-01\n",
215 |       "  -5.03239989e-01   3.04490000e-01   5.17790020e-01   1.50899999e-02\n",
216 |       "  -3.50309998e-01  -1.12779999e+00   3.32529992e-01  -3.52499992e-01\n",
217 |       "   4.13260013e-02   1.08630002e+00   3.39099988e-02   3.35640013e-01\n",
218 |       "   4.97449994e-01  -7.01309964e-02  -1.21920002e+00  -4.85119998e-01\n",
219 |       "  -3.85119990e-02  -1.35539994e-01  -1.63800001e-01   5.23209989e-01\n",
220 |       "  -3.13180000e-01  -1.65500000e-01   1.19089998e-01  -1.51150003e-01\n",
221 |       "  -1.56210005e-01  -6.26550019e-01  -6.23359978e-01  -4.21499997e-01\n",
222 |       "   4.18729991e-01  -9.24719989e-01   1.10490000e+00  -2.99959987e-01\n",
223 |       "  -6.30029989e-03   3.95399988e-01]\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "word = \"unk\"\n",
229 |     "print (\"Vector representation of '\"+str(word)+\"':\\n\")\n",
230 |     "print (word2vec(word))"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 16,
236 |    "metadata": {
237 |     "collapsed": true
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "#REDUCE DATA (FOR SPEEDING UP THE NEXT STEPS)\n",
242 |     "\n",
243 |     "MAXIMUM_DATA_NUM = 50000\n",
244 |     "\n",
245 |     "texts = texts[0:MAXIMUM_DATA_NUM]\n",
246 |     "summaries = summaries[0:MAXIMUM_DATA_NUM]"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 17,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "vocab_limit = []\n",
256 |     "embd_limit = []\n",
257 |     "\n",
258 |     "i=0\n",
259 |     "for text in texts:\n",
260 |     "    for word in text:\n",
261 |     "        if word not in vocab_limit:\n",
262 |     "            if word in vocab:\n",
263 |     "                vocab_limit.append(word)\n",
264 |     "                embd_limit.append(word2vec(word))"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 18,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "for summary in summaries:\n",
276 |     "    for word in summary:\n",
277 |     "        if word not in vocab_limit:\n",
278 |     "            if word in vocab:\n",
279 |     "                vocab_limit.append(word)\n",
280 |     "                embd_limit.append(word2vec(word))"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 19,
286 |    "metadata": {
287 |     "collapsed": true
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "if 'eos' not in vocab_limit:\n",
292 |     "    vocab_limit.append('eos')\n",
293 |     "    embd_limit.append(word2vec('eos'))\n",
294 |     "if 'unk' not in vocab_limit:\n",
295 |     "    vocab_limit.append('unk')\n",
296 |     "    embd_limit.append(word2vec('unk'))\n",
297 |     "\n",
298 |     "null_vector = np.zeros([word_vec_dim])\n",
299 |     "\n",
300 |     "vocab_limit.append('<PAD>')\n",
301 |     "embd_limit.append(null_vector)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 20,
307 |    "metadata": {
308 |     "collapsed": true
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "vec_summaries = []\n",
313 |     "\n",
314 |     "for summary in summaries:\n",
315 |     "    \n",
316 |     "    vec_summary = []\n",
317 |     "    \n",
318 |     "    for word in summary:\n",
319 |     "        vec_summary.append(word2vec(word))\n",
320 |     "            \n",
321 |     "    vec_summary.append(word2vec('eos'))\n",
322 |     "    \n",
323 |     "    vec_summary = np.asarray(vec_summary)\n",
324 |     "    vec_summary = vec_summary.astype(np.float32)\n",
325 |     "    \n",
326 |     "    vec_summaries.append(vec_summary)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 21,
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "vec_texts = []\n",
338 |     "\n",
339 |     "for text in texts:\n",
340 |     "    \n",
341 |     "    vec_text = []\n",
342 |     "    \n",
343 |     "    for word in text:\n",
344 |     "        vec_text.append(word2vec(word))\n",
345 |     "    \n",
346 |     "    vec_text = np.asarray(vec_text)\n",
347 |     "    vec_text = vec_text.astype(np.float32)\n",
348 |     "    \n",
349 |     "    vec_texts.append(vec_text)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 22,
355 |    "metadata": {
356 |     "collapsed": true
357 |    },
358 |    "outputs": [],
359 |    "source": [
360 |     "import pickle\n",
361 |     "with open('vocab_limit', 'wb') as fp:\n",
362 |     "    pickle.dump(vocab_limit, fp)\n",
363 |     "with open('embd_limit', 'wb') as fp:\n",
364 |     "    pickle.dump(embd_limit, fp)\n",
365 |     "with open('vec_summaries', 'wb') as fp:\n",
366 |     "    pickle.dump(vec_summaries, fp)\n",
367 |     "with open('vec_texts', 'wb') as fp:\n",
368 |     "    pickle.dump(vec_texts, fp)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {
375 |     "collapsed": true
376 |    },
377 |    "outputs": [],
378 |    "source": []
379 |   }
380 |  ],
381 |  "metadata": {
382 |   "kernelspec": {
383 |    "display_name": "Python 3",
384 |    "language": "python",
385 |    "name": "python3"
386 |   },
387 |   "language_info": {
388 |    "codemirror_mode": {
389 |     "name": "ipython",
390 |     "version": 3
391 |    },
392 |    "file_extension": ".py",
393 |    "mimetype": "text/x-python",
394 |    "name": "python",
395 |    "nbconvert_exporter": "python",
396 |    "pygments_lexer": "ipython3",
397 |    "version": "3.6.3"
398 |   }
399 |  },
400 |  "nbformat": 4,
401 |  "nbformat_minor": 2
402 | }
403 | 


--------------------------------------------------------------------------------
/Extractive vs. Abstractive Text Summarization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/Extractive vs. Abstractive Text Summarization.pdf


--------------------------------------------------------------------------------
/MICHELLEZHAO_CS141_finalreport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/MICHELLEZHAO_CS141_finalreport.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Extractive vs. Abstractive Text Summarization Methods: An Analysis
  2 | 
  3 | Text summarization solves the problem of condensing information into a more compact form, while maintaining the important information in the text. The methods of automatic text summarization fall into two primary categories: extractive and abstractive. A common approach of extractive summarization involves selecting the most representative sentences that best cover the information expressed by the original text based on a ranking of sentences by relevance. A popular method of abstractive text summarization is using an encoder-decoder structure, which generates a latent factor representation of the data, and decodes it to generate a summary. The goal of the project was to analyze and compare the effectiveness of both methods when applied specifically to scientific texts.
  4 | 
  5 | ## Motivation
  6 | 
  7 | My motivation for this project came from personal experience. As a student in college, I'm often faced with a large number of scientific papers and research articles that pertain to my interests, yet I don't have the time to read them all. I wanted a way to be able to get summaries of the main ideas for the papers, without significant loss of important content. Text summarization is a widely implemented algorithm, but I wanted to explore different text summarization methods applied to scientific writing in particular. 
  8 | 
  9 | ## Introduction
 10 | 
 11 | Automatic text summarization is the process of shortening a text documentation using a system for prioritizing information. Technologies that generate summaries take into account variables such as length, style, and syntax. Text summarization from the perspective of humans is taking a chunk of information and extracting what one deems most important. Automatic text summarization is based on the logical quantification of features of the text including, weighting keywords, and sentence ranking.
 12 | 
 13 | ### Extractive Text Summarization
 14 | Extractive text summarization does not use words aside from the ones already in the text, and selects some combination of the existing words most relevant to the meaning of the source. Techniques of extractive summarization include ranking sentences and phrases in order of importance and selecting the most important components of the document to construct the summary. These methods tend to more robust because they use existing phrases, but lack flexibility since they cannot use new words or paraphrase.
 15 | 
 16 | ### Abstractive Text Summarization
 17 | Abstractive text summarization involves generating entirely new phrases and sentences to capture the meaning of the text. Abstractive methods tend to be more complex, because the machine must read over the text and deem certain concepts to be important, and then learn to construct some cohesive phrasing of the relevant concepts. Abstractive summarization is most similar to how humans summarize, as humans often summarize by paraphrasing. 
 18 | 
 19 | ## Materials and Methods
 20 | Although the primary goal of my project was to be able to summarize entire scientific papers, and essentially create abstracts given papers, a paper was too long of an input text to start with. I decided to first work with generating summaries given abstracts, which are much shorter than entire papers. Essentially, my project can be thought of as generating paper titles, given abstracts. First, I needed a dataset of abstract texts with their corresponding titles.
 21 | 
 22 | I used the NSF Research Award Abstracts 1990-2003 Data Set from the UCI machine learning repository. The dataset consisted of abstracts that had won the NSF research awards from 1990 to 2003, along with the title of the paper. For my abstractive learning, the training input X was the abstract and the title was the training input Y.
 23 | 
 24 | ### Extractive Methods
 25 | For extractive summarization, I used the TextRank algorithm, which is based on Google’s PageRank algorithm. TextRanks works by transforming the text into a graph. It regards words as vertices and the relation between words in phrases or sentences as edges. Each edge also has different weight. When one vertex links to another one, it is basically casting a vote of importance for that vertex. The importance of the vertex also dictates how heavily weighted its votes are. TextRank uses the structure of the text and the known parts of speech for words to assign a score to words that are keywords for the text.
 26 | 
 27 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/algo1.png)
 28 | 
 29 | #### Algorithm: TextRank Algorithm
 30 | 1. Identify filtered text units most representative of the text and add them as vertices to the graph.
 31 | 2. Identify relations that connect such text units, and use these relations to draw edges between vertices in the graph.
 32 | 3. Iterate the graph-based ranking algorithm until convergence.
 33 | 4. Sort vertices based on their final score. Use the values attached to each vertex for ranking/selection decisions.
 34 | 
 35 | 
 36 | First, we take the input text and split the entire text down to individual words. Using a list of stop words, words are filtered so that only nouns and adjectives are considered. Then a graph of words is created where the words are the nodes/vertices. Each vertex’s edges are defined by connections of a word to other words that are close to it in the text. The TextRank algorithm is then run on the graph. Each node is given a weight of 1. Then, we go through the list of nodes and collect the number of edges and connections the word has, which is essentially the influence of the connected vertex. The scores are computed and normalized for every node, and the algorithm takes the top-scoring words that have been identified as important keywords. The algorithm sums up the scores for each of the keywords in all of the sentences, and ranks the sentences in order of score and significance. Finally, the top K sentences are returned to become the TextRank generated summary.
 37 | 
 38 | ### Code for TextRank Reduction
 39 | 
 40 | First, we take the input text and split the entire text down to individual words. Using a list of stop words, words are filtered so that only nouns and adjectives are considered. Then a graph of words is created where the words are the nodes/vertices. Each vertex’s edges are defined by connections of a word to other words that are close to it in the text. The TextRank algorithm is then run on the graph. Each node is given a weight of 1. Then, we go through the list of nodes and collect the number of edges and connections the word has, which is essentially the influence of the connected vertex. The scores are computed and normalized for every node, and the algorithm takes the top-scoring words that have been identified as important keywords. The algorithm sums up the scores for each of the keywords in all of the sentences, and ranks the sentences in order of score and significance. Finally, the top K sentences are returned to become the TextRank generated summary.
 41 | 
 42 | 
 43 | ```python
 44 | def reduce(self, text, reductionRatio):
 45 | 	stopWordsFile = 'stopWords.txt'
 46 | 	stopWords= open(stopWordsFile).read().splitlines()
 47 | 
 48 | 	lines = text.splitlines()
 49 | 	contentLines = filter(lambda w: w.strip() != '', lines)
 50 | 
 51 | 	paragraphs = self.getParagraphs(contentLines, stopWords)
 52 | 	print("paragraphs", paragraphs)
 53 | 
 54 | 	rankedSentences = self.sentenceRank(paragraphs)
 55 | 
 56 | 	orderedSentences = []
 57 | 	for p in paragraphs:
 58 | 		for s in p.Sentences:
 59 | 			orderedSentences.append(s)
 60 | 
 61 | 	reducedSentences = []
 62 | 	i = 0
 63 | 	while i < math.trunc(len(rankedSentences) * reductionRatio):
 64 | 		s = rankedSentences[i][0].Sentence
 65 | 		position = orderedSentences.index(s)
 66 | 		reducedSentences.append((s, position))
 67 | 		i = i + 1
 68 | 	reducedSentences = sorted(reducedSentences, key=lambda x: x[1])
 69 | 
 70 | 	reducedText = []
 71 | 	for s,r in reducedSentences:
 72 | 		reducedText.append(s.getFullSentence())
 73 | 	return reducedText	
 74 | 
 75 | 
 76 | ```
 77 | 
 78 | 
 79 | 
 80 | ### Abstractive Methods
 81 | First, we need to preprocess the data by constructing an embedding of the text. Embedding the input converts the text into numbers, a more interpretable numerical representation of the data for the encoder-decoder network to work with. I experimented with two different embedding methods: Word2Vec and Global-Vectors (GloVe). Word2Vec is algorithm that combines continuous bag of words and the Skip-gram model to generate word vector representations. GloVe is an unsupervised learning algorithm for obtaining vector representations for words, training from a dictionary of common words. 
 82 | 
 83 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p1.png)
 84 | 
 85 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p2.png)
 86 | 
 87 | The encoder-decoder model is composed of multiple recurrent neural networks, one of which works as an encoder, and one as a decoder. The encoder converts an input document into a latent representation (a vector), and the decoder reads the latent input, generating a summary as it decodes. With encoder decoder structures, issues to consider include determining how to set the focus on the import sentences and keywords, how to handle novel or rare words in the document, how to handle incredibly long documents, and how to make summaries readable and flexible with a large vocabulary.
 88 | 
 89 | 
 90 | The encoder-decoder recurrent neural network architecture has been shown to be effective when applied to text summarization. The architecture involves two components: an encoder and a decoder. The encoder reads the entire input sequence and encodes it into an internal representation, often a fixed-length vector. The decoder reads the encoded input sequence from the decoder and generates the output sequence, which is the summary. Both the encoder and decoder sub-models are trained jointly, meaning their output feed into the other as input. 
 91 | 
 92 | ```python
 93 | model = Sequential()
 94 | model.add(Embedding(vocab_size, embedding_size,
 95 |                     input_length=maxlen,
 96 |                     W_regularizer=regularizer, weights=[embedding], mask_zero=True,
 97 |                     name='embedding_1'))
 98 | 
 99 | for i in range(rnn_layers):
100 |     lstm = LSTM(40000, return_sequences=True,
101 |                 W_regularizer=regularizer, U_regularizer=regularizer,
102 |                 b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,
103 |                 name='lstm_%d'%(i+1)
104 |                   )
105 |     model.add(lstm)
106 | ```
107 | The encoder is a bidirectional LSTM recurrent neural network (RNN). RNNs can use their internal state (memory) to process sequences of inputs. LSTMs are capable of learning long term dependencies by storing long-term states and inputs in gated cell memory. The tokenized words of the text are fed one-by-one into the encoder, a single-layer bidirectional LSTM, producing a sequence of hidden states, which is a latent representation of the input. The decoder is a single-layer unidirectional LSTM, which receives the word embedding of the previous word, and the embedding is transformed into a word representation, which is part of the summary.
108 | 
109 | I used the one-shot encoder-decoder model, where the entire output sequence is generated in a one-shot manner, meaning the decoder uses the latent context vector alone to generate the output summary.
110 | 
111 | Abstractive methods like the encoder-decoder network are capable of generating entirely new phrases and sentences to capture the meaning of the text. They tend to be more complex than extractive methods, since they learn to construct some cohesive phrasing of the relevant concepts. However, this also means they are more susceptible to error.
112 | 
113 | ## Results
114 | ### Extractive Results
115 | The TextRank algorithm generated the following summary. I specified how many sentences to reduce, and generated a 70% reduction summary and a 90% reduction summary which contained the top 3 most important sentences and the top 1 most important sentence, respectively.
116 | 
117 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p3.png)
118 | 
119 | I summarized this text from one of the scientific paper abstracts. 
120 | #### Text: 
121 | Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals.
122 | 
123 | 
124 | #### Summary:
125 | We can specific how many sentences to output and define what percentage of reduction of the text we will perform. 
126 | 
127 | ##### 70% Reduction:
128 | Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations.
129 | 
130 | ##### 90% Reduction:
131 | Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale.
132 | 
133 | ### Abstractive Results
134 | As part of the pre-processing analysis, ranking the words in order of number of appearances, we saw this distribution of keywords and their frequencies in the training data. The distribution of set of text input words is much larger and wider than that of words in the summaries.
135 | 
136 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p5.png)
137 | 
138 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p6.png)
139 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p7.png)
140 | 
141 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p4.png)
142 | 
143 | The encoder decoder network generated the following two summaries on the testing input.
144 | #### Text:
145 | Proposal seeks to demonstrate a technique for observing ocean currents by electric field measurements using a towed instrument of recent design measurements will be made in conjunction with a cruise across the in which several additional observational techniques will be employed several data types will be to improve the accuracy of the methods.
146 | 
147 | #### Summary:
148 | ##### Summary #1
149 | Drum frame multidisciplinary
150 | 
151 | ##### Summary #2
152 | Extension solver bearing.
153 | 
154 | Experimenting with generated longer summaries to discover whether a longer generated summary would be a better summary for the information, I reran the encoder-decoder network to get a 4-word summary: 
155 | 
156 | #### Summary #3
157 | Exceptional geology goal visited
158 | 
159 | This summary is much better than the 3-word summaries, which poses an interesting question: what is the relationship between length of summary and quality of summary generated by encoder-decoder structures?
160 | 
161 | 
162 | 
163 | ### Discussion
164 | TextRank selected the two most significant sentences in the text. E-D generated two different three-word summaries, using words not present in the text, but the summaries generated were not representative of the text and did not make logical sense.
165 | By analyzing the two summaries generated by the encoder-decoder network, I found qualitatively that the extractive summarizer worked better than the abstractive text summarizer. The summaries generated by the extractive method were more understandable and representatively of the text than the abstractive summaries. The extractive summaries were generated much quicker than the abstractive ones. The TextRank algorithm took about 2 seconds to generate a summary, while the encoder-decoder network took about 15 minutes to train.
166 | The encoder decoder network performed rather poorly in comparison to the extractive method. This may have been because the encoder-decoder network didn’t have enough training. If the encoder-decoder network perhaps have had more epochs of training, it would have performed better. The training input may have also been too small. 
167 | 
168 | 
169 | ### Conclusions
170 | In conclusion, the TextRank summarization method was very effective in choosing important sentences. As a further extension to the TextRank algorithm, it would be worthwhile to experiment with more ways of choosing “connections” from words to other words. Instead of using the proximity of words to other words in sentences, there may be other ways to measure connections between words, such as using proximity to other words with high connection to the word in question.
171 | The encoder-decoder network was found to be less effective than TextRank, likely because abstractive methods in general are less flexible and more susceptible to error than extractive methods, especially since words not in the text can be used. The next steps in improving the encoder-decoder network are to train on a larger training set, experiment with model hyper-parameters, use beam search, and explore different preprocessing methods.
172 | 
173 | 
174 | ### References
175 | 1.Jing, Hongyan. “Sentence Reduction for Automatic Text Summarization.” Proceedings of the Sixth Conference on Applied Natural Language Processing -, 2000, doi:10.3115/974147.974190.
176 | 2.Garg, Sneh, and Sunil Chhillar. “Review of Text Reduction Algorithms and Text Reduction Using Sentence Vectorization.” International Journal of Computer Applications, vol. 107, no. 12, 2014, pp. 39–42., doi:10.5120/18806-0380.
177 | 3.JRC1995. “JRC1995/Abstractive-Summarization.” GitHub, github.com/JRC1995/Abstractive-Summarization/blob/master/Summarization_model.ipynb.
178 | 4.“A Gentle Introduction to Text Summarization.” Machine Learning Mastery, 21 Nov. 2017, machinelearningmastery.com/gentle-introduction-text-summarization/.
179 | 5.“A Survey of Relevant Text Content Summarization Techniques.” International Journal of Science and Research (IJSR), vol. 5, no. 1, 2016, pp. 129–132., doi:10.21275/v5i1.nov152644.
180 | 6.“Text Summarization in Python: Extractive vs. Abstractive Techniques Revisited.” Pragmatic Machine Learning, rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/.
181 | 7.“Neural Machine Translation (seq2seq) Tutorial | TensorFlow.” TensorFlow, www.tensorflow.org/tutorials/seq2seq.
182 | 8.“Encoder-Decoder Long Short-Term Memory Networks.” Machine Learning Mastery, 20 July 2017, machinelearningmastery.com/encoder-decoder-long-short-term-memory-networks/.
183 | 9.Dalalkrish. “Dalalkrish/Text-Summarization-Keras.” GitHub, github.com/dalalkrish/text-summarization-keras/blob/master/Text_Summarization.ipynb.
184 | 10.“Text Summarization with TensorFlow.” Google AI Blog, 24 Aug. 2016, ai.googleblog.com/2016/08/text-summarization-with-tensorflow.html.
185 | 11.llSourcell. “LlSourcell/How_to_make_a_text_summarizer.” GitHub, github.com/llSourcell/How_to_make_a_text_summarizer/blob/master/train.ipynb.
186 | 
187 | 
188 | 
189 | ### Contact
190 | 
191 | Michelle Zhao
192 | mzhao@caltech.edu
193 | 
194 | 


--------------------------------------------------------------------------------
/Reduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re, pdb, sys, math\n",
 12 |     "from collections import defaultdict"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "class Graph:\n",
 24 |     "    def __init__(self):\n",
 25 |     "        self.Vertices = []\n",
 26 |     "        self.Edges = []\n",
 27 |     "\n",
 28 |     "    def getRankedVertices(self):\n",
 29 |     "        res = defaultdict(float)\n",
 30 |     "        for e in self.Edges:\n",
 31 |     "            res[e.Vertex1] += e.Weight\n",
 32 |     "        return sorted(res.items(), key=lambda x: x[1], reverse=True)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "class Vertex:\n",
 44 |     "    def __init__(self):\n",
 45 |     "        self.Sentence = None"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 5,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "class Edge:\n",
 57 |     "    def __init__(self):\n",
 58 |     "        self.Vertex1 = None\n",
 59 |     "        self.Vertex2 = None\n",
 60 |     "        self.Weight = 0"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "class WordType:\n",
 72 |     "    Content=0\n",
 73 |     "    Function=1\n",
 74 |     "    ContentPunctuation=2\n",
 75 |     "    FunctionPunctuation=3"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "class Word:\n",
 87 |     "    def __init__(self):\n",
 88 |     "        self.Text=''\n",
 89 |     "        self.Type=''"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 8,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "class Sentence:\n",
101 |     "    def __init__(self):\n",
102 |     "        self.Words = []\n",
103 |     "\n",
104 |     "    def getFullSentence(self):\n",
105 |     "        text = ''\n",
106 |     "        for w in self.Words:\n",
107 |     "            text += w.Text\n",
108 |     "        return text.strip()\n",
109 |     "\n",
110 |     "    def getReducedSentence(self):\n",
111 |     "        sentenceText = ''\n",
112 |     "        sentenceEnd = self.Words[len(self.Words)-1]\n",
113 |     "        contentWords = filter(lambda w: w.Type == WordType.Content, self.Words)\n",
114 |     "        i = 0\n",
115 |     "        while i < len(contentWords):\n",
116 |     "            w = contentWords[i]\n",
117 |     "            # upper case the first character of the sentence\n",
118 |     "            if i == 0:\n",
119 |     "                li = list(w.Text)\n",
120 |     "                li[0] = li[0].upper()\n",
121 |     "                w.Text = ''.join(li)\n",
122 |     "            sentenceText += w.Text\n",
123 |     "            if i < len(contentWords)-1:\n",
124 |     "                sentenceText += ' '\n",
125 |     "            elif sentenceEnd.Text != w.Text:\n",
126 |     "                sentenceText += sentenceEnd.Text\n",
127 |     "            i = i+1\n",
128 |     "        return sentenceText\n",
129 |     "\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "class Paragraph:\n",
141 |     "    def __init__(self):\n",
142 |     "        self.Sentences = []"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 16,
148 |    "metadata": {
149 |     "collapsed": true
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "class Reduction:\n",
154 |     "    functionPunctuation = ' ,-'\n",
155 |     "    contentPunctuation = '.?!\\n'\n",
156 |     "    punctuationCharacters = functionPunctuation+contentPunctuation\n",
157 |     "    sentenceEndCharacters = '.?!'\n",
158 |     "\n",
159 |     "    def isContentPunctuation(self, text):\n",
160 |     "        for c in self.contentPunctuation:\n",
161 |     "            if text.lower() == c.lower():\n",
162 |     "                return True\n",
163 |     "        return False\n",
164 |     "\n",
165 |     "    def isFunctionPunctuation(self, text):\n",
166 |     "        for c in self.functionPunctuation:\n",
167 |     "            if text.lower() == c.lower():\n",
168 |     "                return True\n",
169 |     "        return False\n",
170 |     "\n",
171 |     "    def isFunction(self, text, stopWords):\n",
172 |     "        for w in stopWords:\n",
173 |     "            if text.lower() == w.lower():\n",
174 |     "                return True\n",
175 |     "        return False\n",
176 |     "\n",
177 |     "    def tag(self, sampleWords, stopWords):\n",
178 |     "        taggedWords = []\n",
179 |     "        for w in sampleWords:\n",
180 |     "            tw = Word()\n",
181 |     "            tw.Text = w\n",
182 |     "            if self.isContentPunctuation(w):\n",
183 |     "                tw.Type = WordType.ContentPunctuation\n",
184 |     "            elif self.isFunctionPunctuation(w):\n",
185 |     "                tw.Type = WordType.FunctionPunctuation\n",
186 |     "            elif self.isFunction(w, stopWords):\n",
187 |     "                tw.Type = WordType.Function\n",
188 |     "            else:\n",
189 |     "                tw.Type = WordType.Content\n",
190 |     "            taggedWords.append(tw)\n",
191 |     "        return taggedWords\n",
192 |     "\n",
193 |     "    def tokenize(self, text):\n",
194 |     "        return filter(lambda w: w != '', re.split('([{0}])'.format(self.punctuationCharacters), text))\t\n",
195 |     "\n",
196 |     "    def getWords(self, sentenceText, stopWords):\n",
197 |     "        return self.tag(self.tokenize(sentenceText), stopWords) \n",
198 |     "\n",
199 |     "    def getSentences(self, line, stopWords):\n",
200 |     "        sentences = []\n",
201 |     "        sentenceTexts = filter(lambda w: w.strip() != '', re.split('[{0}]'.format(self.sentenceEndCharacters), line))\t\n",
202 |     "        sentenceEnds = re.findall('[{0}]'.format(self.sentenceEndCharacters), line)\n",
203 |     "        sentenceEnds.reverse()\n",
204 |     "        for t in sentenceTexts:\n",
205 |     "            if len(sentenceEnds) > 0:\n",
206 |     "                t += sentenceEnds.pop()\n",
207 |     "            sentence = Sentence()\n",
208 |     "            sentence.Words = self.getWords(t, stopWords)\n",
209 |     "            sentences.append(sentence)\n",
210 |     "        return sentences\n",
211 |     "\n",
212 |     "    def getParagraphs(self, lines, stopWords):\n",
213 |     "        paragraphs = []\n",
214 |     "        for line in lines:\n",
215 |     "            paragraph = Paragraph()\n",
216 |     "            paragraph.Sentences = self.getSentences(line, stopWords)\n",
217 |     "            paragraphs.append(paragraph)\n",
218 |     "        return paragraphs\n",
219 |     "\n",
220 |     "    def findWeight(self, sentence1, sentence2):\n",
221 |     "        length1 = len(list(filter(lambda w: w.Type == WordType.Content, sentence1.Words)))\n",
222 |     "        length2 = len(list(filter(lambda w: w.Type == WordType.Content, sentence2.Words)))\n",
223 |     "        if length1 < 4 or length2 < 4:\n",
224 |     "            return 0\n",
225 |     "        weight = 0\n",
226 |     "        for w1 in filter(lambda w: w.Type == WordType.Content, sentence1.Words):\n",
227 |     "            for w2 in filter(lambda w: w.Type == WordType.Content, sentence2.Words):\n",
228 |     "                if w1.Text.lower() == w2.Text.lower():\n",
229 |     "                    weight = weight + 1\n",
230 |     "        normalised1 = 0\n",
231 |     "        if length1 > 0:\n",
232 |     "            normalised1 = math.log(length1)\n",
233 |     "        normalised2 = 0\n",
234 |     "        if length2 > 0:\n",
235 |     "            normalised2 = math.log(length2)\n",
236 |     "        norm = normalised1 + normalised2\n",
237 |     "        if norm == 0:\n",
238 |     "            return 0\n",
239 |     "        return weight / float(norm)\n",
240 |     "\n",
241 |     "    def buildGraph(self, sentences):\n",
242 |     "        g = Graph()\n",
243 |     "        for s in sentences:\n",
244 |     "            v = Vertex()\n",
245 |     "            v.Sentence = s\n",
246 |     "            g.Vertices.append(v)\n",
247 |     "        for i in g.Vertices:\n",
248 |     "            for j in g.Vertices:\n",
249 |     "                if i != j:\n",
250 |     "                    w = self.findWeight(i.Sentence, j.Sentence)\n",
251 |     "                    e = Edge()\n",
252 |     "                    e.Vertex1 = i\n",
253 |     "                    e.Vertex2 = j\n",
254 |     "                    e.Weight = w\n",
255 |     "                    g.Edges.append(e)\n",
256 |     "        return g\n",
257 |     "\n",
258 |     "    def sentenceRank(self, paragraphs):\n",
259 |     "        sentences = []\n",
260 |     "        for p in paragraphs:\n",
261 |     "            for s in p.Sentences:\n",
262 |     "                sentences.append(s)\n",
263 |     "        g = self.buildGraph(sentences)\n",
264 |     "        return g.getRankedVertices()\n",
265 |     "\n",
266 |     "    def reduce(self, text, reductionRatio):\n",
267 |     "        stopWordsFile = 'stopWords.txt'\n",
268 |     "        stopWords= open(stopWordsFile).read().splitlines()\n",
269 |     "\n",
270 |     "        lines = text.splitlines()\n",
271 |     "        print(\"lines\", lines)\n",
272 |     "        contentLines = filter(lambda w: w.strip() != '', lines)\n",
273 |     "        print(\"contentLines\", contentLines)\n",
274 |     "\n",
275 |     "        paragraphs = self.getParagraphs(contentLines, stopWords)\n",
276 |     "        print(\"paragraphs\", paragraphs)\n",
277 |     "\n",
278 |     "        rankedSentences = self.sentenceRank(paragraphs)\n",
279 |     "\n",
280 |     "        orderedSentences = []\n",
281 |     "        for p in paragraphs:\n",
282 |     "            for s in p.Sentences:\n",
283 |     "                orderedSentences.append(s)\n",
284 |     "\n",
285 |     "        reducedSentences = []\n",
286 |     "        i = 0\n",
287 |     "        while i < math.trunc(len(rankedSentences) * reductionRatio):\n",
288 |     "            s = rankedSentences[i][0].Sentence\n",
289 |     "            position = orderedSentences.index(s)\n",
290 |     "            reducedSentences.append((s, position))\n",
291 |     "            i = i + 1\n",
292 |     "        reducedSentences = sorted(reducedSentences, key=lambda x: x[1])\n",
293 |     "\n",
294 |     "        reducedText = []\n",
295 |     "        for s,r in reducedSentences:\n",
296 |     "            reducedText.append(s.getFullSentence())\n",
297 |     "        return reducedText\t"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 17,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "lines [': Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals.']\n",
310 |       "contentLines <filter object at 0x105afef98>\n",
311 |       "paragraphs [<__main__.Paragraph object at 0x105b0e0f0>]\n",
312 |       "['Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale.']\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "import string\n",
318 |     "reduction = Reduction()\n",
319 |     "filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n",
320 |     "f = open(filename)\n",
321 |     "addTitle = False\n",
322 |     "addTexts = False\n",
323 |     "title = []\n",
324 |     "text = []\n",
325 |     "for word in f.read().split():\n",
326 |     "    if (word == \"Title\"):\n",
327 |     "        addTitle = True\n",
328 |     "        continue\n",
329 |     "\n",
330 |     "    if (word == \"\\n\"):\n",
331 |     "        addTitle = False\n",
332 |     "\n",
333 |     "    if (addTexts == True and word == \"\\n\"):\n",
334 |     "        addTexts = False\n",
335 |     "        break\n",
336 |     "\n",
337 |     "\n",
338 |     "    if (word == \"Abstract\"):\n",
339 |     "        addTexts = True\n",
340 |     "        continue\n",
341 |     "\n",
342 |     "    if(addTitle == True):\n",
343 |     "        title.append(word)\n",
344 |     "\n",
345 |     "    if(addTexts == True):\n",
346 |     "        text.append(word)\n",
347 |     "\n",
348 |     "\n",
349 |     "title = ' '.join(title)\n",
350 |     "text =' '.join(text)\n",
351 |     "\n",
352 |     "reduction_ratio = 0.1\n",
353 |     "reduced_text = reduction.reduce(text, reduction_ratio)\n",
354 |     "\n",
355 |     "#output = open('output.txt')\n",
356 |     "print(reduced_text)\n"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 12,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       ": Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals.\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "print(text)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": []
384 |   }
385 |  ],
386 |  "metadata": {
387 |   "kernelspec": {
388 |    "display_name": "Python 3",
389 |    "language": "python",
390 |    "name": "python3"
391 |   },
392 |   "language_info": {
393 |    "codemirror_mode": {
394 |     "name": "ipython",
395 |     "version": 3
396 |    },
397 |    "file_extension": ".py",
398 |    "mimetype": "text/x-python",
399 |    "name": "python",
400 |    "nbconvert_exporter": "python",
401 |    "pygments_lexer": "ipython3",
402 |    "version": "3.6.3"
403 |   }
404 |  },
405 |  "nbformat": 4,
406 |  "nbformat_minor": 2
407 | }
408 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-time-machine


--------------------------------------------------------------------------------
/algo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/algo1.png


--------------------------------------------------------------------------------
/cs141_final_poster_toSize.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/cs141_final_poster_toSize.pptx


--------------------------------------------------------------------------------
/embedding2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "SyntaxError",
 10 |      "evalue": "invalid syntax (config.py, line 733)",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n",
 14 |       "  File \u001b[1;32m\"/Users/michellezhao/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py\"\u001b[0m, line \u001b[1;32m2862\u001b[0m, in \u001b[1;35mrun_code\u001b[0m\n    exec(code_obj, self.user_global_ns, self.user_ns)\n",
 15 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-3-e9cb4ada674a>\"\u001b[0;36m, line \u001b[0;32m7\u001b[0;36m, in \u001b[0;35m<module>\u001b[0;36m\u001b[0m\n\u001b[0;31m    import config\u001b[0m\n",
 16 |       "\u001b[0;36m  File \u001b[0;32m\"/Users/michellezhao/anaconda3/lib/python3.6/site-packages/config.py\"\u001b[0;36m, line \u001b[0;32m733\u001b[0m\n\u001b[0;31m    except Exception, e:\u001b[0m\n\u001b[0m                    ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
 17 |      ]
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "\"\"\"Generate intial word embedding for headlines and description.\n",
 22 |     "\n",
 23 |     "The embedding is limited to a fixed vocabulary size (`vocab_size`) but\n",
 24 |     "a vocabulary of all the words that appeared in the data is built.\n",
 25 |     "\"\"\"\n",
 26 |     "from os import path\n",
 27 |     "import config\n",
 28 |     "import _pickle as pickle\n",
 29 |     "from collections import Counter\n",
 30 |     "import numpy as np\n",
 31 |     "\n",
 32 |     "from prep_data import plt\n",
 33 |     "\n",
 34 |     "# static vars\n",
 35 |     "FN = 'vocabulary-embedding'\n",
 36 |     "seed = 42\n",
 37 |     "vocab_size = 40000\n",
 38 |     "embedding_dim = 100\n",
 39 |     "lower = False\n",
 40 |     "\n",
 41 |     "# index words\n",
 42 |     "empty = 0  # RNN mask of no data\n",
 43 |     "eos = 1  # end of sentence\n",
 44 |     "start_idx = eos + 1  # first real word\n",
 45 |     "\n",
 46 |     "# set random seed\n",
 47 |     "np.random.seed(seed)\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "def build_vocab(lst):\n",
 51 |     "    \"\"\"Return vocabulary for iterable `lst`.\"\"\"\n",
 52 |     "    vocab_count = Counter(w for txt in lst for w in txt.split())\n",
 53 |     "    vocab = list(map(lambda x: x[0], sorted(vocab_count.items(), key=lambda x: -x[1])))\n",
 54 |     "    return vocab, vocab_count\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "def load_text():\n",
 58 |     "    \"\"\"Return vocabulary for pickled headlines and descriptions.\"\"\"\n",
 59 |     "    # read tokenized headlines and descriptions\n",
 60 |     "    with open(path.join(config.path_data, 'tokens.pkl'), 'rb') as fp:\n",
 61 |     "        headlines, desc = pickle.load(fp)\n",
 62 |     "\n",
 63 |     "    # map headlines and descriptions to lower case\n",
 64 |     "    if lower:\n",
 65 |     "        headlines = [h.lower() for h in headlines]\n",
 66 |     "        desc = [h.lower() for h in desc]\n",
 67 |     "\n",
 68 |     "    return headlines, desc\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "def print_most_popular_tokens(vocab):\n",
 72 |     "    \"\"\"Print th most popular tokens in vocabulary dictionary `vocab`.\"\"\"\n",
 73 |     "    print('Most popular tokens:')\n",
 74 |     "    print(vocab[:50])\n",
 75 |     "    print('Total vocab size: {:,}'.format(len(vocab)))\n",
 76 |     "\n",
 77 |     "\n",
 78 |     "def plot_word_distributions(vocab, vocab_count):\n",
 79 |     "    \"\"\"Plot word distribution in headlines and discription.\"\"\"\n",
 80 |     "    plt.plot([vocab_count[w] for w in vocab])\n",
 81 |     "    plt.gca().set_xscale(\"log\", nonposx='clip')\n",
 82 |     "    plt.gca().set_yscale(\"log\", nonposy='clip')\n",
 83 |     "    title = 'word distribution in headlines and discription'\n",
 84 |     "    plt.title(title)\n",
 85 |     "    plt.xlabel('rank')\n",
 86 |     "    plt.ylabel('total appearances')\n",
 87 |     "    plt.savefig(path.join(config.path_outputs, '{}.png'.format(title)))\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "def get_idx(vocab):\n",
 91 |     "    \"\"\"Add empty and end-of-sentence tokens to vocabulary and return tuple (vocabulary, reverse-vocabulary).\"\"\"\n",
 92 |     "    word2idx = dict((word, idx + start_idx) for idx, word in enumerate(vocab))\n",
 93 |     "    word2idx['<empty>'] = empty\n",
 94 |     "    word2idx['<eos>'] = eos\n",
 95 |     "    idx2word = dict((idx, word) for word, idx in word2idx.items())\n",
 96 |     "    return word2idx, idx2word\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "def get_glove():\n",
100 |     "    \"\"\"Load GloVe embedding weights and indices.\"\"\"\n",
101 |     "    glove_name = path.join(config.path_data, 'glove.6B.{}d.txt'.format(embedding_dim))\n",
102 |     "    glove_n_symbols = sum(1 for line in open(glove_name))\n",
103 |     "    print('{:,} GloVe symbols'.format(glove_n_symbols))\n",
104 |     "\n",
105 |     "    # load embedding weights and index dictionary\n",
106 |     "    glove_index_dict = {}\n",
107 |     "    glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))\n",
108 |     "    globale_scale = .1\n",
109 |     "    with open(glove_name, 'r') as fp:\n",
110 |     "        i = 0\n",
111 |     "        for l in fp:\n",
112 |     "            l = l.strip().split()\n",
113 |     "            w = l[0]\n",
114 |     "            glove_index_dict[w] = i\n",
115 |     "            glove_embedding_weights[i, :] = list(map(float, l[1:]))\n",
116 |     "            i += 1\n",
117 |     "    glove_embedding_weights *= globale_scale\n",
118 |     "    print('GloVe std dev: {:.4f}'.format(glove_embedding_weights.std()))\n",
119 |     "\n",
120 |     "    # add lower case version of the keys to the dict\n",
121 |     "    for w, i in glove_index_dict.items():\n",
122 |     "        w = w.lower()\n",
123 |     "        if w not in glove_index_dict:\n",
124 |     "            glove_index_dict[w] = i\n",
125 |     "\n",
126 |     "    return glove_embedding_weights, glove_index_dict\n",
127 |     "\n",
128 |     "\n",
129 |     "def initialize_embedding(vocab_size, embedding_dim, glove_embedding_weights):\n",
130 |     "    \"\"\"Use GloVe to initialize random embedding matrix with same scale as glove.\"\"\"\n",
131 |     "    shape = (vocab_size, embedding_dim)\n",
132 |     "    scale = glove_embedding_weights.std() * np.sqrt(12) / 2  # uniform and not normal\n",
133 |     "    embedding = np.random.uniform(low=-scale, high=scale, size=shape)\n",
134 |     "    print('random-embedding/glove scale: {:.4f} std: {:.4f}'.format(scale, embedding.std()))\n",
135 |     "    return embedding\n",
136 |     "\n",
137 |     "\n",
138 |     "def copy_glove_weights(embedding, idx2word, glove_embedding_weights, glove_index_dict):\n",
139 |     "    \"\"\"Copy from glove weights of words that appear in our short vocabulary (idx2word).\"\"\"\n",
140 |     "    c = 0\n",
141 |     "    for i in range(vocab_size):\n",
142 |     "        w = idx2word[i]\n",
143 |     "        g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n",
144 |     "        if g is None and w.startswith('#'):  # glove has no hastags (I think...)\n",
145 |     "            w = w[1:]\n",
146 |     "            g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n",
147 |     "        if g is not None:\n",
148 |     "            embedding[i, :] = glove_embedding_weights[g, :]\n",
149 |     "            c += 1\n",
150 |     "    print('number of tokens, in small vocab: {:,} found in glove and copied to embedding: {:.4f}'.format(c, c / float(vocab_size)))\n",
151 |     "    return embedding\n",
152 |     "\n",
153 |     "\n",
154 |     "def build_word_to_glove(embedding, word2idx, idx2word, glove_index_dict, glove_embedding_weights):\n",
155 |     "    \"\"\"Map full vocabulary to glove based on cosine distance.\"\"\"\n",
156 |     "    glove_thr = 0.5\n",
157 |     "    word2glove = {}\n",
158 |     "    for w in word2idx:\n",
159 |     "        if w in glove_index_dict:\n",
160 |     "            g = w\n",
161 |     "        elif w.lower() in glove_index_dict:\n",
162 |     "            g = w.lower()\n",
163 |     "        elif w.startswith('#') and w[1:] in glove_index_dict:\n",
164 |     "            g = w[1:]\n",
165 |     "        elif w.startswith('#') and w[1:].lower() in glove_index_dict:\n",
166 |     "            g = w[1:].lower()\n",
167 |     "        else:\n",
168 |     "            continue\n",
169 |     "        word2glove[w] = g\n",
170 |     "\n",
171 |     "    # for every word outside the embedding matrix find the closest word inside the mebedding matrix.\n",
172 |     "    # Use cos distance of GloVe vectors.\n",
173 |     "    # Allow for the last `nb_unknown_words` words inside the embedding matrix to be considered to be outside.\n",
174 |     "    # Dont accept distances below `glove_thr`\n",
175 |     "    normed_embedding = embedding / np.array(\n",
176 |     "        [np.sqrt(np.dot(gweight, gweight)) for gweight in embedding])[:, None]\n",
177 |     "\n",
178 |     "    nb_unknown_words = 100\n",
179 |     "\n",
180 |     "    glove_match = []\n",
181 |     "    for w, idx in word2idx.items():\n",
182 |     "        if idx >= vocab_size - nb_unknown_words and w.isalpha() and w in word2glove:\n",
183 |     "            gidx = glove_index_dict[word2glove[w]]\n",
184 |     "            gweight = glove_embedding_weights[gidx, :].copy()\n",
185 |     "\n",
186 |     "            # find row in embedding that has the highest cos score with gweight\n",
187 |     "            gweight /= np.sqrt(np.dot(gweight, gweight))\n",
188 |     "            score = np.dot(normed_embedding[:vocab_size - nb_unknown_words], gweight)\n",
189 |     "            while True:\n",
190 |     "                embedding_idx = score.argmax()\n",
191 |     "                s = score[embedding_idx]\n",
192 |     "                if s < glove_thr:\n",
193 |     "                    break\n",
194 |     "                if idx2word[embedding_idx] in word2glove:\n",
195 |     "                    glove_match.append((w, embedding_idx, s))\n",
196 |     "                    break\n",
197 |     "                score[embedding_idx] = -1\n",
198 |     "\n",
199 |     "    glove_match.sort(key=lambda x: -x[2])\n",
200 |     "    print()\n",
201 |     "    print('# of GloVe substitutes found: {:,}'.format(len(glove_match)))\n",
202 |     "\n",
203 |     "    # manually check that the worst substitutions we are going to do are good enough\n",
204 |     "    for orig, sub, score in glove_match[-10:]:\n",
205 |     "        print('{:.4f}'.format(score), orig, '=>', idx2word[sub])\n",
206 |     "\n",
207 |     "    # return a lookup table of index of outside words to index of inside words\n",
208 |     "    return dict((word2idx[w], embedding_idx) for w, embedding_idx, _ in glove_match)\n",
209 |     "\n",
210 |     "\n",
211 |     "def to_dense_vector(word2idx, corpus, description, bins=50):\n",
212 |     "    \"\"\"Create a dense vector representation of headlines.\"\"\"\n",
213 |     "    data = [[word2idx[token] for token in txt.split()] for txt in corpus]\n",
214 |     "    plt.hist(list(map(len, data)), bins=bins)\n",
215 |     "    plt.savefig(path.join(config.path_outputs, '{}_distribution.png'.format(description)))\n",
216 |     "    return data\n",
217 |     "\n",
218 |     "\n",
219 |     "def summarize_vocab(vocab, vocab_count):\n",
220 |     "    \"\"\"Print the most popular tokens and plot token distributions.\"\"\"\n",
221 |     "    print_most_popular_tokens(vocab)\n",
222 |     "    plot_word_distributions(vocab, vocab_count)\n"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 2,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "ename": "NameError",
232 |      "evalue": "name 'load_text' is not defined",
233 |      "output_type": "error",
234 |      "traceback": [
235 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
236 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
237 |       "\u001b[0;32m<ipython-input-2-b11dea9f4beb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     29\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m     \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
238 |       "\u001b[0;32m<ipython-input-2-b11dea9f4beb>\u001b[0m in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;34m\"\"\"Generate intial word embedding for headlines and description.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m     \u001b[0mheadlines\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdesc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# load headlines and descriptions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheadlines\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# build vocabulary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0msummarize_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab_count\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# summarize vocabulary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
239 |       "\u001b[0;31mNameError\u001b[0m: name 'load_text' is not defined"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "\n",
245 |     "def main():\n",
246 |     "    \"\"\"Generate intial word embedding for headlines and description.\"\"\"\n",
247 |     "    headlines, desc = load_text()  # load headlines and descriptions\n",
248 |     "    vocab, vocab_count = build_vocab(headlines + desc)  # build vocabulary\n",
249 |     "    summarize_vocab(vocab, vocab_count)  # summarize vocabulary\n",
250 |     "    word2idx, idx2word = get_idx(vocab)  # add special tokens and get reverse vocab lookup\n",
251 |     "    glove_embedding_weights, glove_index_dict = get_glove()  # load GloVe data\n",
252 |     "\n",
253 |     "    # initialize embedding\n",
254 |     "    embedding = initialize_embedding(vocab_size, embedding_dim, glove_embedding_weights)\n",
255 |     "    embedding = copy_glove_weights(embedding, idx2word, glove_embedding_weights, glove_index_dict)\n",
256 |     "\n",
257 |     "    # map vocab to GloVe using cosine similarity\n",
258 |     "    glove_idx2idx = build_word_to_glove(embedding, word2idx, idx2word, glove_index_dict, glove_embedding_weights)\n",
259 |     "\n",
260 |     "    # create a dense vector representation of headlines and descriptions\n",
261 |     "    description_vector = to_dense_vector(word2idx, desc, 'description')\n",
262 |     "    headline_vector = to_dense_vector(word2idx, headlines, 'headline')\n",
263 |     "\n",
264 |     "    # write vocabulary to disk\n",
265 |     "    with open(path.join(config.path_data, '{}.pkl'.format(FN)), 'wb') as fp:\n",
266 |     "        pickle.dump((embedding, idx2word, word2idx, glove_idx2idx), fp, 2)\n",
267 |     "\n",
268 |     "    # write data to disk\n",
269 |     "    with open(path.join(config.path_data, '{}.data.pkl'.format(FN)), 'wb') as fp:\n",
270 |     "        pickle.dump((description_vector, headline_vector), fp, 2)\n",
271 |     "\n",
272 |     "if __name__ == '__main__':\n",
273 |     "    main()\n"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "collapsed": true
281 |    },
282 |    "outputs": [],
283 |    "source": []
284 |   }
285 |  ],
286 |  "metadata": {
287 |   "kernelspec": {
288 |    "display_name": "Python 3",
289 |    "language": "python",
290 |    "name": "python3"
291 |   },
292 |   "language_info": {
293 |    "codemirror_mode": {
294 |     "name": "ipython",
295 |     "version": 3
296 |    },
297 |    "file_extension": ".py",
298 |    "mimetype": "text/x-python",
299 |    "name": "python",
300 |    "nbconvert_exporter": "python",
301 |    "pygments_lexer": "ipython3",
302 |    "version": "3.6.3"
303 |   }
304 |  },
305 |  "nbformat": 4,
306 |  "nbformat_minor": 2
307 | }
308 | 


--------------------------------------------------------------------------------
/embeddingNotebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 54,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pickle\n",
 13 |     "import string "
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 55,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "def transform_input_text(self, texts):\n",
 25 |     "    temp = []\n",
 26 |     "    for line in texts:\n",
 27 |     "        x = []\n",
 28 |     "        for word in line.lower().split(' '):\n",
 29 |     "            wid = 1\n",
 30 |     "            if word in self.input_word2idx:\n",
 31 |     "                wid = self.input_word2idx[word]\n",
 32 |     "            x.append(wid)\n",
 33 |     "            if len(x) >= self.max_input_seq_length:\n",
 34 |     "                break\n",
 35 |     "        temp.append(x)\n",
 36 |     "    temp = pad_sequences(temp, maxlen=self.max_input_seq_length)\n",
 37 |     "\n",
 38 |     "    print(temp.shape)\n",
 39 |     "    return temp"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 56,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import numpy as np\n",
 51 |     "import os\n",
 52 |     "\n",
 53 |     "def split():\n",
 54 |     "    titles = []\n",
 55 |     "    texts = []\n",
 56 |     "    root = 'Part1'\n",
 57 |     "    stop = 0\n",
 58 |     "    for p1 in os.listdir(root)[1:]:\n",
 59 |     "        if(stop == 1):\n",
 60 |     "            break\n",
 61 |     "        dir1 = root + '/' + str(p1)\n",
 62 |     "        print(dir1)\n",
 63 |     "        for p2 in os.listdir(dir1)[1:]:\n",
 64 |     "            if(stop == 1):\n",
 65 |     "                break\n",
 66 |     "            dir2 = dir1 + '/' + str(p2)\n",
 67 |     "            print(dir2)\n",
 68 |     "            for filename in os.listdir(dir2)[1:]:\n",
 69 |     "                if (filename == 'a9302502.txt'):\n",
 70 |     "                    stop = 1\n",
 71 |     "                    break\n",
 72 |     "                print(dir2 + '/' + filename)\n",
 73 |     "                #print(iter)\n",
 74 |     "                #iter += 1\n",
 75 |     "                #print(dirs[1:])\n",
 76 |     "                #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n",
 77 |     "                f = open(dir2 + '/' + filename)\n",
 78 |     "                addTitle = False\n",
 79 |     "                addTexts = False\n",
 80 |     "                title = []\n",
 81 |     "                text = []\n",
 82 |     "                for word in f.read().split():\n",
 83 |     "                    if (word == \"Title\"):\n",
 84 |     "                        addTitle = True\n",
 85 |     "                        continue\n",
 86 |     "\n",
 87 |     "                    if (word == \"\\n\"):\n",
 88 |     "                        addTitle = False\n",
 89 |     "                    if (word == \"Type\"):\n",
 90 |     "                        addTitle = False\n",
 91 |     "                        \n",
 92 |     "#                     if (addTexts == True and word == \"\\n\"):\n",
 93 |     "#                         addTexts = False\n",
 94 |     "#                         break\n",
 95 |     "                        \n",
 96 |     "\n",
 97 |     "                    if (word == \"Abstract\"):\n",
 98 |     "                        addTexts = True\n",
 99 |     "                        continue\n",
100 |     "\n",
101 |     "                    if(addTitle == True):\n",
102 |     "                        title.append(word)\n",
103 |     "\n",
104 |     "                    if(addTexts == True):\n",
105 |     "                        text.append(word)\n",
106 |     "\n",
107 |     "                for i in range(len(title)):\n",
108 |     "                    s = title[i]\n",
109 |     "                    table = str.maketrans({key: None for key in string.punctuation})\n",
110 |     "                    new_s = s.translate(table)\n",
111 |     "                    title[i] = new_s\n",
112 |     "                for i in range(len(text)):\n",
113 |     "                    s = text[i]\n",
114 |     "                    table = str.maketrans({key: None for key in string.punctuation})\n",
115 |     "                    new_s = s.translate(table)\n",
116 |     "                    text[i] = new_s\n",
117 |     "\n",
118 |     "                title = ' '.join(title)\n",
119 |     "                text =' '.join(text)\n",
120 |     "                titles.append(title)\n",
121 |     "                texts.append(text)\n",
122 |     "\n",
123 |     "#     f=open(\"titles.txt\", 'w')\n",
124 |     "#     for i in titles:\n",
125 |     "#         f.write(i)\n",
126 |     "#         f.write(' ')\n",
127 |     "\n",
128 |     "#     t=open(\"texts.txt\", 'w')\n",
129 |     "#     for i in texts:\n",
130 |     "#         t.write(i)\n",
131 |     "#         t.write(' ')\n",
132 |     "\n",
133 |     "#     f.close()\n",
134 |     "#     t.close()\n",
135 |     "    return titles, texts\n"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 57,
141 |    "metadata": {
142 |     "collapsed": true
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "import numpy as np\n",
147 |     "import os\n",
148 |     "\n",
149 |     "def split():\n",
150 |     "    titles = []\n",
151 |     "    texts = []\n",
152 |     "    root = 'Part1'\n",
153 |     "    \n",
154 |     "    #dirr = 'Part1/awards_1990/awd_1990_00/'\n",
155 |     "    dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n",
156 |     "\n",
157 |     "    for filename in dirs[1:]:\n",
158 |     "    #iter = 0\n",
159 |     "    #print(dirs[1])\n",
160 |     "            \n",
161 |     "                #print(iter)\n",
162 |     "                #iter += 1\n",
163 |     "                #print(dirs[1:])\n",
164 |     "                #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n",
165 |     "        f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n",
166 |     "        addTitle = False\n",
167 |     "        addTexts = False\n",
168 |     "        title = []\n",
169 |     "        text = []\n",
170 |     "        for word in f.read().split():\n",
171 |     "            if (word == \"Title\"):\n",
172 |     "                addTitle = True\n",
173 |     "                continue\n",
174 |     "\n",
175 |     "            if (word == \"Type\"):\n",
176 |     "                addTitle = False\n",
177 |     "\n",
178 |     "#             if (addTexts == True and word == \"\\n\"):\n",
179 |     "#                 addTexts = False\n",
180 |     "#                 break\n",
181 |     "\n",
182 |     "\n",
183 |     "            if (word == \"Abstract\"):\n",
184 |     "                addTexts = True\n",
185 |     "                continue\n",
186 |     "\n",
187 |     "            if(addTitle == True):\n",
188 |     "                title.append(word)\n",
189 |     "\n",
190 |     "            if(addTexts == True):\n",
191 |     "                text.append(word)\n",
192 |     "\n",
193 |     "        for i in range(len(title)):\n",
194 |     "            s = title[i]\n",
195 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
196 |     "            new_s = s.translate(table)\n",
197 |     "            title[i] = new_s\n",
198 |     "        for i in range(len(text)):\n",
199 |     "            s = text[i]\n",
200 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
201 |     "            new_s = s.translate(table)\n",
202 |     "            text[i] = new_s\n",
203 |     "\n",
204 |     "        title = ' '.join(title)\n",
205 |     "        text =' '.join(text)\n",
206 |     "        titles.append(title)\n",
207 |     "        texts.append(text)\n",
208 |     "\n",
209 |     "#     f=open(\"titles.txt\", 'w')\n",
210 |     "#     for i in titles:\n",
211 |     "#         f.write(i)\n",
212 |     "#         f.write(' ')\n",
213 |     "\n",
214 |     "#     t=open(\"texts.txt\", 'w')\n",
215 |     "#     for i in texts:\n",
216 |     "#         t.write(i)\n",
217 |     "#         t.write(' ')\n",
218 |     "\n",
219 |     "#     f.close()\n",
220 |     "#     t.close()\n",
221 |     "    return titles, texts\n"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 58,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "if __name__ == '__main__':\n",
231 |     "    titles, texts = split()\n"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 59,
237 |    "metadata": {
238 |     "scrolled": true
239 |    },
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "379\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "print(len(titles))"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 60,
256 |    "metadata": {
257 |     "collapsed": true
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "from collections import Counter\n",
262 |     "from itertools import chain\n",
263 |     "def get_vocab(lst):\n",
264 |     "    vocabcount = Counter(w for txt in lst for w in txt.split())\n",
265 |     "    vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))\n",
266 |     "    return list(vocab), vocabcount"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 61,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "' RFLP Patterns as a Measure of Diversity in Small Populations'"
278 |       ]
279 |      },
280 |      "execution_count": 61,
281 |      "metadata": {},
282 |      "output_type": "execute_result"
283 |     }
284 |    ],
285 |    "source": [
286 |     "titles[1]"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 62,
292 |    "metadata": {
293 |     "collapsed": true
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "vocab, vocabcount = get_vocab(titles+texts)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 63,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "['the', 'of', 'and', 'to', 'in']\n",
310 |       "9073\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "print(vocab[:5])\n",
316 |     "print(len(vocab))"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 64,
322 |    "metadata": {
323 |     "collapsed": true,
324 |     "scrolled": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "import string \n",
329 |     "for i in range(len(vocab)):\n",
330 |     "    s = vocab[i]\n",
331 |     "    table = str.maketrans({key: None for key in string.punctuation})\n",
332 |     "    new_s = s.translate(table)\n",
333 |     "    vocab[i] = new_s"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 65,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEaCAYAAAAL7cBuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl8VOXZ//HPNUlIWMO+yI6AiEoB\nI6jFHSlWcXusa+uutda6VK1a7WO19bH2sa31p3Wta92trRuK+CiLqJVFAQXZQQLIvu9Jrt8f50SH\nOElmkpnMTPJ9v155Zeas19xzZq45933OfZu7IyIiUlEk3QGIiEhmUoIQEZGYlCBERCQmJQgREYlJ\nCUJERGJSghARkZiUILKAmf3WzP6RwPJuZr3Dxw+a2W+SFEc3M9tiZjnh83FmdnEyth1u7y0zOy9Z\n24vabo3LINmvsZp9Rb9vT5jZ78PHh5nZnLqIIRXM7EgzK05g+W/K3MzOMbN3khzPHsdxDdZPekyZ\nKjfdAUhquftl8SxnZouBi9393Sq29RXQLBlxmdlvgd7u/uOo7R+XjG1XFG8ZZCp3nwjsk+440sHd\nnwGeSfI24z6OzawHsAjIc/eSVMWUqXQGkUEskJHviZnpx4RkPR3HicnIL6NsYGYXmNnrUc/nm9mL\nUc+XmtnA8PGhZjbZzDaG/w+NWm6cmd1hZpOAbUAvM+tpZuPNbLOZjQXaVhPL9Wa2wsyWm9mFFeZF\nV1W0NbM3zGyDma0zs4lmFjGzp4FuwOvhqfevzKxHWOVxkZl9BbwXNS36Q7a3mX0SvrZXzax1uK/v\nVCuY2WIzG25mI4FfA2eE+5seVRblVQsRM7vFzJaY2Soze8rMCsN55XGcZ2ZfmdkaM7u5ivKJLoMj\nzazYzK4Nt7vCzC6oqnyB7mY2KXw/3jGzb94PMzvYzD4My3S6mR0ZNe8CM5sdrrfQzH4a7/tWYbk9\nyjIsx+vMbEZY7i+YWUHU/BPM7LMwpg/NbEDUvBvMbFkY0xwzO6aSfR5vZp+a2abwWP5t1Lwqy9/M\nGodlvt7MZgEHVVW4ZnasmX0Zvpb7AIuad76ZfRA+NjP7S/i+bQxf//5R+/xTeLxsNLMPwmnVHsfh\ncXdnrOMYmBD+3xAeq4dExxSuX93n+3eVHT8Zz931V4M/oBewgSDJdgKWAMui5q0P57UOH/+EoErv\nrPB5m3DZccBXwH7h/DzgI+DPQD5wOLAZ+EclcYwEVgL7A02BZwEnqL4BeAL4ffj4TuDBcB95wGGA\nhfMWA8Ojttsj3M5T4XYbR03LjYp9WdS+/1keJ3AkUFwh1m/2Afy24msKt3dx+PhCYH5Yls2AV4Cn\nK8T2SBjX94CdwL6VlFF0GRwJlAC3h2XwQ4LE3KqSdccBC4C+4b7GAX8I53UG1obbiADHhs/bhfOP\nB/Ym+MI7ItzP4Bq8b3uUZViOnwB7ERxfs4HLwnmDgVXAUCAHOC9cPp+gmmopsFdUOe5dyes+Ejgg\nfF0DwlhPjqf8gT8AE8PYugKfU+FYiNpPW2ATcFr4flwTvj/lx8H5wAfh4x8AU4GWYZnuC3QK590f\nvjedw9d9aPiay2Ot6XG8x7IxYorn8x3z+MmGP51B1JC7LyT44h5I8OEfAywzs37h84nuXkbwJTHP\n3Z929xJ3fw74EhgVtbkn3P0LD+o4OxH84vqNu+909wnA61TudOBxd//c3bcSfPFWZne4/e7uvtvd\nJ3p4FFfht+6+1d23VzL/6ah9/wY43WrY+FfBOcCf3X2hu28BbgLOtD3PXm5z9+3uPh2YTvBFFY/d\nwO1hGYwGtlB1Hf/j7j43LIMXCd5zgB8Do919tLuXuftYYApBwsDd33T3BR4YD7xDkJQhsfctlnvd\nfbm7ryM4PspjugR4yN3/4+6l7v4kwZf3wUApwZdmfzPLc/fF7r4g1sbdfZy7zwxf1wzgOYLjOlpl\n5X86cIe7r3P3pcC9VbyOHwKz3P1ld98N3AN8Xcmyu4HmQD+CHzaz3X2FBdWyFwJXufuy8HV/6O47\no9ZN1XEcz+e7suMn4ylB1M54gl9ah4ePxxF8iI4In0PwK29JhfWWEPzSKbc06vFewPrwQI1evjJ7\nVVi/qmX/l+BX+TthlceNVSwbK7bq5i8h+BWYjFPoiuW2hOAXWoeoadFfJNuIvwF9bZiM4123sv10\nB34UVuVsMLMNwDCCJIyZHWdmH1tQnbeB4MuwvGwSed8SjenaCjF1JThrmA9cTZCMVpnZ82a2V6yN\nm9lQM3vfzFab2UbgMr77vlYWQyKvbY9lwx8sMY85d38PuI/gbGGlmT1sZi3CuAoIfqlXJlXHcTyf\n75oep2mnBFE75QnisPDxeL6bIJYTfGijdSM4pS0X/St+BdDKzJpWWL4yKwi+AKpd1t03u/u17t6L\n4BfOL6PqoCs7k6juDKPivncDa4CtQJPyGeGvsXYJbLdiuXUjqHpYWc16dWkpwS/PllF/Td39D2aW\nT1BVcTfQwd1bAqP5tn497vetBjHdUSGmJuEvW9z9WXcfRlC2DtxVyXaeBV4Durp7IUHVpFWybEWJ\nvLY9ljUzq7DuHtz9Xnc/kKBKti9wPcHxtoOgOq/SVauJubLjONHjtHz9ZTGWzTpKELUzHjgKaOzu\nxQT1riOBNsCn4TKjgb5mdraZ5ZrZGUB/4I1YG3T3JQTVFLeZWSMzG8aep6sVvQicb2b9zawJcGtl\nC4aNl73DD+EmgiqH0nD2SoL6/kT9OGrftwMvu3spMBcoCBs784BbCKo3yq0EeljlV209B1xjQYN9\nM+B/gBcq/PJPt38Ao8zsB2aWY2YFFjQodwEaEbze1UCJmR0HjIhaN+73LUGPAJeFZwBmZk3D96C5\nme1jZkeHyWsHsJ1v3/+KmgPr3H2HmQ0Bzk4ghheBm8ysVVgWv6hi2TeB/czs1LD68EqgY6wFzeyg\n8HXlEfwA2QGUhlW5jwF/NrO9wvfikPB1xquy43g1UEbln42EPt/ZRgmiFtx9LkH99cTw+SZgITAp\nPLhw97XACcC1BA2YvwJOcPc1VWz6bIJGxnUEXxxPVRHDWwT1tu8RVB+9V8V2+wDvhjF/BPzN3ceF\n8+4EbgmrJa6rYhsVPU3QoPo1wWn+lWFcG4HLgUcJfk1tBaKvanop/L/WzKbF2O5j4bYnEFyHvoOq\nv2jqXFi/fhLBFVmrCX69Xw9E3H0zQVm8SNBoeTbBL/LydRN53xKJaQpBO8R94X7nEzSqQpCw/kDw\ny/hroH0YeyyXA7eb2Wbgv8PXEa/bCKpZFhG0uzxdRbxrgB+Fca0lOEYnVbJ4C4IEuD7c/lqCMzSA\n64CZwGSCz81dJPb9VtlxvA24A5gUfjYOrhB/TT7fWaP8ChYRkQbJzMYRXLX0aLpjyTQ6gxARkZiU\nIEREJCZVMYmISEw6gxARkZiUIEREJKas7NnQzEYBo5o3b35J37590x2OiEhWmTp16hp3b1fdclnd\nBlFUVORTpkxJdxgiIlnFzKa6e1F1y6mKSUREYlKCEBGRmJQgREQkpqxMEGY2yswe3rhxY7pDERGp\nt7IyQbj76+5+aWFhYbpDERGpt7IyQYiISOopQYiISExZeaNcuUVrtnLuY5/UaN0WBbkctU97ju7X\nnlZNGyU5MhGR7JfVCaK0zNm0fXeN1p3z9SbemLGCnIhxUI9WHNu/IyP6d6Br6ybVrywi0gA02Dup\ny8qcmcs2MnbWSsbOWsmclZsB6NexOSP2C5LFfnu1IBidU0Sk/oj3TuqsTBDlfTH17t37knnz5iVl\nm0vWbmXsrJW888VKpixZR5nDXoUFHNu/AyP268iQnq3Jy1GTjYhkv3qdIMqlqi+mtVt28n9frmLs\nrJVMnLeaHbvLgjaLfu0Z0b8jR+zTjmb5WV07JyINmBJEkmzfVcrEeat5Z9ZK/m/2StZv202jnAiH\n9m7Dsf07cOy+HWjfoiClMYiIJJMSRAqUlJYxdcn6oCpq1kq+WrcNgIFdW3Js/w78YL8O7N2umdot\nRCSjKUGkmLszd+UWxs76mndmrWRGcdDtR8+2TTlu/45cflRvVUOJSEZSgqhjKzZu593wzGLS/DX0\naNOU+88ZzL6dWqQ7NBGRPWg8iDrWqbAxPzmkB09fNJRnLj6YzTtLOPn+STz/yVdkcxIWkYZLCSIF\nDtm7DaOvPIyiHq248ZWZXPvidLbtKkl3WCIiCVGCSJF2zfN56sKhXDO8L//6bBkn3jeJueHNeCIi\n2UAJIoVyIsZVw/vwzEVD2bBtNyfe9wEvTVma7rBEROKSlQki2wYMOrR3W0ZfNYxBXVtx/cszuO6l\n6WzfVZrusEREqpSVCSIbBwxq37yAf1w8lCuP6cM/pxVz0v0fMH+VqpxEJHNlZYLIVjkR45fH9uWp\nC4ewdssuRv2/SbwyrTjdYYmIxKQEkQaH9WnH6KsO44Auhfzyxenc8PIMduxWlZOIZBbd6psmHVoU\n8OzFQ/nLu3O5//0FTC/ewM+P6k3jvBzyciPkRSz4nxMhL8fC/xGaF+TStll+usMXkQZAd1JngHFz\nVnHNC5+xflv1gx+Zwe9P3p9zhnavg8hEpD6K905qnUFkgCP3ac+EXx1F8frt7C4tY3eph//LvvP8\nn1OXceurX9C7XTOG9mqT7tBFpB5TgsgQzQvy2LdTXrXLHd2vA6fcP4nLn5nGa78YRueWjesgOhFp\niNRInWUKG+fx8LlF7Cop49Knpuh+ChFJGSWILNS7fTPuOXMgs1Zs4oZ/zlBngCKSEkoQWeqYfTtw\n3Yh9eG36ch6asDDd4YhIPaQEkcUuP3Jvjh/Qibve/pJxc1alOxwRqWeUILKYmfG/pw2gX8cW/OK5\nT1m4eku6QxKReiRjEoSZ7WtmD5rZy2b2s3THky2aNMrl4Z8cSF5OhEuemsLmHdXfSyEiEo+UJggz\ne8zMVpnZ5xWmjzSzOWY238xuBHD32e5+GXA6UO0NHPKtrq2bcP/Zg1m8dhvH/XUi1744nac/WsyM\n4g3sKilLd3gikqVSfR/EE8B9wFPlE8wsB7gfOBYoBiab2WvuPsvMTgRuDNeRBByydxseOGcwL05Z\nyvi5q/hn2Algo5wI/fdqwSmDOnPeoT3SG6SIZJWUJgh3n2BmPSpMHgLMd/eFAGb2PHASMMvdXwNe\nM7M3gWdTGVt9NGK/jozYryPuzvKNO/jsqw1ML97AxwvXcutrX9A4L4fTD+qa7jBFJEuk407qzkD0\nsGrFwFAzOxI4FcgHRle2spldClwK0K1bt9RFmcXMjM4tG9O5ZWOOH9CJktIyLnhiMjf/eyY92zXl\noB6t0x2iiGSBdDRSW4xp7u7j3P1Kd/+pu99f2cru/rC7F7l7Ubt27VIYZv2RmxPhvrMG07VVEy57\neirF67elOyQRyQLpSBDFQHQ9RxdgeSIbyLYhRzNBYZM8HjmviF2lZVz85BS27ixJd0gikuHSkSAm\nA33MrKeZNQLOBF5LZAPZOORoJti7XTPuO3swc1du5poXPqOsTF10iEjlUn2Z63PAR8A+ZlZsZhe5\newlwBTAGmA286O5fpDIO+dYRfdtx8/H9eWfWSv40dk66wxGRDJbqq5jOqmT6aKpoiK6OmY0CRvXu\n3bumm2jQLvx+D+Z+vZn7319A47wcrji6T7pDEpEMlDF3UidCVUy1Y2bcccr+nDKoM3e/M5c/vTNH\nPcKKyHdowKAGKjcnwt0/+h6NciL8v/fms6ukjBuP64dZrIvMRKQhUoJowHIixp2nHkCj3AgPTVjI\nzpIyrh3Rl8Z5OeTmZOXJpYgkUVYmCLVBJE8kYtx+0n40yo3w9w8W8cSHiwHIjRhN83MZuV9HLjm8\nF73bN0tvoCJS5yyb656Liop8ypQp6Q6jXnB3xnyxkqXrtrFjdynbd5eyctNO3pixnJ0lZRzbvwOX\nH7k3g7q1SneoIlJLZjbV3avtFFUJQqq0dstOnvxwMU9+tISN23dz2oFduPG4frRtlp/u0ESkhup1\ngoiqYrpk3rx56Q6nQdi6s4T73p/PoxMXUpCXw69G9uPHQ7upUVskC8WbILKyJVKXuda9pvm53DCy\nH29ddTgDuhTym39/zuOTFqc7LBFJoaxMEJI+vds34+kLhzKifwd+/+Ys3v9SY2GL1FdKEJKwSMS4\n58yB7NspGAv7y683pTskEUkBJQipkSaNcnn0vCKaNMrh4iensEljYYvUO1mZINTdd2boVNiYB39y\nIMs3bOeON2anOxwRSbKsTBBqpM4cg7u14qdH7M0LU5aqPUKknsnKBCGZ5erhfejboRk3vjKDjdtU\n1SRSXyhBSK3l5+bwpx8NZM2WXdwxela6wxGRJFGCkKQ4oEshFw/ryUtTi5lZrLYhkfogKxOEGqkz\n0xVH96ZN00bc/sYXGl9CpB7IygShRurM1Lwgj2tH7MPkxet5c+aKdIcjIrWUld19S+Y6vagrT320\nhN+9MYtZyzfRpVUTRuzXQZ37iWShrOysr5x6c81Mn361nmtfnM5X67ZRUuY0y8/lZ0fuzQ8P6ETj\nvBxaNc0jPzcn3WGKNFhJ683VzH4EvO3um83sFmAw8Ht3n5acUGtOCSKzlZY581Zt5u4xc3l39so9\n5rVtls/BvVpz2oFdOKJvO/UKK1KHkpkgZrj7ADMbBtwJ3A382t2HJifUmlOCyB6fLd3AojVb2Lar\nlLVbdrFk7Tb+78uVbNi2m2P7d+DuH32PwsZ56Q5TpEGIN0HE0wZRGv4/HnjA3V81s9/WJjhpeAZ2\nbcnAri33mLazpJSnP1rCH976klP+NonRVx5GQZ6qnkQyRTxXMS0zs4eA04HRZpYf53opo8tc64f8\n3BwuPqwX9509iIWrtzJ21srqVxKROhPPF/3pwBhgpLtvAFoD16c0qmroMtf65dj+HencsjEvTS1O\ndygiEqXaBOHu24BVwLBwUgmgcT4laXIixn8N7szEeatZvmF7usMRkVC1CcLMbgVuAG4KJ+UB/0hl\nUNLwnHZgV9zh6Y+XpDsUEQnF00h9CjAImAbg7svNrHlKo5IGp1ubJvzwgI48MG4B67fu4sJhPenT\nvpkufxVJo3gSxC53dzNzADNrmuKYpIG698xBdGs9lwfHL+D5yUs5dO82PHxuEc3ydcO/SDrE00j9\nYngVU0szuwR4F3gktWFJQ5SbE+HG4/rx4Y1Hc/MP9+U/i9ZxziMfM3/V5nSHJtIgxdXVhpkdC4wA\nDBjj7mNTHVg8dKNc/TZ21kp++cJnbNtdyoj+Hfh+77Yc0LmQ71W4n0JEEpPMO6l7AivcfUf4vDHQ\nwd0XJyPQ2lCCqP/WbtnJA+MW8O/PlrNmy04AXv3595UkRGoh3gQRTxXTS0BZ1PPScJpIyrVpls8t\nJ/TnP78+hgnXH0XLJnnc8+7cdIcl0iDEkyBy3X1X+ZPwcaPUhVQ93Und8OREjG5tmnDp4b14f85q\nZhRvSHdIIvVePAlitZmdWP7EzE4C1qQupOrpTuqG6ycHd6dRboRXpi1Ldygi9V48CeIy4Ndm9pWZ\nLSW4ae6nqQ1LJLbmBXkc0689b8xYTklpWfUriEiNxdPVxgJ3PxjoD/R390PdfX7qQxOJ7cTv7cWa\nLbu4Y/Rsdc0hkkLV3oEU9t76X0APILf8zlZ3vz2lkYlU4qh+7dmnQ3Men7SYjxas5Y1fDCM3JyuH\nVxfJaPF8ql4FTiLopG9r1J9IWhTk5TDmmsN54JzBfPn1Zv5n9Jds3rE73WGJ1Dvx9GHQxd1HpjwS\nkQSN3L8jpwzqzGOTFvH25yt48CcHMqCL7o8QSZZ4ziA+NLMDUh6JSILMjL+cMZCXLzsEM+O8xz5h\nxUa1SYgkSzwJYhgw1czmmNkMM5tpZjNSHZhIvIp6tOapi4aws6SMkfdM5MHxC9Idkki9EE8V03Ep\nj0KklvZu14zHzz+Iu9+Zw5/emcNJA/eiU2HjdIclktXiucx1ibsvAbYDHvUnklGG9mrDn08fSJnD\naQ98xM3/msnasP8mEUlcPCPKnWhm84BFwHhgMfBWiuMSqZGurZvw6x/uy97tm/HilKX89vVZ6Q5J\nJGvF0wbxO+BgYK679wSOASalNCqRWrhoWE+eunAIFw7ryZszljPna40nIVIT8SSI3e6+FoiYWcTd\n3wcGpiIYMzvZzB4xs1fNbEQq9iENx8XDetGqSSNd3SRSQ/EkiA1m1gyYADxjZn8luGkuLmb2mJmt\nMrPPK0wfGV4ZNd/MbgRw93+7+yXA+cAZcb8KkRjaNc/nHxcPZdOO3Zx03yQeGLeAsjI1n4nEK54E\ncRKwDbgGeBtYAIxKYB9PAHvcaGdmOcD9BFdI9QfOMrP+UYvcEs4XqZV9O7XgnjMG0qJxHne9/SXn\nPvYJS9aqIwCReFSZIMIv8lfdvczdS9z9SXe/N6xyiou7TwDWVZg8BJjv7gvD8SWeB06ywF3AW+4+\nLcHXIhLTiP06Mvaaw7njlP35bOkGhv95PFc8O01XOIlUo8r7INy91My2mVmhuydzdJ7OwNKo58XA\nUOAXwHCg0Mx6u/uDFVc0s0uBSwG6deuWxJCkPjMzzhnanWP6deDRiQt56uMlLNuwnSuP6UN+boQh\nPVqrwz+RCuK5UW4HMNPMxhLVSZ+7X1mL/VqMae7u9wL3VrWiuz8MPAzBmNS1iEEaoI6FBdxyQn8O\n6FLIVc9/xgWPTwZgWO+2/HJEXwZ2aUkkEuvwFGl44kkQb4Z/yVQMdI163gVYnuR9iFTqpIGdGdCl\nJRu27WJG8UZ+/+YsTv3bh/Rp34yrhvfhhAF7pTtEkbSrNkG4+5Mp2O9koI+Z9QSWAWcCZ8e7spmN\nAkb17t07BaFJQ9GzbVOgKYO6teKIvu34z6K1PD5pMVc8+yndWjdRz7DS4Jl71bU0ZtYHuJPgaqOC\n8unu3iuuHZg9BxwJtAVWAre6+9/N7IfAPUAO8Ji735Fo8EVFRT5lypREVxOp1OYduznif8exu6SM\no/q15/C+7RixXwdaFOSlOzSRpDGzqe5eVO1ycSSID4Bbgb8QXN56QbjerckItCaiziAumTdvXrrC\nkHpqweot/HnsXD6Yt4aN23fTqkkeJwzYixuO60ez/HhqZUUyWzITxFR3P9DMZrr7AeG0ie5+WJJi\nrTGdQUgq7SwpZeqS9dzz7jw+WbSORjkRTh3cmf8e1Z8mjZQoJHvFmyDiuorJzCLAPDO7gqDNoH1t\nAxTJdPm5ORy6d1sO3bst7325kkcnLuL5yUt56/OvufKYPpx/aA9ydMWT1GPxXPh9NdAEuBI4EPgx\ncF4qgxLJNEf368AzFw/lkXOLaNE4l9+9MYurnv9UfTxJvVZtFdM3C5o1dfeM6KNAbRCSTqVlzvA/\nj2fRmuDjMKx3W64e3oeiHq3THJlIfJLZBnEI8Hegmbt3M7PvAT9198uTE2rNqQ1C0mXH7lJmr9jE\n0x8v4ZVpywA4o6grpx/UhQO7K1FIZktmgvgPcBrwmrsPCqd97u77JyXSWlCCkEywaM1WbnplBh8v\nDLocO6BzIX88bQD7dmqR5shEYos3QcTV+Yy7L60wqbRGUSWJmY0ys4c3bkxm91AiNdOzbVOev/QQ\nptwynB8f3I2ZyzZyxbPTeGvmCrbsjLtnfJGME0+CWGpmhwJuZo3M7DpgdorjqpK7v+7ulxYWFqYz\nDJE9tG2Wz+9PPoA/njaABau38rNnpjHwtnf4y9i5GodCslI8VUxtgb8S9LIaAcYAVyXS5XeqqIpJ\nMtW2XSW8MX0FD01YwILVW+nXsTnXjdiHo/u1V2eAknZJa4PIZEoQkunKypw/jZ3D/e8vAIIBjM49\npDsHdC5k/846A5b0SGYjdS+CM4iDAQc+Aq5x94XJCLQ2lCAkW6zavIO/jJ3H85O/ovwjd+4h3fn5\nUb3p0KKg6pVFkiyZCeJjguE/nwsnnQn8wt2H1jrKGtJ9EJKt1m/dxfKN27nyuU9ZsDq4j+L4Azpx\n8qDOHNu/Q5qjk4YiqZe5VkwGZvaxux9cyxhrTWcQkq1Ky5w3Z67gztGzWbFxBwBtmzXiwmE9Oe+Q\nHjRVp4CSQslMEH8ANhCMG+3AGUA+wVkF7l5xvOk6owQh2a60zFm2fjt/Gzef5ycHV5PnRozLjtib\nY/t3YECXQszUqC3JlcwEsaiK2R7vuBCpoAQh9cm6rbt4aPwCHprwbfNen/bNOO/QHpw9pJuufpKk\n0VVMIllq7ZadzFqxibvHzGF6cXAzaIcW+fz759+nU2HjNEcn9UEyzyAKgMuBYQRVTBOBB919RzIC\nrQk1UktDsXxD0KA9Zcl6AM47pDs3/XBfCvJy0hyZZLNkJogXgc3AP8JJZwGt3P1HtY6ylnQGIQ3F\nc598xU2vzASga+vGjLn6cA1aJDWWzL6Y9nH3i9z9/fDvUqBv7UMUkXidNaQbU28ZTp/2zVi6bjuX\n/WMau0vL0h2W1HPxJIhPzeybS1rNbCgwKXUhiUgsbZrlM+bqw2nVJI8Jc1fzg79MYPaKTekOS+qx\neBLEUOBDM1tsZosJ7qQ+wsxmmtmMlEYnInuIRIwx1xxO55aNWbhmK8f9dSKTF6ftSnOp5+Jpg+he\n1Xx3X5LUiBKgNghpyF6eWsx1L00H4KdH9OKKo3rTvCAvzVFJNkhaG4S7LwmTwHaCq5g8mPzNdBFJ\ng9MO7MKj5waf8YfGL+SwP77P0x8tplRdi0uSVJsgzOxEM5sHLALGA4uBt1IcV3UxacAgEWB4/w5M\n/NVR9O/Ugg3bdvObV7/g+HsnMn/V5nSHJvVAPFVM04GjgXfdfZCZHQWcFV7NlFaqYhIJuDtrtuzi\ngic+4fNlQcP1D/brwP1nDyY3J66BI6UBSeZlrrvDwYEiZhZx9/eBgbWOUESSxsxo1zyfN35xGNcM\n70t+boQxX6yk7y1v8cx/VBMsNRNPgthgZs2ACcAzZvZXQAPtimSoq4b34ZNfD+eC7/egzOHmf31O\njxvf5MrnPmVnSanaKCRu8VQxNSVooI4A5wCFwDMaclQk8320YC2T5q/hvvfnfzOtXfN8xl5zOC0K\n8tQBYAOlzvpE5BuL1mzlrc9XsHTdNp77JOhWfGjP1rzw00PSHJmkQ7wJQp25iDQAPds25fIje1NS\nWsa+nVrw/pereH/Oanrc+CYpTxzRAAAP4klEQVSdCgt479ojadxIHQDKnnQGIdIArdq8g+c/Wcqi\nNVv516fLaNkkj1MHdeG/R/VPd2hSB3QGISKVat+8gCuP6cP2XaW0b57P+3NW8cx/ljC9eAM3jOzH\nkJ6t0x2iZIBKzyDMbCbBXdPfmUVwJ/WAVAYWD51BiCTHxwvX8uD4BXw4fy292zdjSM/W/Pyo3rRr\nnp/u0CQFknEGcUIS40mqqAGD0h2KSL1wcK82HNyrDde/NJ33vlzFEx8uZmdJGUf0bcdR/dqRn6v2\niYZIbRAisoeyMqfojndZt3UXAH88bQCnDOpMnu7IrjeSdie1mR1sZpPNbIuZ7TKzUjNTJ/Qi9VQk\nYrx37RG8c83h5ESMX708g31/8zZT1K14gxNPI/V9wJnAS0ARcC6guh2Reqxlk0a0bNKI+88exKzl\nm7j3vfncMXo2Pdo05bA+bTl1cJd0hyh1IK6rmNx9vpnluHsp8LiZfZjiuEQkA4zcvxMj+nfks+KN\nLF6zlblfb+aTResY0KWQds0LKGys8Sfqs3gSxDYzawR8ZmZ/BFYATVMblohkikjEeOrCIQDc+dZs\nHhq/kOF/nkCXVo354Iaj0xydpFI8rU4/CZe7AtgKdAVOTWVQIpKZLj+iN/efPZjjB3SieP12bv7X\nTO54cxabd+xOd2iSAvEkiJPdfYe7b3L329z9l2TwJbAikjqFTfI4fkAnzjqoGx1a5PPGjBU8MnER\nHy1Ie9+dkgLx9OY6zd0HV5j2qbsPSmlkcdBlriLpVbx+G8Puep/m+bnk5+VQkBfh8fMPok+H5ukO\nTapQ6xvlzOws4Gygp5m9FjWrBaCfCyJC55aNuWZ4X1Zu3sG2nSX8+7PlTJy3hhaN88jPjdCySaN0\nhyi1UFVXG92BnsCdwI1RszYDM9w97YMG6QxCJHNs21XC/reOoXw8IjN49effZ0CXlukNTL6j1mcQ\n7r4EWAIcYmYdgIPCWbMzITmISGZp0iiXJy4YQvH67azdspM/jZ3Lfxauo2l+LgZ0b9OUHA1QlFXi\naYP4EXA3MI6go77DgOvd/eWUR1cNnUGIZKaN23Yz8HfvEP31cvXwPlw9vG/6gpJvJLO771uAg9x9\nVbjhdsC7QNoThIhkpsImeTx/ycF8vWkHALe9Posla7exY3cpALkRI1d9O2W8eBJEpDw5hNYS3+Wx\nCTGzXsDNQKG7n5bs7YtI3Rraq803jx8av5B/fbqMf326DICOLQqYeMNR6gAww8WTIN42szHAc+Hz\nM4C34tm4mT1GcM/EKnffP2r6SOCvQA7wqLv/wd0XAheZmc5MROqZ20/aj8mL1wPw6VfreWfWSjZt\n302bZhpvIpNVmyDc/XozOxUYRtAG8bC7/yvO7T9B0NnfU+UTzCwHuB84FigGJpvZa+4+K8HYRSRL\nFPVoTVGPYJS6l6cW886slfz82WkU5OWQlxPhluP3pXsb9eCTaapNEGZ2l7vfALwSY1qV3H2CmfWo\nMHkIMD88Y8DMngdOAuJKEGZ2KXApQLdu3eJZRUQySFH3Vgzp0Zrtu0rZtL2EWSs2ceQ+7ZQgMlA8\nFYDHxph2XC322RlYGvW8GOhsZm3M7EFgkJndVNnK7v6wuxe5e1G7du1qEYaIpEOPtk158bJDePWK\nYbz8s0MA+GrdNhat2cqiNVtZvXlnmiOUclXdSf0z4HKgl5nNiJrVHJhUi33GuhDa3X0tcFkttisi\nWaZx2D3HQ+MX8tD4hQBEDD644Wj2atk4zdFJVVVMzxI0Rn/nTmp3r83QUsUEPcKW6wIsT2QDGpNa\npH4wM5695GC+WrsNgNkrNvHQhIWs3LRDCSIDVHUn9UZgI3BWkvc5GehjZj2BZQSj1Z2dyAbc/XXg\n9aKiokuSHJuI1LHB3VoxuFsrADoVFvDQhIV8vHAdm3YEHTY0bZTDgd1bYaa7sOtaXCPK1ZSZPQcc\nCbQ1s2LgVnf/u5ldAYwhuMz1MXf/IsHt6gxCpB5q36IAgLve/nKP6f/82aEc2L1VOkJq0KrtaiOT\nqasNkfpn3srN35w9LFi9hV+9PINHzy1ieP8OaY6s/khmVxsiInUmeiyJ5gXBV9SWnSXsLCn9ZnrE\nTHdh1wElCBHJWE0a5QBw9QufcfUL305vlBPhnz87lAO6FKYpsoYhKxOE2iBEGobOLRtz138dwJot\nu76ZtnbLLh6btIgl67YqQaRYViYIXcUk0jCYGWcctGePCV+t3cZjkxaxc3dZmqJqOLIyQYhIw5Wf\nF7Q9fL58I+1bfNvZX8cWBRoLO8myMkGoikmk4WqWn0tejvH4pMU8PmnxN9PzcyN8ftsP1HidRFmZ\nIFTFJNJwNc3PZew1R7Bmy7d9Nr02fTlPfbSEHbtLlSCSKCsThIg0bD3aNqVH2297f/1i+SYAdpWo\nXSKZlGpFJOs1yg2+ynYqQSSVziBEJOvlhwni7Ec+jlnFdMlhvTj9oK7fmS5Vy8oEoUZqEYl26N5t\nOWVQ5z3uti43Ye4axs9drQRRA1mZINRILSLROhYW8JczBsacN/KeCewqVdVTTagNQkTqtUa5EXYr\nQdSIEoSI1Gt5ORFKSrO31+p0ysoqJhGReOXlGMs2bOe5T76KOb9JoxyOP6ATubp/4juyMkGokVpE\n4rVXy8Z8vHAdN70ys9JlOrYoYGivNnUYVXbQgEEiUq+VljmrN++MOW/mso1c8tQUnrxwCEf0bVfH\nkaWPBgwSEQFyIkbHwoKY81Zu2gFAiRqxY1Klm4g0WDkRA6CkLHtrUlJJCUJEGqzyu65LlSBiUoIQ\nkQZLZxBVUxuEiDRYuWGCmDRvDdt2llS63Pd7t6Vr6yZ1FVbGyMoEoctcRSQZWjVpRKPcCC9MWcoL\nU5ZWutwJAzpx39mD6zCyzJCVCUJ9MYlIMhQ2yWPKLcPZWsXZw7l//4Qdu7/bCWBDkJUJQkQkWVoU\n5NGiIK/S+QV5OQ22EVuN1CIiVYhEjIbalZMShIhIFXIMynQGISIiFeVETFVMIiLyXREzSrO4z7ra\nUIIQEamCziBERCSmhpwgdJmriEgVciLG8g3buXvMnLjXMYNTBnWmV7tmKYws9bIyQehOahGpK/t2\nasHEeWt4YPyCuNcpLXO27Czh1lH7pTCy1NOAQSIiSTbo9ncY9b29uP2k/dMdSkzxDhikNggRkSSL\nmFGWxT++yylBiIgkmZlRD/KDEoSISLKZQX248EkJQkQkySIG2dy+W04JQkQkydQGISIiMRmoDUJE\nRL7LzNQGISIi3xWJqA1CRERiMIzsTw9KECIiSRcx1EgtIiLfFVEbhIiIxFRPziAypjdXM2sK/A3Y\nBYxz92fSHJKISI1EzKgPjRApPYMws8fMbJWZfV5h+kgzm2Nm883sxnDyqcDL7n4JcGIq4xIRSaX6\n0gaR6jOIJ4D7gKfKJ5hZDnA/cCxQDEw2s9eALsDMcLHSFMclIpIyhjF96QZ++cJnKdvH2UO7UdSj\ndcq2DylOEO4+wcx6VJg8BJjv7gsBzOx54CSCZNEF+IwqzmzM7FLgUoBu3bolP2gRkVo6vG9b3v7i\nayYvWZeyfYzYr2PKtl0uHW0QnYGlUc+LgaHAvcB9ZnY88HplK7v7w8DDEAwYlMI4RURq5Obj+3Pz\n8f3THUatpSNBWIxp7u5bgQvqOhgREYktHZe5FgNdo553AZYnsgEzG2VmD2/cuDGpgYmIyLfSkSAm\nA33MrKeZNQLOBF5LZAPu/rq7X1pYWJiSAEVEJPWXuT4HfATsY2bFZnaRu5cAVwBjgNnAi+7+RSrj\nEBGRxKX6KqazKpk+Ghhd0+2a2ShgVO/evWu6CRERqUZWdrWhKiYRkdTLygQhIiKpl5UJQlcxiYik\nnmXzqEdmthpYEj4tBCpmjIrTop+3BdakKLRYsSRzvaqWq2xePOUTa1p9KLPqllGZJb5MMsus4nyV\nWdXHVKzniZZZd3dvV+1S7l4v/oCHq5sW/RyYUpexJHO9qparbF485VNfy6y6ZVRm6S2zGOWnMqvi\nmKrLMsvKKqZKxOqeo+K0SrvwSLKa7ife9aparrJ58ZRPrGn1ocyqW0ZllvgyySyzuiqv2uwr3WWW\nlmMsq6uYasPMprh7UbrjyCYqs8SpzBKnMktcqsqsPp1BJOrhdAeQhVRmiVOZJU5llriUlFmDPYMQ\nEZGqNeQzCBERqYIShIiIxKQEISIiMSlBhMysqZk9aWaPmNk56Y4nG5hZLzP7u5m9nO5YsoWZnRwe\nY6+a2Yh0x5MNzGxfM3vQzF42s5+lO55sEH6fTTWzE2qznXqdIMzsMTNbZWafV5g+0szmmNl8M7sx\nnHwq8LK7XwKcWOfBZohEyszdF7r7RemJNHMkWGb/Do+x84Ez0hBuRkiwzGa7+2XA6UCDvPw1we8y\ngBuAF2u733qdIIAngJHRE8wsB7gfOA7oD5xlZv0JRrYrHyu7tA5jzDRPEH+ZSeAJEi+zW8L5DdUT\nJFBmZnYi8AHwf3UbZsZ4gjjLy8yGA7OAlbXdab1OEO4+AVhXYfIQYH7463cX8DxwEsFQqF3CZep1\nuVQlwTITEiszC9wFvOXu0+o61kyR6HHm7q+5+6FAg6z+TbC8jgIOBs4GLjGzGn+fpXTAoAzVmW/P\nFCBIDEOBe4H7zOx46vbW/2wQs8zMrA1wBzDIzG5y9zvTEl1mquw4+wUwHCg0s97u/mA6gstQlR1n\nRxJUAedTi4HG6qGY5eXuVwCY2fnAGncvq+kOGmKCsBjT3N23AhfUdTBZorIyWwtcVtfBZInKyuxe\ngh8j8l2Vldk4YFzdhpIVYpbXNw/cn6jtDhpiVUox0DXqeRdgeZpiyRYqs8SpzBKnMktMysurISaI\nyUAfM+tpZo2AM4HX0hxTplOZJU5lljiVWWJSXl71OkGY2XPAR8A+ZlZsZhe5ewlwBTAGmA286O5f\npDPOTKIyS5zKLHEqs8Skq7zUWZ+IiMRUr88gRESk5pQgREQkJiUIERGJSQlCRERiUoIQEZGYlCBE\nRCQmJQiROmRmvzWz69Idh0g8lCBEaijsmVWfIam3dHCLJMDMepjZbDP7GzAN+LuZTTGzL8zstqjl\nFpvZbWY2zcxmmlm/GNu6xMzeMrPGdfkaROKlBCGSuH2Ap9x9EHCtuxcBA4AjzGxA1HJr3H0w8ACw\nR7WSmV0BjAJOdvftdRS3SEKUIEQSt8TdPw4fn25m04BPgf0IRvYq90r4fyrQI2r6TwhGAfsvd9+Z\n4lhFakwJQiRxWwHMrCfBmcEx7j4AeBMoiFqu/Mu/lD3HXvmcIGF0QSSDKUGI1FwLgmSx0cw6EJwV\nxONT4KfAa2a2V6qCE6ktJQiRGnL36QRf9l8AjwGTElj3A4KzjzfNrG1qIhSpHXX3LSIiMekMQkRE\nYlKCEBGRmJQgREQkJiUIERGJSQlCRERiUoIQEZGYlCBERCQmJQgREYnp/wPg34tFx7LZUAAAAABJ\nRU5ErkJggg==\n",
344 |       "text/plain": [
345 |        "<matplotlib.figure.Figure at 0x1074f3ef0>"
346 |       ]
347 |      },
348 |      "metadata": {},
349 |      "output_type": "display_data"
350 |     }
351 |    ],
352 |    "source": [
353 |     "import matplotlib.pyplot as plt\n",
354 |     "%matplotlib inline\n",
355 |     "plt.plot([vocabcount[w] for w in vocab]);\n",
356 |     "plt.gca().set_xscale(\"log\", nonposx='clip')\n",
357 |     "plt.gca().set_yscale(\"log\", nonposy='clip')\n",
358 |     "plt.title('word distribution in headlines and discription')\n",
359 |     "plt.xlabel('rank')\n",
360 |     "plt.ylabel('total appearances');"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 66,
366 |    "metadata": {
367 |     "collapsed": true
368 |    },
369 |    "outputs": [],
370 |    "source": [
371 |     "empty = 0 # RNN mask of no data\n",
372 |     "eos = 1  # end of sentence\n",
373 |     "start_idx = eos+1 # first real word\n",
374 |     "\n",
375 |     "def get_idx(vocab, vocabcount):\n",
376 |     "    word2idx = dict((word, idx+start_idx) for idx,word in enumerate(vocab))\n",
377 |     "    word2idx['<empty>'] = empty\n",
378 |     "    word2idx['<eos>'] = eos\n",
379 |     "    \n",
380 |     "    idx2word = dict((idx,word) for word,idx in word2idx.items())\n",
381 |     "\n",
382 |     "    return word2idx, idx2word"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 67,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "word2idx, idx2word = get_idx(vocab, vocabcount)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "Read Glove"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 68,
406 |    "metadata": {
407 |     "collapsed": true
408 |    },
409 |    "outputs": [],
410 |    "source": [
411 |     "FN  =  'vocabulary-embedding'\n",
412 |     "seed=42\n",
413 |     "vocab_size = 40000\n",
414 |     "embedding_dim = 100\n",
415 |     "lower = False # dont lower case the text"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 69,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "name": "stdout",
425 |      "output_type": "stream",
426 |      "text": [
427 |       "unzip:  cannot find or open /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip, /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip.zip or /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip.ZIP.\r\n"
428 |      ]
429 |     }
430 |    ],
431 |    "source": [
432 |     "fname = 'glove.6B.%dd.txt'%embedding_dim\n",
433 |     "from keras.utils.data_utils import get_file\n",
434 |     "import os\n",
435 |     "datadir_base = os.path.expanduser(os.path.join('~', '.keras'))\n",
436 |     "if not os.access(datadir_base, os.W_OK):\n",
437 |     "    datadir_base = os.path.join('/tmp', '.keras')\n",
438 |     "datadir = os.path.join(datadir_base, 'datasets')\n",
439 |     "glove_name = os.path.join(datadir, fname)\n",
440 |     "if not os.path.exists(glove_name):\n",
441 |     "    path = 'glove.6B.zip'\n",
442 |     "    path = get_file(path, origin=\"http://nlp.stanford.edu/data/glove.6B.zip\")\n",
443 |     "    !unzip {datadir}/{path}"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 70,
449 |    "metadata": {
450 |     "collapsed": true
451 |    },
452 |    "outputs": [],
453 |    "source": [
454 |     "import zipfile\n",
455 |     "zip = zipfile.ZipFile(path)\n",
456 |     "zip.extractall()"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 71,
462 |    "metadata": {
463 |     "collapsed": true
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "datadir = ''\n",
468 |     "glove_name = os.path.join(datadir, fname)\n",
469 |     "glove_n_symbols = sum(1 for line in open(glove_name))"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 74,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/plain": [
480 |        "400000"
481 |       ]
482 |      },
483 |      "execution_count": 74,
484 |      "metadata": {},
485 |      "output_type": "execute_result"
486 |     }
487 |    ],
488 |    "source": [
489 |     "glove_n_symbols"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 75,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "import numpy as np\n",
499 |     "glove_index_dict = {}\n",
500 |     "glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))\n",
501 |     "globale_scale=.1\n",
502 |     "with open(glove_name, 'r') as fp:\n",
503 |     "    i = 0\n",
504 |     "    for l in fp:\n",
505 |     "        l = l.strip().split()\n",
506 |     "        w = l[0]\n",
507 |     "        glove_index_dict[w] = i\n",
508 |     "        glove_embedding_weights[i,:] = list(map(float,l[1:]))\n",
509 |     "        i += 1\n",
510 |     "glove_embedding_weights *= globale_scale"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 76,
516 |    "metadata": {},
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/plain": [
521 |        "0.040815727600190289"
522 |       ]
523 |      },
524 |      "execution_count": 76,
525 |      "metadata": {},
526 |      "output_type": "execute_result"
527 |     }
528 |    ],
529 |    "source": [
530 |     "glove_embedding_weights.std()"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 77,
536 |    "metadata": {
537 |     "collapsed": true
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "for w, i in  glove_index_dict.items():\n",
542 |     "    w = w.lower()\n",
543 |     "    if w not in glove_index_dict:\n",
544 |     "        glove_index_dict[w] = i"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 78,
550 |    "metadata": {},
551 |    "outputs": [
552 |     {
553 |      "name": "stdout",
554 |      "output_type": "stream",
555 |      "text": [
556 |       "random-embedding/glove scale 0.0706949139514 std 0.0408138249575\n",
557 |       "number of tokens, in small vocab, found in glove and copied to embedding 7960 0.199\n"
558 |      ]
559 |     }
560 |    ],
561 |    "source": [
562 |     "# generate random embedding with same scale as glove\n",
563 |     "np.random.seed(seed)\n",
564 |     "shape = (vocab_size, embedding_dim)\n",
565 |     "scale = glove_embedding_weights.std()*np.sqrt(12)/2 # uniform and not normal\n",
566 |     "embedding = np.random.uniform(low=-scale, high=scale, size=shape)\n",
567 |     "print ('random-embedding/glove scale', scale, 'std', embedding.std())\n",
568 |     "\n",
569 |     "# copy from glove weights of words that appear in our short vocabulary (idx2word)\n",
570 |     "c = 0\n",
571 |     "for i in range(vocab_size):\n",
572 |     "    #print(i)\n",
573 |     "    if(i not in idx2word):\n",
574 |     "        continue\n",
575 |     "    w = idx2word[i]\n",
576 |     "    #print(w)\n",
577 |     "    g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n",
578 |     "    if g is None and w.startswith('#'): # glove has no hastags (I think...)\n",
579 |     "        w = w[1:]\n",
580 |     "        g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n",
581 |     "    if g is not None:\n",
582 |     "        embedding[i,:] = glove_embedding_weights[g,:]\n",
583 |     "        c+=1\n",
584 |     "print ('number of tokens, in small vocab, found in glove and copied to embedding', c,c/float(vocab_size))\n"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 79,
590 |    "metadata": {
591 |     "collapsed": true
592 |    },
593 |    "outputs": [],
594 |    "source": [
595 |     "glove_thr = 0.5\n",
596 |     "word2glove = {}\n",
597 |     "for w in word2idx:\n",
598 |     "    if w in glove_index_dict:\n",
599 |     "        g = w\n",
600 |     "    elif w.lower() in glove_index_dict:\n",
601 |     "        g = w.lower()\n",
602 |     "    elif w.startswith('#') and w[1:] in glove_index_dict:\n",
603 |     "        g = w[1:]\n",
604 |     "    elif w.startswith('#') and w[1:].lower() in glove_index_dict:\n",
605 |     "        g = w[1:].lower()\n",
606 |     "    else:\n",
607 |     "        continue\n",
608 |     "    word2glove[w] = g"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": 80,
614 |    "metadata": {},
615 |    "outputs": [
616 |     {
617 |      "name": "stdout",
618 |      "output_type": "stream",
619 |      "text": [
620 |       "# of glove substitutes found 0\n"
621 |      ]
622 |     }
623 |    ],
624 |    "source": [
625 |     "normed_embedding = embedding/np.array([np.sqrt(np.dot(gweight,gweight)) for gweight in embedding])[:,None]\n",
626 |     "\n",
627 |     "nb_unknown_words = 100\n",
628 |     "\n",
629 |     "glove_match = []\n",
630 |     "for w,idx in word2idx.items():\n",
631 |     "    if idx >= vocab_size-nb_unknown_words and w.isalpha() and w in word2glove:\n",
632 |     "        gidx = glove_index_dict[word2glove[w]]\n",
633 |     "        gweight = glove_embedding_weights[gidx,:].copy()\n",
634 |     "        # find row in embedding that has the highest cos score with gweight\n",
635 |     "        gweight /= np.sqrt(np.dot(gweight,gweight))\n",
636 |     "        score = np.dot(normed_embedding[:vocab_size-nb_unknown_words], gweight)\n",
637 |     "        while True:\n",
638 |     "            embedding_idx = score.argmax()\n",
639 |     "            s = score[embedding_idx]\n",
640 |     "            if s < glove_thr:\n",
641 |     "                break\n",
642 |     "            if idx2word[embedding_idx] in word2glove :\n",
643 |     "                glove_match.append((w, embedding_idx, s)) \n",
644 |     "                break\n",
645 |     "            score[embedding_idx] = -1\n",
646 |     "glove_match.sort(key = lambda x: -x[2])\n",
647 |     "print ('# of glove substitutes found', len(glove_match))"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": 81,
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": [
656 |     "for orig, sub, score in glove_match[-10:]:\n",
657 |     "    print (score, orig,'=>', idx2word[sub])"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 82,
663 |    "metadata": {
664 |     "collapsed": true
665 |    },
666 |    "outputs": [],
667 |    "source": [
668 |     "glove_idx2idx = dict((word2idx[w],embedding_idx) for  w, embedding_idx, _ in glove_match)"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 83,
674 |    "metadata": {},
675 |    "outputs": [
676 |     {
677 |      "data": {
678 |       "text/plain": [
679 |        "379"
680 |       ]
681 |      },
682 |      "execution_count": 83,
683 |      "metadata": {},
684 |      "output_type": "execute_result"
685 |     }
686 |    ],
687 |    "source": [
688 |     "Y = [[word2idx[token] for token in title.split()] for title in titles]\n",
689 |     "len(Y)"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 84,
695 |    "metadata": {},
696 |    "outputs": [
697 |     {
698 |      "data": {
699 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD8CAYAAACINTRsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAC65JREFUeJzt3WGIpIddx/HvzzNBaYUkzSYcucSN\nEkqDYCJLCEQkpFajEXNCUxqknBA5X7SSomDPvrGCwlW07RsRThM8oTYNbfVCU9AQE6pQontpbBKP\ncjGc9cxxdyUNbd4oaf6+2OfwvN29nZmd3dn9z/cDYed5dibz58mT7z33zDwzqSokST38wKwHkCRN\nj1GXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTID27nk1177bW1uLi4nU8pSbve8ePH\nv11VC6Pcd1ujvri4yPLy8nY+pSTtekn+Y9T7evpFkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJ\nasSoS1IjRl2SGtnWK0q1tsVDT665/tTh+7Z5Ekm7nUfqktSIUZekRoy6JDVi1CWpEaMuSY0YdUlq\nxKhLUiNGXZIaMeqS1IhRl6RGjLokNWLUJakRoy5JjfgpjXPAT4GU5odH6pLUiFGXpEaMuiQ1YtQl\nqZGRo55kT5KvJ/nysHxzkueSnEzy+SRXbt2YkqRRjHOk/jBw4qLlTwKfrqpbgO8AD01zMEnS+EaK\nepJ9wH3AXwzLAe4BvjDc5SiwfysGlCSNbtQj9c8AvwO8PSy/C3ijqt4alk8DN0x5NknSmDa8+CjJ\nLwHnqup4krsvrF7jrrXO4w8CBwFuuummCcfUdvJiJWn3GuVI/S7gl5OcAh5j5bTLZ4Crklz4Q2Ef\n8NpaD66qI1W1VFVLCwsLUxhZkrSeDaNeVb9bVfuqahH4IPAPVfWrwDPA+4e7HQCObdmUkqSRbOZ9\n6h8DfivJK6ycY39kOiNJkiY11gd6VdWzwLPD7VeBO6Y/kiRpUl5RKkmNGHVJasSoS1IjRl2SGjHq\nktSIUZekRoy6JDVi1CWpEaMuSY2MdUWpdgY/RVHSejxSl6RGjLokNWLUJakRoy5JjRh1SWrEqEtS\nI0Zdkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktSIUZekRvzmo03wG4g2\n5jaStpdH6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTIhlFP8kNJ/jnJvyZ5\nOcnvD+tvTvJckpNJPp/kyq0fV5J0OaMcqf83cE9V/SRwG3BvkjuBTwKfrqpbgO8AD23dmJKkUWwY\n9Vrx5rB4xfBPAfcAXxjWHwX2b8mEkqSRjXROPcmeJC8A54CngH8H3qiqt4a7nAZu2JoRJUmjGinq\nVfX9qroN2AfcAbxnrbut9dgkB5MsJ1k+f/785JNKkjY01rtfquoN4FngTuCqJBc+uncf8No6jzlS\nVUtVtbSwsLCZWSVJGxjl3S8LSa4abv8w8LPACeAZ4P3D3Q4Ax7ZqSEnSaEb5koy9wNEke1j5Q+Dx\nqvpykn8DHkvyB8DXgUe2cE5J0gg2jHpVfQO4fY31r7Jyfl2StEN4RakkNWLUJakRoy5JjRh1SWrE\nqEtSI0Zdkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktSIUZekRoy6JDVi\n1CWpkVG+o1TaNouHnlxz/anD923zJNLu5JG6JDVi1CWpEaMuSY0YdUlqxKhLUiNGXZIaMeqS1IhR\nl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox6pLUiFGXpEaMuiQ14jcfaVfzm5Kk/2/DI/Uk\nNyZ5JsmJJC8neXhYf02Sp5KcHH5evfXjSpIuZ5TTL28Bv11V7wHuBD6c5FbgEPB0Vd0CPD0sS5Jm\naMOoV9WZqnp+uP094ARwA3A/cHS421Fg/1YNKUkazVgvlCZZBG4HngOur6ozsBJ+4LppDydJGs/I\nUU/yTuCLwEer6rtjPO5gkuUky+fPn59kRknSiEaKepIrWAn6Z6vqS8Pqs0n2Dr/fC5xb67FVdaSq\nlqpqaWFhYRozS5LWMcq7XwI8Apyoqk9d9KsngAPD7QPAsemPJ0kaxyjvU78L+BDwYpIXhnUfBw4D\njyd5CPgW8MDWjChJGtWGUa+qfwKyzq/fO91xJEmb4ccESFIjRl2SGjHqktSIUZekRoy6JDVi1CWp\nEaMuSY0YdUlqxKhLUiNGXZIaMeqS1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0ZdkhoZ5TtK\nd4TFQ0+uuf7U4ft2xb9fO5v//dWFR+qS1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox\n6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTIrvnmI2kn8ZuStFNteKSe5NEk\n55K8dNG6a5I8leTk8PPqrR1TkjSKUU6//CVw7yXrDgFPV9UtwNPDsiRpxjaMelV9FXj9ktX3A0eH\n20eB/VOeS5I0gUlfKL2+qs4ADD+vm95IkqRJbfm7X5IcTLKcZPn8+fNb/XSSNNcmjfrZJHsBhp/n\n1rtjVR2pqqWqWlpYWJjw6SRJo5g06k8AB4bbB4Bj0xlHkrQZo7yl8XPA14B3Jzmd5CHgMPC+JCeB\n9w3LkqQZ2/Dio6p6cJ1fvXfKs0jaAl4oNV/8mABJasSoS1IjRl2SGjHqktSIUZekRoy6JDVi1CWp\nEaMuSY0YdUlqpO3X2XkVnXa7We3D/r+zu3mkLkmNGHVJasSoS1Ijbc+pSzuN56q1HTxSl6RGjLok\nNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox6pLUiBcfSdoUL6raWTxSl6RGjLokNWLUJakRoy5JjfhC\nqaRdyxdpV/NIXZIaMeqS1IhRl6RGjLokNeILpZI0JTvhhVuP1CWpEaMuSY0YdUlqxHPqkrbdTjj3\nPIrdMufFNnWknuTeJN9M8kqSQ9MaSpI0mYmjnmQP8KfALwC3Ag8muXVag0mSxreZI/U7gFeq6tWq\n+h/gMeD+6YwlSZrEZqJ+A/CfFy2fHtZJkmYkVTXZA5MHgJ+vql8flj8E3FFVv3nJ/Q4CB4fFdwPf\nnHzcXeFa4NuzHmKHcZus5jZZzW2y2oVt8qNVtTDKAzbz7pfTwI0XLe8DXrv0TlV1BDiyiefZVZIs\nV9XSrOfYSdwmq7lNVnObrDbJNtnM6Zd/AW5JcnOSK4EPAk9s4t8nSdqkiY/Uq+qtJB8B/g7YAzxa\nVS9PbTJJ0tg2dfFRVX0F+MqUZulibk41jcFtsprbZDW3yWpjb5OJXyiVJO08fvaLJDVi1Kckyakk\nLyZ5IcnyrOeZlSSPJjmX5KWL1l2T5KkkJ4efV89yxu22zjb5RJL/GvaXF5L84ixn3G5JbkzyTJIT\nSV5O8vCwfm73lctsk7H2FU+/TEmSU8BSVc31+2yT/AzwJvBXVfUTw7o/Al6vqsPDZwRdXVUfm+Wc\n22mdbfIJ4M2q+uNZzjYrSfYCe6vq+SQ/AhwH9gO/xpzuK5fZJh9gjH3FI3VNVVV9FXj9ktX3A0eH\n20dZ2VHnxjrbZK5V1Zmqen64/T3gBCtXpM/tvnKZbTIWoz49Bfx9kuPDVbT6P9dX1RlY2XGB62Y8\nz07xkSTfGE7PzM1phkslWQRuB57DfQVYtU1gjH3FqE/PXVX1U6x8auWHh79yS+v5M+DHgduAM8Cf\nzHac2UjyTuCLwEer6ruznmcnWGObjLWvGPUpqarXhp/ngL9h5VMsteLscL7wwnnDczOeZ+aq6mxV\nfb+q3gb+nDncX5JcwUq8PltVXxpWz/W+stY2GXdfMepTkOQdwwsbJHkH8HPAS5d/1Fx5Ajgw3D4A\nHJvhLDvChXANfoU521+SBHgEOFFVn7roV3O7r6y3TcbdV3z3yxQk+TFWjs5h5Srdv66qP5zhSDOT\n5HPA3ax8utxZ4PeAvwUeB24CvgU8UFVz88LhOtvkblb+Ol3AKeA3LpxLngdJfhr4R+BF4O1h9cdZ\nOYc8l/vKZbbJg4yxrxh1SWrE0y+S1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhr5XzjN\n5muAK5tRAAAAAElFTkSuQmCC\n",
700 |       "text/plain": [
701 |        "<matplotlib.figure.Figure at 0x105acc710>"
702 |       ]
703 |      },
704 |      "metadata": {},
705 |      "output_type": "display_data"
706 |     }
707 |    ],
708 |    "source": [
709 |     "plt.hist(list(map(len,Y)),bins=50);"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": 85,
715 |    "metadata": {},
716 |    "outputs": [
717 |     {
718 |      "data": {
719 |       "text/plain": [
720 |        "379"
721 |       ]
722 |      },
723 |      "execution_count": 85,
724 |      "metadata": {},
725 |      "output_type": "execute_result"
726 |     }
727 |    ],
728 |    "source": [
729 |     "X = [[word2idx[token] for token in text.split()] for text in texts]\n",
730 |     "len(X)"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "code",
735 |    "execution_count": 86,
736 |    "metadata": {},
737 |    "outputs": [
738 |     {
739 |      "data": {
740 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADpRJREFUeJzt3W+MZfVdx/H3V0qpKUSgO5ANsA4Q\nUkuMLmTckKxpsNiWgnEhoQZiYB9gttFiIGJ0aRPFByZohBqThmYRZFVKWwsE0mJbAhjSRMFZusCS\nLQXbVYHN7hLaQp9Uga8P7m9g3Myf+3fu2S/vVzK555577tzP/nbmM+eee373RmYiSarlZ6YdQJI0\nfpa7JBVkuUtSQZa7JBVkuUtSQZa7JBVkuUtSQauWe0ScFhGPRsTeiHg2Iq5t62+MiJciYnf7umjy\ncSVJ/YjVJjFFxHpgfWY+GRHHAbuAS4DfAn6SmX81+ZiSpEG8Z7UNMnM/sL8tvx4Re4FThnmwdevW\n5ezs7DB3laR3rV27dr2SmTOD3GfVcl8sImaBc4DHgc3ANRFxFTAPXJ+ZP1zp/rOzs8zPzw/ykJL0\nrhcR/znoffp+QTUijgXuAa7LzNeAW4EzgY309uxvXuZ+2yJiPiLmDx06NGg+SdIQ+ir3iDiaXrHf\nlZn3AmTmgcx8MzPfAm4DNi1138zckZlzmTk3MzPQswpJ0pD6OVsmgNuBvZl5y6L16xdtdimwZ/zx\nJEnD6OeY+2bgSuCZiNjd1n0GuCIiNgIJ7AM+NZGEkqSB9XO2zLeBWOKmB8cfR5I0Ds5QlaSCLHdJ\nKshyl6SCLHdJKmigGaqqZXb715dcv++mi9c4iaRxc89dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWp\nIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtd\nkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpoFXLPSJO\ni4hHI2JvRDwbEde29SdGxEMR8Xy7PGHycSVJ/ehnz/0N4PrM/BBwHvDpiDgb2A48nJlnAQ+365Kk\nDli13DNzf2Y+2ZZfB/YCpwBbgJ1ts53AJZMKKUkazEDH3CNiFjgHeBw4OTP3Q+8PAHDSMvfZFhHz\nETF/6NCh0dJKkvrSd7lHxLHAPcB1mflav/fLzB2ZOZeZczMzM8NklCQNqK9yj4ij6RX7XZl5b1t9\nICLWt9vXAwcnE1GSNKh+zpYJ4HZgb2besuimB4CtbXkrcP/440mShvGePrbZDFwJPBMRu9u6zwA3\nAV+JiKuB/wI+OZmIkqRBrVrumfltIJa5+YLxxpEkjYMzVCWpIMtdkgqy3CWpoH5eUJWmbnb71wfa\nft9NF08oiXRkcM9dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgryPHeVtNx58YOe/77S+fWe\nS68uc89dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpICcxaSwGnTQ0rklGkpbmnrsk\nFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBTmI6AjkBaHiOnd4t3HOXpIIsd0kqyHKX\npIIsd0kqaNVyj4g7IuJgROxZtO7GiHgpIna3r4smG1OSNIh+9tzvBC5cYv3nMnNj+3pwvLEkSaNY\ntdwz8zHg1TXIIkkak1HOc78mIq4C5oHrM/OHS20UEduAbQAbNmwY4eG0VpY7Fxw8H1w6Ugz7guqt\nwJnARmA/cPNyG2bmjsycy8y5mZmZIR9OkjSIoco9Mw9k5puZ+RZwG7BpvLEkSaMYqtwjYv2iq5cC\ne5bbVpK09lY95h4RdwPnA+si4kXgT4HzI2IjkMA+4FMTzChJGtCq5Z6ZVyyx+vYJZJEkjYkzVCWp\nIMtdkgqy3CWpID+soxA/iGJtOd7qMvfcJakgy12SCrLcJakgy12SCrLcJakgy12SCrLcJakgy12S\nCnISkzRmTm5SF7jnLkkFWe6SVJDlLkkFWe6SVJDlLkkFWe6SVJDlLkkFeZ77BHTtPOfl8kiqyz13\nSSrIcpekgix3SSrIcpekgix3SSrIcpekgix3SSrIcpekgpzE1GEVJh8N+m+o8G+WusA9d0kqyHKX\npIIsd0kqyHKXpIJWLfeIuCMiDkbEnkXrToyIhyLi+XZ5wmRjSpIG0c+e+53AhYet2w48nJlnAQ+3\n65Kkjli13DPzMeDVw1ZvAXa25Z3AJWPOJUkawbDH3E/OzP0A7fKk8UWSJI1q4pOYImIbsA1gw4YN\nk344aShOnlI1w+65H4iI9QDt8uByG2bmjsycy8y5mZmZIR9OkjSIYcv9AWBrW94K3D+eOJKkcejn\nVMi7gX8FPhgRL0bE1cBNwEcj4nngo+26JKkjVj3mnplXLHPTBWPOIkkaE2eoSlJBlrskFWS5S1JB\nlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFTTxD+tQLX6oxfCWG7t9N128\nxkn0buCeuyQVZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkGWuyQVdMSc5+45wpLUP/fcJakgy12SCrLc\nJakgy12SCrLcJakgy12SCrLcJakgy12SCjpiJjF10aAfXOFELElrxT13SSrIcpekgix3SSrIcpek\ngkZ6QTUi9gGvA28Cb2Tm3DhCSZJGM46zZX4tM18Zw/eRJI2Jh2UkqaBRyz2Bb0XErojYNo5AkqTR\njXpYZnNmvhwRJwEPRcR3M/OxxRu00t8GsGHDhhEfrqZBJ0NJgxh08pyT7WoYac89M19ulweB+4BN\nS2yzIzPnMnNuZmZmlIeTJPVp6HKPiPdHxHELy8DHgD3jCiZJGt4oh2VOBu6LiIXv88XM/MZYUkmS\nRjJ0uWfm94FfHmMWSdKYeCqkJBVkuUtSQZa7JBXkh3X0wfPQVZE/16s7ks/5d89dkgqy3CWpIMtd\nkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpICcxSVNW+cM0Bp0o1cV/w5HKPXdJKshyl6SCLHdJKshy\nl6SCLHdJKshyl6SCLHdJKshyl6SCyk5iGuZTZpxAoS4Z9Gd40p+sVGHy1KATw45k7rlLUkGWuyQV\nZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkFlz3MfRsVzXaUKxnWO/bTmAsDazwdwz12SCrLcJakgy12S\nCrLcJamgkco9Ii6MiOci4oWI2D6uUJKk0Qxd7hFxFPB54BPA2cAVEXH2uIJJkoY3yp77JuCFzPx+\nZv4P8CVgy3hiSZJGMUq5nwL896LrL7Z1kqQpi8wc7o4RnwQ+npm/065fCWzKzN8/bLttwLZ29YPA\nc0M83DrglaGCTl6Xs4H5RmW+0XQ5X5ezwf/P9/OZOTPInUeZofoicNqi66cCLx++UWbuAHaM8DhE\nxHxmzo3yPSaly9nAfKMy32i6nK/L2WD0fKMclvl34KyIOD0i3gtcDjwwwveTJI3J0HvumflGRFwD\nfBM4CrgjM58dWzJJ0tBGeuOwzHwQeHBMWVYy0mGdCetyNjDfqMw3mi7n63I2GPVw9rAvqEqSusu3\nH5Ckgjpd7l18e4OI2BcRz0TE7oiYb+tOjIiHIuL5dnnCGua5IyIORsSeReuWzBM9f9PG8+mIOHdK\n+W6MiJfaGO6OiIsW3XZDy/dcRHx8wtlOi4hHI2JvRDwbEde29Z0YvxXydWX83hcRT0TEUy3fn7X1\np0fE4238vtxOuCAijmnXX2i3z04p350R8YNF47exrZ/G78dREfGdiPhauz6+scvMTn7Re5H2P4Az\ngPcCTwFndyDXPmDdYev+EtjelrcDf7GGeT4MnAvsWS0PcBHwz0AA5wGPTynfjcAfLrHt2e3/+Rjg\n9Pb/f9QEs60Hzm3LxwHfaxk6MX4r5OvK+AVwbFs+Gni8jctXgMvb+i8Av9uWfw/4Qlu+HPjyhMdv\nuXx3Apctsf00fj/+APgi8LV2fWxj1+U99yPp7Q22ADvb8k7gkrV64Mx8DHi1zzxbgL/Pnn8Djo+I\n9VPIt5wtwJcy86eZ+QPgBXo/B5PKtj8zn2zLrwN76c2y7sT4rZBvOWs9fpmZP2lXj25fCXwE+Gpb\nf/j4LYzrV4ELIiKmkG85a/r/GxGnAhcDf9uuB2Mcuy6Xe1ff3iCBb0XErujNvgU4OTP3Q+8XEjhp\naulWztOlMb2mPfW9Y9FhrKnla09zz6G3d9e58TssH3Rk/Nphhd3AQeAhes8WfpSZbyyR4e187fYf\nAx9Yy3yZuTB+f97G73MRcczh+ZbIPgl/DfwR8Fa7/gHGOHZdLvel/ip14dSezZl5Lr13w/x0RHx4\n2oEG0JUxvRU4E9gI7Adubuunki8ijgXuAa7LzNdW2nSJddPI15nxy8w3M3MjvRnqm4APrZBh6vki\n4heBG4BfAH4FOBH447XOFxG/ARzMzF2LV6/w+ANn63K59/X2BmstM19ulweB++j9QB9YePrWLg9O\nLyGskKcTY5qZB9ov3VvAbbxz6GDN80XE0fSK867MvLet7sz4LZWvS+O3IDN/BPwLvWPVx0fEwhya\nxRneztdu/zn6P2Q3rnwXtsNdmZk/Bf6O6YzfZuA3I2IfvUPOH6G3Jz+2setyuXfu7Q0i4v0RcdzC\nMvAxYE/LtbVtthW4fzoJ37ZcngeAq9pZAecBP144/LCWDjuOeSm9MVzId3k7M+B04CzgiQnmCOB2\nYG9m3rLopk6M33L5OjR+MxFxfFv+WeDX6b0u8ChwWdvs8PFbGNfLgEeyvUK4hvm+u+gPd9A7pr14\n/Nbk/zczb8jMUzNzll63PZKZv804x27SrwaP8kXv1evv0TuO99kO5DmD3tkITwHPLmSid+zrYeD5\ndnniGma6m95T8/+l99f96uXy0Htq9/k2ns8Ac1PK9w/t8Z9uP7TrF23/2ZbvOeATE872q/Se2j4N\n7G5fF3Vl/FbI15Xx+yXgOy3HHuBPFv2ePEHvBd1/Ao5p69/Xrr/Qbj9jSvkeaeO3B/hH3jmjZs1/\nP9rjns87Z8uMbeycoSpJBXX5sIwkaUiWuyQVZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkGWuyQV9H+4\nytjNNBPLlwAAAABJRU5ErkJggg==\n",
741 |       "text/plain": [
742 |        "<matplotlib.figure.Figure at 0x105b6e390>"
743 |       ]
744 |      },
745 |      "metadata": {},
746 |      "output_type": "display_data"
747 |     }
748 |    ],
749 |    "source": [
750 |     "plt.hist(list(map(len,X)),bins=50);"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": 87,
756 |    "metadata": {},
757 |    "outputs": [
758 |     {
759 |      "name": "stdout",
760 |      "output_type": "stream",
761 |      "text": [
762 |       "[[-0.01773875  0.06372642  0.03280158 ..., -0.01024496 -0.06710091\n",
763 |       "  -0.05544016]\n",
764 |       " [-0.06625115  0.01928704 -0.02624818 ...,  0.05614735  0.05473008\n",
765 |       "   0.03957156]\n",
766 |       " [-0.0038194  -0.024487    0.072812   ..., -0.01459     0.08278     0.027062  ]\n",
767 |       " ..., \n",
768 |       " [ 0.06982313 -0.02670071 -0.03871925 ..., -0.00267477 -0.01187393\n",
769 |       "  -0.05748738]\n",
770 |       " [ 0.04286668 -0.0481842  -0.01529906 ...,  0.0564503   0.03692646\n",
771 |       "   0.03450374]\n",
772 |       " [ 0.01269256  0.03835368 -0.04946906 ...,  0.01590619 -0.05742016\n",
773 |       "   0.03449618]]\n"
774 |      ]
775 |     }
776 |    ],
777 |    "source": [
778 |     "print(embedding)"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": 88,
784 |    "metadata": {
785 |     "collapsed": true
786 |    },
787 |    "outputs": [],
788 |    "source": [
789 |     "with open('%s.pkl'%FN,\"wb\") as fp:\n",
790 |     "    pickle.dump((embedding, idx2word, word2idx, glove_idx2idx),fp,-1)\n",
791 |     "    \n",
792 |     "with open('%s.data.pkl'%FN,\"wb\") as fp:\n",
793 |     "    pickle.dump((X,Y),fp,-1)"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "code",
798 |    "execution_count": null,
799 |    "metadata": {
800 |     "collapsed": true
801 |    },
802 |    "outputs": [],
803 |    "source": []
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": null,
808 |    "metadata": {
809 |     "collapsed": true
810 |    },
811 |    "outputs": [],
812 |    "source": []
813 |   },
814 |   {
815 |    "cell_type": "code",
816 |    "execution_count": null,
817 |    "metadata": {
818 |     "collapsed": true
819 |    },
820 |    "outputs": [],
821 |    "source": []
822 |   }
823 |  ],
824 |  "metadata": {
825 |   "kernelspec": {
826 |    "display_name": "Python 3",
827 |    "language": "python",
828 |    "name": "python3"
829 |   },
830 |   "language_info": {
831 |    "codemirror_mode": {
832 |     "name": "ipython",
833 |     "version": 3
834 |    },
835 |    "file_extension": ".py",
836 |    "mimetype": "text/x-python",
837 |    "name": "python",
838 |    "nbconvert_exporter": "python",
839 |    "pygments_lexer": "ipython3",
840 |    "version": "3.6.3"
841 |   }
842 |  },
843 |  "nbformat": 4,
844 |  "nbformat_minor": 2
845 | }
846 | 


--------------------------------------------------------------------------------
/p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p1.png


--------------------------------------------------------------------------------
/p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p2.png


--------------------------------------------------------------------------------
/p3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p3.png


--------------------------------------------------------------------------------
/p4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p4.png


--------------------------------------------------------------------------------
/p5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p5.png


--------------------------------------------------------------------------------
/p6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p6.png


--------------------------------------------------------------------------------
/p7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p7.png


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | """Predict a title for a recipe."""
  2 | from os import path
  3 | import random
  4 | import json
  5 | import pickle
  6 | import h5py
  7 | import numpy as np
  8 | from utils import str_shape
  9 | import keras.backend as K
 10 | import argparse
 11 | 
 12 | from config import path_models, path_data
 13 | from constants import FN1, FN0, nb_unknown_words, eos
 14 | from model import create_model
 15 | from sample_gen import gensamples
 16 | 
 17 | # set seeds in random libraries
 18 | seed = 42
 19 | random.seed(seed)
 20 | np.random.seed(seed)
 21 | 
 22 | 
 23 | def load_weights(model, filepath):
 24 |     """Load all weights possible into model from filepath.
 25 | 
 26 |     This is a modified version of keras load_weights that loads as much as it can
 27 |     if there is a mismatch between file and model. It returns the weights
 28 |     of the first layer in which the mismatch has happened
 29 |     """
 30 |     print('Loading', filepath, 'to', model.name)
 31 |     with h5py.File(filepath, mode='r') as f:
 32 |         # new file format
 33 |         layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
 34 | 
 35 |         # we batch weight value assignments in a single backend call
 36 |         # which provides a speedup in TensorFlow.
 37 |         weight_value_tuples = []
 38 |         for name in layer_names:
 39 |             print(name)
 40 |             g = f[name]
 41 |             weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
 42 |             if len(weight_names):
 43 |                 weight_values = [g[weight_name] for weight_name in weight_names]
 44 |                 try:
 45 |                     layer = model.get_layer(name=name)
 46 |                 except:
 47 |                     layer = None
 48 |                 if not layer:
 49 |                     print('failed to find layer', name, 'in model')
 50 |                     print('weights', ' '.join(str_shape(w) for w in weight_values))
 51 |                     print('stopping to load all other layers')
 52 |                     weight_values = [np.array(w) for w in weight_values]
 53 |                     break
 54 |                 symbolic_weights = layer.trainable_weights + layer.non_trainable_weights
 55 |                 weight_value_tuples += zip(symbolic_weights, weight_values)
 56 |                 weight_values = None
 57 |         K.batch_set_value(weight_value_tuples)
 58 |     return weight_values
 59 | 
 60 | 
 61 | def main(sample_str=None):
 62 |     """Predict a title for a recipe."""
 63 |     # load model parameters used for training
 64 |     with open(path.join(path_models, 'model_params.json'), 'r') as f:
 65 |         model_params = json.load(f)
 66 | 
 67 |     # create placeholder model
 68 |     model = create_model(**model_params)
 69 | 
 70 |     # load weights from training run
 71 |     load_weights(model, path.join(path_models, '{}.hdf5'.format(FN1)))
 72 | 
 73 |     # load recipe titles and descriptions
 74 |     with open(path.join(path_data, 'vocabulary-embedding.data.pkl'), 'rb') as fp:
 75 |         X_data, Y_data = pickle.load(fp)
 76 | 
 77 |     # load vocabulary
 78 |     with open(path.join(path_data, '{}.pkl'.format(FN0)), 'rb') as fp:
 79 |         embedding, idx2word, word2idx, glove_idx2idx = pickle.load(fp)
 80 |     vocab_size, embedding_size = embedding.shape
 81 |     oov0 = vocab_size - nb_unknown_words
 82 | 
 83 |     if sample_str is None:
 84 |         # load random recipe description if none provided
 85 |         i = np.random.randint(len(X_data))
 86 |         sample_str = ''
 87 |         sample_title = ''
 88 |         for w in X_data[i]:
 89 |             sample_str += idx2word[w] + ' '
 90 |         for w in Y_data[i]:
 91 |             sample_title += idx2word[w] + ' '
 92 |         y = Y_data[i]
 93 |         print('Randomly sampled recipe:')
 94 |         print(sample_title)
 95 |         print(sample_str)
 96 |     else:
 97 |         sample_title = ''
 98 |         y = [eos]
 99 | 
100 |     x = [word2idx[w.rstrip('^')] for w in sample_str.split()]
101 | 
102 |     samples = gensamples(
103 |         skips=2,
104 |         k=1,
105 |         batch_size=2,
106 |         short=False,
107 |         temperature=1.,
108 |         use_unk=True,
109 |         model=model,
110 |         data=(x, y),
111 |         idx2word=idx2word,
112 |         oov0=oov0,
113 |         glove_idx2idx=glove_idx2idx,
114 |         vocab_size=vocab_size,
115 |         nb_unknown_words=nb_unknown_words,
116 |     )
117 | 
118 |     headline = samples[0][0][len(samples[0][1]):]
119 |     ' '.join(idx2word[w] for w in headline)
120 | 
121 | if __name__ == '__main__':
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument('--sample-str', type=str, default=None, help='Sample recipe description')
124 |     args = parser.parse_args()
125 |     main(sample_str=args.sample_str)
126 | 


--------------------------------------------------------------------------------
/simpler.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pickle\n",
 13 |     "import string "
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np\n",
 25 |     "import os\n",
 26 |     "\n",
 27 |     "def split():\n",
 28 |     "    titles = []\n",
 29 |     "    texts = []\n",
 30 |     "    root = 'Part1'\n",
 31 |     "    \n",
 32 |     "    #dirr = 'Part1/awards_1990/awd_1990_00/'\n",
 33 |     "    dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n",
 34 |     "\n",
 35 |     "    for filename in dirs[1:]:\n",
 36 |     "    #iter = 0\n",
 37 |     "    #print(dirs[1])\n",
 38 |     "            \n",
 39 |     "                #print(iter)\n",
 40 |     "                #iter += 1\n",
 41 |     "                #print(dirs[1:])\n",
 42 |     "                #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n",
 43 |     "        f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n",
 44 |     "        addTitle = False\n",
 45 |     "        addTexts = False\n",
 46 |     "        title = []\n",
 47 |     "        text = []\n",
 48 |     "        for word in f.read().split():\n",
 49 |     "            if (word == \"Title\"):\n",
 50 |     "                addTitle = True\n",
 51 |     "                continue\n",
 52 |     "\n",
 53 |     "            if (word == \"Type\"):\n",
 54 |     "                addTitle = False\n",
 55 |     "\n",
 56 |     "#             if (addTexts == True and word == \"\\n\"):\n",
 57 |     "#                 addTexts = False\n",
 58 |     "#                 break\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "            if (word == \"Abstract\"):\n",
 62 |     "                addTexts = True\n",
 63 |     "                continue\n",
 64 |     "\n",
 65 |     "            if(addTitle == True):\n",
 66 |     "                title.append(word)\n",
 67 |     "\n",
 68 |     "            if(addTexts == True):\n",
 69 |     "                text.append(word)\n",
 70 |     "\n",
 71 |     "        for i in range(len(title)):\n",
 72 |     "            s = title[i]\n",
 73 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
 74 |     "            new_s = s.translate(table)\n",
 75 |     "            title[i] = new_s\n",
 76 |     "        for i in range(len(text)):\n",
 77 |     "            s = text[i]\n",
 78 |     "            table = str.maketrans({key: None for key in string.punctuation})\n",
 79 |     "            new_s = s.translate(table)\n",
 80 |     "            text[i] = new_s\n",
 81 |     "\n",
 82 |     "        title = ' '.join(title)\n",
 83 |     "        text =' '.join(text)\n",
 84 |     "        titles.append(title)\n",
 85 |     "        texts.append(text)\n",
 86 |     "\n",
 87 |     "#     f=open(\"titles.txt\", 'w')\n",
 88 |     "#     for i in titles:\n",
 89 |     "#         f.write(i)\n",
 90 |     "#         f.write(' ')\n",
 91 |     "\n",
 92 |     "#     t=open(\"texts.txt\", 'w')\n",
 93 |     "#     for i in texts:\n",
 94 |     "#         t.write(i)\n",
 95 |     "#         t.write(' ')\n",
 96 |     "\n",
 97 |     "#     f.close()\n",
 98 |     "#     t.close()\n",
 99 |     "    return titles, texts\n"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "titles, texts = split()\n"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 4,
116 |    "metadata": {
117 |     "scrolled": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "379\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "print(len(titles))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 5,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "from collections import Counter\n",
141 |     "from itertools import chain\n",
142 |     "def get_vocab(lst):\n",
143 |     "    vocabcount = Counter(w for txt in lst for w in txt.split())\n",
144 |     "    vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))\n",
145 |     "    return list(vocab), vocabcount"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 7,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "' CRB Genetic Diversity of Endangered Populations of Mysticete Whales Mitochondrial DNA and Historical Demography'"
157 |       ]
158 |      },
159 |      "execution_count": 7,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "titles[0]"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 8,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "vocab, vocabcount = get_vocab(titles+texts)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 10,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "9073"
188 |       ]
189 |      },
190 |      "execution_count": 10,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "len(vocabcount)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 31,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "from keras.models import Sequential\n",
208 |     "from keras.engine.topology import Input\n",
209 |     "from keras.layers.core import Dense, Activation, Dropout, RepeatVector\n",
210 |     "from keras.layers.wrappers import TimeDistributed\n",
211 |     "from keras.layers.recurrent import LSTM\n",
212 |     "from keras.layers.embeddings import Embedding\n",
213 |     "from keras.regularizers import l2\n",
214 |     "from keras.layers import merge, SpatialDropout1D\n",
215 |     "from keras.callbacks import TensorBoard\n",
216 |     "from keras import Model\n",
217 |     "import keras\n",
218 |     "import random"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 20,
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "vocab_size = 9073\n",
230 |     "src_txt_length = 379\n",
231 |     "sum_txt_length = 379\n",
232 |     "# encoder input model\n",
233 |     "inputs = Input(shape=(src_txt_length,))\n",
234 |     "encoder1 = Embedding(vocab_size, 128)(inputs)\n",
235 |     "encoder2 = LSTM(128)(encoder1)\n",
236 |     "encoder3 = RepeatVector(sum_txt_length)(encoder2)\n",
237 |     "# decoder output model\n",
238 |     "decoder1 = LSTM(128, return_sequences=True)(encoder3)\n",
239 |     "outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder1)\n",
240 |     "# tie it together\n",
241 |     "model = Model(inputs=inputs, outputs=outputs)\n",
242 |     "model.compile(loss='categorical_crossentropy', optimizer='adam')"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 23,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "batch_size = 10\n",
254 |     "seed = 0\n",
255 |     "def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, seed=seed):\n",
256 |     "    \"\"\"yield batches. for training use nb_batches=None\n",
257 |     "    for validation generate deterministic results repeating every nb_batches\n",
258 |     "    \n",
259 |     "    while training it is good idea to flip once in a while the values of the headlines from the\n",
260 |     "    value taken from Xh to value generated by the model.\n",
261 |     "    \"\"\"\n",
262 |     "    c = nb_batches if nb_batches else 0\n",
263 |     "    while True:\n",
264 |     "        xds = []\n",
265 |     "        xhs = []\n",
266 |     "        if nb_batches and c >= nb_batches:\n",
267 |     "            c = 0\n",
268 |     "        new_seed = random.randint(0, 9223372036854775807)\n",
269 |     "        random.seed(c+123456789+seed)\n",
270 |     "        for b in range(batch_size):\n",
271 |     "            t = random.randint(0,len(Xd)-1)\n",
272 |     "\n",
273 |     "            xd = Xd[t]\n",
274 |     "            s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))\n",
275 |     "            xds.append(xd[:s])\n",
276 |     "            \n",
277 |     "            xh = Xh[t]\n",
278 |     "            s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))\n",
279 |     "            xhs.append(xh[:s])\n",
280 |     "\n",
281 |     "        # undo the seeding before we yield inorder not to affect the caller\n",
282 |     "        c+= 1\n",
283 |     "        random.seed(new_seed)\n",
284 |     "\n",
285 |     "        yield conv_seq_labels(xds, xhs, nflips=nflips, model=model, debug=debug)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 6,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stderr",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "Using TensorFlow backend.\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "FN = 'train'\n",
303 |     "FN0 = 'vocabulary-embedding'\n",
304 |     "FN1 = 'train'\n",
305 |     "\n",
306 |     "import os\n",
307 |     "import keras\n",
308 |     "keras.__version__\n",
309 |     "with open('%s.data.pkl'%FN0, 'rb') as fp:\n",
310 |     "    X, Y = pickle.load(fp)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 7,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "ename": "NameError",
320 |      "evalue": "name 'gen' is not defined",
321 |      "output_type": "error",
322 |      "traceback": [
323 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
324 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
325 |       "\u001b[0;32m<ipython-input-7-55de1447b583>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mnflips\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtraingen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnflips\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnflips\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
326 |       "\u001b[0;31mNameError\u001b[0m: name 'gen' is not defined"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "nflips = 10\n",
332 |     "traingen = gen(X, Y, batch_size=batch_size, nflips=nflips, model=model)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "len(Y)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "model.fit(X, Y)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {
357 |     "collapsed": true
358 |    },
359 |    "outputs": [],
360 |    "source": []
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {
366 |     "collapsed": true
367 |    },
368 |    "outputs": [],
369 |    "source": []
370 |   }
371 |  ],
372 |  "metadata": {
373 |   "kernelspec": {
374 |    "display_name": "Python 3",
375 |    "language": "python",
376 |    "name": "python3"
377 |   },
378 |   "language_info": {
379 |    "codemirror_mode": {
380 |     "name": "ipython",
381 |     "version": 3
382 |    },
383 |    "file_extension": ".py",
384 |    "mimetype": "text/x-python",
385 |    "name": "python",
386 |    "nbconvert_exporter": "python",
387 |    "pygments_lexer": "ipython3",
388 |    "version": "3.6.3"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 2
393 | }
394 | 


--------------------------------------------------------------------------------
/stopWords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | a's
  3 | able
  4 | about
  5 | above
  6 | according
  7 | accordingly
  8 | across
  9 | actually
 10 | after
 11 | afterwards
 12 | again
 13 | against
 14 | ain't
 15 | all
 16 | allow
 17 | allows
 18 | almost
 19 | alone
 20 | along
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | among
 27 | amongst
 28 | an
 29 | and
 30 | another
 31 | any
 32 | anybody
 33 | anyhow
 34 | anyone
 35 | anything
 36 | anyway
 37 | anyways
 38 | anywhere
 39 | apart
 40 | appear
 41 | appreciate
 42 | appropriate
 43 | are
 44 | aren't
 45 | around
 46 | as
 47 | aside
 48 | ask
 49 | asking
 50 | associated
 51 | at
 52 | available
 53 | away
 54 | awfully
 55 | b
 56 | be
 57 | became
 58 | because
 59 | become
 60 | becomes
 61 | becoming
 62 | been
 63 | before
 64 | beforehand
 65 | behind
 66 | being
 67 | believe
 68 | below
 69 | beside
 70 | besides
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | c
 80 | c'mon
 81 | c's
 82 | came
 83 | can
 84 | can't
 85 | cannot
 86 | cant
 87 | cause
 88 | causes
 89 | certain
 90 | certainly
 91 | changes
 92 | clearly
 93 | co
 94 | com
 95 | come
 96 | comes
 97 | concerning
 98 | consequently
 99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero


--------------------------------------------------------------------------------
/testing.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "ename": "AttributeError",
10 |      "evalue": "'list' object has no attribute 'ndim'",
11 |      "output_type": "error",
12 |      "traceback": [
13 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14 |       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
15 |       "\u001b[0;32m<ipython-input-3-5db1a25b992c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'binary_crossentropy'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'adam'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'accuracy'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[0;31m# Fit the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m150\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     26\u001b[0m \u001b[0;31m# evaluate the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
16 |       "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/models.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m    963\u001b[0m                               \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    964\u001b[0m                               \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 965\u001b[0;31m                               validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m    966\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    967\u001b[0m     def evaluate(self, x=None, y=None,\n",
17 |       "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m   1591\u001b[0m             \u001b[0mclass_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1592\u001b[0m             \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1593\u001b[0;31m             batch_size=batch_size)\n\u001b[0m\u001b[1;32m   1594\u001b[0m         \u001b[0;31m# Prepare validation data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1595\u001b[0m         \u001b[0mdo_validation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
18 |       "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[0;34m(self, x, y, sample_weight, class_weight, check_batch_axis, batch_size)\u001b[0m\n\u001b[1;32m   1424\u001b[0m                                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feed_input_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1425\u001b[0m                                     \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1426\u001b[0;31m                                     exception_prefix='input')\n\u001b[0m\u001b[1;32m   1427\u001b[0m         y = _standardize_input_data(y, self._feed_output_names,\n\u001b[1;32m   1428\u001b[0m                                     \u001b[0moutput_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
19 |       "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_standardize_input_data\u001b[0;34m(data, names, shapes, check_batch_axis, exception_prefix)\u001b[0m\n\u001b[1;32m     68\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     71\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     72\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
20 |       "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     68\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     71\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     72\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
21 |       "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'ndim'"
22 |      ]
23 |     }
24 |    ],
25 |    "source": [
26 |     "from keras.models import Sequential\n",
27 |     "from keras.layers import Dense\n",
28 |     "import numpy\n",
29 |     "# fix random seed for reproducibility\n",
30 |     "numpy.random.seed(7)\n",
31 |     "# load pima indians dataset\n",
32 |     "FN = 'train'\n",
33 |     "FN0 = 'vocabulary-embedding'\n",
34 |     "FN1 = 'train'\n",
35 |     "\n",
36 |     "import os\n",
37 |     "import keras\n",
38 |     "import pickle\n",
39 |     "keras.__version__\n",
40 |     "with open('%s.data.pkl'%FN0, 'rb') as fp:\n",
41 |     "    X, Y = pickle.load(fp)\n",
42 |     "# create model\n",
43 |     "model = Sequential()\n",
44 |     "model.add(Dense(12, input_dim=8, activation='relu'))\n",
45 |     "model.add(Dense(8, activation='relu'))\n",
46 |     "model.add(Dense(1, activation='sigmoid'))\n",
47 |     "# Compile model\n",
48 |     "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
49 |     "# Fit the model\n",
50 |     "model.fit(X, Y, epochs=150, batch_size=10)\n",
51 |     "# evaluate the model\n",
52 |     "scores = model.evaluate(X, Y)\n",
53 |     "print(\"\\n%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "metadata": {
60 |     "collapsed": true
61 |    },
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 3",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.6.3"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/tokenize_recipes.py:
--------------------------------------------------------------------------------
 1 | """Tokenize recipes."""
 2 | import _pickle as pickle
 3 | from os import path
 4 | from nltk.tokenize import word_tokenize
 5 | from nltk import download
 6 | from tqdm import tqdm
 7 | 
 8 | import config
 9 | import prep_data
10 | from parse_ingredients import parse_ingredient_list
11 | 
12 | 
13 | def tokenize_sentence(sentence):
14 |     """Tokenize a sentence."""
15 |     try:
16 |         return ' '.join(list(filter(
17 |             lambda x: x.lower() != "advertisement",
18 |             word_tokenize(sentence))))
19 |     except LookupError:
20 |         print('Downloading NLTK data')
21 |         download()
22 |         return ' '.join(list(filter(
23 |             lambda x: x.lower() != "advertisement",
24 |             word_tokenize(sentence))))
25 | 
26 | 
27 | def recipe_is_complete(r):
28 |     """Return True if recipe is complete and False otherwise.
29 | 
30 |     Completeness is defined as the recipe containing a title and instructions.
31 |     """
32 |     if ('title' not in r) or ('instructions' not in r):
33 |         return False
34 |     if (r['title'] is None) or (r['instructions'] is None):
35 |         return False
36 |     return True
37 | 
38 | 
39 | def tokenize_recipes(recipes):
40 |     """Tokenise all recipes."""
41 |     tokenized = []
42 |     for r in tqdm(recipes.values()):
43 |         if recipe_is_complete(r):
44 |             ingredients = '; '.join(parse_ingredient_list(r['ingredients'])) + '; '
45 |             tokenized.append((
46 |                 tokenize_sentence(r['title']),
47 |                 tokenize_sentence(ingredients) + tokenize_sentence(r['instructions'])))
48 |     return tuple(map(list, zip(*tokenized)))
49 | 
50 | 
51 | def pickle_recipes(recipes):
52 |     """Pickle all recipe tokens to disk."""
53 |     with open(path.join(config.path_data, 'tokens.pkl'), 'wb') as f:
54 |         pickle.dump(recipes, f, 2)
55 | 
56 | 
57 | def load_recipes():
58 |     """Read pickled recipe tokens from disk."""
59 |     with open(path.join(config.path_data, 'tokens.pkl'), 'rb') as f:
60 |         recipes = pickle.load(f)
61 |     return recipes
62 | 
63 | 
64 | def main():
65 |     """Tokenize recipes."""
66 |     recipes = prep_data.load_recipes()
67 |     text_sum_data = tokenize_recipes(recipes)
68 |     pickle_recipes(text_sum_data)
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/train2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 19,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "\"\"\"Define constant variables.\"\"\"\n",
 12 |     "\n",
 13 |     "# define empty and end-of-sentence vocab idx\n",
 14 |     "empty = 0\n",
 15 |     "eos = 1\n",
 16 |     "\n",
 17 |     "# input data (X) is made from maxlend description words followed by eos followed by\n",
 18 |     "# headline words followed by eos if description is shorter than maxlend it will be\n",
 19 |     "# left padded with empty if entire data is longer than maxlen it will be clipped and\n",
 20 |     "# if it is shorter it will be right padded with empty. labels (Y) are the headline\n",
 21 |     "# words followed by eos and clipped or padded to maxlenh. In other words the input is\n",
 22 |     "# made from a maxlend half in which the description is padded from the left and a\n",
 23 |     "# maxlenh half in which eos is followed by a headline followed by another eos if there\n",
 24 |     "# is enough space. The labels match only the second half and the first label matches\n",
 25 |     "# the eos at the start of the second half (following the description in the first half)\n",
 26 |     "maxlend = 100\n",
 27 |     "maxlenh = 15\n",
 28 |     "maxlen = maxlend + maxlenh\n",
 29 |     "activation_rnn_size = 40 if maxlend else 0\n",
 30 |     "nb_unknown_words = 10\n",
 31 |     "\n",
 32 |     "# function names\n",
 33 |     "FN0 = 'vocabulary-embedding'  # filename of vocab embeddings\n",
 34 |     "FN1 = 'train'  # filename of model weights\n",
 35 |     "\n",
 36 |     "# training variables\n",
 37 |     "seed = 42\n",
 38 |     "optimizer = 'adam'\n",
 39 |     "p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0\n",
 40 |     "regularizer = None\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 20,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from keras.models import Sequential\n",
 50 |     "from keras.layers.core import Dense, Activation, Dropout\n",
 51 |     "from keras.layers.wrappers import TimeDistributed\n",
 52 |     "from keras.layers.recurrent import LSTM\n",
 53 |     "from keras.layers.embeddings import Embedding\n",
 54 |     "from keras.layers.core import Lambda\n",
 55 |     "import keras.backend as K\n",
 56 |     "import numpy as np\n",
 57 |     "\n",
 58 |     "#from utils import str_shape\n",
 59 |     "#from constants import maxlend, maxlenh, maxlen, activation_rnn_size, optimizer, p_W, p_U, p_dense, p_emb, regularizer\n",
 60 |     "\n",
 61 |     "\n",
 62 |     "def inspect_model(model):\n",
 63 |     "    \"\"\"Print the structure of Keras `model`.\"\"\"\n",
 64 |     "    for i, l in enumerate(model.layers):\n",
 65 |     "        print(i, 'cls={} name={}'.format(type(l).__name__, l.name))\n",
 66 |     "        weights = l.get_weights()\n",
 67 |     "        print_str = ''\n",
 68 |     "#         for weight in weights:\n",
 69 |     "#             print_str += str_shape(weight) + ' '\n",
 70 |     "        print(print_str)\n",
 71 |     "        print()\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "class SimpleContext(Lambda):\n",
 75 |     "    \"\"\"Class to implement `simple_context` method as a Keras layer.\"\"\"\n",
 76 |     "\n",
 77 |     "    def __init__(self, fn, rnn_size, **kwargs):\n",
 78 |     "        \"\"\"Initialize SimpleContext.\"\"\"\n",
 79 |     "        self.rnn_size = rnn_size\n",
 80 |     "        super(SimpleContext, self).__init__(fn, **kwargs)\n",
 81 |     "        self.supports_masking = True\n",
 82 |     "\n",
 83 |     "    def compute_mask(self, input, input_mask=None):\n",
 84 |     "        \"\"\"Compute mask of maxlend.\"\"\"\n",
 85 |     "        return input_mask[:, maxlend:]\n",
 86 |     "\n",
 87 |     "    def get_output_shape_for(self, input_shape):\n",
 88 |     "        \"\"\"Get output shape for a given `input_shape`.\"\"\"\n",
 89 |     "        nb_samples = input_shape[0]\n",
 90 |     "        n = 2 * (self.rnn_size - activation_rnn_size)\n",
 91 |     "        return (nb_samples, maxlenh, n)\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "def create_model(vocab_size, embedding_size, LR, rnn_layers, rnn_size, embedding=None):\n",
 95 |     "    \"\"\"Construct and compile LSTM model.\"\"\"\n",
 96 |     "    # create a standard stacked LSTM\n",
 97 |     "    if embedding is not None:\n",
 98 |     "        embedding = [embedding]\n",
 99 |     "    model = Sequential()\n",
100 |     "    model.add(Embedding(vocab_size, embedding_size,\n",
101 |     "                        input_length=maxlen,\n",
102 |     "                        W_regularizer=regularizer, dropout=p_emb, weights=embedding, mask_zero=True,\n",
103 |     "                        name='embedding_1'))\n",
104 |     "    for i in range(rnn_layers):\n",
105 |     "        lstm = LSTM(rnn_size, return_sequences=True,\n",
106 |     "                    W_regularizer=regularizer, U_regularizer=regularizer,\n",
107 |     "                    b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,\n",
108 |     "                    name='lstm_{}'.format(i + 1))\n",
109 |     "        model.add(lstm)\n",
110 |     "        model.add(Dropout(p_dense, name='dropout_{}'.format(i + 1)))\n",
111 |     "\n",
112 |     "    def simple_context(X, mask, n=activation_rnn_size):\n",
113 |     "        \"\"\"Reduce the input just to its headline part (second half).\n",
114 |     "        For each word in this part it concatenate the output of the previous layer (RNN)\n",
115 |     "        with a weighted average of the outputs of the description part.\n",
116 |     "        In this only the last `rnn_size - activation_rnn_size` are used from each output.\n",
117 |     "        The first `activation_rnn_size` output is used to computer the weights for the averaging.\n",
118 |     "        \"\"\"\n",
119 |     "        desc, head = X[:, :maxlend, :], X[:, maxlend:, :]\n",
120 |     "        head_activations, head_words = head[:, :, :n], head[:, :, n:]\n",
121 |     "        desc_activations, desc_words = desc[:, :, :n], desc[:, :, n:]\n",
122 |     "\n",
123 |     "        # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot\n",
124 |     "        # activation for every head word and every desc word\n",
125 |     "        activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))\n",
126 |     "        # make sure we dont use description words that are masked out\n",
127 |     "        activation_energies = activation_energies + -1e20 * K.expand_dims(\n",
128 |     "            1. - K.cast(mask[:, :maxlend], 'float32'), 1)\n",
129 |     "\n",
130 |     "        # for every head word compute weights for every desc word\n",
131 |     "        activation_energies = K.reshape(activation_energies, (-1, maxlend))\n",
132 |     "        activation_weights = K.softmax(activation_energies)\n",
133 |     "        activation_weights = K.reshape(activation_weights, (-1, maxlenh, maxlend))\n",
134 |     "\n",
135 |     "        # for every head word compute weighted average of desc words\n",
136 |     "        desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))\n",
137 |     "        return K.concatenate((desc_avg_word, head_words))\n",
138 |     "\n",
139 |     "    if activation_rnn_size:\n",
140 |     "        model.add(SimpleContext(simple_context, rnn_size, name='simplecontext_1'))\n",
141 |     "\n",
142 |     "    model.add(TimeDistributed(Dense(\n",
143 |     "        vocab_size,\n",
144 |     "        W_regularizer=regularizer,\n",
145 |     "        b_regularizer=regularizer,\n",
146 |     "        name='timedistributed_1')))\n",
147 |     "    model.add(Activation('softmax', name='activation_1'))\n",
148 |     "\n",
149 |     "    # opt = Adam(lr=LR)  # keep calm and reduce learning rate\n",
150 |     "    model.compile(loss='categorical_crossentropy', optimizer=optimizer)\n",
151 |     "\n",
152 |     "    K.set_value(model.optimizer.lr, np.float32(LR))\n",
153 |     "    return model"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "# %load constants.py\n",
165 |     "\"\"\"Define constant variables.\"\"\"\n",
166 |     "\n",
167 |     "# define empty and end-of-sentence vocab idx\n",
168 |     "empty = 0\n",
169 |     "eos = 1\n",
170 |     "\n",
171 |     "# input data (X) is made from maxlend description words followed by eos followed by\n",
172 |     "# headline words followed by eos if description is shorter than maxlend it will be\n",
173 |     "# left padded with empty if entire data is longer than maxlen it will be clipped and\n",
174 |     "# if it is shorter it will be right padded with empty. labels (Y) are the headline\n",
175 |     "# words followed by eos and clipped or padded to maxlenh. In other words the input is\n",
176 |     "# made from a maxlend half in which the description is padded from the left and a\n",
177 |     "# maxlenh half in which eos is followed by a headline followed by another eos if there\n",
178 |     "# is enough space. The labels match only the second half and the first label matches\n",
179 |     "# the eos at the start of the second half (following the description in the first half)\n",
180 |     "maxlend = 100\n",
181 |     "maxlenh = 15\n",
182 |     "maxlen = maxlend + maxlenh\n",
183 |     "activation_rnn_size = 40 if maxlend else 0\n",
184 |     "nb_unknown_words = 10\n",
185 |     "\n",
186 |     "# function names\n",
187 |     "FN0 = 'vocabulary-embedding'  # filename of vocab embeddings\n",
188 |     "FN1 = 'train'  # filename of model weights\n",
189 |     "\n",
190 |     "# training variables\n",
191 |     "seed = 42\n",
192 |     "optimizer = 'adam'\n",
193 |     "p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0\n",
194 |     "regularizer = None\n"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 27,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "ename": "ImportError",
204 |      "evalue": "cannot import name 'empty'",
205 |      "output_type": "error",
206 |      "traceback": [
207 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
208 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
209 |       "\u001b[0;32m<ipython-input-27-907f12301f30>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msample_gen\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgensamples\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     13\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mprt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_embedding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess_vocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_split_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minspect_model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
210 |       "\u001b[0;32m~/Documents/cs141/sample_gen.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msequence\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mempty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlenh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
211 |       "\u001b[0;31mImportError\u001b[0m: cannot import name 'empty'"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "import os\n",
217 |     "import time\n",
218 |     "import random\n",
219 |     "import argparse\n",
220 |     "import json\n",
221 |     "\n",
222 |     "\n",
223 |     "import numpy as np\n",
224 |     "from keras.callbacks import TensorBoard\n",
225 |     "\n",
226 |     "import config\n",
227 |     "from sample_gen import gensamples\n",
228 |     "from utils import prt, load_embedding, process_vocab, load_split_data\n",
229 |     "from model import create_model, inspect_model\n",
230 |     "from generate import gen\n",
231 |     "#from constants import FN1, seed, nb_unknown_words\n",
232 |     "\n",
233 |     "# parse arguments\n",
234 |     "parser = argparse.ArgumentParser()\n",
235 |     "parser.add_argument('--batch-size', type=int, default=32, help='input batch size')\n",
236 |     "parser.add_argument('--epochs', type=int, default=10, help='number of epochs')\n",
237 |     "parser.add_argument('--rnn-size', type=int, default=512, help='size of RNN layers')\n",
238 |     "parser.add_argument('--rnn-layers', type=int, default=3, help='number of RNN layers')\n",
239 |     "parser.add_argument('--nsamples', type=int, default=640, help='number of samples per epoch')\n",
240 |     "parser.add_argument('--nflips', type=int, default=0, help='number of flips')\n",
241 |     "parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature')\n",
242 |     "parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001')\n",
243 |     "parser.add_argument('--warm-start', action='store_true')\n",
244 |     "args = parser.parse_args()\n",
245 |     "batch_size = args.batch_size\n",
246 |     "\n",
247 |     "# set sample sizes\n",
248 |     "nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size  # num training samples\n",
249 |     "nb_val_samples = nb_train_samples  # num validation samples\n",
250 |     "\n",
251 |     "# seed weight initialization\n",
252 |     "random.seed(seed)\n",
253 |     "np.random.seed(seed)\n",
254 |     "\n",
255 |     "embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words)\n",
256 |     "vocab_size, embedding_size = embedding.shape\n",
257 |     "oov0 = vocab_size - nb_unknown_words\n",
258 |     "idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words)\n",
259 |     "X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed)\n",
260 |     "\n",
261 |     "# print a sample recipe to make sure everything looks right\n",
262 |     "print('Random head, description:')\n",
263 |     "i = 811\n",
264 |     "prt('H', Y_train[i], idx2word)\n",
265 |     "prt('D', X_train[i], idx2word)\n",
266 |     "\n",
267 |     "# save model initialization parameters\n",
268 |     "model_params = (dict(\n",
269 |     "    vocab_size=vocab_size,\n",
270 |     "    embedding_size=embedding_size,\n",
271 |     "    LR=args.lr,\n",
272 |     "    rnn_layers=args.rnn_layers,\n",
273 |     "    rnn_size=args.rnn_size,\n",
274 |     "))\n",
275 |     "with open(os.path.join(config.path_models, 'model_params.json'), 'w') as f:\n",
276 |     "    json.dump(model_params, f)\n",
277 |     "\n",
278 |     "\n",
279 |     "model = create_model(\n",
280 |     "    vocab_size=vocab_size,\n",
281 |     "    embedding_size=embedding_size,\n",
282 |     "    LR=args.lr,\n",
283 |     "    embedding=embedding,\n",
284 |     "    rnn_layers=args.rnn_layers,\n",
285 |     "    rnn_size=args.rnn_size,\n",
286 |     ")\n",
287 |     "inspect_model(model)\n",
288 |     "\n",
289 |     "# load pre-trained model weights\n",
290 |     "FN1_filename = os.path.join(config.path_models, '{}.hdf5'.format(FN1))\n",
291 |     "if args.warm_start and FN1 and os.path.exists(FN1_filename):\n",
292 |     "    model.load_weights(FN1_filename)\n",
293 |     "    print('Model weights loaded from {}'.format(FN1_filename))\n",
294 |     "\n",
295 |     "# print samples before training\n",
296 |     "gensamples(\n",
297 |     "    skips=2,\n",
298 |     "    k=10,\n",
299 |     "    batch_size=batch_size,\n",
300 |     "    short=False,\n",
301 |     "    temperature=args.temperature,\n",
302 |     "    use_unk=True,\n",
303 |     "    model=model,\n",
304 |     "    data=(X_test, Y_test),\n",
305 |     "    idx2word=idx2word,\n",
306 |     "    oov0=oov0,\n",
307 |     "    glove_idx2idx=glove_idx2idx,\n",
308 |     "    vocab_size=vocab_size,\n",
309 |     "    nb_unknown_words=nb_unknown_words,\n",
310 |     ")\n",
311 |     "\n",
312 |     "# get train and validation generators\n",
313 |     "r = next(gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word))\n",
314 |     "traingen = gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=args.nflips, model=model, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)\n",
315 |     "valgen = gen(X_test, Y_test, batch_size=batch_size, nb_batches=nb_val_samples // batch_size, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)\n",
316 |     "\n",
317 |     "# define callbacks for training\n",
318 |     "callbacks = [TensorBoard(\n",
319 |     "    log_dir=os.path.join(config.path_logs, str(time.time())),\n",
320 |     "    histogram_freq=2, write_graph=False, write_images=False)]\n",
321 |     "\n",
322 |     "# train model and save weights\n",
323 |     "h = model.fit_generator(\n",
324 |     "    traingen, samples_per_epoch=nb_train_samples,\n",
325 |     "    nb_epoch=args.epochs, validation_data=valgen, nb_val_samples=nb_val_samples,\n",
326 |     "    callbacks=callbacks,\n",
327 |     ")\n",
328 |     "model.save_weights(FN1_filename, overwrite=True)\n",
329 |     "\n",
330 |     "# print samples after training\n",
331 |     "gensamples(\n",
332 |     "    skips=2,\n",
333 |     "    k=10,\n",
334 |     "    batch_size=batch_size,\n",
335 |     "    short=False,\n",
336 |     "    temperature=args.temperature,\n",
337 |     "    use_unk=True,\n",
338 |     "    model=model,\n",
339 |     "    data=(X_test, Y_test),\n",
340 |     "    idx2word=idx2word,\n",
341 |     "    oov0=oov0,\n",
342 |     "    glove_idx2idx=glove_idx2idx,\n",
343 |     "    vocab_size=vocab_size,\n",
344 |     "    nb_unknown_words=nb_unknown_words,\n",
345 |     ")"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "collapsed": true
353 |    },
354 |    "outputs": [],
355 |    "source": []
356 |   }
357 |  ],
358 |  "metadata": {
359 |   "kernelspec": {
360 |    "display_name": "Python 3",
361 |    "language": "python",
362 |    "name": "python3"
363 |   },
364 |   "language_info": {
365 |    "codemirror_mode": {
366 |     "name": "ipython",
367 |     "version": 3
368 |    },
369 |    "file_extension": ".py",
370 |    "mimetype": "text/x-python",
371 |    "name": "python",
372 |    "nbconvert_exporter": "python",
373 |    "pygments_lexer": "ipython3",
374 |    "version": "3.6.3"
375 |   }
376 |  },
377 |  "nbformat": 4,
378 |  "nbformat_minor": 2
379 | }
380 | 


--------------------------------------------------------------------------------
/train3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/train_seq2seq.py:
--------------------------------------------------------------------------------
  1 | """Train a sequence to sequence model.
  2 | 
  3 | This script is sourced from Siraj Rival
  4 | https://github.com/llSourcell/How_to_make_a_text_summarizer/blob/master/train.ipynb
  5 | """
  6 | import os
  7 | import time
  8 | import random
  9 | import argparse
 10 | import json
 11 | 
 12 | import numpy as np
 13 | from keras.callbacks import TensorBoard
 14 | 
 15 | import config
 16 | from sample_gen import gensamples
 17 | from utils import prt, load_embedding, process_vocab, load_split_data
 18 | from model import create_model, inspect_model
 19 | from generate import gen
 20 | from constants import FN1, seed, nb_unknown_words
 21 | 
 22 | # parse arguments
 23 | parser = argparse.ArgumentParser()
 24 | parser.add_argument('--batch-size', type=int, default=32, help='input batch size')
 25 | parser.add_argument('--epochs', type=int, default=10, help='number of epochs')
 26 | parser.add_argument('--rnn-size', type=int, default=512, help='size of RNN layers')
 27 | parser.add_argument('--rnn-layers', type=int, default=3, help='number of RNN layers')
 28 | parser.add_argument('--nsamples', type=int, default=640, help='number of samples per epoch')
 29 | parser.add_argument('--nflips', type=int, default=0, help='number of flips')
 30 | parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature')
 31 | parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001')
 32 | parser.add_argument('--warm-start', action='store_true')
 33 | args = parser.parse_args()
 34 | batch_size = args.batch_size
 35 | 
 36 | # set sample sizes
 37 | nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size  # num training samples
 38 | nb_val_samples = nb_train_samples  # num validation samples
 39 | 
 40 | # seed weight initialization
 41 | random.seed(seed)
 42 | np.random.seed(seed)
 43 | 
 44 | embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words)
 45 | vocab_size, embedding_size = embedding.shape
 46 | oov0 = vocab_size - nb_unknown_words
 47 | idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words)
 48 | X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed)
 49 | 
 50 | # print a sample recipe to make sure everything looks right
 51 | print('Random head, description:')
 52 | i = 811
 53 | prt('H', Y_train[i], idx2word)
 54 | prt('D', X_train[i], idx2word)
 55 | 
 56 | # save model initialization parameters
 57 | model_params = (dict(
 58 |     vocab_size=vocab_size,
 59 |     embedding_size=embedding_size,
 60 |     LR=args.lr,
 61 |     rnn_layers=args.rnn_layers,
 62 |     rnn_size=args.rnn_size,
 63 | ))
 64 | with open(os.path.join(config.path_models, 'model_params.json'), 'w') as f:
 65 |     json.dump(model_params, f)
 66 | 
 67 | 
 68 | model = create_model(
 69 |     vocab_size=vocab_size,
 70 |     embedding_size=embedding_size,
 71 |     LR=args.lr,
 72 |     embedding=embedding,
 73 |     rnn_layers=args.rnn_layers,
 74 |     rnn_size=args.rnn_size,
 75 | )
 76 | inspect_model(model)
 77 | 
 78 | # load pre-trained model weights
 79 | FN1_filename = os.path.join(config.path_models, '{}.hdf5'.format(FN1))
 80 | if args.warm_start and FN1 and os.path.exists(FN1_filename):
 81 |     model.load_weights(FN1_filename)
 82 |     print('Model weights loaded from {}'.format(FN1_filename))
 83 | 
 84 | # print samples before training
 85 | gensamples(
 86 |     skips=2,
 87 |     k=10,
 88 |     batch_size=batch_size,
 89 |     short=False,
 90 |     temperature=args.temperature,
 91 |     use_unk=True,
 92 |     model=model,
 93 |     data=(X_test, Y_test),
 94 |     idx2word=idx2word,
 95 |     oov0=oov0,
 96 |     glove_idx2idx=glove_idx2idx,
 97 |     vocab_size=vocab_size,
 98 |     nb_unknown_words=nb_unknown_words,
 99 | )
100 | 
101 | # get train and validation generators
102 | r = next(gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word))
103 | traingen = gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=args.nflips, model=model, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)
104 | valgen = gen(X_test, Y_test, batch_size=batch_size, nb_batches=nb_val_samples // batch_size, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)
105 | 
106 | # define callbacks for training
107 | callbacks = [TensorBoard(
108 |     log_dir=os.path.join(config.path_logs, str(time.time())),
109 |     histogram_freq=2, write_graph=False, write_images=False)]
110 | 
111 | # train model and save weights
112 | h = model.fit_generator(
113 |     traingen, samples_per_epoch=nb_train_samples,
114 |     nb_epoch=args.epochs, validation_data=valgen, nb_val_samples=nb_val_samples,
115 |     callbacks=callbacks,
116 | )
117 | model.save_weights(FN1_filename, overwrite=True)
118 | 
119 | # print samples after training
120 | gensamples(
121 |     skips=2,
122 |     k=10,
123 |     batch_size=batch_size,
124 |     short=False,
125 |     temperature=args.temperature,
126 |     use_unk=True,
127 |     model=model,
128 |     data=(X_test, Y_test),
129 |     idx2word=idx2word,
130 |     oov0=oov0,
131 |     glove_idx2idx=glove_idx2idx,
132 |     vocab_size=vocab_size,
133 |     nb_unknown_words=nb_unknown_words,
134 | )
135 | 


--------------------------------------------------------------------------------