├── EmbeddingAttempt4.ipynb ├── Extractive vs. Abstractive Text Summarization.pdf ├── MICHELLEZHAO_CS141_finalreport.pdf ├── README.md ├── Reduction.ipynb ├── TrainAttempt4.ipynb ├── _config.yml ├── algo1.png ├── cs141_final_poster_toSize.pptx ├── embedding2.ipynb ├── embeddingNotebook.ipynb ├── p1.png ├── p2.png ├── p3.png ├── p4.png ├── p5.png ├── p6.png ├── p7.png ├── predict.py ├── simpleTrain.ipynb ├── simpler.ipynb ├── stopWords.txt ├── testing.ipynb ├── tokenize_recipes.py ├── train.ipynb ├── train2.ipynb ├── train3.ipynb ├── trainNotebook.ipynb └── train_seq2seq.py /EmbeddingAttempt4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "GloVe Loaded.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import numpy as np\n", 18 | "from __future__ import division\n", 19 | "\n", 20 | "filename = 'glove.6B.50d.txt' \n", 21 | "# (glove data set from: https://nlp.stanford.edu/projects/glove/)\n", 22 | "\n", 23 | "\n", 24 | "def loadGloVe(filename):\n", 25 | " vocab = []\n", 26 | " embd = []\n", 27 | " file = open(filename,'r')\n", 28 | " for line in file.readlines():\n", 29 | " row = line.strip().split(' ')\n", 30 | " vocab.append(row[0])\n", 31 | " embd.append(row[1:])\n", 32 | " print('GloVe Loaded.')\n", 33 | " file.close()\n", 34 | " return vocab,embd\n", 35 | "\n", 36 | "# Pre-trained GloVe embedding\n", 37 | "vocab,embd = loadGloVe(filename)\n", 38 | "\n", 39 | "embedding = np.asarray(embd)\n", 40 | "embedding = embedding.astype(np.float32)\n", 41 | "\n", 42 | "word_vec_dim = len(embd[0]) # word_vec_dim = dimension of each word vectors" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 12, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import csv\n", 52 | "import nltk as nlp\n", 53 | "from nltk import word_tokenize\n", 54 | "import string\n", 55 | "\n", 56 | "summaries = []\n", 57 | "texts = []\n", 58 | "\n", 59 | "def clean(text):\n", 60 | " text = text.lower()\n", 61 | " printable = set(string.printable)\n", 62 | " return filter(lambda x: x in printable, text) #filter funny characters, if any. \n", 63 | " \n", 64 | "import numpy as np\n", 65 | "import os\n", 66 | "\n", 67 | "def split():\n", 68 | " titles = []\n", 69 | " texts = []\n", 70 | " root = 'Part1'\n", 71 | " \n", 72 | " #dirr = 'Part1/awards_1990/awd_1990_00/'\n", 73 | " dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n", 74 | "\n", 75 | " for filename in dirs[1:]:\n", 76 | " #iter = 0\n", 77 | " #print(dirs[1])\n", 78 | " \n", 79 | " #print(iter)\n", 80 | " #iter += 1\n", 81 | " #print(dirs[1:])\n", 82 | " #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n", 83 | " f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n", 84 | " addTitle = False\n", 85 | " addTexts = False\n", 86 | " title = []\n", 87 | " text = []\n", 88 | " for word in f.read().split():\n", 89 | " if (word == \"Title\"):\n", 90 | " addTitle = True\n", 91 | " continue\n", 92 | "\n", 93 | " if (word == \"Type\"):\n", 94 | " addTitle = False\n", 95 | "\n", 96 | "# if (addTexts == True and word == \"\\n\"):\n", 97 | "# addTexts = False\n", 98 | "# break\n", 99 | "\n", 100 | "\n", 101 | " if (word == \"Abstract\"):\n", 102 | " addTexts = True\n", 103 | " continue\n", 104 | "\n", 105 | " if(addTitle == True):\n", 106 | " title.append(word)\n", 107 | "\n", 108 | " if(addTexts == True):\n", 109 | " text.append(word)\n", 110 | "\n", 111 | " for i in range(len(title)):\n", 112 | " s = title[i]\n", 113 | " table = str.maketrans({key: None for key in string.punctuation})\n", 114 | " new_s = s.translate(table)\n", 115 | " title[i] = new_s\n", 116 | " for i in range(len(text)):\n", 117 | " s = text[i]\n", 118 | " table = str.maketrans({key: None for key in string.punctuation})\n", 119 | " new_s = s.translate(table)\n", 120 | " text[i] = new_s\n", 121 | "\n", 122 | " title = ' '.join(title)\n", 123 | " text =' '.join(text)\n", 124 | " titles.append(word_tokenize(title))\n", 125 | " texts.append(word_tokenize(text))\n", 126 | "\n", 127 | " return titles, texts\n", 128 | "\n", 129 | "summaries, texts = split()\n", 130 | "\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 13, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "SAMPLE CLEANED & TOKENIZED TEXT: \n", 143 | "\n", 144 | "['Research', 'in', 'Particulate', 'systems', 'Engineering', 'in', 'crucial', 'to', 'advances', 'in', 'combustion', 'atmospheric', 'and', 'environmental', 'sciences', 'nuclear', 'winter', 'studies', 'nuclear', 'reactor', 'safety', 'and', 'materials', 'manufacturing', 'The', 'purpose', 'of', 'this', 'project', 'is', 'to', 'provide', 'research', 'experiences', 'for', 'undergraduate', 'students', 'in', 'this', 'area', 'At', 'least', 'half', 'of', 'the', 'students', 'will', 'be', 'selected', 'from', 'institutions', 'other', 'the', 'UMC', 'with', 'several', 'recruited', 'from', 'Stephens', 'College', 'a', 'womens', 'college', 'Lincoln', 'University', 'HCBU', 'and', 'other', 'undergraduate', 'schools', 'in', 'the', 'State', 'Each', 'student', 'will', 'undertake', 'a', 'specific', 'research', 'project', 'and', 'a', 'faculty', 'advisor', 'participating', 'in', 'the', 'REU', 'program', 'will', 'work', 'with', 'the', 'student', 'to', 'develop', 'hisher', 'research', 'skills', 'An', 'essential', 'element', 'of', 'the', 'program', 'will', 'be', 'its', 'emphasis', 'on', 'professional', 'development', 'of', 'the', 'students', 'eg', 'paper', 'writing', 'and', 'technical', 'presentations', 'The', 'purpose', 'is', 'to', 'infuse', 'bright', 'undergraduate', 'students', 'with', 'an', 'enthusiasm', 'towards', 'research', 'careers', 'including', 'graduate', 'level', 'education', 'The', 'University', 'of', 'MissouriColumbia', 'has', 'made', 'substantial', 'institutional', 'commitments', 'to', 'the', 'program', 'These', 'include', 'support', 'of', 'faculty', 'time', 'waiver', 'of', 'tuition', 'and', 'fees', 'for', 'the', 'student', 'Participants', 'waiver', 'of', 'all', 'overhead', 'costs', 'and', 'stipends', 'for', 'two', 'minority', 'students', 'in', 'addition', 'to', 'those', 'supported', 'by', 'the', 'NSF', 'funds']\n", 145 | "\n", 146 | "SAMPLE CLEANED & TOKENIZED SUMMARY: \n", 147 | "\n", 148 | "['Research', 'Expereinces', 'for', 'Undergraduate', 'in', 'Particulate', 'Systems', 'Engineering']\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "import random\n", 154 | "\n", 155 | "index = random.randint(0,len(texts)-1)\n", 156 | "\n", 157 | "print (\"SAMPLE CLEANED & TOKENIZED TEXT: \\n\\n\"+str(texts[index]))\n", 158 | "print (\"\\nSAMPLE CLEANED & TOKENIZED SUMMARY: \\n\\n\"+str(summaries[index]))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 14, 164 | "metadata": { 165 | "collapsed": true 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "def np_nearest_neighbour(x):\n", 170 | " #returns array in embedding that's most similar (in terms of cosine similarity) to x\n", 171 | " \n", 172 | " xdoty = np.multiply(embedding,x)\n", 173 | " xdoty = np.sum(xdoty,1)\n", 174 | " xlen = np.square(x)\n", 175 | " xlen = np.sum(xlen,0)\n", 176 | " xlen = np.sqrt(xlen)\n", 177 | " ylen = np.square(embedding)\n", 178 | " ylen = np.sum(ylen,1)\n", 179 | " ylen = np.sqrt(ylen)\n", 180 | " xlenylen = np.multiply(xlen,ylen)\n", 181 | " cosine_similarities = np.divide(xdoty,xlenylen)\n", 182 | "\n", 183 | " return embedding[np.argmax(cosine_similarities)]\n", 184 | " \n", 185 | "\n", 186 | "\n", 187 | "def word2vec(word): # converts a given word into its vector representation\n", 188 | " if word in vocab:\n", 189 | " return embedding[vocab.index(word)]\n", 190 | " else:\n", 191 | " return embedding[vocab.index('unk')]\n", 192 | "\n", 193 | "def vec2word(vec): # converts a given vector representation into the represented word \n", 194 | " for x in xrange(0, len(embedding)):\n", 195 | " if np.array_equal(embedding[x],np.asarray(vec)):\n", 196 | " return vocab[x]\n", 197 | " return vec2word(np_nearest_neighbour(np.asarray(vec)))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 15, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "Vector representation of 'unk':\n", 210 | "\n", 211 | "[ -7.91490018e-01 8.66169989e-01 1.19980000e-01 9.22870007e-04\n", 212 | " 2.77599990e-01 -4.91849989e-01 5.01950026e-01 6.07919996e-04\n", 213 | " -2.58450001e-01 1.78650007e-01 2.53500015e-01 7.65720010e-01\n", 214 | " 5.06640017e-01 4.02500004e-01 -2.13879999e-03 -2.83969998e-01\n", 215 | " -5.03239989e-01 3.04490000e-01 5.17790020e-01 1.50899999e-02\n", 216 | " -3.50309998e-01 -1.12779999e+00 3.32529992e-01 -3.52499992e-01\n", 217 | " 4.13260013e-02 1.08630002e+00 3.39099988e-02 3.35640013e-01\n", 218 | " 4.97449994e-01 -7.01309964e-02 -1.21920002e+00 -4.85119998e-01\n", 219 | " -3.85119990e-02 -1.35539994e-01 -1.63800001e-01 5.23209989e-01\n", 220 | " -3.13180000e-01 -1.65500000e-01 1.19089998e-01 -1.51150003e-01\n", 221 | " -1.56210005e-01 -6.26550019e-01 -6.23359978e-01 -4.21499997e-01\n", 222 | " 4.18729991e-01 -9.24719989e-01 1.10490000e+00 -2.99959987e-01\n", 223 | " -6.30029989e-03 3.95399988e-01]\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "word = \"unk\"\n", 229 | "print (\"Vector representation of '\"+str(word)+\"':\\n\")\n", 230 | "print (word2vec(word))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 16, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "#REDUCE DATA (FOR SPEEDING UP THE NEXT STEPS)\n", 242 | "\n", 243 | "MAXIMUM_DATA_NUM = 50000\n", 244 | "\n", 245 | "texts = texts[0:MAXIMUM_DATA_NUM]\n", 246 | "summaries = summaries[0:MAXIMUM_DATA_NUM]" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 17, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "vocab_limit = []\n", 256 | "embd_limit = []\n", 257 | "\n", 258 | "i=0\n", 259 | "for text in texts:\n", 260 | " for word in text:\n", 261 | " if word not in vocab_limit:\n", 262 | " if word in vocab:\n", 263 | " vocab_limit.append(word)\n", 264 | " embd_limit.append(word2vec(word))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 18, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "for summary in summaries:\n", 276 | " for word in summary:\n", 277 | " if word not in vocab_limit:\n", 278 | " if word in vocab:\n", 279 | " vocab_limit.append(word)\n", 280 | " embd_limit.append(word2vec(word))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 19, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "if 'eos' not in vocab_limit:\n", 292 | " vocab_limit.append('eos')\n", 293 | " embd_limit.append(word2vec('eos'))\n", 294 | "if 'unk' not in vocab_limit:\n", 295 | " vocab_limit.append('unk')\n", 296 | " embd_limit.append(word2vec('unk'))\n", 297 | "\n", 298 | "null_vector = np.zeros([word_vec_dim])\n", 299 | "\n", 300 | "vocab_limit.append('')\n", 301 | "embd_limit.append(null_vector)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 20, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "vec_summaries = []\n", 313 | "\n", 314 | "for summary in summaries:\n", 315 | " \n", 316 | " vec_summary = []\n", 317 | " \n", 318 | " for word in summary:\n", 319 | " vec_summary.append(word2vec(word))\n", 320 | " \n", 321 | " vec_summary.append(word2vec('eos'))\n", 322 | " \n", 323 | " vec_summary = np.asarray(vec_summary)\n", 324 | " vec_summary = vec_summary.astype(np.float32)\n", 325 | " \n", 326 | " vec_summaries.append(vec_summary)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 21, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "vec_texts = []\n", 338 | "\n", 339 | "for text in texts:\n", 340 | " \n", 341 | " vec_text = []\n", 342 | " \n", 343 | " for word in text:\n", 344 | " vec_text.append(word2vec(word))\n", 345 | " \n", 346 | " vec_text = np.asarray(vec_text)\n", 347 | " vec_text = vec_text.astype(np.float32)\n", 348 | " \n", 349 | " vec_texts.append(vec_text)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 22, 355 | "metadata": { 356 | "collapsed": true 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "import pickle\n", 361 | "with open('vocab_limit', 'wb') as fp:\n", 362 | " pickle.dump(vocab_limit, fp)\n", 363 | "with open('embd_limit', 'wb') as fp:\n", 364 | " pickle.dump(embd_limit, fp)\n", 365 | "with open('vec_summaries', 'wb') as fp:\n", 366 | " pickle.dump(vec_summaries, fp)\n", 367 | "with open('vec_texts', 'wb') as fp:\n", 368 | " pickle.dump(vec_texts, fp)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.6.3" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 2 402 | } 403 | -------------------------------------------------------------------------------- /Extractive vs. Abstractive Text Summarization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/Extractive vs. Abstractive Text Summarization.pdf -------------------------------------------------------------------------------- /MICHELLEZHAO_CS141_finalreport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/MICHELLEZHAO_CS141_finalreport.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extractive vs. Abstractive Text Summarization Methods: An Analysis 2 | 3 | Text summarization solves the problem of condensing information into a more compact form, while maintaining the important information in the text. The methods of automatic text summarization fall into two primary categories: extractive and abstractive. A common approach of extractive summarization involves selecting the most representative sentences that best cover the information expressed by the original text based on a ranking of sentences by relevance. A popular method of abstractive text summarization is using an encoder-decoder structure, which generates a latent factor representation of the data, and decodes it to generate a summary. The goal of the project was to analyze and compare the effectiveness of both methods when applied specifically to scientific texts. 4 | 5 | ## Motivation 6 | 7 | My motivation for this project came from personal experience. As a student in college, I'm often faced with a large number of scientific papers and research articles that pertain to my interests, yet I don't have the time to read them all. I wanted a way to be able to get summaries of the main ideas for the papers, without significant loss of important content. Text summarization is a widely implemented algorithm, but I wanted to explore different text summarization methods applied to scientific writing in particular. 8 | 9 | ## Introduction 10 | 11 | Automatic text summarization is the process of shortening a text documentation using a system for prioritizing information. Technologies that generate summaries take into account variables such as length, style, and syntax. Text summarization from the perspective of humans is taking a chunk of information and extracting what one deems most important. Automatic text summarization is based on the logical quantification of features of the text including, weighting keywords, and sentence ranking. 12 | 13 | ### Extractive Text Summarization 14 | Extractive text summarization does not use words aside from the ones already in the text, and selects some combination of the existing words most relevant to the meaning of the source. Techniques of extractive summarization include ranking sentences and phrases in order of importance and selecting the most important components of the document to construct the summary. These methods tend to more robust because they use existing phrases, but lack flexibility since they cannot use new words or paraphrase. 15 | 16 | ### Abstractive Text Summarization 17 | Abstractive text summarization involves generating entirely new phrases and sentences to capture the meaning of the text. Abstractive methods tend to be more complex, because the machine must read over the text and deem certain concepts to be important, and then learn to construct some cohesive phrasing of the relevant concepts. Abstractive summarization is most similar to how humans summarize, as humans often summarize by paraphrasing. 18 | 19 | ## Materials and Methods 20 | Although the primary goal of my project was to be able to summarize entire scientific papers, and essentially create abstracts given papers, a paper was too long of an input text to start with. I decided to first work with generating summaries given abstracts, which are much shorter than entire papers. Essentially, my project can be thought of as generating paper titles, given abstracts. First, I needed a dataset of abstract texts with their corresponding titles. 21 | 22 | I used the NSF Research Award Abstracts 1990-2003 Data Set from the UCI machine learning repository. The dataset consisted of abstracts that had won the NSF research awards from 1990 to 2003, along with the title of the paper. For my abstractive learning, the training input X was the abstract and the title was the training input Y. 23 | 24 | ### Extractive Methods 25 | For extractive summarization, I used the TextRank algorithm, which is based on Google’s PageRank algorithm. TextRanks works by transforming the text into a graph. It regards words as vertices and the relation between words in phrases or sentences as edges. Each edge also has different weight. When one vertex links to another one, it is basically casting a vote of importance for that vertex. The importance of the vertex also dictates how heavily weighted its votes are. TextRank uses the structure of the text and the known parts of speech for words to assign a score to words that are keywords for the text. 26 | 27 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/algo1.png) 28 | 29 | #### Algorithm: TextRank Algorithm 30 | 1. Identify filtered text units most representative of the text and add them as vertices to the graph. 31 | 2. Identify relations that connect such text units, and use these relations to draw edges between vertices in the graph. 32 | 3. Iterate the graph-based ranking algorithm until convergence. 33 | 4. Sort vertices based on their final score. Use the values attached to each vertex for ranking/selection decisions. 34 | 35 | 36 | First, we take the input text and split the entire text down to individual words. Using a list of stop words, words are filtered so that only nouns and adjectives are considered. Then a graph of words is created where the words are the nodes/vertices. Each vertex’s edges are defined by connections of a word to other words that are close to it in the text. The TextRank algorithm is then run on the graph. Each node is given a weight of 1. Then, we go through the list of nodes and collect the number of edges and connections the word has, which is essentially the influence of the connected vertex. The scores are computed and normalized for every node, and the algorithm takes the top-scoring words that have been identified as important keywords. The algorithm sums up the scores for each of the keywords in all of the sentences, and ranks the sentences in order of score and significance. Finally, the top K sentences are returned to become the TextRank generated summary. 37 | 38 | ### Code for TextRank Reduction 39 | 40 | First, we take the input text and split the entire text down to individual words. Using a list of stop words, words are filtered so that only nouns and adjectives are considered. Then a graph of words is created where the words are the nodes/vertices. Each vertex’s edges are defined by connections of a word to other words that are close to it in the text. The TextRank algorithm is then run on the graph. Each node is given a weight of 1. Then, we go through the list of nodes and collect the number of edges and connections the word has, which is essentially the influence of the connected vertex. The scores are computed and normalized for every node, and the algorithm takes the top-scoring words that have been identified as important keywords. The algorithm sums up the scores for each of the keywords in all of the sentences, and ranks the sentences in order of score and significance. Finally, the top K sentences are returned to become the TextRank generated summary. 41 | 42 | 43 | ```python 44 | def reduce(self, text, reductionRatio): 45 | stopWordsFile = 'stopWords.txt' 46 | stopWords= open(stopWordsFile).read().splitlines() 47 | 48 | lines = text.splitlines() 49 | contentLines = filter(lambda w: w.strip() != '', lines) 50 | 51 | paragraphs = self.getParagraphs(contentLines, stopWords) 52 | print("paragraphs", paragraphs) 53 | 54 | rankedSentences = self.sentenceRank(paragraphs) 55 | 56 | orderedSentences = [] 57 | for p in paragraphs: 58 | for s in p.Sentences: 59 | orderedSentences.append(s) 60 | 61 | reducedSentences = [] 62 | i = 0 63 | while i < math.trunc(len(rankedSentences) * reductionRatio): 64 | s = rankedSentences[i][0].Sentence 65 | position = orderedSentences.index(s) 66 | reducedSentences.append((s, position)) 67 | i = i + 1 68 | reducedSentences = sorted(reducedSentences, key=lambda x: x[1]) 69 | 70 | reducedText = [] 71 | for s,r in reducedSentences: 72 | reducedText.append(s.getFullSentence()) 73 | return reducedText 74 | 75 | 76 | ``` 77 | 78 | 79 | 80 | ### Abstractive Methods 81 | First, we need to preprocess the data by constructing an embedding of the text. Embedding the input converts the text into numbers, a more interpretable numerical representation of the data for the encoder-decoder network to work with. I experimented with two different embedding methods: Word2Vec and Global-Vectors (GloVe). Word2Vec is algorithm that combines continuous bag of words and the Skip-gram model to generate word vector representations. GloVe is an unsupervised learning algorithm for obtaining vector representations for words, training from a dictionary of common words. 82 | 83 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p1.png) 84 | 85 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p2.png) 86 | 87 | The encoder-decoder model is composed of multiple recurrent neural networks, one of which works as an encoder, and one as a decoder. The encoder converts an input document into a latent representation (a vector), and the decoder reads the latent input, generating a summary as it decodes. With encoder decoder structures, issues to consider include determining how to set the focus on the import sentences and keywords, how to handle novel or rare words in the document, how to handle incredibly long documents, and how to make summaries readable and flexible with a large vocabulary. 88 | 89 | 90 | The encoder-decoder recurrent neural network architecture has been shown to be effective when applied to text summarization. The architecture involves two components: an encoder and a decoder. The encoder reads the entire input sequence and encodes it into an internal representation, often a fixed-length vector. The decoder reads the encoded input sequence from the decoder and generates the output sequence, which is the summary. Both the encoder and decoder sub-models are trained jointly, meaning their output feed into the other as input. 91 | 92 | ```python 93 | model = Sequential() 94 | model.add(Embedding(vocab_size, embedding_size, 95 | input_length=maxlen, 96 | W_regularizer=regularizer, weights=[embedding], mask_zero=True, 97 | name='embedding_1')) 98 | 99 | for i in range(rnn_layers): 100 | lstm = LSTM(40000, return_sequences=True, 101 | W_regularizer=regularizer, U_regularizer=regularizer, 102 | b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U, 103 | name='lstm_%d'%(i+1) 104 | ) 105 | model.add(lstm) 106 | ``` 107 | The encoder is a bidirectional LSTM recurrent neural network (RNN). RNNs can use their internal state (memory) to process sequences of inputs. LSTMs are capable of learning long term dependencies by storing long-term states and inputs in gated cell memory. The tokenized words of the text are fed one-by-one into the encoder, a single-layer bidirectional LSTM, producing a sequence of hidden states, which is a latent representation of the input. The decoder is a single-layer unidirectional LSTM, which receives the word embedding of the previous word, and the embedding is transformed into a word representation, which is part of the summary. 108 | 109 | I used the one-shot encoder-decoder model, where the entire output sequence is generated in a one-shot manner, meaning the decoder uses the latent context vector alone to generate the output summary. 110 | 111 | Abstractive methods like the encoder-decoder network are capable of generating entirely new phrases and sentences to capture the meaning of the text. They tend to be more complex than extractive methods, since they learn to construct some cohesive phrasing of the relevant concepts. However, this also means they are more susceptible to error. 112 | 113 | ## Results 114 | ### Extractive Results 115 | The TextRank algorithm generated the following summary. I specified how many sentences to reduce, and generated a 70% reduction summary and a 90% reduction summary which contained the top 3 most important sentences and the top 1 most important sentence, respectively. 116 | 117 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p3.png) 118 | 119 | I summarized this text from one of the scientific paper abstracts. 120 | #### Text: 121 | Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals. 122 | 123 | 124 | #### Summary: 125 | We can specific how many sentences to output and define what percentage of reduction of the text we will perform. 126 | 127 | ##### 70% Reduction: 128 | Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. 129 | 130 | ##### 90% Reduction: 131 | Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. 132 | 133 | ### Abstractive Results 134 | As part of the pre-processing analysis, ranking the words in order of number of appearances, we saw this distribution of keywords and their frequencies in the training data. The distribution of set of text input words is much larger and wider than that of words in the summaries. 135 | 136 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p5.png) 137 | 138 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p6.png) 139 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p7.png) 140 | 141 | ![alt text](https://github.com/mzhao98/text-summarization/blob/master/p4.png) 142 | 143 | The encoder decoder network generated the following two summaries on the testing input. 144 | #### Text: 145 | Proposal seeks to demonstrate a technique for observing ocean currents by electric field measurements using a towed instrument of recent design measurements will be made in conjunction with a cruise across the in which several additional observational techniques will be employed several data types will be to improve the accuracy of the methods. 146 | 147 | #### Summary: 148 | ##### Summary #1 149 | Drum frame multidisciplinary 150 | 151 | ##### Summary #2 152 | Extension solver bearing. 153 | 154 | Experimenting with generated longer summaries to discover whether a longer generated summary would be a better summary for the information, I reran the encoder-decoder network to get a 4-word summary: 155 | 156 | #### Summary #3 157 | Exceptional geology goal visited 158 | 159 | This summary is much better than the 3-word summaries, which poses an interesting question: what is the relationship between length of summary and quality of summary generated by encoder-decoder structures? 160 | 161 | 162 | 163 | ### Discussion 164 | TextRank selected the two most significant sentences in the text. E-D generated two different three-word summaries, using words not present in the text, but the summaries generated were not representative of the text and did not make logical sense. 165 | By analyzing the two summaries generated by the encoder-decoder network, I found qualitatively that the extractive summarizer worked better than the abstractive text summarizer. The summaries generated by the extractive method were more understandable and representatively of the text than the abstractive summaries. The extractive summaries were generated much quicker than the abstractive ones. The TextRank algorithm took about 2 seconds to generate a summary, while the encoder-decoder network took about 15 minutes to train. 166 | The encoder decoder network performed rather poorly in comparison to the extractive method. This may have been because the encoder-decoder network didn’t have enough training. If the encoder-decoder network perhaps have had more epochs of training, it would have performed better. The training input may have also been too small. 167 | 168 | 169 | ### Conclusions 170 | In conclusion, the TextRank summarization method was very effective in choosing important sentences. As a further extension to the TextRank algorithm, it would be worthwhile to experiment with more ways of choosing “connections” from words to other words. Instead of using the proximity of words to other words in sentences, there may be other ways to measure connections between words, such as using proximity to other words with high connection to the word in question. 171 | The encoder-decoder network was found to be less effective than TextRank, likely because abstractive methods in general are less flexible and more susceptible to error than extractive methods, especially since words not in the text can be used. The next steps in improving the encoder-decoder network are to train on a larger training set, experiment with model hyper-parameters, use beam search, and explore different preprocessing methods. 172 | 173 | 174 | ### References 175 | 1.Jing, Hongyan. “Sentence Reduction for Automatic Text Summarization.” Proceedings of the Sixth Conference on Applied Natural Language Processing -, 2000, doi:10.3115/974147.974190. 176 | 2.Garg, Sneh, and Sunil Chhillar. “Review of Text Reduction Algorithms and Text Reduction Using Sentence Vectorization.” International Journal of Computer Applications, vol. 107, no. 12, 2014, pp. 39–42., doi:10.5120/18806-0380. 177 | 3.JRC1995. “JRC1995/Abstractive-Summarization.” GitHub, github.com/JRC1995/Abstractive-Summarization/blob/master/Summarization_model.ipynb. 178 | 4.“A Gentle Introduction to Text Summarization.” Machine Learning Mastery, 21 Nov. 2017, machinelearningmastery.com/gentle-introduction-text-summarization/. 179 | 5.“A Survey of Relevant Text Content Summarization Techniques.” International Journal of Science and Research (IJSR), vol. 5, no. 1, 2016, pp. 129–132., doi:10.21275/v5i1.nov152644. 180 | 6.“Text Summarization in Python: Extractive vs. Abstractive Techniques Revisited.” Pragmatic Machine Learning, rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/. 181 | 7.“Neural Machine Translation (seq2seq) Tutorial | TensorFlow.” TensorFlow, www.tensorflow.org/tutorials/seq2seq. 182 | 8.“Encoder-Decoder Long Short-Term Memory Networks.” Machine Learning Mastery, 20 July 2017, machinelearningmastery.com/encoder-decoder-long-short-term-memory-networks/. 183 | 9.Dalalkrish. “Dalalkrish/Text-Summarization-Keras.” GitHub, github.com/dalalkrish/text-summarization-keras/blob/master/Text_Summarization.ipynb. 184 | 10.“Text Summarization with TensorFlow.” Google AI Blog, 24 Aug. 2016, ai.googleblog.com/2016/08/text-summarization-with-tensorflow.html. 185 | 11.llSourcell. “LlSourcell/How_to_make_a_text_summarizer.” GitHub, github.com/llSourcell/How_to_make_a_text_summarizer/blob/master/train.ipynb. 186 | 187 | 188 | 189 | ### Contact 190 | 191 | Michelle Zhao 192 | mzhao@caltech.edu 193 | 194 | -------------------------------------------------------------------------------- /Reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re, pdb, sys, math\n", 12 | "from collections import defaultdict" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "class Graph:\n", 24 | " def __init__(self):\n", 25 | " self.Vertices = []\n", 26 | " self.Edges = []\n", 27 | "\n", 28 | " def getRankedVertices(self):\n", 29 | " res = defaultdict(float)\n", 30 | " for e in self.Edges:\n", 31 | " res[e.Vertex1] += e.Weight\n", 32 | " return sorted(res.items(), key=lambda x: x[1], reverse=True)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "class Vertex:\n", 44 | " def __init__(self):\n", 45 | " self.Sentence = None" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "class Edge:\n", 57 | " def __init__(self):\n", 58 | " self.Vertex1 = None\n", 59 | " self.Vertex2 = None\n", 60 | " self.Weight = 0" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "class WordType:\n", 72 | " Content=0\n", 73 | " Function=1\n", 74 | " ContentPunctuation=2\n", 75 | " FunctionPunctuation=3" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "class Word:\n", 87 | " def __init__(self):\n", 88 | " self.Text=''\n", 89 | " self.Type=''" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "class Sentence:\n", 101 | " def __init__(self):\n", 102 | " self.Words = []\n", 103 | "\n", 104 | " def getFullSentence(self):\n", 105 | " text = ''\n", 106 | " for w in self.Words:\n", 107 | " text += w.Text\n", 108 | " return text.strip()\n", 109 | "\n", 110 | " def getReducedSentence(self):\n", 111 | " sentenceText = ''\n", 112 | " sentenceEnd = self.Words[len(self.Words)-1]\n", 113 | " contentWords = filter(lambda w: w.Type == WordType.Content, self.Words)\n", 114 | " i = 0\n", 115 | " while i < len(contentWords):\n", 116 | " w = contentWords[i]\n", 117 | " # upper case the first character of the sentence\n", 118 | " if i == 0:\n", 119 | " li = list(w.Text)\n", 120 | " li[0] = li[0].upper()\n", 121 | " w.Text = ''.join(li)\n", 122 | " sentenceText += w.Text\n", 123 | " if i < len(contentWords)-1:\n", 124 | " sentenceText += ' '\n", 125 | " elif sentenceEnd.Text != w.Text:\n", 126 | " sentenceText += sentenceEnd.Text\n", 127 | " i = i+1\n", 128 | " return sentenceText\n", 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "class Paragraph:\n", 141 | " def __init__(self):\n", 142 | " self.Sentences = []" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 16, 148 | "metadata": { 149 | "collapsed": true 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "class Reduction:\n", 154 | " functionPunctuation = ' ,-'\n", 155 | " contentPunctuation = '.?!\\n'\n", 156 | " punctuationCharacters = functionPunctuation+contentPunctuation\n", 157 | " sentenceEndCharacters = '.?!'\n", 158 | "\n", 159 | " def isContentPunctuation(self, text):\n", 160 | " for c in self.contentPunctuation:\n", 161 | " if text.lower() == c.lower():\n", 162 | " return True\n", 163 | " return False\n", 164 | "\n", 165 | " def isFunctionPunctuation(self, text):\n", 166 | " for c in self.functionPunctuation:\n", 167 | " if text.lower() == c.lower():\n", 168 | " return True\n", 169 | " return False\n", 170 | "\n", 171 | " def isFunction(self, text, stopWords):\n", 172 | " for w in stopWords:\n", 173 | " if text.lower() == w.lower():\n", 174 | " return True\n", 175 | " return False\n", 176 | "\n", 177 | " def tag(self, sampleWords, stopWords):\n", 178 | " taggedWords = []\n", 179 | " for w in sampleWords:\n", 180 | " tw = Word()\n", 181 | " tw.Text = w\n", 182 | " if self.isContentPunctuation(w):\n", 183 | " tw.Type = WordType.ContentPunctuation\n", 184 | " elif self.isFunctionPunctuation(w):\n", 185 | " tw.Type = WordType.FunctionPunctuation\n", 186 | " elif self.isFunction(w, stopWords):\n", 187 | " tw.Type = WordType.Function\n", 188 | " else:\n", 189 | " tw.Type = WordType.Content\n", 190 | " taggedWords.append(tw)\n", 191 | " return taggedWords\n", 192 | "\n", 193 | " def tokenize(self, text):\n", 194 | " return filter(lambda w: w != '', re.split('([{0}])'.format(self.punctuationCharacters), text))\t\n", 195 | "\n", 196 | " def getWords(self, sentenceText, stopWords):\n", 197 | " return self.tag(self.tokenize(sentenceText), stopWords) \n", 198 | "\n", 199 | " def getSentences(self, line, stopWords):\n", 200 | " sentences = []\n", 201 | " sentenceTexts = filter(lambda w: w.strip() != '', re.split('[{0}]'.format(self.sentenceEndCharacters), line))\t\n", 202 | " sentenceEnds = re.findall('[{0}]'.format(self.sentenceEndCharacters), line)\n", 203 | " sentenceEnds.reverse()\n", 204 | " for t in sentenceTexts:\n", 205 | " if len(sentenceEnds) > 0:\n", 206 | " t += sentenceEnds.pop()\n", 207 | " sentence = Sentence()\n", 208 | " sentence.Words = self.getWords(t, stopWords)\n", 209 | " sentences.append(sentence)\n", 210 | " return sentences\n", 211 | "\n", 212 | " def getParagraphs(self, lines, stopWords):\n", 213 | " paragraphs = []\n", 214 | " for line in lines:\n", 215 | " paragraph = Paragraph()\n", 216 | " paragraph.Sentences = self.getSentences(line, stopWords)\n", 217 | " paragraphs.append(paragraph)\n", 218 | " return paragraphs\n", 219 | "\n", 220 | " def findWeight(self, sentence1, sentence2):\n", 221 | " length1 = len(list(filter(lambda w: w.Type == WordType.Content, sentence1.Words)))\n", 222 | " length2 = len(list(filter(lambda w: w.Type == WordType.Content, sentence2.Words)))\n", 223 | " if length1 < 4 or length2 < 4:\n", 224 | " return 0\n", 225 | " weight = 0\n", 226 | " for w1 in filter(lambda w: w.Type == WordType.Content, sentence1.Words):\n", 227 | " for w2 in filter(lambda w: w.Type == WordType.Content, sentence2.Words):\n", 228 | " if w1.Text.lower() == w2.Text.lower():\n", 229 | " weight = weight + 1\n", 230 | " normalised1 = 0\n", 231 | " if length1 > 0:\n", 232 | " normalised1 = math.log(length1)\n", 233 | " normalised2 = 0\n", 234 | " if length2 > 0:\n", 235 | " normalised2 = math.log(length2)\n", 236 | " norm = normalised1 + normalised2\n", 237 | " if norm == 0:\n", 238 | " return 0\n", 239 | " return weight / float(norm)\n", 240 | "\n", 241 | " def buildGraph(self, sentences):\n", 242 | " g = Graph()\n", 243 | " for s in sentences:\n", 244 | " v = Vertex()\n", 245 | " v.Sentence = s\n", 246 | " g.Vertices.append(v)\n", 247 | " for i in g.Vertices:\n", 248 | " for j in g.Vertices:\n", 249 | " if i != j:\n", 250 | " w = self.findWeight(i.Sentence, j.Sentence)\n", 251 | " e = Edge()\n", 252 | " e.Vertex1 = i\n", 253 | " e.Vertex2 = j\n", 254 | " e.Weight = w\n", 255 | " g.Edges.append(e)\n", 256 | " return g\n", 257 | "\n", 258 | " def sentenceRank(self, paragraphs):\n", 259 | " sentences = []\n", 260 | " for p in paragraphs:\n", 261 | " for s in p.Sentences:\n", 262 | " sentences.append(s)\n", 263 | " g = self.buildGraph(sentences)\n", 264 | " return g.getRankedVertices()\n", 265 | "\n", 266 | " def reduce(self, text, reductionRatio):\n", 267 | " stopWordsFile = 'stopWords.txt'\n", 268 | " stopWords= open(stopWordsFile).read().splitlines()\n", 269 | "\n", 270 | " lines = text.splitlines()\n", 271 | " print(\"lines\", lines)\n", 272 | " contentLines = filter(lambda w: w.strip() != '', lines)\n", 273 | " print(\"contentLines\", contentLines)\n", 274 | "\n", 275 | " paragraphs = self.getParagraphs(contentLines, stopWords)\n", 276 | " print(\"paragraphs\", paragraphs)\n", 277 | "\n", 278 | " rankedSentences = self.sentenceRank(paragraphs)\n", 279 | "\n", 280 | " orderedSentences = []\n", 281 | " for p in paragraphs:\n", 282 | " for s in p.Sentences:\n", 283 | " orderedSentences.append(s)\n", 284 | "\n", 285 | " reducedSentences = []\n", 286 | " i = 0\n", 287 | " while i < math.trunc(len(rankedSentences) * reductionRatio):\n", 288 | " s = rankedSentences[i][0].Sentence\n", 289 | " position = orderedSentences.index(s)\n", 290 | " reducedSentences.append((s, position))\n", 291 | " i = i + 1\n", 292 | " reducedSentences = sorted(reducedSentences, key=lambda x: x[1])\n", 293 | "\n", 294 | " reducedText = []\n", 295 | " for s,r in reducedSentences:\n", 296 | " reducedText.append(s.getFullSentence())\n", 297 | " return reducedText\t" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 17, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "lines [': Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals.']\n", 310 | "contentLines \n", 311 | "paragraphs [<__main__.Paragraph object at 0x105b0e0f0>]\n", 312 | "['Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale.']\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "import string\n", 318 | "reduction = Reduction()\n", 319 | "filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n", 320 | "f = open(filename)\n", 321 | "addTitle = False\n", 322 | "addTexts = False\n", 323 | "title = []\n", 324 | "text = []\n", 325 | "for word in f.read().split():\n", 326 | " if (word == \"Title\"):\n", 327 | " addTitle = True\n", 328 | " continue\n", 329 | "\n", 330 | " if (word == \"\\n\"):\n", 331 | " addTitle = False\n", 332 | "\n", 333 | " if (addTexts == True and word == \"\\n\"):\n", 334 | " addTexts = False\n", 335 | " break\n", 336 | "\n", 337 | "\n", 338 | " if (word == \"Abstract\"):\n", 339 | " addTexts = True\n", 340 | " continue\n", 341 | "\n", 342 | " if(addTitle == True):\n", 343 | " title.append(word)\n", 344 | "\n", 345 | " if(addTexts == True):\n", 346 | " text.append(word)\n", 347 | "\n", 348 | "\n", 349 | "title = ' '.join(title)\n", 350 | "text =' '.join(text)\n", 351 | "\n", 352 | "reduction_ratio = 0.1\n", 353 | "reduced_text = reduction.reduce(text, reduction_ratio)\n", 354 | "\n", 355 | "#output = open('output.txt')\n", 356 | "print(reduced_text)\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | ": Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic populations may be further subdivided into smaller isolates, each with its own migratory pattern and somewhat distinct gene pool. This study will provide information on the level of genetic isolation among populations and the levels of gene flow and genealogical relationships among populations. This detailed genetic information will facilitate international policy decisions regarding the conservation and management of these magnificent mammals.\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "print(text)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [] 384 | } 385 | ], 386 | "metadata": { 387 | "kernelspec": { 388 | "display_name": "Python 3", 389 | "language": "python", 390 | "name": "python3" 391 | }, 392 | "language_info": { 393 | "codemirror_mode": { 394 | "name": "ipython", 395 | "version": 3 396 | }, 397 | "file_extension": ".py", 398 | "mimetype": "text/x-python", 399 | "name": "python", 400 | "nbconvert_exporter": "python", 401 | "pygments_lexer": "ipython3", 402 | "version": "3.6.3" 403 | } 404 | }, 405 | "nbformat": 4, 406 | "nbformat_minor": 2 407 | } 408 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine -------------------------------------------------------------------------------- /algo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/algo1.png -------------------------------------------------------------------------------- /cs141_final_poster_toSize.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/cs141_final_poster_toSize.pptx -------------------------------------------------------------------------------- /embedding2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "SyntaxError", 10 | "evalue": "invalid syntax (config.py, line 733)", 11 | "output_type": "error", 12 | "traceback": [ 13 | "Traceback \u001b[0;36m(most recent call last)\u001b[0m:\n", 14 | " File \u001b[1;32m\"/Users/michellezhao/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py\"\u001b[0m, line \u001b[1;32m2862\u001b[0m, in \u001b[1;35mrun_code\u001b[0m\n exec(code_obj, self.user_global_ns, self.user_ns)\n", 15 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m7\u001b[0;36m, in \u001b[0;35m\u001b[0;36m\u001b[0m\n\u001b[0;31m import config\u001b[0m\n", 16 | "\u001b[0;36m File \u001b[0;32m\"/Users/michellezhao/anaconda3/lib/python3.6/site-packages/config.py\"\u001b[0;36m, line \u001b[0;32m733\u001b[0m\n\u001b[0;31m except Exception, e:\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "\"\"\"Generate intial word embedding for headlines and description.\n", 22 | "\n", 23 | "The embedding is limited to a fixed vocabulary size (`vocab_size`) but\n", 24 | "a vocabulary of all the words that appeared in the data is built.\n", 25 | "\"\"\"\n", 26 | "from os import path\n", 27 | "import config\n", 28 | "import _pickle as pickle\n", 29 | "from collections import Counter\n", 30 | "import numpy as np\n", 31 | "\n", 32 | "from prep_data import plt\n", 33 | "\n", 34 | "# static vars\n", 35 | "FN = 'vocabulary-embedding'\n", 36 | "seed = 42\n", 37 | "vocab_size = 40000\n", 38 | "embedding_dim = 100\n", 39 | "lower = False\n", 40 | "\n", 41 | "# index words\n", 42 | "empty = 0 # RNN mask of no data\n", 43 | "eos = 1 # end of sentence\n", 44 | "start_idx = eos + 1 # first real word\n", 45 | "\n", 46 | "# set random seed\n", 47 | "np.random.seed(seed)\n", 48 | "\n", 49 | "\n", 50 | "def build_vocab(lst):\n", 51 | " \"\"\"Return vocabulary for iterable `lst`.\"\"\"\n", 52 | " vocab_count = Counter(w for txt in lst for w in txt.split())\n", 53 | " vocab = list(map(lambda x: x[0], sorted(vocab_count.items(), key=lambda x: -x[1])))\n", 54 | " return vocab, vocab_count\n", 55 | "\n", 56 | "\n", 57 | "def load_text():\n", 58 | " \"\"\"Return vocabulary for pickled headlines and descriptions.\"\"\"\n", 59 | " # read tokenized headlines and descriptions\n", 60 | " with open(path.join(config.path_data, 'tokens.pkl'), 'rb') as fp:\n", 61 | " headlines, desc = pickle.load(fp)\n", 62 | "\n", 63 | " # map headlines and descriptions to lower case\n", 64 | " if lower:\n", 65 | " headlines = [h.lower() for h in headlines]\n", 66 | " desc = [h.lower() for h in desc]\n", 67 | "\n", 68 | " return headlines, desc\n", 69 | "\n", 70 | "\n", 71 | "def print_most_popular_tokens(vocab):\n", 72 | " \"\"\"Print th most popular tokens in vocabulary dictionary `vocab`.\"\"\"\n", 73 | " print('Most popular tokens:')\n", 74 | " print(vocab[:50])\n", 75 | " print('Total vocab size: {:,}'.format(len(vocab)))\n", 76 | "\n", 77 | "\n", 78 | "def plot_word_distributions(vocab, vocab_count):\n", 79 | " \"\"\"Plot word distribution in headlines and discription.\"\"\"\n", 80 | " plt.plot([vocab_count[w] for w in vocab])\n", 81 | " plt.gca().set_xscale(\"log\", nonposx='clip')\n", 82 | " plt.gca().set_yscale(\"log\", nonposy='clip')\n", 83 | " title = 'word distribution in headlines and discription'\n", 84 | " plt.title(title)\n", 85 | " plt.xlabel('rank')\n", 86 | " plt.ylabel('total appearances')\n", 87 | " plt.savefig(path.join(config.path_outputs, '{}.png'.format(title)))\n", 88 | "\n", 89 | "\n", 90 | "def get_idx(vocab):\n", 91 | " \"\"\"Add empty and end-of-sentence tokens to vocabulary and return tuple (vocabulary, reverse-vocabulary).\"\"\"\n", 92 | " word2idx = dict((word, idx + start_idx) for idx, word in enumerate(vocab))\n", 93 | " word2idx[''] = empty\n", 94 | " word2idx[''] = eos\n", 95 | " idx2word = dict((idx, word) for word, idx in word2idx.items())\n", 96 | " return word2idx, idx2word\n", 97 | "\n", 98 | "\n", 99 | "def get_glove():\n", 100 | " \"\"\"Load GloVe embedding weights and indices.\"\"\"\n", 101 | " glove_name = path.join(config.path_data, 'glove.6B.{}d.txt'.format(embedding_dim))\n", 102 | " glove_n_symbols = sum(1 for line in open(glove_name))\n", 103 | " print('{:,} GloVe symbols'.format(glove_n_symbols))\n", 104 | "\n", 105 | " # load embedding weights and index dictionary\n", 106 | " glove_index_dict = {}\n", 107 | " glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))\n", 108 | " globale_scale = .1\n", 109 | " with open(glove_name, 'r') as fp:\n", 110 | " i = 0\n", 111 | " for l in fp:\n", 112 | " l = l.strip().split()\n", 113 | " w = l[0]\n", 114 | " glove_index_dict[w] = i\n", 115 | " glove_embedding_weights[i, :] = list(map(float, l[1:]))\n", 116 | " i += 1\n", 117 | " glove_embedding_weights *= globale_scale\n", 118 | " print('GloVe std dev: {:.4f}'.format(glove_embedding_weights.std()))\n", 119 | "\n", 120 | " # add lower case version of the keys to the dict\n", 121 | " for w, i in glove_index_dict.items():\n", 122 | " w = w.lower()\n", 123 | " if w not in glove_index_dict:\n", 124 | " glove_index_dict[w] = i\n", 125 | "\n", 126 | " return glove_embedding_weights, glove_index_dict\n", 127 | "\n", 128 | "\n", 129 | "def initialize_embedding(vocab_size, embedding_dim, glove_embedding_weights):\n", 130 | " \"\"\"Use GloVe to initialize random embedding matrix with same scale as glove.\"\"\"\n", 131 | " shape = (vocab_size, embedding_dim)\n", 132 | " scale = glove_embedding_weights.std() * np.sqrt(12) / 2 # uniform and not normal\n", 133 | " embedding = np.random.uniform(low=-scale, high=scale, size=shape)\n", 134 | " print('random-embedding/glove scale: {:.4f} std: {:.4f}'.format(scale, embedding.std()))\n", 135 | " return embedding\n", 136 | "\n", 137 | "\n", 138 | "def copy_glove_weights(embedding, idx2word, glove_embedding_weights, glove_index_dict):\n", 139 | " \"\"\"Copy from glove weights of words that appear in our short vocabulary (idx2word).\"\"\"\n", 140 | " c = 0\n", 141 | " for i in range(vocab_size):\n", 142 | " w = idx2word[i]\n", 143 | " g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n", 144 | " if g is None and w.startswith('#'): # glove has no hastags (I think...)\n", 145 | " w = w[1:]\n", 146 | " g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n", 147 | " if g is not None:\n", 148 | " embedding[i, :] = glove_embedding_weights[g, :]\n", 149 | " c += 1\n", 150 | " print('number of tokens, in small vocab: {:,} found in glove and copied to embedding: {:.4f}'.format(c, c / float(vocab_size)))\n", 151 | " return embedding\n", 152 | "\n", 153 | "\n", 154 | "def build_word_to_glove(embedding, word2idx, idx2word, glove_index_dict, glove_embedding_weights):\n", 155 | " \"\"\"Map full vocabulary to glove based on cosine distance.\"\"\"\n", 156 | " glove_thr = 0.5\n", 157 | " word2glove = {}\n", 158 | " for w in word2idx:\n", 159 | " if w in glove_index_dict:\n", 160 | " g = w\n", 161 | " elif w.lower() in glove_index_dict:\n", 162 | " g = w.lower()\n", 163 | " elif w.startswith('#') and w[1:] in glove_index_dict:\n", 164 | " g = w[1:]\n", 165 | " elif w.startswith('#') and w[1:].lower() in glove_index_dict:\n", 166 | " g = w[1:].lower()\n", 167 | " else:\n", 168 | " continue\n", 169 | " word2glove[w] = g\n", 170 | "\n", 171 | " # for every word outside the embedding matrix find the closest word inside the mebedding matrix.\n", 172 | " # Use cos distance of GloVe vectors.\n", 173 | " # Allow for the last `nb_unknown_words` words inside the embedding matrix to be considered to be outside.\n", 174 | " # Dont accept distances below `glove_thr`\n", 175 | " normed_embedding = embedding / np.array(\n", 176 | " [np.sqrt(np.dot(gweight, gweight)) for gweight in embedding])[:, None]\n", 177 | "\n", 178 | " nb_unknown_words = 100\n", 179 | "\n", 180 | " glove_match = []\n", 181 | " for w, idx in word2idx.items():\n", 182 | " if idx >= vocab_size - nb_unknown_words and w.isalpha() and w in word2glove:\n", 183 | " gidx = glove_index_dict[word2glove[w]]\n", 184 | " gweight = glove_embedding_weights[gidx, :].copy()\n", 185 | "\n", 186 | " # find row in embedding that has the highest cos score with gweight\n", 187 | " gweight /= np.sqrt(np.dot(gweight, gweight))\n", 188 | " score = np.dot(normed_embedding[:vocab_size - nb_unknown_words], gweight)\n", 189 | " while True:\n", 190 | " embedding_idx = score.argmax()\n", 191 | " s = score[embedding_idx]\n", 192 | " if s < glove_thr:\n", 193 | " break\n", 194 | " if idx2word[embedding_idx] in word2glove:\n", 195 | " glove_match.append((w, embedding_idx, s))\n", 196 | " break\n", 197 | " score[embedding_idx] = -1\n", 198 | "\n", 199 | " glove_match.sort(key=lambda x: -x[2])\n", 200 | " print()\n", 201 | " print('# of GloVe substitutes found: {:,}'.format(len(glove_match)))\n", 202 | "\n", 203 | " # manually check that the worst substitutions we are going to do are good enough\n", 204 | " for orig, sub, score in glove_match[-10:]:\n", 205 | " print('{:.4f}'.format(score), orig, '=>', idx2word[sub])\n", 206 | "\n", 207 | " # return a lookup table of index of outside words to index of inside words\n", 208 | " return dict((word2idx[w], embedding_idx) for w, embedding_idx, _ in glove_match)\n", 209 | "\n", 210 | "\n", 211 | "def to_dense_vector(word2idx, corpus, description, bins=50):\n", 212 | " \"\"\"Create a dense vector representation of headlines.\"\"\"\n", 213 | " data = [[word2idx[token] for token in txt.split()] for txt in corpus]\n", 214 | " plt.hist(list(map(len, data)), bins=bins)\n", 215 | " plt.savefig(path.join(config.path_outputs, '{}_distribution.png'.format(description)))\n", 216 | " return data\n", 217 | "\n", 218 | "\n", 219 | "def summarize_vocab(vocab, vocab_count):\n", 220 | " \"\"\"Print the most popular tokens and plot token distributions.\"\"\"\n", 221 | " print_most_popular_tokens(vocab)\n", 222 | " plot_word_distributions(vocab, vocab_count)\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 2, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "ename": "NameError", 232 | "evalue": "name 'load_text' is not defined", 233 | "output_type": "error", 234 | "traceback": [ 235 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 236 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 237 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 238 | "\u001b[0;32m\u001b[0m in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\"\"\"Generate intial word embedding for headlines and description.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mheadlines\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdesc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# load headlines and descriptions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheadlines\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# build vocabulary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0msummarize_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab_count\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# summarize vocabulary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 239 | "\u001b[0;31mNameError\u001b[0m: name 'load_text' is not defined" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "\n", 245 | "def main():\n", 246 | " \"\"\"Generate intial word embedding for headlines and description.\"\"\"\n", 247 | " headlines, desc = load_text() # load headlines and descriptions\n", 248 | " vocab, vocab_count = build_vocab(headlines + desc) # build vocabulary\n", 249 | " summarize_vocab(vocab, vocab_count) # summarize vocabulary\n", 250 | " word2idx, idx2word = get_idx(vocab) # add special tokens and get reverse vocab lookup\n", 251 | " glove_embedding_weights, glove_index_dict = get_glove() # load GloVe data\n", 252 | "\n", 253 | " # initialize embedding\n", 254 | " embedding = initialize_embedding(vocab_size, embedding_dim, glove_embedding_weights)\n", 255 | " embedding = copy_glove_weights(embedding, idx2word, glove_embedding_weights, glove_index_dict)\n", 256 | "\n", 257 | " # map vocab to GloVe using cosine similarity\n", 258 | " glove_idx2idx = build_word_to_glove(embedding, word2idx, idx2word, glove_index_dict, glove_embedding_weights)\n", 259 | "\n", 260 | " # create a dense vector representation of headlines and descriptions\n", 261 | " description_vector = to_dense_vector(word2idx, desc, 'description')\n", 262 | " headline_vector = to_dense_vector(word2idx, headlines, 'headline')\n", 263 | "\n", 264 | " # write vocabulary to disk\n", 265 | " with open(path.join(config.path_data, '{}.pkl'.format(FN)), 'wb') as fp:\n", 266 | " pickle.dump((embedding, idx2word, word2idx, glove_idx2idx), fp, 2)\n", 267 | "\n", 268 | " # write data to disk\n", 269 | " with open(path.join(config.path_data, '{}.data.pkl'.format(FN)), 'wb') as fp:\n", 270 | " pickle.dump((description_vector, headline_vector), fp, 2)\n", 271 | "\n", 272 | "if __name__ == '__main__':\n", 273 | " main()\n" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.6.3" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 2 307 | } 308 | -------------------------------------------------------------------------------- /embeddingNotebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 54, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pickle\n", 13 | "import string " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 55, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def transform_input_text(self, texts):\n", 25 | " temp = []\n", 26 | " for line in texts:\n", 27 | " x = []\n", 28 | " for word in line.lower().split(' '):\n", 29 | " wid = 1\n", 30 | " if word in self.input_word2idx:\n", 31 | " wid = self.input_word2idx[word]\n", 32 | " x.append(wid)\n", 33 | " if len(x) >= self.max_input_seq_length:\n", 34 | " break\n", 35 | " temp.append(x)\n", 36 | " temp = pad_sequences(temp, maxlen=self.max_input_seq_length)\n", 37 | "\n", 38 | " print(temp.shape)\n", 39 | " return temp" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 56, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import numpy as np\n", 51 | "import os\n", 52 | "\n", 53 | "def split():\n", 54 | " titles = []\n", 55 | " texts = []\n", 56 | " root = 'Part1'\n", 57 | " stop = 0\n", 58 | " for p1 in os.listdir(root)[1:]:\n", 59 | " if(stop == 1):\n", 60 | " break\n", 61 | " dir1 = root + '/' + str(p1)\n", 62 | " print(dir1)\n", 63 | " for p2 in os.listdir(dir1)[1:]:\n", 64 | " if(stop == 1):\n", 65 | " break\n", 66 | " dir2 = dir1 + '/' + str(p2)\n", 67 | " print(dir2)\n", 68 | " for filename in os.listdir(dir2)[1:]:\n", 69 | " if (filename == 'a9302502.txt'):\n", 70 | " stop = 1\n", 71 | " break\n", 72 | " print(dir2 + '/' + filename)\n", 73 | " #print(iter)\n", 74 | " #iter += 1\n", 75 | " #print(dirs[1:])\n", 76 | " #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n", 77 | " f = open(dir2 + '/' + filename)\n", 78 | " addTitle = False\n", 79 | " addTexts = False\n", 80 | " title = []\n", 81 | " text = []\n", 82 | " for word in f.read().split():\n", 83 | " if (word == \"Title\"):\n", 84 | " addTitle = True\n", 85 | " continue\n", 86 | "\n", 87 | " if (word == \"\\n\"):\n", 88 | " addTitle = False\n", 89 | " if (word == \"Type\"):\n", 90 | " addTitle = False\n", 91 | " \n", 92 | "# if (addTexts == True and word == \"\\n\"):\n", 93 | "# addTexts = False\n", 94 | "# break\n", 95 | " \n", 96 | "\n", 97 | " if (word == \"Abstract\"):\n", 98 | " addTexts = True\n", 99 | " continue\n", 100 | "\n", 101 | " if(addTitle == True):\n", 102 | " title.append(word)\n", 103 | "\n", 104 | " if(addTexts == True):\n", 105 | " text.append(word)\n", 106 | "\n", 107 | " for i in range(len(title)):\n", 108 | " s = title[i]\n", 109 | " table = str.maketrans({key: None for key in string.punctuation})\n", 110 | " new_s = s.translate(table)\n", 111 | " title[i] = new_s\n", 112 | " for i in range(len(text)):\n", 113 | " s = text[i]\n", 114 | " table = str.maketrans({key: None for key in string.punctuation})\n", 115 | " new_s = s.translate(table)\n", 116 | " text[i] = new_s\n", 117 | "\n", 118 | " title = ' '.join(title)\n", 119 | " text =' '.join(text)\n", 120 | " titles.append(title)\n", 121 | " texts.append(text)\n", 122 | "\n", 123 | "# f=open(\"titles.txt\", 'w')\n", 124 | "# for i in titles:\n", 125 | "# f.write(i)\n", 126 | "# f.write(' ')\n", 127 | "\n", 128 | "# t=open(\"texts.txt\", 'w')\n", 129 | "# for i in texts:\n", 130 | "# t.write(i)\n", 131 | "# t.write(' ')\n", 132 | "\n", 133 | "# f.close()\n", 134 | "# t.close()\n", 135 | " return titles, texts\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 57, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "import numpy as np\n", 147 | "import os\n", 148 | "\n", 149 | "def split():\n", 150 | " titles = []\n", 151 | " texts = []\n", 152 | " root = 'Part1'\n", 153 | " \n", 154 | " #dirr = 'Part1/awards_1990/awd_1990_00/'\n", 155 | " dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n", 156 | "\n", 157 | " for filename in dirs[1:]:\n", 158 | " #iter = 0\n", 159 | " #print(dirs[1])\n", 160 | " \n", 161 | " #print(iter)\n", 162 | " #iter += 1\n", 163 | " #print(dirs[1:])\n", 164 | " #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n", 165 | " f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n", 166 | " addTitle = False\n", 167 | " addTexts = False\n", 168 | " title = []\n", 169 | " text = []\n", 170 | " for word in f.read().split():\n", 171 | " if (word == \"Title\"):\n", 172 | " addTitle = True\n", 173 | " continue\n", 174 | "\n", 175 | " if (word == \"Type\"):\n", 176 | " addTitle = False\n", 177 | "\n", 178 | "# if (addTexts == True and word == \"\\n\"):\n", 179 | "# addTexts = False\n", 180 | "# break\n", 181 | "\n", 182 | "\n", 183 | " if (word == \"Abstract\"):\n", 184 | " addTexts = True\n", 185 | " continue\n", 186 | "\n", 187 | " if(addTitle == True):\n", 188 | " title.append(word)\n", 189 | "\n", 190 | " if(addTexts == True):\n", 191 | " text.append(word)\n", 192 | "\n", 193 | " for i in range(len(title)):\n", 194 | " s = title[i]\n", 195 | " table = str.maketrans({key: None for key in string.punctuation})\n", 196 | " new_s = s.translate(table)\n", 197 | " title[i] = new_s\n", 198 | " for i in range(len(text)):\n", 199 | " s = text[i]\n", 200 | " table = str.maketrans({key: None for key in string.punctuation})\n", 201 | " new_s = s.translate(table)\n", 202 | " text[i] = new_s\n", 203 | "\n", 204 | " title = ' '.join(title)\n", 205 | " text =' '.join(text)\n", 206 | " titles.append(title)\n", 207 | " texts.append(text)\n", 208 | "\n", 209 | "# f=open(\"titles.txt\", 'w')\n", 210 | "# for i in titles:\n", 211 | "# f.write(i)\n", 212 | "# f.write(' ')\n", 213 | "\n", 214 | "# t=open(\"texts.txt\", 'w')\n", 215 | "# for i in texts:\n", 216 | "# t.write(i)\n", 217 | "# t.write(' ')\n", 218 | "\n", 219 | "# f.close()\n", 220 | "# t.close()\n", 221 | " return titles, texts\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 58, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "if __name__ == '__main__':\n", 231 | " titles, texts = split()\n" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 59, 237 | "metadata": { 238 | "scrolled": true 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "379\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "print(len(titles))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 60, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "from collections import Counter\n", 262 | "from itertools import chain\n", 263 | "def get_vocab(lst):\n", 264 | " vocabcount = Counter(w for txt in lst for w in txt.split())\n", 265 | " vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))\n", 266 | " return list(vocab), vocabcount" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 61, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "' RFLP Patterns as a Measure of Diversity in Small Populations'" 278 | ] 279 | }, 280 | "execution_count": 61, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "titles[1]" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 62, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "vocab, vocabcount = get_vocab(titles+texts)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 63, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "['the', 'of', 'and', 'to', 'in']\n", 310 | "9073\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "print(vocab[:5])\n", 316 | "print(len(vocab))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 64, 322 | "metadata": { 323 | "collapsed": true, 324 | "scrolled": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "import string \n", 329 | "for i in range(len(vocab)):\n", 330 | " s = vocab[i]\n", 331 | " table = str.maketrans({key: None for key in string.punctuation})\n", 332 | " new_s = s.translate(table)\n", 333 | " vocab[i] = new_s" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 65, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEaCAYAAAAL7cBuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl8VOXZ//HPNUlIWMO+yI6AiEoB\nI6jFHSlWcXusa+uutda6VK1a7WO19bH2sa31p3Wta92trRuK+CiLqJVFAQXZQQLIvu9Jrt8f50SH\nOElmkpnMTPJ9v155Zeas19xzZq45933OfZu7IyIiUlEk3QGIiEhmUoIQEZGYlCBERCQmJQgREYlJ\nCUJERGJSghARkZiUILKAmf3WzP6RwPJuZr3Dxw+a2W+SFEc3M9tiZjnh83FmdnEyth1u7y0zOy9Z\n24vabo3LINmvsZp9Rb9vT5jZ78PHh5nZnLqIIRXM7EgzK05g+W/K3MzOMbN3khzPHsdxDdZPekyZ\nKjfdAUhquftl8SxnZouBi9393Sq29RXQLBlxmdlvgd7u/uOo7R+XjG1XFG8ZZCp3nwjsk+440sHd\nnwGeSfI24z6OzawHsAjIc/eSVMWUqXQGkUEskJHviZnpx4RkPR3HicnIL6NsYGYXmNnrUc/nm9mL\nUc+XmtnA8PGhZjbZzDaG/w+NWm6cmd1hZpOAbUAvM+tpZuPNbLOZjQXaVhPL9Wa2wsyWm9mFFeZF\nV1W0NbM3zGyDma0zs4lmFjGzp4FuwOvhqfevzKxHWOVxkZl9BbwXNS36Q7a3mX0SvrZXzax1uK/v\nVCuY2WIzG25mI4FfA2eE+5seVRblVQsRM7vFzJaY2Soze8rMCsN55XGcZ2ZfmdkaM7u5ivKJLoMj\nzazYzK4Nt7vCzC6oqnyB7mY2KXw/3jGzb94PMzvYzD4My3S6mR0ZNe8CM5sdrrfQzH4a7/tWYbk9\nyjIsx+vMbEZY7i+YWUHU/BPM7LMwpg/NbEDUvBvMbFkY0xwzO6aSfR5vZp+a2abwWP5t1Lwqy9/M\nGodlvt7MZgEHVVW4ZnasmX0Zvpb7AIuad76ZfRA+NjP7S/i+bQxf//5R+/xTeLxsNLMPwmnVHsfh\ncXdnrOMYmBD+3xAeq4dExxSuX93n+3eVHT8Zz931V4M/oBewgSDJdgKWAMui5q0P57UOH/+EoErv\nrPB5m3DZccBXwH7h/DzgI+DPQD5wOLAZ+EclcYwEVgL7A02BZwEnqL4BeAL4ffj4TuDBcB95wGGA\nhfMWA8Ojttsj3M5T4XYbR03LjYp9WdS+/1keJ3AkUFwh1m/2Afy24msKt3dx+PhCYH5Yls2AV4Cn\nK8T2SBjX94CdwL6VlFF0GRwJlAC3h2XwQ4LE3KqSdccBC4C+4b7GAX8I53UG1obbiADHhs/bhfOP\nB/Ym+MI7ItzP4Bq8b3uUZViOnwB7ERxfs4HLwnmDgVXAUCAHOC9cPp+gmmopsFdUOe5dyes+Ejgg\nfF0DwlhPjqf8gT8AE8PYugKfU+FYiNpPW2ATcFr4flwTvj/lx8H5wAfh4x8AU4GWYZnuC3QK590f\nvjedw9d9aPiay2Ot6XG8x7IxYorn8x3z+MmGP51B1JC7LyT44h5I8OEfAywzs37h84nuXkbwJTHP\n3Z929xJ3fw74EhgVtbkn3P0LD+o4OxH84vqNu+909wnA61TudOBxd//c3bcSfPFWZne4/e7uvtvd\nJ3p4FFfht+6+1d23VzL/6ah9/wY43WrY+FfBOcCf3X2hu28BbgLOtD3PXm5z9+3uPh2YTvBFFY/d\nwO1hGYwGtlB1Hf/j7j43LIMXCd5zgB8Do919tLuXuftYYApBwsDd33T3BR4YD7xDkJQhsfctlnvd\nfbm7ryM4PspjugR4yN3/4+6l7v4kwZf3wUApwZdmfzPLc/fF7r4g1sbdfZy7zwxf1wzgOYLjOlpl\n5X86cIe7r3P3pcC9VbyOHwKz3P1ld98N3AN8Xcmyu4HmQD+CHzaz3X2FBdWyFwJXufuy8HV/6O47\no9ZN1XEcz+e7suMn4ylB1M54gl9ah4ePxxF8iI4In0PwK29JhfWWEPzSKbc06vFewPrwQI1evjJ7\nVVi/qmX/l+BX+TthlceNVSwbK7bq5i8h+BWYjFPoiuW2hOAXWoeoadFfJNuIvwF9bZiM4123sv10\nB34UVuVsMLMNwDCCJIyZHWdmH1tQnbeB4MuwvGwSed8SjenaCjF1JThrmA9cTZCMVpnZ82a2V6yN\nm9lQM3vfzFab2UbgMr77vlYWQyKvbY9lwx8sMY85d38PuI/gbGGlmT1sZi3CuAoIfqlXJlXHcTyf\n75oep2mnBFE75QnisPDxeL6bIJYTfGijdSM4pS0X/St+BdDKzJpWWL4yKwi+AKpd1t03u/u17t6L\n4BfOL6PqoCs7k6juDKPivncDa4CtQJPyGeGvsXYJbLdiuXUjqHpYWc16dWkpwS/PllF/Td39D2aW\nT1BVcTfQwd1bAqP5tn497vetBjHdUSGmJuEvW9z9WXcfRlC2DtxVyXaeBV4Durp7IUHVpFWybEWJ\nvLY9ljUzq7DuHtz9Xnc/kKBKti9wPcHxtoOgOq/SVauJubLjONHjtHz9ZTGWzTpKELUzHjgKaOzu\nxQT1riOBNsCn4TKjgb5mdraZ5ZrZGUB/4I1YG3T3JQTVFLeZWSMzG8aep6sVvQicb2b9zawJcGtl\nC4aNl73DD+EmgiqH0nD2SoL6/kT9OGrftwMvu3spMBcoCBs784BbCKo3yq0EeljlV209B1xjQYN9\nM+B/gBcq/PJPt38Ao8zsB2aWY2YFFjQodwEaEbze1UCJmR0HjIhaN+73LUGPAJeFZwBmZk3D96C5\nme1jZkeHyWsHsJ1v3/+KmgPr3H2HmQ0Bzk4ghheBm8ysVVgWv6hi2TeB/czs1LD68EqgY6wFzeyg\n8HXlEfwA2QGUhlW5jwF/NrO9wvfikPB1xquy43g1UEbln42EPt/ZRgmiFtx9LkH99cTw+SZgITAp\nPLhw97XACcC1BA2YvwJOcPc1VWz6bIJGxnUEXxxPVRHDWwT1tu8RVB+9V8V2+wDvhjF/BPzN3ceF\n8+4EbgmrJa6rYhsVPU3QoPo1wWn+lWFcG4HLgUcJfk1tBaKvanop/L/WzKbF2O5j4bYnEFyHvoOq\nv2jqXFi/fhLBFVmrCX69Xw9E3H0zQVm8SNBoeTbBL/LydRN53xKJaQpBO8R94X7nEzSqQpCw/kDw\ny/hroH0YeyyXA7eb2Wbgv8PXEa/bCKpZFhG0uzxdRbxrgB+Fca0lOEYnVbJ4C4IEuD7c/lqCMzSA\n64CZwGSCz81dJPb9VtlxvA24A5gUfjYOrhB/TT7fWaP8ChYRkQbJzMYRXLX0aLpjyTQ6gxARkZiU\nIEREJCZVMYmISEw6gxARkZiUIEREJKas7NnQzEYBo5o3b35J37590x2OiEhWmTp16hp3b1fdclnd\nBlFUVORTpkxJdxgiIlnFzKa6e1F1y6mKSUREYlKCEBGRmJQgREQkpqxMEGY2yswe3rhxY7pDERGp\nt7IyQbj76+5+aWFhYbpDERGpt7IyQYiISOopQYiISExZeaNcuUVrtnLuY5/UaN0WBbkctU97ju7X\nnlZNGyU5MhGR7JfVCaK0zNm0fXeN1p3z9SbemLGCnIhxUI9WHNu/IyP6d6Br6ybVrywi0gA02Dup\ny8qcmcs2MnbWSsbOWsmclZsB6NexOSP2C5LFfnu1IBidU0Sk/oj3TuqsTBDlfTH17t37knnz5iVl\nm0vWbmXsrJW888VKpixZR5nDXoUFHNu/AyP268iQnq3Jy1GTjYhkv3qdIMqlqi+mtVt28n9frmLs\nrJVMnLeaHbvLgjaLfu0Z0b8jR+zTjmb5WV07JyINmBJEkmzfVcrEeat5Z9ZK/m/2StZv202jnAiH\n9m7Dsf07cOy+HWjfoiClMYiIJJMSRAqUlJYxdcn6oCpq1kq+WrcNgIFdW3Js/w78YL8O7N2umdot\nRCSjKUGkmLszd+UWxs76mndmrWRGcdDtR8+2TTlu/45cflRvVUOJSEZSgqhjKzZu593wzGLS/DX0\naNOU+88ZzL6dWqQ7NBGRPWg8iDrWqbAxPzmkB09fNJRnLj6YzTtLOPn+STz/yVdkcxIWkYZLCSIF\nDtm7DaOvPIyiHq248ZWZXPvidLbtKkl3WCIiCVGCSJF2zfN56sKhXDO8L//6bBkn3jeJueHNeCIi\n2UAJIoVyIsZVw/vwzEVD2bBtNyfe9wEvTVma7rBEROKSlQki2wYMOrR3W0ZfNYxBXVtx/cszuO6l\n6WzfVZrusEREqpSVCSIbBwxq37yAf1w8lCuP6cM/pxVz0v0fMH+VqpxEJHNlZYLIVjkR45fH9uWp\nC4ewdssuRv2/SbwyrTjdYYmIxKQEkQaH9WnH6KsO44Auhfzyxenc8PIMduxWlZOIZBbd6psmHVoU\n8OzFQ/nLu3O5//0FTC/ewM+P6k3jvBzyciPkRSz4nxMhL8fC/xGaF+TStll+usMXkQZAd1JngHFz\nVnHNC5+xflv1gx+Zwe9P3p9zhnavg8hEpD6K905qnUFkgCP3ac+EXx1F8frt7C4tY3eph//LvvP8\nn1OXceurX9C7XTOG9mqT7tBFpB5TgsgQzQvy2LdTXrXLHd2vA6fcP4nLn5nGa78YRueWjesgOhFp\niNRInWUKG+fx8LlF7Cop49Knpuh+ChFJGSWILNS7fTPuOXMgs1Zs4oZ/zlBngCKSEkoQWeqYfTtw\n3Yh9eG36ch6asDDd4YhIPaQEkcUuP3Jvjh/Qibve/pJxc1alOxwRqWeUILKYmfG/pw2gX8cW/OK5\nT1m4eku6QxKReiRjEoSZ7WtmD5rZy2b2s3THky2aNMrl4Z8cSF5OhEuemsLmHdXfSyEiEo+UJggz\ne8zMVpnZ5xWmjzSzOWY238xuBHD32e5+GXA6UO0NHPKtrq2bcP/Zg1m8dhvH/XUi1744nac/WsyM\n4g3sKilLd3gikqVSfR/EE8B9wFPlE8wsB7gfOBYoBiab2WvuPsvMTgRuDNeRBByydxseOGcwL05Z\nyvi5q/hn2Algo5wI/fdqwSmDOnPeoT3SG6SIZJWUJgh3n2BmPSpMHgLMd/eFAGb2PHASMMvdXwNe\nM7M3gWdTGVt9NGK/jozYryPuzvKNO/jsqw1ML97AxwvXcutrX9A4L4fTD+qa7jBFJEuk407qzkD0\nsGrFwFAzOxI4FcgHRle2spldClwK0K1bt9RFmcXMjM4tG9O5ZWOOH9CJktIyLnhiMjf/eyY92zXl\noB6t0x2iiGSBdDRSW4xp7u7j3P1Kd/+pu99f2cru/rC7F7l7Ubt27VIYZv2RmxPhvrMG07VVEy57\neirF67elOyQRyQLpSBDFQHQ9RxdgeSIbyLYhRzNBYZM8HjmviF2lZVz85BS27ixJd0gikuHSkSAm\nA33MrKeZNQLOBF5LZAPZOORoJti7XTPuO3swc1du5poXPqOsTF10iEjlUn2Z63PAR8A+ZlZsZhe5\newlwBTAGmA286O5fpDIO+dYRfdtx8/H9eWfWSv40dk66wxGRDJbqq5jOqmT6aKpoiK6OmY0CRvXu\n3bumm2jQLvx+D+Z+vZn7319A47wcrji6T7pDEpEMlDF3UidCVUy1Y2bcccr+nDKoM3e/M5c/vTNH\nPcKKyHdowKAGKjcnwt0/+h6NciL8v/fms6ukjBuP64dZrIvMRKQhUoJowHIixp2nHkCj3AgPTVjI\nzpIyrh3Rl8Z5OeTmZOXJpYgkUVYmCLVBJE8kYtx+0n40yo3w9w8W8cSHiwHIjRhN83MZuV9HLjm8\nF73bN0tvoCJS5yyb656Liop8ypQp6Q6jXnB3xnyxkqXrtrFjdynbd5eyctNO3pixnJ0lZRzbvwOX\nH7k3g7q1SneoIlJLZjbV3avtFFUJQqq0dstOnvxwMU9+tISN23dz2oFduPG4frRtlp/u0ESkhup1\ngoiqYrpk3rx56Q6nQdi6s4T73p/PoxMXUpCXw69G9uPHQ7upUVskC8WbILKyJVKXuda9pvm53DCy\nH29ddTgDuhTym39/zuOTFqc7LBFJoaxMEJI+vds34+kLhzKifwd+/+Ys3v9SY2GL1FdKEJKwSMS4\n58yB7NspGAv7y683pTskEUkBJQipkSaNcnn0vCKaNMrh4iensEljYYvUO1mZINTdd2boVNiYB39y\nIMs3bOeON2anOxwRSbKsTBBqpM4cg7u14qdH7M0LU5aqPUKknsnKBCGZ5erhfejboRk3vjKDjdtU\n1SRSXyhBSK3l5+bwpx8NZM2WXdwxela6wxGRJFGCkKQ4oEshFw/ryUtTi5lZrLYhkfogKxOEGqkz\n0xVH96ZN00bc/sYXGl9CpB7IygShRurM1Lwgj2tH7MPkxet5c+aKdIcjIrWUld19S+Y6vagrT320\nhN+9MYtZyzfRpVUTRuzXQZ37iWShrOysr5x6c81Mn361nmtfnM5X67ZRUuY0y8/lZ0fuzQ8P6ETj\nvBxaNc0jPzcn3WGKNFhJ683VzH4EvO3um83sFmAw8Ht3n5acUGtOCSKzlZY581Zt5u4xc3l39so9\n5rVtls/BvVpz2oFdOKJvO/UKK1KHkpkgZrj7ADMbBtwJ3A382t2HJifUmlOCyB6fLd3AojVb2Lar\nlLVbdrFk7Tb+78uVbNi2m2P7d+DuH32PwsZ56Q5TpEGIN0HE0wZRGv4/HnjA3V81s9/WJjhpeAZ2\nbcnAri33mLazpJSnP1rCH976klP+NonRVx5GQZ6qnkQyRTxXMS0zs4eA04HRZpYf53opo8tc64f8\n3BwuPqwX9509iIWrtzJ21srqVxKROhPPF/3pwBhgpLtvAFoD16c0qmroMtf65dj+HencsjEvTS1O\ndygiEqXaBOHu24BVwLBwUgmgcT4laXIixn8N7szEeatZvmF7usMRkVC1CcLMbgVuAG4KJ+UB/0hl\nUNLwnHZgV9zh6Y+XpDsUEQnF00h9CjAImAbg7svNrHlKo5IGp1ubJvzwgI48MG4B67fu4sJhPenT\nvpkufxVJo3gSxC53dzNzADNrmuKYpIG698xBdGs9lwfHL+D5yUs5dO82PHxuEc3ydcO/SDrE00j9\nYngVU0szuwR4F3gktWFJQ5SbE+HG4/rx4Y1Hc/MP9+U/i9ZxziMfM3/V5nSHJtIgxdXVhpkdC4wA\nDBjj7mNTHVg8dKNc/TZ21kp++cJnbNtdyoj+Hfh+77Yc0LmQ71W4n0JEEpPMO6l7AivcfUf4vDHQ\nwd0XJyPQ2lCCqP/WbtnJA+MW8O/PlrNmy04AXv3595UkRGoh3gQRTxXTS0BZ1PPScJpIyrVpls8t\nJ/TnP78+hgnXH0XLJnnc8+7cdIcl0iDEkyBy3X1X+ZPwcaPUhVQ93Und8OREjG5tmnDp4b14f85q\nZhRvSHdIIvVePAlitZmdWP7EzE4C1qQupOrpTuqG6ycHd6dRboRXpi1Ldygi9V48CeIy4Ndm9pWZ\nLSW4ae6nqQ1LJLbmBXkc0689b8xYTklpWfUriEiNxdPVxgJ3PxjoD/R390PdfX7qQxOJ7cTv7cWa\nLbu4Y/Rsdc0hkkLV3oEU9t76X0APILf8zlZ3vz2lkYlU4qh+7dmnQ3Men7SYjxas5Y1fDCM3JyuH\nVxfJaPF8ql4FTiLopG9r1J9IWhTk5TDmmsN54JzBfPn1Zv5n9Jds3rE73WGJ1Dvx9GHQxd1HpjwS\nkQSN3L8jpwzqzGOTFvH25yt48CcHMqCL7o8QSZZ4ziA+NLMDUh6JSILMjL+cMZCXLzsEM+O8xz5h\nxUa1SYgkSzwJYhgw1czmmNkMM5tpZjNSHZhIvIp6tOapi4aws6SMkfdM5MHxC9Idkki9EE8V03Ep\nj0KklvZu14zHzz+Iu9+Zw5/emcNJA/eiU2HjdIclktXiucx1ibsvAbYDHvUnklGG9mrDn08fSJnD\naQ98xM3/msnasP8mEUlcPCPKnWhm84BFwHhgMfBWiuMSqZGurZvw6x/uy97tm/HilKX89vVZ6Q5J\nJGvF0wbxO+BgYK679wSOASalNCqRWrhoWE+eunAIFw7ryZszljPna40nIVIT8SSI3e6+FoiYWcTd\n3wcGpiIYMzvZzB4xs1fNbEQq9iENx8XDetGqSSNd3SRSQ/EkiA1m1gyYADxjZn8luGkuLmb2mJmt\nMrPPK0wfGV4ZNd/MbgRw93+7+yXA+cAZcb8KkRjaNc/nHxcPZdOO3Zx03yQeGLeAsjI1n4nEK54E\ncRKwDbgGeBtYAIxKYB9PAHvcaGdmOcD9BFdI9QfOMrP+UYvcEs4XqZV9O7XgnjMG0qJxHne9/SXn\nPvYJS9aqIwCReFSZIMIv8lfdvczdS9z9SXe/N6xyiou7TwDWVZg8BJjv7gvD8SWeB06ywF3AW+4+\nLcHXIhLTiP06Mvaaw7njlP35bOkGhv95PFc8O01XOIlUo8r7INy91My2mVmhuydzdJ7OwNKo58XA\nUOAXwHCg0Mx6u/uDFVc0s0uBSwG6deuWxJCkPjMzzhnanWP6deDRiQt56uMlLNuwnSuP6UN+boQh\nPVqrwz+RCuK5UW4HMNPMxhLVSZ+7X1mL/VqMae7u9wL3VrWiuz8MPAzBmNS1iEEaoI6FBdxyQn8O\n6FLIVc9/xgWPTwZgWO+2/HJEXwZ2aUkkEuvwFGl44kkQb4Z/yVQMdI163gVYnuR9iFTqpIGdGdCl\nJRu27WJG8UZ+/+YsTv3bh/Rp34yrhvfhhAF7pTtEkbSrNkG4+5Mp2O9koI+Z9QSWAWcCZ8e7spmN\nAkb17t07BaFJQ9GzbVOgKYO6teKIvu34z6K1PD5pMVc8+yndWjdRz7DS4Jl71bU0ZtYHuJPgaqOC\n8unu3iuuHZg9BxwJtAVWAre6+9/N7IfAPUAO8Ji735Fo8EVFRT5lypREVxOp1OYduznif8exu6SM\no/q15/C+7RixXwdaFOSlOzSRpDGzqe5eVO1ycSSID4Bbgb8QXN56QbjerckItCaiziAumTdvXrrC\nkHpqweot/HnsXD6Yt4aN23fTqkkeJwzYixuO60ez/HhqZUUyWzITxFR3P9DMZrr7AeG0ie5+WJJi\nrTGdQUgq7SwpZeqS9dzz7jw+WbSORjkRTh3cmf8e1Z8mjZQoJHvFmyDiuorJzCLAPDO7gqDNoH1t\nAxTJdPm5ORy6d1sO3bst7325kkcnLuL5yUt56/OvufKYPpx/aA9ydMWT1GPxXPh9NdAEuBI4EPgx\ncF4qgxLJNEf368AzFw/lkXOLaNE4l9+9MYurnv9UfTxJvVZtFdM3C5o1dfeM6KNAbRCSTqVlzvA/\nj2fRmuDjMKx3W64e3oeiHq3THJlIfJLZBnEI8Hegmbt3M7PvAT9198uTE2rNqQ1C0mXH7lJmr9jE\n0x8v4ZVpywA4o6grpx/UhQO7K1FIZktmgvgPcBrwmrsPCqd97u77JyXSWlCCkEywaM1WbnplBh8v\nDLocO6BzIX88bQD7dmqR5shEYos3QcTV+Yy7L60wqbRGUSWJmY0ys4c3bkxm91AiNdOzbVOev/QQ\nptwynB8f3I2ZyzZyxbPTeGvmCrbsjLtnfJGME0+CWGpmhwJuZo3M7DpgdorjqpK7v+7ulxYWFqYz\nDJE9tG2Wz+9PPoA/njaABau38rNnpjHwtnf4y9i5GodCslI8VUxtgb8S9LIaAcYAVyXS5XeqqIpJ\nMtW2XSW8MX0FD01YwILVW+nXsTnXjdiHo/u1V2eAknZJa4PIZEoQkunKypw/jZ3D/e8vAIIBjM49\npDsHdC5k/846A5b0SGYjdS+CM4iDAQc+Aq5x94XJCLQ2lCAkW6zavIO/jJ3H85O/ovwjd+4h3fn5\nUb3p0KKg6pVFkiyZCeJjguE/nwsnnQn8wt2H1jrKGtJ9EJKt1m/dxfKN27nyuU9ZsDq4j+L4Azpx\n8qDOHNu/Q5qjk4YiqZe5VkwGZvaxux9cyxhrTWcQkq1Ky5w3Z67gztGzWbFxBwBtmzXiwmE9Oe+Q\nHjRVp4CSQslMEH8ANhCMG+3AGUA+wVkF7l5xvOk6owQh2a60zFm2fjt/Gzef5ycHV5PnRozLjtib\nY/t3YECXQszUqC3JlcwEsaiK2R7vuBCpoAQh9cm6rbt4aPwCHprwbfNen/bNOO/QHpw9pJuufpKk\n0VVMIllq7ZadzFqxibvHzGF6cXAzaIcW+fz759+nU2HjNEcn9UEyzyAKgMuBYQRVTBOBB919RzIC\nrQk1UktDsXxD0KA9Zcl6AM47pDs3/XBfCvJy0hyZZLNkJogXgc3AP8JJZwGt3P1HtY6ylnQGIQ3F\nc598xU2vzASga+vGjLn6cA1aJDWWzL6Y9nH3i9z9/fDvUqBv7UMUkXidNaQbU28ZTp/2zVi6bjuX\n/WMau0vL0h2W1HPxJIhPzeybS1rNbCgwKXUhiUgsbZrlM+bqw2nVJI8Jc1fzg79MYPaKTekOS+qx\neBLEUOBDM1tsZosJ7qQ+wsxmmtmMlEYnInuIRIwx1xxO55aNWbhmK8f9dSKTF6ftSnOp5+Jpg+he\n1Xx3X5LUiBKgNghpyF6eWsx1L00H4KdH9OKKo3rTvCAvzVFJNkhaG4S7LwmTwHaCq5g8mPzNdBFJ\ng9MO7MKj5waf8YfGL+SwP77P0x8tplRdi0uSVJsgzOxEM5sHLALGA4uBt1IcV3UxacAgEWB4/w5M\n/NVR9O/Ugg3bdvObV7/g+HsnMn/V5nSHJvVAPFVM04GjgXfdfZCZHQWcFV7NlFaqYhIJuDtrtuzi\ngic+4fNlQcP1D/brwP1nDyY3J66BI6UBSeZlrrvDwYEiZhZx9/eBgbWOUESSxsxo1zyfN35xGNcM\n70t+boQxX6yk7y1v8cx/VBMsNRNPgthgZs2ACcAzZvZXQAPtimSoq4b34ZNfD+eC7/egzOHmf31O\njxvf5MrnPmVnSanaKCRu8VQxNSVooI4A5wCFwDMaclQk8320YC2T5q/hvvfnfzOtXfN8xl5zOC0K\n8tQBYAOlzvpE5BuL1mzlrc9XsHTdNp77JOhWfGjP1rzw00PSHJmkQ7wJQp25iDQAPds25fIje1NS\nWsa+nVrw/pereH/Oanrc+CYpTxzRAAAP4klEQVSdCgt479ojadxIHQDKnnQGIdIArdq8g+c/Wcqi\nNVv516fLaNkkj1MHdeG/R/VPd2hSB3QGISKVat+8gCuP6cP2XaW0b57P+3NW8cx/ljC9eAM3jOzH\nkJ6t0x2iZIBKzyDMbCbBXdPfmUVwJ/WAVAYWD51BiCTHxwvX8uD4BXw4fy292zdjSM/W/Pyo3rRr\nnp/u0CQFknEGcUIS40mqqAGD0h2KSL1wcK82HNyrDde/NJ33vlzFEx8uZmdJGUf0bcdR/dqRn6v2\niYZIbRAisoeyMqfojndZt3UXAH88bQCnDOpMnu7IrjeSdie1mR1sZpPNbIuZ7TKzUjNTJ/Qi9VQk\nYrx37RG8c83h5ESMX708g31/8zZT1K14gxNPI/V9wJnAS0ARcC6guh2Reqxlk0a0bNKI+88exKzl\nm7j3vfncMXo2Pdo05bA+bTl1cJd0hyh1IK6rmNx9vpnluHsp8LiZfZjiuEQkA4zcvxMj+nfks+KN\nLF6zlblfb+aTResY0KWQds0LKGys8Sfqs3gSxDYzawR8ZmZ/BFYATVMblohkikjEeOrCIQDc+dZs\nHhq/kOF/nkCXVo354Iaj0xydpFI8rU4/CZe7AtgKdAVOTWVQIpKZLj+iN/efPZjjB3SieP12bv7X\nTO54cxabd+xOd2iSAvEkiJPdfYe7b3L329z9l2TwJbAikjqFTfI4fkAnzjqoGx1a5PPGjBU8MnER\nHy1Ie9+dkgLx9OY6zd0HV5j2qbsPSmlkcdBlriLpVbx+G8Puep/m+bnk5+VQkBfh8fMPok+H5ukO\nTapQ6xvlzOws4Gygp5m9FjWrBaCfCyJC55aNuWZ4X1Zu3sG2nSX8+7PlTJy3hhaN88jPjdCySaN0\nhyi1UFVXG92BnsCdwI1RszYDM9w97YMG6QxCJHNs21XC/reOoXw8IjN49effZ0CXlukNTL6j1mcQ\n7r4EWAIcYmYdgIPCWbMzITmISGZp0iiXJy4YQvH67azdspM/jZ3Lfxauo2l+LgZ0b9OUHA1QlFXi\naYP4EXA3MI6go77DgOvd/eWUR1cNnUGIZKaN23Yz8HfvEP31cvXwPlw9vG/6gpJvJLO771uAg9x9\nVbjhdsC7QNoThIhkpsImeTx/ycF8vWkHALe9Posla7exY3cpALkRI1d9O2W8eBJEpDw5hNYS3+Wx\nCTGzXsDNQKG7n5bs7YtI3Rraq803jx8av5B/fbqMf326DICOLQqYeMNR6gAww8WTIN42szHAc+Hz\nM4C34tm4mT1GcM/EKnffP2r6SOCvQA7wqLv/wd0XAheZmc5MROqZ20/aj8mL1wPw6VfreWfWSjZt\n302bZhpvIpNVmyDc/XozOxUYRtAG8bC7/yvO7T9B0NnfU+UTzCwHuB84FigGJpvZa+4+K8HYRSRL\nFPVoTVGPYJS6l6cW886slfz82WkU5OWQlxPhluP3pXsb9eCTaapNEGZ2l7vfALwSY1qV3H2CmfWo\nMHkIMD88Y8DMngdOAuJKEGZ2KXApQLdu3eJZRUQySFH3Vgzp0Zrtu0rZtL2EWSs2ceQ+7ZQgMlA8\nFYDHxph2XC322RlYGvW8GOhsZm3M7EFgkJndVNnK7v6wuxe5e1G7du1qEYaIpEOPtk158bJDePWK\nYbz8s0MA+GrdNhat2cqiNVtZvXlnmiOUclXdSf0z4HKgl5nNiJrVHJhUi33GuhDa3X0tcFkttisi\nWaZx2D3HQ+MX8tD4hQBEDD644Wj2atk4zdFJVVVMzxI0Rn/nTmp3r83QUsUEPcKW6wIsT2QDGpNa\npH4wM5695GC+WrsNgNkrNvHQhIWs3LRDCSIDVHUn9UZgI3BWkvc5GehjZj2BZQSj1Z2dyAbc/XXg\n9aKiokuSHJuI1LHB3VoxuFsrADoVFvDQhIV8vHAdm3YEHTY0bZTDgd1bYaa7sOtaXCPK1ZSZPQcc\nCbQ1s2LgVnf/u5ldAYwhuMz1MXf/IsHt6gxCpB5q36IAgLve/nKP6f/82aEc2L1VOkJq0KrtaiOT\nqasNkfpn3srN35w9LFi9hV+9PINHzy1ieP8OaY6s/khmVxsiInUmeiyJ5gXBV9SWnSXsLCn9ZnrE\nTHdh1wElCBHJWE0a5QBw9QufcfUL305vlBPhnz87lAO6FKYpsoYhKxOE2iBEGobOLRtz138dwJot\nu76ZtnbLLh6btIgl67YqQaRYViYIXcUk0jCYGWcctGePCV+t3cZjkxaxc3dZmqJqOLIyQYhIw5Wf\nF7Q9fL58I+1bfNvZX8cWBRoLO8myMkGoikmk4WqWn0tejvH4pMU8PmnxN9PzcyN8ftsP1HidRFmZ\nIFTFJNJwNc3PZew1R7Bmy7d9Nr02fTlPfbSEHbtLlSCSKCsThIg0bD3aNqVH2297f/1i+SYAdpWo\nXSKZlGpFJOs1yg2+ynYqQSSVziBEJOvlhwni7Ec+jlnFdMlhvTj9oK7fmS5Vy8oEoUZqEYl26N5t\nOWVQ5z3uti43Ye4axs9drQRRA1mZINRILSLROhYW8JczBsacN/KeCewqVdVTTagNQkTqtUa5EXYr\nQdSIEoSI1Gt5ORFKSrO31+p0ysoqJhGReOXlGMs2bOe5T76KOb9JoxyOP6ATubp/4juyMkGokVpE\n4rVXy8Z8vHAdN70ys9JlOrYoYGivNnUYVXbQgEEiUq+VljmrN++MOW/mso1c8tQUnrxwCEf0bVfH\nkaWPBgwSEQFyIkbHwoKY81Zu2gFAiRqxY1Klm4g0WDkRA6CkLHtrUlJJCUJEGqzyu65LlSBiUoIQ\nkQZLZxBVUxuEiDRYuWGCmDRvDdt2llS63Pd7t6Vr6yZ1FVbGyMoEoctcRSQZWjVpRKPcCC9MWcoL\nU5ZWutwJAzpx39mD6zCyzJCVCUJ9MYlIMhQ2yWPKLcPZWsXZw7l//4Qdu7/bCWBDkJUJQkQkWVoU\n5NGiIK/S+QV5OQ22EVuN1CIiVYhEjIbalZMShIhIFXIMynQGISIiFeVETFVMIiLyXREzSrO4z7ra\nUIIQEamCziBERCSmhpwgdJmriEgVciLG8g3buXvMnLjXMYNTBnWmV7tmKYws9bIyQehOahGpK/t2\nasHEeWt4YPyCuNcpLXO27Czh1lH7pTCy1NOAQSIiSTbo9ncY9b29uP2k/dMdSkzxDhikNggRkSSL\nmFGWxT++yylBiIgkmZlRD/KDEoSISLKZQX248EkJQkQkySIG2dy+W04JQkQkydQGISIiMRmoDUJE\nRL7LzNQGISIi3xWJqA1CRERiMIzsTw9KECIiSRcx1EgtIiLfFVEbhIiIxFRPziAypjdXM2sK/A3Y\nBYxz92fSHJKISI1EzKgPjRApPYMws8fMbJWZfV5h+kgzm2Nm883sxnDyqcDL7n4JcGIq4xIRSaX6\n0gaR6jOIJ4D7gKfKJ5hZDnA/cCxQDEw2s9eALsDMcLHSFMclIpIyhjF96QZ++cJnKdvH2UO7UdSj\ndcq2DylOEO4+wcx6VJg8BJjv7gsBzOx54CSCZNEF+IwqzmzM7FLgUoBu3bolP2gRkVo6vG9b3v7i\nayYvWZeyfYzYr2PKtl0uHW0QnYGlUc+LgaHAvcB9ZnY88HplK7v7w8DDEAwYlMI4RURq5Obj+3Pz\n8f3THUatpSNBWIxp7u5bgQvqOhgREYktHZe5FgNdo553AZYnsgEzG2VmD2/cuDGpgYmIyLfSkSAm\nA33MrKeZNQLOBF5LZAPu/rq7X1pYWJiSAEVEJPWXuT4HfATsY2bFZnaRu5cAVwBjgNnAi+7+RSrj\nEBGRxKX6KqazKpk+Ghhd0+2a2ShgVO/evWu6CRERqUZWdrWhKiYRkdTLygQhIiKpl5UJQlcxiYik\nnmXzqEdmthpYEj4tBCpmjIrTop+3BdakKLRYsSRzvaqWq2xePOUTa1p9KLPqllGZJb5MMsus4nyV\nWdXHVKzniZZZd3dvV+1S7l4v/oCHq5sW/RyYUpexJHO9qparbF485VNfy6y6ZVRm6S2zGOWnMqvi\nmKrLMsvKKqZKxOqeo+K0SrvwSLKa7ife9aparrJ58ZRPrGn1ocyqW0ZllvgyySyzuiqv2uwr3WWW\nlmMsq6uYasPMprh7UbrjyCYqs8SpzBKnMktcqsqsPp1BJOrhdAeQhVRmiVOZJU5llriUlFmDPYMQ\nEZGqNeQzCBERqYIShIiIxKQEISIiMSlBhMysqZk9aWaPmNk56Y4nG5hZLzP7u5m9nO5YsoWZnRwe\nY6+a2Yh0x5MNzGxfM3vQzF42s5+lO55sEH6fTTWzE2qznXqdIMzsMTNbZWafV5g+0szmmNl8M7sx\nnHwq8LK7XwKcWOfBZohEyszdF7r7RemJNHMkWGb/Do+x84Ez0hBuRkiwzGa7+2XA6UCDvPw1we8y\ngBuAF2u733qdIIAngJHRE8wsB7gfOA7oD5xlZv0JRrYrHyu7tA5jzDRPEH+ZSeAJEi+zW8L5DdUT\nJFBmZnYi8AHwf3UbZsZ4gjjLy8yGA7OAlbXdab1OEO4+AVhXYfIQYH7463cX8DxwEsFQqF3CZep1\nuVQlwTITEiszC9wFvOXu0+o61kyR6HHm7q+5+6FAg6z+TbC8jgIOBs4GLjGzGn+fpXTAoAzVmW/P\nFCBIDEOBe4H7zOx46vbW/2wQs8zMrA1wBzDIzG5y9zvTEl1mquw4+wUwHCg0s97u/mA6gstQlR1n\nRxJUAedTi4HG6qGY5eXuVwCY2fnAGncvq+kOGmKCsBjT3N23AhfUdTBZorIyWwtcVtfBZInKyuxe\ngh8j8l2Vldk4YFzdhpIVYpbXNw/cn6jtDhpiVUox0DXqeRdgeZpiyRYqs8SpzBKnMktMysurISaI\nyUAfM+tpZo2AM4HX0hxTplOZJU5lljiVWWJSXl71OkGY2XPAR8A+ZlZsZhe5ewlwBTAGmA286O5f\npDPOTKIyS5zKLHEqs8Skq7zUWZ+IiMRUr88gRESk5pQgREQkJiUIERGJSQlCRERiUoIQEZGYlCBE\nRCQmJQiROmRmvzWz69Idh0g8lCBEaijsmVWfIam3dHCLJMDMepjZbDP7GzAN+LuZTTGzL8zstqjl\nFpvZbWY2zcxmmlm/GNu6xMzeMrPGdfkaROKlBCGSuH2Ap9x9EHCtuxcBA4AjzGxA1HJr3H0w8ACw\nR7WSmV0BjAJOdvftdRS3SEKUIEQSt8TdPw4fn25m04BPgf0IRvYq90r4fyrQI2r6TwhGAfsvd9+Z\n4lhFakwJQiRxWwHMrCfBmcEx7j4AeBMoiFqu/Mu/lD3HXvmcIGF0QSSDKUGI1FwLgmSx0cw6EJwV\nxONT4KfAa2a2V6qCE6ktJQiRGnL36QRf9l8AjwGTElj3A4KzjzfNrG1qIhSpHXX3LSIiMekMQkRE\nYlKCEBGRmJQgREQkJiUIERGJSQlCRERiUoIQEZGYlCBERCQmJQgREYnp/wPg34tFx7LZUAAAAABJ\nRU5ErkJggg==\n", 344 | "text/plain": [ 345 | "" 346 | ] 347 | }, 348 | "metadata": {}, 349 | "output_type": "display_data" 350 | } 351 | ], 352 | "source": [ 353 | "import matplotlib.pyplot as plt\n", 354 | "%matplotlib inline\n", 355 | "plt.plot([vocabcount[w] for w in vocab]);\n", 356 | "plt.gca().set_xscale(\"log\", nonposx='clip')\n", 357 | "plt.gca().set_yscale(\"log\", nonposy='clip')\n", 358 | "plt.title('word distribution in headlines and discription')\n", 359 | "plt.xlabel('rank')\n", 360 | "plt.ylabel('total appearances');" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 66, 366 | "metadata": { 367 | "collapsed": true 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "empty = 0 # RNN mask of no data\n", 372 | "eos = 1 # end of sentence\n", 373 | "start_idx = eos+1 # first real word\n", 374 | "\n", 375 | "def get_idx(vocab, vocabcount):\n", 376 | " word2idx = dict((word, idx+start_idx) for idx,word in enumerate(vocab))\n", 377 | " word2idx[''] = empty\n", 378 | " word2idx[''] = eos\n", 379 | " \n", 380 | " idx2word = dict((idx,word) for word,idx in word2idx.items())\n", 381 | "\n", 382 | " return word2idx, idx2word" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 67, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "word2idx, idx2word = get_idx(vocab, vocabcount)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "Read Glove" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 68, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "FN = 'vocabulary-embedding'\n", 412 | "seed=42\n", 413 | "vocab_size = 40000\n", 414 | "embedding_dim = 100\n", 415 | "lower = False # dont lower case the text" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 69, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "unzip: cannot find or open /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip, /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip.zip or /Users/michellezhao/.keras/datasets//Users/michellezhao/.keras/datasets/glove.6B.zip.ZIP.\r\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "fname = 'glove.6B.%dd.txt'%embedding_dim\n", 433 | "from keras.utils.data_utils import get_file\n", 434 | "import os\n", 435 | "datadir_base = os.path.expanduser(os.path.join('~', '.keras'))\n", 436 | "if not os.access(datadir_base, os.W_OK):\n", 437 | " datadir_base = os.path.join('/tmp', '.keras')\n", 438 | "datadir = os.path.join(datadir_base, 'datasets')\n", 439 | "glove_name = os.path.join(datadir, fname)\n", 440 | "if not os.path.exists(glove_name):\n", 441 | " path = 'glove.6B.zip'\n", 442 | " path = get_file(path, origin=\"http://nlp.stanford.edu/data/glove.6B.zip\")\n", 443 | " !unzip {datadir}/{path}" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 70, 449 | "metadata": { 450 | "collapsed": true 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "import zipfile\n", 455 | "zip = zipfile.ZipFile(path)\n", 456 | "zip.extractall()" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 71, 462 | "metadata": { 463 | "collapsed": true 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "datadir = ''\n", 468 | "glove_name = os.path.join(datadir, fname)\n", 469 | "glove_n_symbols = sum(1 for line in open(glove_name))" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 74, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "400000" 481 | ] 482 | }, 483 | "execution_count": 74, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "glove_n_symbols" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 75, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "import numpy as np\n", 499 | "glove_index_dict = {}\n", 500 | "glove_embedding_weights = np.empty((glove_n_symbols, embedding_dim))\n", 501 | "globale_scale=.1\n", 502 | "with open(glove_name, 'r') as fp:\n", 503 | " i = 0\n", 504 | " for l in fp:\n", 505 | " l = l.strip().split()\n", 506 | " w = l[0]\n", 507 | " glove_index_dict[w] = i\n", 508 | " glove_embedding_weights[i,:] = list(map(float,l[1:]))\n", 509 | " i += 1\n", 510 | "glove_embedding_weights *= globale_scale" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 76, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "0.040815727600190289" 522 | ] 523 | }, 524 | "execution_count": 76, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "glove_embedding_weights.std()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 77, 536 | "metadata": { 537 | "collapsed": true 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "for w, i in glove_index_dict.items():\n", 542 | " w = w.lower()\n", 543 | " if w not in glove_index_dict:\n", 544 | " glove_index_dict[w] = i" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 78, 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "name": "stdout", 554 | "output_type": "stream", 555 | "text": [ 556 | "random-embedding/glove scale 0.0706949139514 std 0.0408138249575\n", 557 | "number of tokens, in small vocab, found in glove and copied to embedding 7960 0.199\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "# generate random embedding with same scale as glove\n", 563 | "np.random.seed(seed)\n", 564 | "shape = (vocab_size, embedding_dim)\n", 565 | "scale = glove_embedding_weights.std()*np.sqrt(12)/2 # uniform and not normal\n", 566 | "embedding = np.random.uniform(low=-scale, high=scale, size=shape)\n", 567 | "print ('random-embedding/glove scale', scale, 'std', embedding.std())\n", 568 | "\n", 569 | "# copy from glove weights of words that appear in our short vocabulary (idx2word)\n", 570 | "c = 0\n", 571 | "for i in range(vocab_size):\n", 572 | " #print(i)\n", 573 | " if(i not in idx2word):\n", 574 | " continue\n", 575 | " w = idx2word[i]\n", 576 | " #print(w)\n", 577 | " g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n", 578 | " if g is None and w.startswith('#'): # glove has no hastags (I think...)\n", 579 | " w = w[1:]\n", 580 | " g = glove_index_dict.get(w, glove_index_dict.get(w.lower()))\n", 581 | " if g is not None:\n", 582 | " embedding[i,:] = glove_embedding_weights[g,:]\n", 583 | " c+=1\n", 584 | "print ('number of tokens, in small vocab, found in glove and copied to embedding', c,c/float(vocab_size))\n" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 79, 590 | "metadata": { 591 | "collapsed": true 592 | }, 593 | "outputs": [], 594 | "source": [ 595 | "glove_thr = 0.5\n", 596 | "word2glove = {}\n", 597 | "for w in word2idx:\n", 598 | " if w in glove_index_dict:\n", 599 | " g = w\n", 600 | " elif w.lower() in glove_index_dict:\n", 601 | " g = w.lower()\n", 602 | " elif w.startswith('#') and w[1:] in glove_index_dict:\n", 603 | " g = w[1:]\n", 604 | " elif w.startswith('#') and w[1:].lower() in glove_index_dict:\n", 605 | " g = w[1:].lower()\n", 606 | " else:\n", 607 | " continue\n", 608 | " word2glove[w] = g" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 80, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "# of glove substitutes found 0\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "normed_embedding = embedding/np.array([np.sqrt(np.dot(gweight,gweight)) for gweight in embedding])[:,None]\n", 626 | "\n", 627 | "nb_unknown_words = 100\n", 628 | "\n", 629 | "glove_match = []\n", 630 | "for w,idx in word2idx.items():\n", 631 | " if idx >= vocab_size-nb_unknown_words and w.isalpha() and w in word2glove:\n", 632 | " gidx = glove_index_dict[word2glove[w]]\n", 633 | " gweight = glove_embedding_weights[gidx,:].copy()\n", 634 | " # find row in embedding that has the highest cos score with gweight\n", 635 | " gweight /= np.sqrt(np.dot(gweight,gweight))\n", 636 | " score = np.dot(normed_embedding[:vocab_size-nb_unknown_words], gweight)\n", 637 | " while True:\n", 638 | " embedding_idx = score.argmax()\n", 639 | " s = score[embedding_idx]\n", 640 | " if s < glove_thr:\n", 641 | " break\n", 642 | " if idx2word[embedding_idx] in word2glove :\n", 643 | " glove_match.append((w, embedding_idx, s)) \n", 644 | " break\n", 645 | " score[embedding_idx] = -1\n", 646 | "glove_match.sort(key = lambda x: -x[2])\n", 647 | "print ('# of glove substitutes found', len(glove_match))" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 81, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "for orig, sub, score in glove_match[-10:]:\n", 657 | " print (score, orig,'=>', idx2word[sub])" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 82, 663 | "metadata": { 664 | "collapsed": true 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "glove_idx2idx = dict((word2idx[w],embedding_idx) for w, embedding_idx, _ in glove_match)" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 83, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/plain": [ 679 | "379" 680 | ] 681 | }, 682 | "execution_count": 83, 683 | "metadata": {}, 684 | "output_type": "execute_result" 685 | } 686 | ], 687 | "source": [ 688 | "Y = [[word2idx[token] for token in title.split()] for title in titles]\n", 689 | "len(Y)" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 84, 695 | "metadata": {}, 696 | "outputs": [ 697 | { 698 | "data": { 699 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD8CAYAAACINTRsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAC65JREFUeJzt3WGIpIddx/HvzzNBaYUkzSYcucSN\nEkqDYCJLCEQkpFajEXNCUxqknBA5X7SSomDPvrGCwlW07RsRThM8oTYNbfVCU9AQE6pQontpbBKP\ncjGc9cxxdyUNbd4oaf6+2OfwvN29nZmd3dn9z/cDYed5dibz58mT7z33zDwzqSokST38wKwHkCRN\nj1GXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTID27nk1177bW1uLi4nU8pSbve8ePH\nv11VC6Pcd1ujvri4yPLy8nY+pSTtekn+Y9T7evpFkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJ\nasSoS1IjRl2SGtnWK0q1tsVDT665/tTh+7Z5Ekm7nUfqktSIUZekRoy6JDVi1CWpEaMuSY0YdUlq\nxKhLUiNGXZIaMeqS1IhRl6RGjLokNWLUJakRoy5JjfgpjXPAT4GU5odH6pLUiFGXpEaMuiQ1YtQl\nqZGRo55kT5KvJ/nysHxzkueSnEzy+SRXbt2YkqRRjHOk/jBw4qLlTwKfrqpbgO8AD01zMEnS+EaK\nepJ9wH3AXwzLAe4BvjDc5SiwfysGlCSNbtQj9c8AvwO8PSy/C3ijqt4alk8DN0x5NknSmDa8+CjJ\nLwHnqup4krsvrF7jrrXO4w8CBwFuuummCcfUdvJiJWn3GuVI/S7gl5OcAh5j5bTLZ4Crklz4Q2Ef\n8NpaD66qI1W1VFVLCwsLUxhZkrSeDaNeVb9bVfuqahH4IPAPVfWrwDPA+4e7HQCObdmUkqSRbOZ9\n6h8DfivJK6ycY39kOiNJkiY11gd6VdWzwLPD7VeBO6Y/kiRpUl5RKkmNGHVJasSoS1IjRl2SGjHq\nktSIUZekRoy6JDVi1CWpEaMuSY2MdUWpdgY/RVHSejxSl6RGjLokNWLUJakRoy5JjRh1SWrEqEtS\nI0Zdkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktSIUZekRvzmo03wG4g2\n5jaStpdH6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTIhlFP8kNJ/jnJvyZ5\nOcnvD+tvTvJckpNJPp/kyq0fV5J0OaMcqf83cE9V/SRwG3BvkjuBTwKfrqpbgO8AD23dmJKkUWwY\n9Vrx5rB4xfBPAfcAXxjWHwX2b8mEkqSRjXROPcmeJC8A54CngH8H3qiqt4a7nAZu2JoRJUmjGinq\nVfX9qroN2AfcAbxnrbut9dgkB5MsJ1k+f/785JNKkjY01rtfquoN4FngTuCqJBc+uncf8No6jzlS\nVUtVtbSwsLCZWSVJGxjl3S8LSa4abv8w8LPACeAZ4P3D3Q4Ax7ZqSEnSaEb5koy9wNEke1j5Q+Dx\nqvpykn8DHkvyB8DXgUe2cE5J0gg2jHpVfQO4fY31r7Jyfl2StEN4RakkNWLUJakRoy5JjRh1SWrE\nqEtSI0Zdkhox6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktSIUZekRoy6JDVi\n1CWpkVG+o1TaNouHnlxz/anD923zJNLu5JG6JDVi1CWpEaMuSY0YdUlqxKhLUiNGXZIaMeqS1IhR\nl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox6pLUiFGXpEaMuiQ14jcfaVfzm5Kk/2/DI/Uk\nNyZ5JsmJJC8neXhYf02Sp5KcHH5evfXjSpIuZ5TTL28Bv11V7wHuBD6c5FbgEPB0Vd0CPD0sS5Jm\naMOoV9WZqnp+uP094ARwA3A/cHS421Fg/1YNKUkazVgvlCZZBG4HngOur6ozsBJ+4LppDydJGs/I\nUU/yTuCLwEer6rtjPO5gkuUky+fPn59kRknSiEaKepIrWAn6Z6vqS8Pqs0n2Dr/fC5xb67FVdaSq\nlqpqaWFhYRozS5LWMcq7XwI8Apyoqk9d9KsngAPD7QPAsemPJ0kaxyjvU78L+BDwYpIXhnUfBw4D\njyd5CPgW8MDWjChJGtWGUa+qfwKyzq/fO91xJEmb4ccESFIjRl2SGjHqktSIUZekRoy6JDVi1CWp\nEaMuSY0YdUlqxKhLUiNGXZIaMeqS1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0ZdkhoZ5TtK\nd4TFQ0+uuf7U4ft2xb9fO5v//dWFR+qS1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox\n6pLUiFGXpEaMuiQ1YtQlqRGjLkmNGHVJasSoS1IjRl2SGjHqktTIrvnmI2kn8ZuStFNteKSe5NEk\n55K8dNG6a5I8leTk8PPqrR1TkjSKUU6//CVw7yXrDgFPV9UtwNPDsiRpxjaMelV9FXj9ktX3A0eH\n20eB/VOeS5I0gUlfKL2+qs4ADD+vm95IkqRJbfm7X5IcTLKcZPn8+fNb/XSSNNcmjfrZJHsBhp/n\n1rtjVR2pqqWqWlpYWJjw6SRJo5g06k8AB4bbB4Bj0xlHkrQZo7yl8XPA14B3Jzmd5CHgMPC+JCeB\n9w3LkqQZ2/Dio6p6cJ1fvXfKs0jaAl4oNV/8mABJasSoS1IjRl2SGjHqktSIUZekRoy6JDVi1CWp\nEaMuSY0YdUlqpO3X2XkVnXa7We3D/r+zu3mkLkmNGHVJasSoS1Ijbc+pSzuN56q1HTxSl6RGjLok\nNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhox6pLUiBcfSdoUL6raWTxSl6RGjLokNWLUJakRoy5JjfhC\nqaRdyxdpV/NIXZIaMeqS1IhRl6RGjLokNeILpZI0JTvhhVuP1CWpEaMuSY0YdUlqxHPqkrbdTjj3\nPIrdMufFNnWknuTeJN9M8kqSQ9MaSpI0mYmjnmQP8KfALwC3Ag8muXVag0mSxreZI/U7gFeq6tWq\n+h/gMeD+6YwlSZrEZqJ+A/CfFy2fHtZJkmYkVTXZA5MHgJ+vql8flj8E3FFVv3nJ/Q4CB4fFdwPf\nnHzcXeFa4NuzHmKHcZus5jZZzW2y2oVt8qNVtTDKAzbz7pfTwI0XLe8DXrv0TlV1BDiyiefZVZIs\nV9XSrOfYSdwmq7lNVnObrDbJNtnM6Zd/AW5JcnOSK4EPAk9s4t8nSdqkiY/Uq+qtJB8B/g7YAzxa\nVS9PbTJJ0tg2dfFRVX0F+MqUZulibk41jcFtsprbZDW3yWpjb5OJXyiVJO08fvaLJDVi1Kckyakk\nLyZ5IcnyrOeZlSSPJjmX5KWL1l2T5KkkJ4efV89yxu22zjb5RJL/GvaXF5L84ixn3G5JbkzyTJIT\nSV5O8vCwfm73lctsk7H2FU+/TEmSU8BSVc31+2yT/AzwJvBXVfUTw7o/Al6vqsPDZwRdXVUfm+Wc\n22mdbfIJ4M2q+uNZzjYrSfYCe6vq+SQ/AhwH9gO/xpzuK5fZJh9gjH3FI3VNVVV9FXj9ktX3A0eH\n20dZ2VHnxjrbZK5V1Zmqen64/T3gBCtXpM/tvnKZbTIWoz49Bfx9kuPDVbT6P9dX1RlY2XGB62Y8\nz07xkSTfGE7PzM1phkslWQRuB57DfQVYtU1gjH3FqE/PXVX1U6x8auWHh79yS+v5M+DHgduAM8Cf\nzHac2UjyTuCLwEer6ruznmcnWGObjLWvGPUpqarXhp/ngL9h5VMsteLscL7wwnnDczOeZ+aq6mxV\nfb+q3gb+nDncX5JcwUq8PltVXxpWz/W+stY2GXdfMepTkOQdwwsbJHkH8HPAS5d/1Fx5Ajgw3D4A\nHJvhLDvChXANfoU521+SBHgEOFFVn7roV3O7r6y3TcbdV3z3yxQk+TFWjs5h5Srdv66qP5zhSDOT\n5HPA3ax8utxZ4PeAvwUeB24CvgU8UFVz88LhOtvkblb+Ol3AKeA3LpxLngdJfhr4R+BF4O1h9cdZ\nOYc8l/vKZbbJg4yxrxh1SWrE0y+S1IhRl6RGjLokNWLUJakRoy5JjRh1SWrEqEtSI0Zdkhr5XzjN\n5muAK5tRAAAAAElFTkSuQmCC\n", 700 | "text/plain": [ 701 | "" 702 | ] 703 | }, 704 | "metadata": {}, 705 | "output_type": "display_data" 706 | } 707 | ], 708 | "source": [ 709 | "plt.hist(list(map(len,Y)),bins=50);" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 85, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "data": { 719 | "text/plain": [ 720 | "379" 721 | ] 722 | }, 723 | "execution_count": 85, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "X = [[word2idx[token] for token in text.split()] for text in texts]\n", 730 | "len(X)" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 86, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADpRJREFUeJzt3W+MZfVdx/H3V0qpKUSgO5ANsA4Q\nUkuMLmTckKxpsNiWgnEhoQZiYB9gttFiIGJ0aRPFByZohBqThmYRZFVKWwsE0mJbAhjSRMFZusCS\nLQXbVYHN7hLaQp9Uga8P7m9g3Myf+3fu2S/vVzK555577tzP/nbmM+eee373RmYiSarlZ6YdQJI0\nfpa7JBVkuUtSQZa7JBVkuUtSQZa7JBVkuUtSQauWe0ScFhGPRsTeiHg2Iq5t62+MiJciYnf7umjy\ncSVJ/YjVJjFFxHpgfWY+GRHHAbuAS4DfAn6SmX81+ZiSpEG8Z7UNMnM/sL8tvx4Re4FThnmwdevW\n5ezs7DB3laR3rV27dr2SmTOD3GfVcl8sImaBc4DHgc3ANRFxFTAPXJ+ZP1zp/rOzs8zPzw/ykJL0\nrhcR/znoffp+QTUijgXuAa7LzNeAW4EzgY309uxvXuZ+2yJiPiLmDx06NGg+SdIQ+ir3iDiaXrHf\nlZn3AmTmgcx8MzPfAm4DNi1138zckZlzmTk3MzPQswpJ0pD6OVsmgNuBvZl5y6L16xdtdimwZ/zx\nJEnD6OeY+2bgSuCZiNjd1n0GuCIiNgIJ7AM+NZGEkqSB9XO2zLeBWOKmB8cfR5I0Ds5QlaSCLHdJ\nKshyl6SCLHdJKmigGaqqZXb715dcv++mi9c4iaRxc89dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWp\nIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtd\nkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpoFXLPSJO\ni4hHI2JvRDwbEde29SdGxEMR8Xy7PGHycSVJ/ehnz/0N4PrM/BBwHvDpiDgb2A48nJlnAQ+365Kk\nDli13DNzf2Y+2ZZfB/YCpwBbgJ1ts53AJZMKKUkazEDH3CNiFjgHeBw4OTP3Q+8PAHDSMvfZFhHz\nETF/6NCh0dJKkvrSd7lHxLHAPcB1mflav/fLzB2ZOZeZczMzM8NklCQNqK9yj4ij6RX7XZl5b1t9\nICLWt9vXAwcnE1GSNKh+zpYJ4HZgb2besuimB4CtbXkrcP/440mShvGePrbZDFwJPBMRu9u6zwA3\nAV+JiKuB/wI+OZmIkqRBrVrumfltIJa5+YLxxpEkjYMzVCWpIMtdkgqy3CWpoH5eUJWmbnb71wfa\nft9NF08oiXRkcM9dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgryPHeVtNx58YOe/77S+fWe\nS68uc89dkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpICcxaSwGnTQ0rklGkpbmnrsk\nFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBTmI6AjkBaHiOnd4t3HOXpIIsd0kqyHKX\npIIsd0kqaNVyj4g7IuJgROxZtO7GiHgpIna3r4smG1OSNIh+9tzvBC5cYv3nMnNj+3pwvLEkSaNY\ntdwz8zHg1TXIIkkak1HOc78mIq4C5oHrM/OHS20UEduAbQAbNmwY4eG0VpY7Fxw8H1w6Ugz7guqt\nwJnARmA/cPNyG2bmjsycy8y5mZmZIR9OkjSIoco9Mw9k5puZ+RZwG7BpvLEkSaMYqtwjYv2iq5cC\ne5bbVpK09lY95h4RdwPnA+si4kXgT4HzI2IjkMA+4FMTzChJGtCq5Z6ZVyyx+vYJZJEkjYkzVCWp\nIMtdkgqy3CWpID+soxA/iGJtOd7qMvfcJakgy12SCrLcJakgy12SCrLcJakgy12SCrLcJakgy12S\nCnISkzRmTm5SF7jnLkkFWe6SVJDlLkkFWe6SVJDlLkkFWe6SVJDlLkkFeZ77BHTtPOfl8kiqyz13\nSSrIcpekgix3SSrIcpekgix3SSrIcpekgix3SSrIcpekgpzE1GEVJh8N+m+o8G+WusA9d0kqyHKX\npIIsd0kqyHKXpIJWLfeIuCMiDkbEnkXrToyIhyLi+XZ5wmRjSpIG0c+e+53AhYet2w48nJlnAQ+3\n65Kkjli13DPzMeDVw1ZvAXa25Z3AJWPOJUkawbDH3E/OzP0A7fKk8UWSJI1q4pOYImIbsA1gw4YN\nk344aShOnlI1w+65H4iI9QDt8uByG2bmjsycy8y5mZmZIR9OkjSIYcv9AWBrW94K3D+eOJKkcejn\nVMi7gX8FPhgRL0bE1cBNwEcj4nngo+26JKkjVj3mnplXLHPTBWPOIkkaE2eoSlJBlrskFWS5S1JB\nlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFWS5S1JBlrskFTTxD+tQLX6oxfCWG7t9N128\nxkn0buCeuyQVZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkGWuyQVdMSc5+45wpLUP/fcJakgy12SCrLc\nJakgy12SCrLcJakgy12SCrLcJakgy12SCjpiJjF10aAfXOFELElrxT13SSrIcpekgix3SSrIcpek\ngkZ6QTUi9gGvA28Cb2Tm3DhCSZJGM46zZX4tM18Zw/eRJI2Jh2UkqaBRyz2Bb0XErojYNo5AkqTR\njXpYZnNmvhwRJwEPRcR3M/OxxRu00t8GsGHDhhEfrqZBJ0NJgxh08pyT7WoYac89M19ulweB+4BN\nS2yzIzPnMnNuZmZmlIeTJPVp6HKPiPdHxHELy8DHgD3jCiZJGt4oh2VOBu6LiIXv88XM/MZYUkmS\nRjJ0uWfm94FfHmMWSdKYeCqkJBVkuUtSQZa7JBXkh3X0wfPQVZE/16s7ks/5d89dkgqy3CWpIMtd\nkgqy3CWpIMtdkgqy3CWpIMtdkgqy3CWpICcxSVNW+cM0Bp0o1cV/w5HKPXdJKshyl6SCLHdJKshy\nl6SCLHdJKshyl6SCLHdJKshyl6SCyk5iGuZTZpxAoS4Z9Gd40p+sVGHy1KATw45k7rlLUkGWuyQV\nZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkFlz3MfRsVzXaUKxnWO/bTmAsDazwdwz12SCrLcJakgy12S\nCrLcJamgkco9Ii6MiOci4oWI2D6uUJKk0Qxd7hFxFPB54BPA2cAVEXH2uIJJkoY3yp77JuCFzPx+\nZv4P8CVgy3hiSZJGMUq5nwL896LrL7Z1kqQpi8wc7o4RnwQ+npm/065fCWzKzN8/bLttwLZ29YPA\nc0M83DrglaGCTl6Xs4H5RmW+0XQ5X5ezwf/P9/OZOTPInUeZofoicNqi66cCLx++UWbuAHaM8DhE\nxHxmzo3yPSaly9nAfKMy32i6nK/L2WD0fKMclvl34KyIOD0i3gtcDjwwwveTJI3J0HvumflGRFwD\nfBM4CrgjM58dWzJJ0tBGeuOwzHwQeHBMWVYy0mGdCetyNjDfqMw3mi7n63I2GPVw9rAvqEqSusu3\nH5Ckgjpd7l18e4OI2BcRz0TE7oiYb+tOjIiHIuL5dnnCGua5IyIORsSeReuWzBM9f9PG8+mIOHdK\n+W6MiJfaGO6OiIsW3XZDy/dcRHx8wtlOi4hHI2JvRDwbEde29Z0YvxXydWX83hcRT0TEUy3fn7X1\np0fE4238vtxOuCAijmnXX2i3z04p350R8YNF47exrZ/G78dREfGdiPhauz6+scvMTn7Re5H2P4Az\ngPcCTwFndyDXPmDdYev+EtjelrcDf7GGeT4MnAvsWS0PcBHwz0AA5wGPTynfjcAfLrHt2e3/+Rjg\n9Pb/f9QEs60Hzm3LxwHfaxk6MX4r5OvK+AVwbFs+Gni8jctXgMvb+i8Av9uWfw/4Qlu+HPjyhMdv\nuXx3Apctsf00fj/+APgi8LV2fWxj1+U99yPp7Q22ADvb8k7gkrV64Mx8DHi1zzxbgL/Pnn8Djo+I\n9VPIt5wtwJcy86eZ+QPgBXo/B5PKtj8zn2zLrwN76c2y7sT4rZBvOWs9fpmZP2lXj25fCXwE+Gpb\nf/j4LYzrV4ELIiKmkG85a/r/GxGnAhcDf9uuB2Mcuy6Xe1ff3iCBb0XErujNvgU4OTP3Q+8XEjhp\naulWztOlMb2mPfW9Y9FhrKnla09zz6G3d9e58TssH3Rk/Nphhd3AQeAhes8WfpSZbyyR4e187fYf\nAx9Yy3yZuTB+f97G73MRcczh+ZbIPgl/DfwR8Fa7/gHGOHZdLvel/ip14dSezZl5Lr13w/x0RHx4\n2oEG0JUxvRU4E9gI7Adubuunki8ijgXuAa7LzNdW2nSJddPI15nxy8w3M3MjvRnqm4APrZBh6vki\n4heBG4BfAH4FOBH447XOFxG/ARzMzF2LV6/w+ANn63K59/X2BmstM19ulweB++j9QB9YePrWLg9O\nLyGskKcTY5qZB9ov3VvAbbxz6GDN80XE0fSK867MvLet7sz4LZWvS+O3IDN/BPwLvWPVx0fEwhya\nxRneztdu/zn6P2Q3rnwXtsNdmZk/Bf6O6YzfZuA3I2IfvUPOH6G3Jz+2setyuXfu7Q0i4v0RcdzC\nMvAxYE/LtbVtthW4fzoJ37ZcngeAq9pZAecBP144/LCWDjuOeSm9MVzId3k7M+B04CzgiQnmCOB2\nYG9m3rLopk6M33L5OjR+MxFxfFv+WeDX6b0u8ChwWdvs8PFbGNfLgEeyvUK4hvm+u+gPd9A7pr14\n/Nbk/zczb8jMUzNzll63PZKZv804x27SrwaP8kXv1evv0TuO99kO5DmD3tkITwHPLmSid+zrYeD5\ndnniGma6m95T8/+l99f96uXy0Htq9/k2ns8Ac1PK9w/t8Z9uP7TrF23/2ZbvOeATE872q/Se2j4N\n7G5fF3Vl/FbI15Xx+yXgOy3HHuBPFv2ePEHvBd1/Ao5p69/Xrr/Qbj9jSvkeaeO3B/hH3jmjZs1/\nP9rjns87Z8uMbeycoSpJBXX5sIwkaUiWuyQVZLlLUkGWuyQVZLlLUkGWuyQVZLlLUkGWuyQV9H+4\nytjNNBPLlwAAAABJRU5ErkJggg==\n", 741 | "text/plain": [ 742 | "" 743 | ] 744 | }, 745 | "metadata": {}, 746 | "output_type": "display_data" 747 | } 748 | ], 749 | "source": [ 750 | "plt.hist(list(map(len,X)),bins=50);" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 87, 756 | "metadata": {}, 757 | "outputs": [ 758 | { 759 | "name": "stdout", 760 | "output_type": "stream", 761 | "text": [ 762 | "[[-0.01773875 0.06372642 0.03280158 ..., -0.01024496 -0.06710091\n", 763 | " -0.05544016]\n", 764 | " [-0.06625115 0.01928704 -0.02624818 ..., 0.05614735 0.05473008\n", 765 | " 0.03957156]\n", 766 | " [-0.0038194 -0.024487 0.072812 ..., -0.01459 0.08278 0.027062 ]\n", 767 | " ..., \n", 768 | " [ 0.06982313 -0.02670071 -0.03871925 ..., -0.00267477 -0.01187393\n", 769 | " -0.05748738]\n", 770 | " [ 0.04286668 -0.0481842 -0.01529906 ..., 0.0564503 0.03692646\n", 771 | " 0.03450374]\n", 772 | " [ 0.01269256 0.03835368 -0.04946906 ..., 0.01590619 -0.05742016\n", 773 | " 0.03449618]]\n" 774 | ] 775 | } 776 | ], 777 | "source": [ 778 | "print(embedding)" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 88, 784 | "metadata": { 785 | "collapsed": true 786 | }, 787 | "outputs": [], 788 | "source": [ 789 | "with open('%s.pkl'%FN,\"wb\") as fp:\n", 790 | " pickle.dump((embedding, idx2word, word2idx, glove_idx2idx),fp,-1)\n", 791 | " \n", 792 | "with open('%s.data.pkl'%FN,\"wb\") as fp:\n", 793 | " pickle.dump((X,Y),fp,-1)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": { 800 | "collapsed": true 801 | }, 802 | "outputs": [], 803 | "source": [] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": null, 808 | "metadata": { 809 | "collapsed": true 810 | }, 811 | "outputs": [], 812 | "source": [] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": null, 817 | "metadata": { 818 | "collapsed": true 819 | }, 820 | "outputs": [], 821 | "source": [] 822 | } 823 | ], 824 | "metadata": { 825 | "kernelspec": { 826 | "display_name": "Python 3", 827 | "language": "python", 828 | "name": "python3" 829 | }, 830 | "language_info": { 831 | "codemirror_mode": { 832 | "name": "ipython", 833 | "version": 3 834 | }, 835 | "file_extension": ".py", 836 | "mimetype": "text/x-python", 837 | "name": "python", 838 | "nbconvert_exporter": "python", 839 | "pygments_lexer": "ipython3", 840 | "version": "3.6.3" 841 | } 842 | }, 843 | "nbformat": 4, 844 | "nbformat_minor": 2 845 | } 846 | -------------------------------------------------------------------------------- /p1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p1.png -------------------------------------------------------------------------------- /p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p2.png -------------------------------------------------------------------------------- /p3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p3.png -------------------------------------------------------------------------------- /p4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p4.png -------------------------------------------------------------------------------- /p5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p5.png -------------------------------------------------------------------------------- /p6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p6.png -------------------------------------------------------------------------------- /p7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzhao98/text-summarization/61be858da5b81515615a9f047435e3abf41a79f3/p7.png -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | """Predict a title for a recipe.""" 2 | from os import path 3 | import random 4 | import json 5 | import pickle 6 | import h5py 7 | import numpy as np 8 | from utils import str_shape 9 | import keras.backend as K 10 | import argparse 11 | 12 | from config import path_models, path_data 13 | from constants import FN1, FN0, nb_unknown_words, eos 14 | from model import create_model 15 | from sample_gen import gensamples 16 | 17 | # set seeds in random libraries 18 | seed = 42 19 | random.seed(seed) 20 | np.random.seed(seed) 21 | 22 | 23 | def load_weights(model, filepath): 24 | """Load all weights possible into model from filepath. 25 | 26 | This is a modified version of keras load_weights that loads as much as it can 27 | if there is a mismatch between file and model. It returns the weights 28 | of the first layer in which the mismatch has happened 29 | """ 30 | print('Loading', filepath, 'to', model.name) 31 | with h5py.File(filepath, mode='r') as f: 32 | # new file format 33 | layer_names = [n.decode('utf8') for n in f.attrs['layer_names']] 34 | 35 | # we batch weight value assignments in a single backend call 36 | # which provides a speedup in TensorFlow. 37 | weight_value_tuples = [] 38 | for name in layer_names: 39 | print(name) 40 | g = f[name] 41 | weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] 42 | if len(weight_names): 43 | weight_values = [g[weight_name] for weight_name in weight_names] 44 | try: 45 | layer = model.get_layer(name=name) 46 | except: 47 | layer = None 48 | if not layer: 49 | print('failed to find layer', name, 'in model') 50 | print('weights', ' '.join(str_shape(w) for w in weight_values)) 51 | print('stopping to load all other layers') 52 | weight_values = [np.array(w) for w in weight_values] 53 | break 54 | symbolic_weights = layer.trainable_weights + layer.non_trainable_weights 55 | weight_value_tuples += zip(symbolic_weights, weight_values) 56 | weight_values = None 57 | K.batch_set_value(weight_value_tuples) 58 | return weight_values 59 | 60 | 61 | def main(sample_str=None): 62 | """Predict a title for a recipe.""" 63 | # load model parameters used for training 64 | with open(path.join(path_models, 'model_params.json'), 'r') as f: 65 | model_params = json.load(f) 66 | 67 | # create placeholder model 68 | model = create_model(**model_params) 69 | 70 | # load weights from training run 71 | load_weights(model, path.join(path_models, '{}.hdf5'.format(FN1))) 72 | 73 | # load recipe titles and descriptions 74 | with open(path.join(path_data, 'vocabulary-embedding.data.pkl'), 'rb') as fp: 75 | X_data, Y_data = pickle.load(fp) 76 | 77 | # load vocabulary 78 | with open(path.join(path_data, '{}.pkl'.format(FN0)), 'rb') as fp: 79 | embedding, idx2word, word2idx, glove_idx2idx = pickle.load(fp) 80 | vocab_size, embedding_size = embedding.shape 81 | oov0 = vocab_size - nb_unknown_words 82 | 83 | if sample_str is None: 84 | # load random recipe description if none provided 85 | i = np.random.randint(len(X_data)) 86 | sample_str = '' 87 | sample_title = '' 88 | for w in X_data[i]: 89 | sample_str += idx2word[w] + ' ' 90 | for w in Y_data[i]: 91 | sample_title += idx2word[w] + ' ' 92 | y = Y_data[i] 93 | print('Randomly sampled recipe:') 94 | print(sample_title) 95 | print(sample_str) 96 | else: 97 | sample_title = '' 98 | y = [eos] 99 | 100 | x = [word2idx[w.rstrip('^')] for w in sample_str.split()] 101 | 102 | samples = gensamples( 103 | skips=2, 104 | k=1, 105 | batch_size=2, 106 | short=False, 107 | temperature=1., 108 | use_unk=True, 109 | model=model, 110 | data=(x, y), 111 | idx2word=idx2word, 112 | oov0=oov0, 113 | glove_idx2idx=glove_idx2idx, 114 | vocab_size=vocab_size, 115 | nb_unknown_words=nb_unknown_words, 116 | ) 117 | 118 | headline = samples[0][0][len(samples[0][1]):] 119 | ' '.join(idx2word[w] for w in headline) 120 | 121 | if __name__ == '__main__': 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument('--sample-str', type=str, default=None, help='Sample recipe description') 124 | args = parser.parse_args() 125 | main(sample_str=args.sample_str) 126 | -------------------------------------------------------------------------------- /simpler.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pickle\n", 13 | "import string " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import os\n", 26 | "\n", 27 | "def split():\n", 28 | " titles = []\n", 29 | " texts = []\n", 30 | " root = 'Part1'\n", 31 | " \n", 32 | " #dirr = 'Part1/awards_1990/awd_1990_00/'\n", 33 | " dirs = os.listdir('Part1/awards_1990/awd_1990_00/')\n", 34 | "\n", 35 | " for filename in dirs[1:]:\n", 36 | " #iter = 0\n", 37 | " #print(dirs[1])\n", 38 | " \n", 39 | " #print(iter)\n", 40 | " #iter += 1\n", 41 | " #print(dirs[1:])\n", 42 | " #filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'\n", 43 | " f = open('Part1/awards_1990/awd_1990_00/' + str(filename))\n", 44 | " addTitle = False\n", 45 | " addTexts = False\n", 46 | " title = []\n", 47 | " text = []\n", 48 | " for word in f.read().split():\n", 49 | " if (word == \"Title\"):\n", 50 | " addTitle = True\n", 51 | " continue\n", 52 | "\n", 53 | " if (word == \"Type\"):\n", 54 | " addTitle = False\n", 55 | "\n", 56 | "# if (addTexts == True and word == \"\\n\"):\n", 57 | "# addTexts = False\n", 58 | "# break\n", 59 | "\n", 60 | "\n", 61 | " if (word == \"Abstract\"):\n", 62 | " addTexts = True\n", 63 | " continue\n", 64 | "\n", 65 | " if(addTitle == True):\n", 66 | " title.append(word)\n", 67 | "\n", 68 | " if(addTexts == True):\n", 69 | " text.append(word)\n", 70 | "\n", 71 | " for i in range(len(title)):\n", 72 | " s = title[i]\n", 73 | " table = str.maketrans({key: None for key in string.punctuation})\n", 74 | " new_s = s.translate(table)\n", 75 | " title[i] = new_s\n", 76 | " for i in range(len(text)):\n", 77 | " s = text[i]\n", 78 | " table = str.maketrans({key: None for key in string.punctuation})\n", 79 | " new_s = s.translate(table)\n", 80 | " text[i] = new_s\n", 81 | "\n", 82 | " title = ' '.join(title)\n", 83 | " text =' '.join(text)\n", 84 | " titles.append(title)\n", 85 | " texts.append(text)\n", 86 | "\n", 87 | "# f=open(\"titles.txt\", 'w')\n", 88 | "# for i in titles:\n", 89 | "# f.write(i)\n", 90 | "# f.write(' ')\n", 91 | "\n", 92 | "# t=open(\"texts.txt\", 'w')\n", 93 | "# for i in texts:\n", 94 | "# t.write(i)\n", 95 | "# t.write(' ')\n", 96 | "\n", 97 | "# f.close()\n", 98 | "# t.close()\n", 99 | " return titles, texts\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 3, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "titles, texts = split()\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": { 117 | "scrolled": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "379\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "print(len(titles))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "from collections import Counter\n", 141 | "from itertools import chain\n", 142 | "def get_vocab(lst):\n", 143 | " vocabcount = Counter(w for txt in lst for w in txt.split())\n", 144 | " vocab = map(lambda x: x[0], sorted(vocabcount.items(), key=lambda x: -x[1]))\n", 145 | " return list(vocab), vocabcount" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "' CRB Genetic Diversity of Endangered Populations of Mysticete Whales Mitochondrial DNA and Historical Demography'" 157 | ] 158 | }, 159 | "execution_count": 7, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "titles[0]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "vocab, vocabcount = get_vocab(titles+texts)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 10, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "9073" 188 | ] 189 | }, 190 | "execution_count": 10, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "len(vocabcount)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 31, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "from keras.models import Sequential\n", 208 | "from keras.engine.topology import Input\n", 209 | "from keras.layers.core import Dense, Activation, Dropout, RepeatVector\n", 210 | "from keras.layers.wrappers import TimeDistributed\n", 211 | "from keras.layers.recurrent import LSTM\n", 212 | "from keras.layers.embeddings import Embedding\n", 213 | "from keras.regularizers import l2\n", 214 | "from keras.layers import merge, SpatialDropout1D\n", 215 | "from keras.callbacks import TensorBoard\n", 216 | "from keras import Model\n", 217 | "import keras\n", 218 | "import random" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 20, 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "vocab_size = 9073\n", 230 | "src_txt_length = 379\n", 231 | "sum_txt_length = 379\n", 232 | "# encoder input model\n", 233 | "inputs = Input(shape=(src_txt_length,))\n", 234 | "encoder1 = Embedding(vocab_size, 128)(inputs)\n", 235 | "encoder2 = LSTM(128)(encoder1)\n", 236 | "encoder3 = RepeatVector(sum_txt_length)(encoder2)\n", 237 | "# decoder output model\n", 238 | "decoder1 = LSTM(128, return_sequences=True)(encoder3)\n", 239 | "outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder1)\n", 240 | "# tie it together\n", 241 | "model = Model(inputs=inputs, outputs=outputs)\n", 242 | "model.compile(loss='categorical_crossentropy', optimizer='adam')" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 23, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "batch_size = 10\n", 254 | "seed = 0\n", 255 | "def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, seed=seed):\n", 256 | " \"\"\"yield batches. for training use nb_batches=None\n", 257 | " for validation generate deterministic results repeating every nb_batches\n", 258 | " \n", 259 | " while training it is good idea to flip once in a while the values of the headlines from the\n", 260 | " value taken from Xh to value generated by the model.\n", 261 | " \"\"\"\n", 262 | " c = nb_batches if nb_batches else 0\n", 263 | " while True:\n", 264 | " xds = []\n", 265 | " xhs = []\n", 266 | " if nb_batches and c >= nb_batches:\n", 267 | " c = 0\n", 268 | " new_seed = random.randint(0, 9223372036854775807)\n", 269 | " random.seed(c+123456789+seed)\n", 270 | " for b in range(batch_size):\n", 271 | " t = random.randint(0,len(Xd)-1)\n", 272 | "\n", 273 | " xd = Xd[t]\n", 274 | " s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))\n", 275 | " xds.append(xd[:s])\n", 276 | " \n", 277 | " xh = Xh[t]\n", 278 | " s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))\n", 279 | " xhs.append(xh[:s])\n", 280 | "\n", 281 | " # undo the seeding before we yield inorder not to affect the caller\n", 282 | " c+= 1\n", 283 | " random.seed(new_seed)\n", 284 | "\n", 285 | " yield conv_seq_labels(xds, xhs, nflips=nflips, model=model, debug=debug)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 6, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stderr", 295 | "output_type": "stream", 296 | "text": [ 297 | "Using TensorFlow backend.\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "FN = 'train'\n", 303 | "FN0 = 'vocabulary-embedding'\n", 304 | "FN1 = 'train'\n", 305 | "\n", 306 | "import os\n", 307 | "import keras\n", 308 | "keras.__version__\n", 309 | "with open('%s.data.pkl'%FN0, 'rb') as fp:\n", 310 | " X, Y = pickle.load(fp)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 7, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "ename": "NameError", 320 | "evalue": "name 'gen' is not defined", 321 | "output_type": "error", 322 | "traceback": [ 323 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 324 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 325 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mnflips\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtraingen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnflips\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnflips\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 326 | "\u001b[0;31mNameError\u001b[0m: name 'gen' is not defined" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "nflips = 10\n", 332 | "traingen = gen(X, Y, batch_size=batch_size, nflips=nflips, model=model)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "len(Y)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "model.fit(X, Y)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "collapsed": true 367 | }, 368 | "outputs": [], 369 | "source": [] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "Python 3", 375 | "language": "python", 376 | "name": "python3" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 3 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython3", 388 | "version": "3.6.3" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 2 393 | } 394 | -------------------------------------------------------------------------------- /stopWords.txt: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero -------------------------------------------------------------------------------- /testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "AttributeError", 10 | "evalue": "'list' object has no attribute 'ndim'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'binary_crossentropy'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'adam'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'accuracy'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;31m# Fit the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m150\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 26\u001b[0m \u001b[0;31m# evaluate the model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 16 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/models.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 965\u001b[0;31m validation_steps=validation_steps)\n\u001b[0m\u001b[1;32m 966\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m def evaluate(self, x=None, y=None,\n", 17 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m 1591\u001b[0m \u001b[0mclass_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1592\u001b[0m \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1593\u001b[0;31m batch_size=batch_size)\n\u001b[0m\u001b[1;32m 1594\u001b[0m \u001b[0;31m# Prepare validation data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1595\u001b[0m \u001b[0mdo_validation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 18 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[0;34m(self, x, y, sample_weight, class_weight, check_batch_axis, batch_size)\u001b[0m\n\u001b[1;32m 1424\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feed_input_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1425\u001b[0m \u001b[0mcheck_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1426\u001b[0;31m exception_prefix='input')\n\u001b[0m\u001b[1;32m 1427\u001b[0m y = _standardize_input_data(y, self._feed_output_names,\n\u001b[1;32m 1428\u001b[0m \u001b[0moutput_shapes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 19 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_standardize_input_data\u001b[0;34m(data, names, shapes, check_batch_axis, exception_prefix)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 20 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 21 | "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'ndim'" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "from keras.models import Sequential\n", 27 | "from keras.layers import Dense\n", 28 | "import numpy\n", 29 | "# fix random seed for reproducibility\n", 30 | "numpy.random.seed(7)\n", 31 | "# load pima indians dataset\n", 32 | "FN = 'train'\n", 33 | "FN0 = 'vocabulary-embedding'\n", 34 | "FN1 = 'train'\n", 35 | "\n", 36 | "import os\n", 37 | "import keras\n", 38 | "import pickle\n", 39 | "keras.__version__\n", 40 | "with open('%s.data.pkl'%FN0, 'rb') as fp:\n", 41 | " X, Y = pickle.load(fp)\n", 42 | "# create model\n", 43 | "model = Sequential()\n", 44 | "model.add(Dense(12, input_dim=8, activation='relu'))\n", 45 | "model.add(Dense(8, activation='relu'))\n", 46 | "model.add(Dense(1, activation='sigmoid'))\n", 47 | "# Compile model\n", 48 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 49 | "# Fit the model\n", 50 | "model.fit(X, Y, epochs=150, batch_size=10)\n", 51 | "# evaluate the model\n", 52 | "scores = model.evaluate(X, Y)\n", 53 | "print(\"\\n%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.6.3" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /tokenize_recipes.py: -------------------------------------------------------------------------------- 1 | """Tokenize recipes.""" 2 | import _pickle as pickle 3 | from os import path 4 | from nltk.tokenize import word_tokenize 5 | from nltk import download 6 | from tqdm import tqdm 7 | 8 | import config 9 | import prep_data 10 | from parse_ingredients import parse_ingredient_list 11 | 12 | 13 | def tokenize_sentence(sentence): 14 | """Tokenize a sentence.""" 15 | try: 16 | return ' '.join(list(filter( 17 | lambda x: x.lower() != "advertisement", 18 | word_tokenize(sentence)))) 19 | except LookupError: 20 | print('Downloading NLTK data') 21 | download() 22 | return ' '.join(list(filter( 23 | lambda x: x.lower() != "advertisement", 24 | word_tokenize(sentence)))) 25 | 26 | 27 | def recipe_is_complete(r): 28 | """Return True if recipe is complete and False otherwise. 29 | 30 | Completeness is defined as the recipe containing a title and instructions. 31 | """ 32 | if ('title' not in r) or ('instructions' not in r): 33 | return False 34 | if (r['title'] is None) or (r['instructions'] is None): 35 | return False 36 | return True 37 | 38 | 39 | def tokenize_recipes(recipes): 40 | """Tokenise all recipes.""" 41 | tokenized = [] 42 | for r in tqdm(recipes.values()): 43 | if recipe_is_complete(r): 44 | ingredients = '; '.join(parse_ingredient_list(r['ingredients'])) + '; ' 45 | tokenized.append(( 46 | tokenize_sentence(r['title']), 47 | tokenize_sentence(ingredients) + tokenize_sentence(r['instructions']))) 48 | return tuple(map(list, zip(*tokenized))) 49 | 50 | 51 | def pickle_recipes(recipes): 52 | """Pickle all recipe tokens to disk.""" 53 | with open(path.join(config.path_data, 'tokens.pkl'), 'wb') as f: 54 | pickle.dump(recipes, f, 2) 55 | 56 | 57 | def load_recipes(): 58 | """Read pickled recipe tokens from disk.""" 59 | with open(path.join(config.path_data, 'tokens.pkl'), 'rb') as f: 60 | recipes = pickle.load(f) 61 | return recipes 62 | 63 | 64 | def main(): 65 | """Tokenize recipes.""" 66 | recipes = prep_data.load_recipes() 67 | text_sum_data = tokenize_recipes(recipes) 68 | pickle_recipes(text_sum_data) 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /train2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 19, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "\"\"\"Define constant variables.\"\"\"\n", 12 | "\n", 13 | "# define empty and end-of-sentence vocab idx\n", 14 | "empty = 0\n", 15 | "eos = 1\n", 16 | "\n", 17 | "# input data (X) is made from maxlend description words followed by eos followed by\n", 18 | "# headline words followed by eos if description is shorter than maxlend it will be\n", 19 | "# left padded with empty if entire data is longer than maxlen it will be clipped and\n", 20 | "# if it is shorter it will be right padded with empty. labels (Y) are the headline\n", 21 | "# words followed by eos and clipped or padded to maxlenh. In other words the input is\n", 22 | "# made from a maxlend half in which the description is padded from the left and a\n", 23 | "# maxlenh half in which eos is followed by a headline followed by another eos if there\n", 24 | "# is enough space. The labels match only the second half and the first label matches\n", 25 | "# the eos at the start of the second half (following the description in the first half)\n", 26 | "maxlend = 100\n", 27 | "maxlenh = 15\n", 28 | "maxlen = maxlend + maxlenh\n", 29 | "activation_rnn_size = 40 if maxlend else 0\n", 30 | "nb_unknown_words = 10\n", 31 | "\n", 32 | "# function names\n", 33 | "FN0 = 'vocabulary-embedding' # filename of vocab embeddings\n", 34 | "FN1 = 'train' # filename of model weights\n", 35 | "\n", 36 | "# training variables\n", 37 | "seed = 42\n", 38 | "optimizer = 'adam'\n", 39 | "p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0\n", 40 | "regularizer = None\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 20, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from keras.models import Sequential\n", 50 | "from keras.layers.core import Dense, Activation, Dropout\n", 51 | "from keras.layers.wrappers import TimeDistributed\n", 52 | "from keras.layers.recurrent import LSTM\n", 53 | "from keras.layers.embeddings import Embedding\n", 54 | "from keras.layers.core import Lambda\n", 55 | "import keras.backend as K\n", 56 | "import numpy as np\n", 57 | "\n", 58 | "#from utils import str_shape\n", 59 | "#from constants import maxlend, maxlenh, maxlen, activation_rnn_size, optimizer, p_W, p_U, p_dense, p_emb, regularizer\n", 60 | "\n", 61 | "\n", 62 | "def inspect_model(model):\n", 63 | " \"\"\"Print the structure of Keras `model`.\"\"\"\n", 64 | " for i, l in enumerate(model.layers):\n", 65 | " print(i, 'cls={} name={}'.format(type(l).__name__, l.name))\n", 66 | " weights = l.get_weights()\n", 67 | " print_str = ''\n", 68 | "# for weight in weights:\n", 69 | "# print_str += str_shape(weight) + ' '\n", 70 | " print(print_str)\n", 71 | " print()\n", 72 | "\n", 73 | "\n", 74 | "class SimpleContext(Lambda):\n", 75 | " \"\"\"Class to implement `simple_context` method as a Keras layer.\"\"\"\n", 76 | "\n", 77 | " def __init__(self, fn, rnn_size, **kwargs):\n", 78 | " \"\"\"Initialize SimpleContext.\"\"\"\n", 79 | " self.rnn_size = rnn_size\n", 80 | " super(SimpleContext, self).__init__(fn, **kwargs)\n", 81 | " self.supports_masking = True\n", 82 | "\n", 83 | " def compute_mask(self, input, input_mask=None):\n", 84 | " \"\"\"Compute mask of maxlend.\"\"\"\n", 85 | " return input_mask[:, maxlend:]\n", 86 | "\n", 87 | " def get_output_shape_for(self, input_shape):\n", 88 | " \"\"\"Get output shape for a given `input_shape`.\"\"\"\n", 89 | " nb_samples = input_shape[0]\n", 90 | " n = 2 * (self.rnn_size - activation_rnn_size)\n", 91 | " return (nb_samples, maxlenh, n)\n", 92 | "\n", 93 | "\n", 94 | "def create_model(vocab_size, embedding_size, LR, rnn_layers, rnn_size, embedding=None):\n", 95 | " \"\"\"Construct and compile LSTM model.\"\"\"\n", 96 | " # create a standard stacked LSTM\n", 97 | " if embedding is not None:\n", 98 | " embedding = [embedding]\n", 99 | " model = Sequential()\n", 100 | " model.add(Embedding(vocab_size, embedding_size,\n", 101 | " input_length=maxlen,\n", 102 | " W_regularizer=regularizer, dropout=p_emb, weights=embedding, mask_zero=True,\n", 103 | " name='embedding_1'))\n", 104 | " for i in range(rnn_layers):\n", 105 | " lstm = LSTM(rnn_size, return_sequences=True,\n", 106 | " W_regularizer=regularizer, U_regularizer=regularizer,\n", 107 | " b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,\n", 108 | " name='lstm_{}'.format(i + 1))\n", 109 | " model.add(lstm)\n", 110 | " model.add(Dropout(p_dense, name='dropout_{}'.format(i + 1)))\n", 111 | "\n", 112 | " def simple_context(X, mask, n=activation_rnn_size):\n", 113 | " \"\"\"Reduce the input just to its headline part (second half).\n", 114 | " For each word in this part it concatenate the output of the previous layer (RNN)\n", 115 | " with a weighted average of the outputs of the description part.\n", 116 | " In this only the last `rnn_size - activation_rnn_size` are used from each output.\n", 117 | " The first `activation_rnn_size` output is used to computer the weights for the averaging.\n", 118 | " \"\"\"\n", 119 | " desc, head = X[:, :maxlend, :], X[:, maxlend:, :]\n", 120 | " head_activations, head_words = head[:, :, :n], head[:, :, n:]\n", 121 | " desc_activations, desc_words = desc[:, :, :n], desc[:, :, n:]\n", 122 | "\n", 123 | " # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot\n", 124 | " # activation for every head word and every desc word\n", 125 | " activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))\n", 126 | " # make sure we dont use description words that are masked out\n", 127 | " activation_energies = activation_energies + -1e20 * K.expand_dims(\n", 128 | " 1. - K.cast(mask[:, :maxlend], 'float32'), 1)\n", 129 | "\n", 130 | " # for every head word compute weights for every desc word\n", 131 | " activation_energies = K.reshape(activation_energies, (-1, maxlend))\n", 132 | " activation_weights = K.softmax(activation_energies)\n", 133 | " activation_weights = K.reshape(activation_weights, (-1, maxlenh, maxlend))\n", 134 | "\n", 135 | " # for every head word compute weighted average of desc words\n", 136 | " desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))\n", 137 | " return K.concatenate((desc_avg_word, head_words))\n", 138 | "\n", 139 | " if activation_rnn_size:\n", 140 | " model.add(SimpleContext(simple_context, rnn_size, name='simplecontext_1'))\n", 141 | "\n", 142 | " model.add(TimeDistributed(Dense(\n", 143 | " vocab_size,\n", 144 | " W_regularizer=regularizer,\n", 145 | " b_regularizer=regularizer,\n", 146 | " name='timedistributed_1')))\n", 147 | " model.add(Activation('softmax', name='activation_1'))\n", 148 | "\n", 149 | " # opt = Adam(lr=LR) # keep calm and reduce learning rate\n", 150 | " model.compile(loss='categorical_crossentropy', optimizer=optimizer)\n", 151 | "\n", 152 | " K.set_value(model.optimizer.lr, np.float32(LR))\n", 153 | " return model" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# %load constants.py\n", 165 | "\"\"\"Define constant variables.\"\"\"\n", 166 | "\n", 167 | "# define empty and end-of-sentence vocab idx\n", 168 | "empty = 0\n", 169 | "eos = 1\n", 170 | "\n", 171 | "# input data (X) is made from maxlend description words followed by eos followed by\n", 172 | "# headline words followed by eos if description is shorter than maxlend it will be\n", 173 | "# left padded with empty if entire data is longer than maxlen it will be clipped and\n", 174 | "# if it is shorter it will be right padded with empty. labels (Y) are the headline\n", 175 | "# words followed by eos and clipped or padded to maxlenh. In other words the input is\n", 176 | "# made from a maxlend half in which the description is padded from the left and a\n", 177 | "# maxlenh half in which eos is followed by a headline followed by another eos if there\n", 178 | "# is enough space. The labels match only the second half and the first label matches\n", 179 | "# the eos at the start of the second half (following the description in the first half)\n", 180 | "maxlend = 100\n", 181 | "maxlenh = 15\n", 182 | "maxlen = maxlend + maxlenh\n", 183 | "activation_rnn_size = 40 if maxlend else 0\n", 184 | "nb_unknown_words = 10\n", 185 | "\n", 186 | "# function names\n", 187 | "FN0 = 'vocabulary-embedding' # filename of vocab embeddings\n", 188 | "FN1 = 'train' # filename of model weights\n", 189 | "\n", 190 | "# training variables\n", 191 | "seed = 42\n", 192 | "optimizer = 'adam'\n", 193 | "p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0\n", 194 | "regularizer = None\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 27, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "ename": "ImportError", 204 | "evalue": "cannot import name 'empty'", 205 | "output_type": "error", 206 | "traceback": [ 207 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 208 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 209 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msample_gen\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgensamples\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mprt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_embedding\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprocess_vocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_split_data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minspect_model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 210 | "\u001b[0;32m~/Documents/cs141/sample_gen.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msequence\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mempty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlenh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 211 | "\u001b[0;31mImportError\u001b[0m: cannot import name 'empty'" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "import os\n", 217 | "import time\n", 218 | "import random\n", 219 | "import argparse\n", 220 | "import json\n", 221 | "\n", 222 | "\n", 223 | "import numpy as np\n", 224 | "from keras.callbacks import TensorBoard\n", 225 | "\n", 226 | "import config\n", 227 | "from sample_gen import gensamples\n", 228 | "from utils import prt, load_embedding, process_vocab, load_split_data\n", 229 | "from model import create_model, inspect_model\n", 230 | "from generate import gen\n", 231 | "#from constants import FN1, seed, nb_unknown_words\n", 232 | "\n", 233 | "# parse arguments\n", 234 | "parser = argparse.ArgumentParser()\n", 235 | "parser.add_argument('--batch-size', type=int, default=32, help='input batch size')\n", 236 | "parser.add_argument('--epochs', type=int, default=10, help='number of epochs')\n", 237 | "parser.add_argument('--rnn-size', type=int, default=512, help='size of RNN layers')\n", 238 | "parser.add_argument('--rnn-layers', type=int, default=3, help='number of RNN layers')\n", 239 | "parser.add_argument('--nsamples', type=int, default=640, help='number of samples per epoch')\n", 240 | "parser.add_argument('--nflips', type=int, default=0, help='number of flips')\n", 241 | "parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature')\n", 242 | "parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001')\n", 243 | "parser.add_argument('--warm-start', action='store_true')\n", 244 | "args = parser.parse_args()\n", 245 | "batch_size = args.batch_size\n", 246 | "\n", 247 | "# set sample sizes\n", 248 | "nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size # num training samples\n", 249 | "nb_val_samples = nb_train_samples # num validation samples\n", 250 | "\n", 251 | "# seed weight initialization\n", 252 | "random.seed(seed)\n", 253 | "np.random.seed(seed)\n", 254 | "\n", 255 | "embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words)\n", 256 | "vocab_size, embedding_size = embedding.shape\n", 257 | "oov0 = vocab_size - nb_unknown_words\n", 258 | "idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words)\n", 259 | "X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed)\n", 260 | "\n", 261 | "# print a sample recipe to make sure everything looks right\n", 262 | "print('Random head, description:')\n", 263 | "i = 811\n", 264 | "prt('H', Y_train[i], idx2word)\n", 265 | "prt('D', X_train[i], idx2word)\n", 266 | "\n", 267 | "# save model initialization parameters\n", 268 | "model_params = (dict(\n", 269 | " vocab_size=vocab_size,\n", 270 | " embedding_size=embedding_size,\n", 271 | " LR=args.lr,\n", 272 | " rnn_layers=args.rnn_layers,\n", 273 | " rnn_size=args.rnn_size,\n", 274 | "))\n", 275 | "with open(os.path.join(config.path_models, 'model_params.json'), 'w') as f:\n", 276 | " json.dump(model_params, f)\n", 277 | "\n", 278 | "\n", 279 | "model = create_model(\n", 280 | " vocab_size=vocab_size,\n", 281 | " embedding_size=embedding_size,\n", 282 | " LR=args.lr,\n", 283 | " embedding=embedding,\n", 284 | " rnn_layers=args.rnn_layers,\n", 285 | " rnn_size=args.rnn_size,\n", 286 | ")\n", 287 | "inspect_model(model)\n", 288 | "\n", 289 | "# load pre-trained model weights\n", 290 | "FN1_filename = os.path.join(config.path_models, '{}.hdf5'.format(FN1))\n", 291 | "if args.warm_start and FN1 and os.path.exists(FN1_filename):\n", 292 | " model.load_weights(FN1_filename)\n", 293 | " print('Model weights loaded from {}'.format(FN1_filename))\n", 294 | "\n", 295 | "# print samples before training\n", 296 | "gensamples(\n", 297 | " skips=2,\n", 298 | " k=10,\n", 299 | " batch_size=batch_size,\n", 300 | " short=False,\n", 301 | " temperature=args.temperature,\n", 302 | " use_unk=True,\n", 303 | " model=model,\n", 304 | " data=(X_test, Y_test),\n", 305 | " idx2word=idx2word,\n", 306 | " oov0=oov0,\n", 307 | " glove_idx2idx=glove_idx2idx,\n", 308 | " vocab_size=vocab_size,\n", 309 | " nb_unknown_words=nb_unknown_words,\n", 310 | ")\n", 311 | "\n", 312 | "# get train and validation generators\n", 313 | "r = next(gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word))\n", 314 | "traingen = gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=args.nflips, model=model, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)\n", 315 | "valgen = gen(X_test, Y_test, batch_size=batch_size, nb_batches=nb_val_samples // batch_size, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)\n", 316 | "\n", 317 | "# define callbacks for training\n", 318 | "callbacks = [TensorBoard(\n", 319 | " log_dir=os.path.join(config.path_logs, str(time.time())),\n", 320 | " histogram_freq=2, write_graph=False, write_images=False)]\n", 321 | "\n", 322 | "# train model and save weights\n", 323 | "h = model.fit_generator(\n", 324 | " traingen, samples_per_epoch=nb_train_samples,\n", 325 | " nb_epoch=args.epochs, validation_data=valgen, nb_val_samples=nb_val_samples,\n", 326 | " callbacks=callbacks,\n", 327 | ")\n", 328 | "model.save_weights(FN1_filename, overwrite=True)\n", 329 | "\n", 330 | "# print samples after training\n", 331 | "gensamples(\n", 332 | " skips=2,\n", 333 | " k=10,\n", 334 | " batch_size=batch_size,\n", 335 | " short=False,\n", 336 | " temperature=args.temperature,\n", 337 | " use_unk=True,\n", 338 | " model=model,\n", 339 | " data=(X_test, Y_test),\n", 340 | " idx2word=idx2word,\n", 341 | " oov0=oov0,\n", 342 | " glove_idx2idx=glove_idx2idx,\n", 343 | " vocab_size=vocab_size,\n", 344 | " nb_unknown_words=nb_unknown_words,\n", 345 | ")" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": true 353 | }, 354 | "outputs": [], 355 | "source": [] 356 | } 357 | ], 358 | "metadata": { 359 | "kernelspec": { 360 | "display_name": "Python 3", 361 | "language": "python", 362 | "name": "python3" 363 | }, 364 | "language_info": { 365 | "codemirror_mode": { 366 | "name": "ipython", 367 | "version": 3 368 | }, 369 | "file_extension": ".py", 370 | "mimetype": "text/x-python", 371 | "name": "python", 372 | "nbconvert_exporter": "python", 373 | "pygments_lexer": "ipython3", 374 | "version": "3.6.3" 375 | } 376 | }, 377 | "nbformat": 4, 378 | "nbformat_minor": 2 379 | } 380 | -------------------------------------------------------------------------------- /train3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /train_seq2seq.py: -------------------------------------------------------------------------------- 1 | """Train a sequence to sequence model. 2 | 3 | This script is sourced from Siraj Rival 4 | https://github.com/llSourcell/How_to_make_a_text_summarizer/blob/master/train.ipynb 5 | """ 6 | import os 7 | import time 8 | import random 9 | import argparse 10 | import json 11 | 12 | import numpy as np 13 | from keras.callbacks import TensorBoard 14 | 15 | import config 16 | from sample_gen import gensamples 17 | from utils import prt, load_embedding, process_vocab, load_split_data 18 | from model import create_model, inspect_model 19 | from generate import gen 20 | from constants import FN1, seed, nb_unknown_words 21 | 22 | # parse arguments 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--batch-size', type=int, default=32, help='input batch size') 25 | parser.add_argument('--epochs', type=int, default=10, help='number of epochs') 26 | parser.add_argument('--rnn-size', type=int, default=512, help='size of RNN layers') 27 | parser.add_argument('--rnn-layers', type=int, default=3, help='number of RNN layers') 28 | parser.add_argument('--nsamples', type=int, default=640, help='number of samples per epoch') 29 | parser.add_argument('--nflips', type=int, default=0, help='number of flips') 30 | parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature') 31 | parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001') 32 | parser.add_argument('--warm-start', action='store_true') 33 | args = parser.parse_args() 34 | batch_size = args.batch_size 35 | 36 | # set sample sizes 37 | nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size # num training samples 38 | nb_val_samples = nb_train_samples # num validation samples 39 | 40 | # seed weight initialization 41 | random.seed(seed) 42 | np.random.seed(seed) 43 | 44 | embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words) 45 | vocab_size, embedding_size = embedding.shape 46 | oov0 = vocab_size - nb_unknown_words 47 | idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words) 48 | X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed) 49 | 50 | # print a sample recipe to make sure everything looks right 51 | print('Random head, description:') 52 | i = 811 53 | prt('H', Y_train[i], idx2word) 54 | prt('D', X_train[i], idx2word) 55 | 56 | # save model initialization parameters 57 | model_params = (dict( 58 | vocab_size=vocab_size, 59 | embedding_size=embedding_size, 60 | LR=args.lr, 61 | rnn_layers=args.rnn_layers, 62 | rnn_size=args.rnn_size, 63 | )) 64 | with open(os.path.join(config.path_models, 'model_params.json'), 'w') as f: 65 | json.dump(model_params, f) 66 | 67 | 68 | model = create_model( 69 | vocab_size=vocab_size, 70 | embedding_size=embedding_size, 71 | LR=args.lr, 72 | embedding=embedding, 73 | rnn_layers=args.rnn_layers, 74 | rnn_size=args.rnn_size, 75 | ) 76 | inspect_model(model) 77 | 78 | # load pre-trained model weights 79 | FN1_filename = os.path.join(config.path_models, '{}.hdf5'.format(FN1)) 80 | if args.warm_start and FN1 and os.path.exists(FN1_filename): 81 | model.load_weights(FN1_filename) 82 | print('Model weights loaded from {}'.format(FN1_filename)) 83 | 84 | # print samples before training 85 | gensamples( 86 | skips=2, 87 | k=10, 88 | batch_size=batch_size, 89 | short=False, 90 | temperature=args.temperature, 91 | use_unk=True, 92 | model=model, 93 | data=(X_test, Y_test), 94 | idx2word=idx2word, 95 | oov0=oov0, 96 | glove_idx2idx=glove_idx2idx, 97 | vocab_size=vocab_size, 98 | nb_unknown_words=nb_unknown_words, 99 | ) 100 | 101 | # get train and validation generators 102 | r = next(gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word)) 103 | traingen = gen(X_train, Y_train, batch_size=batch_size, nb_batches=None, nflips=args.nflips, model=model, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word) 104 | valgen = gen(X_test, Y_test, batch_size=batch_size, nb_batches=nb_val_samples // batch_size, nflips=None, model=None, debug=False, oov0=oov0, glove_idx2idx=glove_idx2idx, vocab_size=vocab_size, nb_unknown_words=nb_unknown_words, idx2word=idx2word) 105 | 106 | # define callbacks for training 107 | callbacks = [TensorBoard( 108 | log_dir=os.path.join(config.path_logs, str(time.time())), 109 | histogram_freq=2, write_graph=False, write_images=False)] 110 | 111 | # train model and save weights 112 | h = model.fit_generator( 113 | traingen, samples_per_epoch=nb_train_samples, 114 | nb_epoch=args.epochs, validation_data=valgen, nb_val_samples=nb_val_samples, 115 | callbacks=callbacks, 116 | ) 117 | model.save_weights(FN1_filename, overwrite=True) 118 | 119 | # print samples after training 120 | gensamples( 121 | skips=2, 122 | k=10, 123 | batch_size=batch_size, 124 | short=False, 125 | temperature=args.temperature, 126 | use_unk=True, 127 | model=model, 128 | data=(X_test, Y_test), 129 | idx2word=idx2word, 130 | oov0=oov0, 131 | glove_idx2idx=glove_idx2idx, 132 | vocab_size=vocab_size, 133 | nb_unknown_words=nb_unknown_words, 134 | ) 135 | --------------------------------------------------------------------------------