├── .travis.yaml ├── README.md ├── Research_LDA.ipynb ├── TLDR_LDA_and_Text_Summarization.ipynb ├── requirements.txt ├── teal_deer.py └── test_tldr.py /.travis.yaml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.2" 6 | - "3.3" 7 | - "3.4" 8 | - "3.5" 9 | - "3.5-dev" # 3.5 development branch 10 | - "3.6" 11 | - "3.6-dev" # 3.6 development branch 12 | - "3.7-dev" # 3.7 development branch 13 | - "nightly" # currently points to 3.7-dev 14 | # command to install dependencies 15 | install: "pip install -r requirements.txt" 16 | # command to run tests 17 | script: test_tldr.py 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Teal Deer 2 | ### TLDR_LDA_and_Text_Summarization.ipynb is the primary current notebook. 3 | Currently just hacking notebook. The notebook scrapes text from a directory of academic research pdf's, 4 | and then does LDA on it for prioritization of reading, and then trains a text summarization model based on the abstracts of the papers--- that way, if you find more, you can use the model to summarize them right away. Dataset for this run included just a handful of papers on chatbots from arxiv, but this notebook has been tested on corpuses of several hundred documents from both arXiv and google scholar in another domain with favorable results. OCR portion relies on: https://github.com/euske/pdfminer/blob/master/tools/pdf2txt.py 5 |

6 | NOTE: Partially refactored as a discord bot, deployed via Heroku, to summarize channels in discord and direct message the results to the user. Still working on a more permanent plan for deployment. 7 | **In process:**
8 | Convert the notebook to a straight .py script to add the text summarization, which aims to generate abstracts or short summaries for large blocks 9 | of text (i.e., an abstract for the rest of a paper). So, not only could papers be prioritized, but could be 10 | summarized as well. 11 |

12 | **Planned updates - See project tab as well:**
13 | + Finish out OCR from PDF files part 14 | + Complete the text summarization portion - Thanks to Siraj Raval for making the video: https://www.youtube.com/watch?v=ogrJaOIuBx4 15 | + Clean up into python scripts with test suites 16 | + Experiment with other front-end usecases: i.e., a slackbot is currently underway (notebook to be added later). 17 | + Add a CI framework into this repo. 18 | + Cartoon for a fun logo :-) 19 | -------------------------------------------------------------------------------- /Research_LDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# OCR and LDA on Lit Review Docs" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 319, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pdfminer as pdf\n", 19 | "import os\n", 20 | "import glob\n", 21 | "import pickle as pkl\n", 22 | "import pandas as pd\n", 23 | "from gensim.utils import simple_preprocess\n", 24 | "from gensim.parsing.preprocessing import STOPWORDS\n", 25 | "import gensim\n", 26 | "import unicodedata\n", 27 | "import re, string\n", 28 | "from gensim import corpora, models\n", 29 | "import gensim" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 6, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# Get a blob of the pdf filenames, and turn them into a list\n", 41 | "files = glob.glob('/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/*.pdf')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 9, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "['/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/AutomatedRecHealthyPersMeals.pdf',\n", 55 | " '/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/Biologeek.pdf',\n", 56 | " '/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/BuonApetitoRestMealRec.pdf',\n", 57 | " '/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/ChallengesNutrRecSys.pdf',\n", 58 | " '/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/ClusteringEliminateSpam.pdf']" 59 | ] 60 | }, 61 | "execution_count": 9, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "files[0:5]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 16, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "text_files = glob.glob('/Users/sararogis/Dropbox/FoodRecommender/MealRec_LitReview/LitRev_Text/*.txt')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 357, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "for x in files:\n", 90 | " dest_loc = x.replace('.pdf','.txt').replace('MealRec_LitReview/','MealRec_LitReview/LitRev_Text/')\n", 91 | " ocr_cmd = 'pdf2txt.py -o ' + dest_loc + ' -t text ' + x\n", 92 | " os.system(ocr_cmd)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "text_files[0:5]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 20, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "# Ok, now pull these text files in and lets do some lda" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 359, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "all_texts = []" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 360, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "for textpath in text_files:\n", 137 | " # Set an open string to place document text into\n", 138 | " all_doc_string = ''\n", 139 | " \n", 140 | " # Open the file and save contents to variable 'file_text'\n", 141 | " file_text = open(textpath, 'rb')\n", 142 | " \n", 143 | " # Read the lines of the tester into a variable. \n", 144 | " file_lines = file_text.readlines()\n", 145 | " \n", 146 | " # Iterate through the lines, and append them to the string to make one big text blob\n", 147 | " for line in file_lines:\n", 148 | " all_doc_string = all_doc_string + line\n", 149 | " \n", 150 | " # Append the new string to the list of all files\n", 151 | " all_texts.append(all_doc_string)\n", 152 | " \n", 153 | " # String will reset next; close the file and flush the file buffer\n", 154 | " file_text.flush()\n", 155 | " file_text.close()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "all_texts" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 362, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "32" 180 | ] 181 | }, 182 | "execution_count": 362, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "len(all_texts)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 363, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "32" 202 | ] 203 | }, 204 | "execution_count": 363, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "len(text_files)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 364, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# Hmmm. One off, but what the heck. Save it. \n", 222 | "pkl.dump(all_texts, open('all_texts_mealrec_litreview.pkl','wb'))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 55, 228 | "metadata": { 229 | "collapsed": true 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "#####" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 365, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "df = pd.DataFrame(all_texts,columns=['orig_text'])" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 366, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "def preprocess_text(text):\n", 256 | " from nltk.corpus import stopwords\n", 257 | " from nltk.tokenize import LineTokenizer\n", 258 | " from nltk.tokenize import WhitespaceTokenizer\n", 259 | " from nltk.stem.porter import PorterStemmer\n", 260 | " \n", 261 | " # Create p_stemmer of class PorterStemmer\n", 262 | " p_stemmer = PorterStemmer()\n", 263 | " \n", 264 | " sw = list(stopwords.words())\n", 265 | " extra_stops = ['R', '', ' ', 'abstract', 'keywords', 'introduction', 'figure','morgan', 'harvey',\n", 266 | " 'david','elsweiler','northumbria','university','newcastle','united','kingdom','university',\n", 267 | " 'regensburg','germany', 'h', 'k', 'f', 'b', 'user', 'g', 'use']\n", 268 | " for word in extra_stops:\n", 269 | " sw.append(word)\n", 270 | " \n", 271 | " # Step 1 - Clean up unicode\n", 272 | " clean_string = ''\n", 273 | " doc = []\n", 274 | " #for x in text:\n", 275 | " # if ord(x) <= 128:\n", 276 | " # clean_string += x\n", 277 | " #clean_string = unicodedata.normalize('NFKD', clean_string.encode('utf-8', 'replace')).encode('ascii','replace')\n", 278 | " \n", 279 | " # Tokenize each line to get rid of the line carriages\n", 280 | " lines = LineTokenizer().tokenize(text.lower())\n", 281 | " \n", 282 | " clean_lines = []\n", 283 | " \n", 284 | " for line in lines:\n", 285 | " if line.startswith('e-mail') or line.startswith('doi') or line.startswith('For all other uses, contact') or line.find(' acm. isbn ') > 0:\n", 286 | " pass\n", 287 | " else:\n", 288 | " line_str = ''\n", 289 | " for char in line:\n", 290 | " #if ord(char) <= 127:\n", 291 | " if (char in string.ascii_letters) or char == ' ':\n", 292 | " line_str += char\n", 293 | " \n", 294 | " # Clean up other undesirable characters\n", 295 | " if line_str != ' ' and line_str.rstrip().lstrip() not in sw:\n", 296 | " clean_lines.append(line_str)\n", 297 | " \n", 298 | " # Tokenize the lines\n", 299 | " for clean_line in clean_lines:\n", 300 | " tokens = WhitespaceTokenizer().tokenize(clean_line)\n", 301 | " stopped_tokens = [i for i in tokens if not i in sw]\n", 302 | " stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n", 303 | " [doc.append(i) for i in stemmed_tokens]\n", 304 | " \n", 305 | " \n", 306 | " return doc" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 367, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "df.insert(df.shape[1], 'clean_text', df.orig_text.apply(preprocess_text))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 369, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "# turn our tokenized documents into a id <-> term dictionary\n", 329 | "dictionary = corpora.Dictionary(df.clean_text)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 370, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "# convert tokenized documents into a document-term matrix\n", 341 | "corpus = [dictionary.doc2bow(text) for text in df.clean_text]" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 371, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "# generate LDA model\n", 353 | "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 372, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "ldamodel4 = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word = dictionary, passes=20)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 373, 370 | "metadata": { 371 | "collapsed": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "ldamodel3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 374, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "[(0, u'0.016*food + 0.013*algorithm + 0.012*cid + 0.009*r'), (1, u'0.000*food + 0.000*recip + 0.000*system + 0.000*recommend'), (2, u'0.028*recip + 0.023*menu + 0.015*recommend + 0.013*food'), (3, u'0.040*recip + 0.019*ingredi + 0.009*similar + 0.008*use'), (4, u'0.000*food + 0.000*recip + 0.000*use + 0.000*system'), (5, u'0.000*food + 0.000*recommend + 0.000*recip + 0.000*system'), (6, u'0.000*recip + 0.000*food + 0.000*system + 0.000*ingredi'), (7, u'0.048*food + 0.014*practic + 0.011*particip + 0.011*wast'), (8, u'0.017*system + 0.013*user + 0.012*product + 0.010*recip'), (9, u'0.024*recip + 0.022*recommend + 0.018*system + 0.016*user')]\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "print(ldamodel.print_topics(num_topics=10, num_words=4))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 375, 400 | "metadata": { 401 | "collapsed": true 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "import pyLDAvis" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 376, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "pyLDAvis.enable_notebook()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 377, 422 | "metadata": { 423 | "collapsed": false 424 | }, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/html": [ 429 | "\n", 430 | "\n", 431 | "\n", 432 | "\n", 433 | "
\n", 434 | "" 470 | ], 471 | "text/plain": [ 472 | "PreparedData(topic_coordinates= Freq cluster topics x y\n", 473 | "topic \n", 474 | "7 29.504711 1 1 -0.106822 -0.173555\n", 475 | "9 27.416633 1 2 -0.168737 0.027879\n", 476 | "3 13.685156 1 3 -0.090105 0.050671\n", 477 | "8 11.325331 1 4 -0.130220 -0.122096\n", 478 | "2 9.346031 1 5 -0.140154 0.090575\n", 479 | "0 8.708459 1 6 -0.094952 0.128187\n", 480 | "5 0.003420 1 7 0.181959 -0.000537\n", 481 | "4 0.003420 1 8 0.182305 -0.000579\n", 482 | "6 0.003420 1 9 0.183362 -0.000277\n", 483 | "1 0.003420 1 10 0.183365 -0.000266, topic_info= Category Freq Term Total loglift logprob\n", 484 | "term \n", 485 | "1248 Default 1565.000000 recip 1565.000000 30.0000 30.0000\n", 486 | "300 Default 238.000000 menu 238.000000 29.0000 29.0000\n", 487 | "3434 Default 1945.000000 food 1945.000000 28.0000 28.0000\n", 488 | "7283 Default 402.000000 practic 402.000000 27.0000 27.0000\n", 489 | "4185 Default 595.000000 ingredi 595.000000 26.0000 26.0000\n", 490 | "4460 Default 902.000000 recommend 902.000000 25.0000 25.0000\n", 491 | "216 Default 292.000000 wast 292.000000 24.0000 24.0000\n", 492 | "1873 Default 895.000000 system 895.000000 23.0000 23.0000\n", 493 | "1621 Default 371.000000 algorithm 371.000000 22.0000 22.0000\n", 494 | "2435 Default 581.000000 rate 581.000000 21.0000 21.0000\n", 495 | "5238 Default 276.000000 shop 276.000000 20.0000 20.0000\n", 496 | "2845 Default 208.000000 product 208.000000 19.0000 19.0000\n", 497 | "3607 Default 415.000000 particip 415.000000 18.0000 18.0000\n", 498 | "1590 Default 183.000000 cid 183.000000 17.0000 17.0000\n", 499 | "4901 Default 414.000000 featur 414.000000 16.0000 16.0000\n", 500 | "4582 Default 824.000000 user 824.000000 15.0000 15.0000\n", 501 | "2738 Default 118.000000 graph 118.000000 14.0000 14.0000\n", 502 | "3804 Default 295.000000 similar 295.000000 13.0000 13.0000\n", 503 | "620 Default 355.000000 prefer 355.000000 12.0000 12.0000\n", 504 | "3531 Default 114.000000 groceri 114.000000 11.0000 11.0000\n", 505 | "7382 Default 212.000000 r 212.000000 10.0000 10.0000\n", 506 | "198 Default 110.000000 navig 110.000000 9.0000 9.0000\n", 507 | "3921 Default 134.000000 fridg 134.000000 8.0000 8.0000\n", 508 | "1162 Default 282.000000 social 282.000000 7.0000 7.0000\n", 509 | "101 Default 142.000000 household 142.000000 6.0000 6.0000\n", 510 | "4267 Default 157.000000 home 157.000000 5.0000 5.0000\n", 511 | "3642 Default 90.000000 imag 90.000000 4.0000 4.0000\n", 512 | "1343 Default 300.000000 plan 300.000000 3.0000 3.0000\n", 513 | "1385 Default 315.000000 set 315.000000 2.0000 2.0000\n", 514 | "304 Default 97.000000 busi 97.000000 1.0000 1.0000\n", 515 | "... ... ... ... ... ... ...\n", 516 | "3434 Topic10 0.000497 food 1945.074469 -4.8957 -8.7684\n", 517 | "1248 Topic10 0.000486 recip 1565.882060 -4.7017 -8.7912\n", 518 | "4460 Topic10 0.000454 recommend 902.943697 -4.2194 -8.8594\n", 519 | "4582 Topic10 0.000437 user 824.684741 -4.1673 -8.8980\n", 520 | "4901 Topic10 0.000422 featur 414.842097 -3.5155 -8.9333\n", 521 | "2435 Topic10 0.000424 rate 581.066894 -3.8464 -8.9272\n", 522 | "7291 Topic10 0.000423 inform 513.501629 -3.7264 -8.9308\n", 523 | "4185 Topic10 0.000420 ingredi 595.032866 -3.8801 -8.9372\n", 524 | "3804 Topic10 0.000409 similar 295.426228 -3.2075 -8.9648\n", 525 | "8308 Topic10 0.000415 use 661.088914 -3.9983 -8.9501\n", 526 | "765 Topic10 0.000410 one 367.100892 -3.4229 -8.9629\n", 527 | "3607 Topic10 0.000405 particip 415.519210 -3.5588 -8.9750\n", 528 | "2782 Topic10 0.000402 gener 266.748068 -3.1219 -8.9813\n", 529 | "7283 Topic10 0.000403 practic 402.610950 -3.5301 -8.9778\n", 530 | "7178 Topic10 0.000399 cook 374.983324 -3.4695 -8.9883\n", 531 | "5238 Topic10 0.000397 shop 276.517659 -3.1709 -8.9944\n", 532 | "5053 Topic10 0.000397 predict 276.849378 -3.1727 -8.9949\n", 533 | "2295 Topic10 0.000397 work 312.192835 -3.2919 -8.9939\n", 534 | "552 Topic10 0.000396 time 269.623542 -3.1472 -8.9959\n", 535 | "2845 Topic10 0.000394 product 208.803040 -2.8968 -9.0011\n", 536 | "7739 Topic10 0.000395 design 268.366068 -3.1468 -9.0002\n", 537 | "620 Topic10 0.000395 prefer 355.391136 -3.4254 -8.9979\n", 538 | "4549 Topic10 0.000395 meal 300.193105 -3.2584 -8.9997\n", 539 | "1302 Topic10 0.000392 differ 189.344738 -2.8048 -9.0070\n", 540 | "5578 Topic10 0.000392 valu 216.263405 -2.9375 -9.0067\n", 541 | "2389 Topic10 0.000390 network 171.025698 -2.7071 -9.0110\n", 542 | "1385 Topic10 0.000392 set 315.306823 -3.3135 -9.0057\n", 543 | "636 Topic10 0.000390 peopl 212.878156 -2.9278 -9.0128\n", 544 | "7890 Topic10 0.000390 health 232.898215 -3.0170 -9.0121\n", 545 | "4975 Topic10 0.000390 item 436.024846 -3.6429 -9.0109\n", 546 | "\n", 547 | "[787 rows x 6 columns], token_table= Topic Freq Term\n", 548 | "term \n", 549 | "6866 6 0.688533 abowd\n", 550 | "3007 6 0.688674 abraham\n", 551 | "7294 1 0.136595 acceleromet\n", 552 | "7294 4 0.887865 acceleromet\n", 553 | "1021 1 0.386258 acm\n", 554 | "1021 2 0.335243 acm\n", 555 | "1021 3 0.054659 acm\n", 556 | "1021 4 0.109318 acm\n", 557 | "1021 5 0.065591 acm\n", 558 | "1021 6 0.047371 acm\n", 559 | "5183 2 0.996518 acquisit\n", 560 | "835 1 0.261871 activ\n", 561 | "835 2 0.411511 activ\n", 562 | "835 3 0.043645 activ\n", 563 | "835 4 0.255636 activ\n", 564 | "835 5 0.024940 activ\n", 565 | "6433 6 0.688616 agat\n", 566 | "7307 3 0.217493 ahn\n", 567 | "7307 4 0.739475 ahn\n", 568 | "7307 6 0.043499 ahn\n", 569 | "6285 6 0.688665 ahyoung\n", 570 | "7007 1 0.031176 aisl\n", 571 | "7007 4 0.935284 aisl\n", 572 | "7007 6 0.031176 aisl\n", 573 | "1071 5 0.867041 aj\n", 574 | "5530 6 0.688442 alberto\n", 575 | "1621 2 0.502817 algorithm\n", 576 | "1621 3 0.045711 algorithm\n", 577 | "1621 4 0.013444 algorithm\n", 578 | "1621 5 0.155954 algorithm\n", 579 | "... ... ... ...\n", 580 | "7107 4 0.971178 wireless\n", 581 | "8475 6 0.688685 withreturn\n", 582 | "12 5 0.680205 wji\n", 583 | "7595 6 0.688667 woontack\n", 584 | "2295 1 0.333127 work\n", 585 | "2295 2 0.355549 work\n", 586 | "2295 3 0.140939 work\n", 587 | "2295 4 0.044844 work\n", 588 | "2295 5 0.076876 work\n", 589 | "2295 6 0.044844 work\n", 590 | "8774 6 0.688642 wpnc\n", 591 | "6666 6 0.998178 wrrcid\n", 592 | "3600 1 0.069357 x\n", 593 | "3600 2 0.219630 x\n", 594 | "3600 3 0.242749 x\n", 595 | "3600 5 0.069357 x\n", 596 | "3600 6 0.393023 x\n", 597 | "2944 6 0.847053 xr\n", 598 | "2942 3 0.051816 xx\n", 599 | "2942 6 0.932690 xx\n", 600 | "8261 6 0.688649 ycid\n", 601 | "7631 6 0.688653 ycui\n", 602 | "6606 5 0.876908 yelp\n", 603 | "6606 6 0.125273 yelp\n", 604 | "3432 3 0.071640 yi\n", 605 | "3432 6 0.931326 yi\n", 606 | "5505 6 0.688656 ylongqi\n", 607 | "3442 2 0.959197 ys\n", 608 | "3445 6 0.847022 yt\n", 609 | "5756 6 0.688657 za\n", 610 | "\n", 611 | "[1245 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[8, 10, 4, 9, 3, 1, 6, 5, 7, 2])" 612 | ] 613 | }, 614 | "execution_count": 377, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "import pyLDAvis.gensim\n", 621 | "\n", 622 | "pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 378, 628 | "metadata": { 629 | "collapsed": false 630 | }, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/html": [ 635 | "\n", 636 | "\n", 637 | "\n", 638 | "\n", 639 | "
\n", 640 | "" 676 | ], 677 | "text/plain": [ 678 | "PreparedData(topic_coordinates= Freq cluster topics x y\n", 679 | "topic \n", 680 | "1 37.912012 1 1 0.136382 0.005009\n", 681 | "2 32.402903 1 2 -0.059993 -0.064994\n", 682 | "0 29.685085 1 3 -0.076389 0.059986, topic_info= Category Freq Term Total loglift logprob\n", 683 | "term \n", 684 | "1248 Default 1498.000000 recip 1498.000000 30.0000 30.0000\n", 685 | "4185 Default 572.000000 ingredi 572.000000 29.0000 29.0000\n", 686 | "4460 Default 869.000000 recommend 869.000000 28.0000 28.0000\n", 687 | "3434 Default 1875.000000 food 1875.000000 27.0000 27.0000\n", 688 | "7283 Default 384.000000 practic 384.000000 26.0000 26.0000\n", 689 | "2435 Default 560.000000 rate 560.000000 25.0000 25.0000\n", 690 | "216 Default 278.000000 wast 278.000000 24.0000 24.0000\n", 691 | "5238 Default 266.000000 shop 266.000000 23.0000 23.0000\n", 692 | "3607 Default 397.000000 particip 397.000000 22.0000 22.0000\n", 693 | "1621 Default 365.000000 algorithm 365.000000 21.0000 21.0000\n", 694 | "300 Default 232.000000 menu 232.000000 20.0000 20.0000\n", 695 | "1873 Default 869.000000 system 869.000000 19.0000 19.0000\n", 696 | "5053 Default 267.000000 predict 267.000000 18.0000 18.0000\n", 697 | "4901 Default 403.000000 featur 403.000000 17.0000 17.0000\n", 698 | "101 Default 135.000000 household 135.000000 16.0000 16.0000\n", 699 | "3921 Default 128.000000 fridg 128.000000 15.0000 15.0000\n", 700 | "4267 Default 150.000000 home 150.000000 14.0000 14.0000\n", 701 | "620 Default 344.000000 prefer 344.000000 13.0000 13.0000\n", 702 | "2389 Default 163.000000 network 163.000000 12.0000 12.0000\n", 703 | "5986 Default 95.000000 patient 95.000000 11.0000 11.0000\n", 704 | "2845 Default 203.000000 product 203.000000 10.0000 10.0000\n", 705 | "3531 Default 111.000000 groceri 111.000000 9.0000 9.0000\n", 706 | "1590 Default 182.000000 cid 182.000000 8.0000 8.0000\n", 707 | "4582 Default 797.000000 user 797.000000 7.0000 7.0000\n", 708 | "3804 Default 288.000000 similar 288.000000 6.0000 6.0000\n", 709 | "3464 Default 104.000000 sustain 104.000000 5.0000 5.0000\n", 710 | "1173 Default 221.000000 select 221.000000 4.0000 4.0000\n", 711 | "1198 Default 140.000000 learn 140.000000 3.0000 3.0000\n", 712 | "7244 Default 191.000000 share 191.000000 2.0000 2.0000\n", 713 | "2797 Default 136.000000 mobil 136.000000 1.0000 1.0000\n", 714 | "... ... ... ... ... ... ...\n", 715 | "1590 Topic3 116.335648 cid 182.626755 0.7636 -5.4747\n", 716 | "453 Topic3 98.113598 measur 156.348428 0.7486 -5.6451\n", 717 | "4566 Topic3 77.992371 content 119.021315 0.7918 -5.8746\n", 718 | "2738 Topic3 76.394208 graph 117.468683 0.7843 -5.8953\n", 719 | "5231 Topic3 47.387690 pair 65.724372 0.8874 -6.3728\n", 720 | "5744 Topic3 41.331526 construct 55.671856 0.9167 -6.5096\n", 721 | "3804 Topic3 154.295558 similar 288.688616 0.5880 -5.1923\n", 722 | "1385 Topic3 149.667329 set 307.295368 0.4951 -5.2228\n", 723 | "4460 Topic3 303.947331 recommend 869.948741 0.1629 -4.5143\n", 724 | "5603 Topic3 106.617930 two 219.554988 0.4922 -5.5619\n", 725 | "7291 Topic3 186.738116 inform 494.933113 0.2398 -5.0015\n", 726 | "6183 Topic3 73.382947 onlin 135.766404 0.5993 -5.9355\n", 727 | "8308 Topic3 201.452628 use 639.866665 0.0588 -4.9256\n", 728 | "673 Topic3 98.696667 data 239.673706 0.3273 -5.6391\n", 729 | "1621 Topic3 126.615187 algorithm 365.635648 0.1540 -5.3900\n", 730 | "4582 Topic3 199.076254 user 797.078290 -0.1727 -4.9375\n", 731 | "7178 Topic3 121.607153 cook 359.243557 0.1313 -5.4304\n", 732 | "6946 Topic3 81.399526 approach 182.828855 0.4053 -5.8318\n", 733 | "3475 Topic3 98.825026 base 266.463716 0.2226 -5.6378\n", 734 | "765 Topic3 114.680630 one 355.089897 0.0843 -5.4890\n", 735 | "4901 Topic3 118.036182 featur 403.016392 -0.0135 -5.4602\n", 736 | "1369 Topic3 86.353819 factor 216.440362 0.2957 -5.7727\n", 737 | "2435 Topic3 129.170771 rate 560.993042 -0.2540 -5.3700\n", 738 | "620 Topic3 105.263072 prefer 344.756553 0.0281 -5.5747\n", 739 | "4975 Topic3 110.321902 item 422.067334 -0.1272 -5.5278\n", 740 | "3434 Topic3 177.327729 food 1875.919168 -1.1443 -5.0532\n", 741 | "2295 Topic3 95.667249 work 299.585219 0.0730 -5.6703\n", 742 | "1873 Topic3 116.053483 system 869.814765 -0.7997 -5.4771\n", 743 | "6169 Topic3 83.960101 tag 204.951830 0.3221 -5.8008\n", 744 | "2782 Topic3 82.808005 gener 256.777964 0.0828 -5.8147\n", 745 | "\n", 746 | "[260 rows x 6 columns], token_table= Topic Freq Term\n", 747 | "term \n", 748 | "6905 2 0.972988 absolut\n", 749 | "1897 1 0.186796 accuraci\n", 750 | "1897 2 0.648870 accuraci\n", 751 | "1897 3 0.157302 accuraci\n", 752 | "1021 1 0.471956 acm\n", 753 | "1021 2 0.377565 acm\n", 754 | "1021 3 0.151026 acm\n", 755 | "5183 2 0.956473 acquisit\n", 756 | "1163 1 0.811355 action\n", 757 | "1163 2 0.030426 action\n", 758 | "1163 3 0.152129 action\n", 759 | "835 1 0.289709 activ\n", 760 | "835 2 0.585857 activ\n", 761 | "835 3 0.122322 activ\n", 762 | "522 2 0.918927 adopt\n", 763 | "522 3 0.051052 adopt\n", 764 | "7007 1 0.995696 aisl\n", 765 | "1621 1 0.010940 algorithm\n", 766 | "1621 2 0.642716 algorithm\n", 767 | "1621 3 0.347340 algorithm\n", 768 | "2667 3 0.919761 allrecipescom\n", 769 | "4817 1 0.634265 applic\n", 770 | "4817 2 0.255599 applic\n", 771 | "4817 3 0.108866 applic\n", 772 | "6946 1 0.229723 approach\n", 773 | "6946 2 0.328176 approach\n", 774 | "6946 3 0.443037 approach\n", 775 | "967 1 0.846143 around\n", 776 | "967 2 0.072787 around\n", 777 | "967 3 0.081885 around\n", 778 | "... ... ... ...\n", 779 | "552 1 0.515030 time\n", 780 | "552 2 0.215236 time\n", 781 | "552 3 0.269046 time\n", 782 | "6158 1 0.053032 train\n", 783 | "6158 2 0.777810 train\n", 784 | "6158 3 0.159097 train\n", 785 | "5603 1 0.250507 two\n", 786 | "5603 2 0.264171 two\n", 787 | "5603 3 0.487349 two\n", 788 | "4498 1 0.056519 updat\n", 789 | "4498 2 0.876042 updat\n", 790 | "4498 3 0.084778 updat\n", 791 | "8308 1 0.332882 use\n", 792 | "8308 2 0.353199 use\n", 793 | "8308 3 0.314128 use\n", 794 | "4582 1 0.228334 user\n", 795 | "4582 2 0.521906 user\n", 796 | "4582 3 0.249662 user\n", 797 | "1219 3 0.936934 variant\n", 798 | "7229 1 0.936945 visibl\n", 799 | "7229 2 0.038243 visibl\n", 800 | "7229 3 0.019121 visibl\n", 801 | "2116 2 0.980662 vsi\n", 802 | "216 1 0.997217 wast\n", 803 | "2295 1 0.360498 work\n", 804 | "2295 2 0.320443 work\n", 805 | "2295 3 0.320443 work\n", 806 | "6666 3 0.974974 wrrcid\n", 807 | "2949 3 0.919292 xt\n", 808 | "6606 2 0.950295 yelp\n", 809 | "\n", 810 | "[410 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[2, 3, 1])" 811 | ] 812 | }, 813 | "execution_count": 378, 814 | "metadata": {}, 815 | "output_type": "execute_result" 816 | } 817 | ], 818 | "source": [ 819 | "pyLDAvis.gensim.prepare(ldamodel3, corpus, dictionary)" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": { 826 | "collapsed": true 827 | }, 828 | "outputs": [], 829 | "source": [] 830 | } 831 | ], 832 | "metadata": { 833 | "kernelspec": { 834 | "display_name": "Python 3", 835 | "language": "python", 836 | "name": "python3" 837 | }, 838 | "language_info": { 839 | "codemirror_mode": { 840 | "name": "ipython", 841 | "version": 3 842 | }, 843 | "file_extension": ".py", 844 | "mimetype": "text/x-python", 845 | "name": "python", 846 | "nbconvert_exporter": "python", 847 | "pygments_lexer": "ipython3", 848 | "version": "3.5.2" 849 | } 850 | }, 851 | "nbformat": 4, 852 | "nbformat_minor": 0 853 | } 854 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | bleach==1.5.0 3 | boto==2.46.1 4 | bz2file==0.98 5 | chardet==2.3.0 6 | decorator==4.0.11 7 | entrypoints==0.2.2 8 | gensim==1.0.1 9 | html5lib==0.999 10 | ipykernel==4.5.2 11 | ipython==5.3.0 12 | ipython-genutils==0.1.0 13 | ipywidgets==6.0.0 14 | Jinja2==2.9.5 15 | jsonschema==2.5.1 16 | jupyter==1.0.0 17 | jupyter-client==5.0.0 18 | jupyter-console==5.1.0 19 | jupyter-core==4.3.0 20 | Keras==2.0.2 21 | MarkupSafe==0.23 22 | mistune==0.7.4 23 | nbconvert==5.1.1 24 | nbformat==4.3.0 25 | nltk==3.2.2 26 | nose==1.3.7 27 | notebook==5.4.1 28 | numpy==1.12.1 29 | pandas==0.19.2 30 | pandocfilters==1.4.1 31 | pdfminer.six==20160614 32 | pexpect==4.2.1 33 | pickleshare==0.7.4 34 | prompt-toolkit==1.0.13 35 | ptyprocess==0.5.1 36 | Pygments==2.2.0 37 | python-dateutil==2.6.0 38 | pytz==2016.10 39 | PyYAML==3.12 40 | pyzmq==16.0.2 41 | qtconsole==4.2.1 42 | requests==2.13.0 43 | scipy==0.19.0 44 | simplegeneric==0.8.1 45 | six==1.10.0 46 | smart-open==1.5.1 47 | terminado==0.6 48 | testpath==0.3 49 | Theano==0.9.0 50 | tornado==4.4.2 51 | traitlets==4.3.2 52 | wcwidth==0.1.7 53 | widgetsnbextension==2.0.0 54 | -------------------------------------------------------------------------------- /teal_deer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import argparse 3 | import json 4 | 5 | 6 | def get_file_list(path, file_type='pdf'): 7 | """ 8 | Gets a list of files of the directory of pdfs. 9 | :param path: Directory path 10 | :param file_type: Type of file to be imported. 11 | :return: List of files. 12 | """ 13 | path += ('*.' + file_type) 14 | return glob.glob(path) 15 | 16 | 17 | def parse_args(): 18 | """ 19 | Returns arguments passed at the command line as a dict 20 | """ 21 | parser = argparse.ArgumentParser(description='Generates a machine Learning Dataset.') 22 | parser.add_argument('-c', help="Config File Location", required=True, 23 | dest='config') 24 | args = vars(parser.parse_args()) 25 | return args 26 | 27 | 28 | def load_config(config_name): 29 | """ 30 | loads a json config file and returns a config dictionary 31 | """ 32 | with open(config_name) as config_file: 33 | config = json.load(config_file) 34 | return config 35 | 36 | 37 | def replace_file_type_in_file_name(file_path, input_type='pdf', output_type='txt'): 38 | """ 39 | Replace one file type with another in the specified file name. 40 | :param file_path: Starting file path. 41 | :return: File name with replacement type. 42 | """ 43 | return file_path.replace('.' + input_type, '.' + output_type) -------------------------------------------------------------------------------- /test_tldr.py: -------------------------------------------------------------------------------- 1 | # Nose tests for python. 2 | import teal_deer as tldr 3 | 4 | 5 | def test_get_configuration(): 6 | assert len(tldr.load_config('config.json')) == 3, "Configuration not loaded correctly." 7 | 8 | 9 | def test_get_file_list(): 10 | configuration = tldr.load_config('config.json') 11 | assert len(tldr.get_file_list(configuration['files_path'])) == 106, "Getting incorrect number of files." 12 | 13 | 14 | def test_replace_file_type_in_file_name(): 15 | configuration = tldr.load_config('config.json') 16 | file_list = tldr.get_file_list(configuration['files_path']) 17 | print(file_list[0]) 18 | assert tldr.replace_file_type_in_file_name(file_list[0]) == \ 19 | '/Users/saracollins/Documents/LiteratureReviews/chatbots/0310018.txt', \ 20 | "File type replacement not happening correctly." 21 | --------------------------------------------------------------------------------