├── tests.py ├── requirements.txt ├── MANIFEST.in ├── .gitignore ├── .travis.yml ├── setup.py ├── example.py ├── redditnlp ├── words │ ├── stopwords_english.txt │ └── swearwords_english.txt └── __init__.py ├── README.md └── ez_setup.py /tests.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | praw>=2.1.19 2 | nltk>=3.0.0 3 | numpy>=1.8.0 4 | scikit-learn>=0.15.2 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include ez_setup.py 3 | include example.py 4 | recursive-include redditnlp/words * -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.zip 4 | *.egg 5 | .idea/ 6 | tfidf_corpus/ 7 | build/ 8 | dist/ 9 | redditnlp.egg-info/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | install: 6 | - pip install . 7 | - pip install -r requirements.txt 8 | script: "python tests.py" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import ez_setup 2 | ez_setup.use_setuptools(version='0.7') 3 | 4 | from setuptools import setup 5 | import os 6 | 7 | PACKAGE_NAME = 'redditnlp' 8 | VERSION = '0.1.3' 9 | 10 | 11 | def read(filename): 12 | filepath = os.path.join(os.path.dirname(__file__), filename) 13 | try: 14 | # Convert GitHub markdown to restructured text (needed for upload to PyPI) 15 | from pypandoc import convert 16 | return convert(filepath, 'rst') 17 | except ImportError: 18 | return open(filepath).read() 19 | 20 | description = 'A tool to perform natural language processing of reddit content.' 21 | try: 22 | long_description = read('README.md') 23 | except IOError: 24 | long_description = description 25 | 26 | setup( 27 | name=PACKAGE_NAME, 28 | version=VERSION, 29 | author='Jai Juneja', 30 | author_email='jai.juneja@gmail.com', 31 | description=description, 32 | license='BSD', 33 | keywords='reddit, natural language processing, machine learning', 34 | url='https://github.com/jaijuneja/reddit-nlp', 35 | packages=[PACKAGE_NAME,], 36 | long_description=long_description, 37 | classifiers=[ 38 | 'Development Status :: 3 - Alpha', 39 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 40 | 'Intended Audience :: Developers', 41 | 'License :: OSI Approved :: BSD License', 42 | 'Operating System :: OS Independent', 43 | 'Programming Language :: Python' 44 | ], 45 | install_requires=[ 46 | 'praw>=2.1.19', 47 | 'nltk>=3.0.0', 48 | 'numpy>=1.8.0', 49 | 'scikit-learn>=0.15.2', 50 | ], 51 | include_package_data=True, 52 | package_data={PACKAGE_NAME: ['words/*.txt'], 53 | '': ['README.md', 'ez_setup.py', 'example.py']}, 54 | test_suite='tests' 55 | ) -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from redditnlp import RedditWordCounter, TfidfCorpus 2 | import requests 3 | import os 4 | from collections import deque 5 | 6 | ###################### 7 | # SETTINGS (EDIT THIS) 8 | ###################### 9 | 10 | USERNAME = 'your_username' # Change this to your username 11 | SAVE_DIR = 'tfidf_corpus' 12 | CORPUS_FILE = 'corpus.json' 13 | COMMENTS_PER_SUBREDDIT = 1000 14 | SUBREDDITS = [ 15 | 'funny', 'pics', 'AskReddit', 'todayilearned', 'worldnews', 16 | 'science', 'blog', 'IAmA', 'videos', 'gaming', 17 | 'movies', 'Music', 'aww', 'technology', 'bestof', 18 | 'WTF', 'AdviceAnimals', 'news', 'gifs', 'askscience', 19 | 'explainlikeimfive', 'EarthPorn', 'books', 'television', 'politics' 20 | ] 21 | 22 | ########################### 23 | # VOCABULARY ANALYTICS DEMO 24 | ########################### 25 | 26 | 27 | def get_subreddit_vocabularies(): 28 | # Initialise Reddit word counter instance 29 | reddit_counter = RedditWordCounter(USERNAME) 30 | 31 | # Initialise tf-idf corpus instance 32 | corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE) 33 | comment_corpus = TfidfCorpus(corpus_path) 34 | 35 | # Extract the vocabulary for each of the subreddits specified 36 | subreddit_queue = deque([subreddit for subreddit in SUBREDDITS]) 37 | while len(subreddit_queue) > 0: 38 | subreddit = subreddit_queue.popleft() 39 | 40 | try: 41 | vocabulary = reddit_counter.subreddit_comments(subreddit, limit=COMMENTS_PER_SUBREDDIT) 42 | except requests.exceptions.HTTPError as err: 43 | print err 44 | # Add subreddit back into queue 45 | subreddit_queue.append(subreddit) 46 | continue 47 | 48 | comment_corpus.add_document(vocabulary, subreddit) 49 | comment_corpus.save() 50 | 51 | return comment_corpus, corpus_path 52 | 53 | 54 | def save_subreddit_top_terms(corpus): 55 | # Save the top terms for each subreddit in a text file 56 | save_path = os.path.join(SAVE_DIR, 'top_words.txt') 57 | for document in corpus.get_document_list(): 58 | top_terms = corpus.get_top_terms(document, num_terms=50) 59 | top_terms = sorted(top_terms.items(), key=lambda x: x[1], reverse=True) 60 | with open(save_path, 'ab') as f: 61 | f.write(document.encode('utf-8') + '\n' + 62 | '\n'.join(['{0}, {1}'.format(term.encode('utf-8'), weight) for term, weight in top_terms]) 63 | + '\n\n') 64 | 65 | return save_path 66 | 67 | 68 | def get_swearword_counts(corpus): 69 | with open('redditnlp/words/swearwords_english.txt', 'rb') as f: 70 | swearwords = [word.strip('\n') for word in f.readlines()] 71 | 72 | swearword_counts = dict() 73 | for document in corpus.get_document_list(): 74 | swearword_counts[document] = corpus.count_words_from_list(document, swearwords) 75 | return swearword_counts 76 | 77 | 78 | def get_vocabulary_sophistication(corpus): 79 | mean_word_lengths = dict() 80 | for document in corpus.get_document_list(): 81 | mean_word_lengths[document] = corpus.get_mean_word_length(document) 82 | return mean_word_lengths 83 | 84 | # Extract their word counts 85 | corpus, corpus_path = get_subreddit_vocabularies() 86 | print 'TF-IDF corpus saved to %s' % corpus_path 87 | 88 | # Get the top words by subreddit 89 | top_terms_path = save_subreddit_top_terms(corpus) 90 | print 'Top terms saved to %s' % corpus_path 91 | 92 | # Get the swearword frequency 93 | swearword_frequency = get_swearword_counts(corpus) 94 | print 'Normalized swearword frequency:' 95 | for subreddit, frequency in swearword_frequency.items(): 96 | print '%s, %s' % (subreddit, frequency) 97 | 98 | # Get the average word length 99 | print '\nAverage word length by subreddit:' 100 | word_lengths = get_vocabulary_sophistication(corpus) 101 | for subreddit, frequency in word_lengths.items(): 102 | print '%s, %s' % (subreddit, frequency) 103 | 104 | ####################### 105 | # MACHINE LEARNING DEMO 106 | ####################### 107 | 108 | # Collect the comments for a particular user and determine which subreddit their comments best match up with 109 | counter = RedditWordCounter(USERNAME) 110 | corpus = TfidfCorpus(os.path.join(SAVE_DIR, CORPUS_FILE)) 111 | 112 | user_comments = counter.user_comments('way_fairer') 113 | corpus.train_classifier(classifier_type='LinearSVC', tfidf=True) 114 | print corpus.classify_document(user_comments) -------------------------------------------------------------------------------- /redditnlp/words/stopwords_english.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | above 5 | across 6 | actually 7 | after 8 | again 9 | against 10 | ago 11 | aint 12 | all 13 | almost 14 | alone 15 | along 16 | already 17 | also 18 | although 19 | always 20 | am 21 | among 22 | an 23 | and 24 | another 25 | any 26 | anybody 27 | anyone 28 | anything 29 | anywhere 30 | are 31 | area 32 | areas 33 | arent 34 | around 35 | as 36 | ask 37 | asked 38 | asking 39 | asks 40 | at 41 | away 42 | b 43 | back 44 | backed 45 | backing 46 | backs 47 | be 48 | became 49 | because 50 | become 51 | becomes 52 | been 53 | before 54 | began 55 | behind 56 | being 57 | beings 58 | below 59 | best 60 | better 61 | between 62 | big 63 | both 64 | but 65 | by 66 | c 67 | came 68 | can 69 | cannot 70 | cant 71 | case 72 | cases 73 | certain 74 | certainly 75 | clear 76 | clearly 77 | com 78 | come 79 | couk 80 | could 81 | couldnt 82 | couldve 83 | d 84 | dear 85 | did 86 | didnt 87 | differ 88 | different 89 | differently 90 | do 91 | does 92 | doesnt 93 | doing 94 | don 95 | done 96 | dont 97 | down 98 | downed 99 | downing 100 | downs 101 | during 102 | e 103 | each 104 | early 105 | eg 106 | either 107 | else 108 | end 109 | ended 110 | ending 111 | ends 112 | enough 113 | etc 114 | even 115 | evenly 116 | ever 117 | every 118 | everybody 119 | everyone 120 | everything 121 | everywhere 122 | f 123 | far 124 | few 125 | find 126 | finds 127 | first 128 | for 129 | four 130 | from 131 | full 132 | fully 133 | further 134 | furthered 135 | furthering 136 | furthers 137 | g 138 | gave 139 | general 140 | generally 141 | get 142 | gets 143 | give 144 | given 145 | gives 146 | go 147 | going 148 | good 149 | got 150 | great 151 | greater 152 | greatest 153 | h 154 | had 155 | hadnt 156 | haha 157 | happen 158 | happened 159 | has 160 | hasnt 161 | have 162 | havent 163 | having 164 | he 165 | hed 166 | hell 167 | her 168 | here 169 | heres 170 | hers 171 | herself 172 | hes 173 | high 174 | higher 175 | highest 176 | him 177 | himself 178 | his 179 | how 180 | howd 181 | however 182 | howll 183 | hows 184 | http 185 | i 186 | id 187 | ie 188 | if 189 | ill 190 | im 191 | important 192 | in 193 | instead 194 | interested 195 | interesting 196 | into 197 | is 198 | isnt 199 | it 200 | itd 201 | itll 202 | its 203 | itself 204 | ive 205 | j 206 | just 207 | k 208 | keep 209 | keeps 210 | kind 211 | knew 212 | know 213 | known 214 | knows 215 | l 216 | large 217 | largely 218 | last 219 | later 220 | latest 221 | least 222 | less 223 | let 224 | lets 225 | like 226 | likely 227 | long 228 | longer 229 | longest 230 | looked 231 | looks 232 | lot 233 | m 234 | made 235 | make 236 | making 237 | many 238 | may 239 | maybe 240 | me 241 | might 242 | mightnt 243 | mightve 244 | more 245 | most 246 | mostly 247 | mr 248 | mrs 249 | much 250 | must 251 | mustnt 252 | mustve 253 | my 254 | myself 255 | n 256 | necessary 257 | need 258 | needed 259 | needing 260 | needs 261 | neither 262 | net 263 | never 264 | new 265 | newer 266 | newest 267 | next 268 | no 269 | nobody 270 | non 271 | noone 272 | nor 273 | not 274 | nothing 275 | now 276 | nowhere 277 | o 278 | of 279 | off 280 | often 281 | oh 282 | old 283 | older 284 | oldest 285 | on 286 | once 287 | one 288 | only 289 | open 290 | opened 291 | opening 292 | opens 293 | or 294 | order 295 | ordered 296 | ordering 297 | orders 298 | org 299 | other 300 | others 301 | ought 302 | our 303 | ours 304 | ourselves 305 | out 306 | over 307 | own 308 | p 309 | part 310 | parted 311 | parting 312 | parts 313 | per 314 | perhaps 315 | place 316 | places 317 | point 318 | pointed 319 | pointing 320 | points 321 | possible 322 | present 323 | presented 324 | presenting 325 | presents 326 | put 327 | puts 328 | q 329 | quite 330 | r 331 | rather 332 | really 333 | right 334 | room 335 | rooms 336 | s 337 | said 338 | same 339 | saw 340 | say 341 | says 342 | second 343 | seconds 344 | see 345 | seem 346 | seemed 347 | seeming 348 | seems 349 | sees 350 | several 351 | shall 352 | shant 353 | she 354 | shed 355 | shell 356 | shes 357 | should 358 | shouldnt 359 | shouldve 360 | show 361 | showed 362 | showing 363 | shows 364 | side 365 | sides 366 | since 367 | small 368 | smaller 369 | smallest 370 | so 371 | some 372 | somebody 373 | someone 374 | something 375 | somewhere 376 | state 377 | states 378 | still 379 | such 380 | sure 381 | t 382 | take 383 | taken 384 | teh 385 | than 386 | that 387 | thatll 388 | thats 389 | the 390 | their 391 | theirs 392 | them 393 | themselves 394 | then 395 | there 396 | therefore 397 | theres 398 | these 399 | they 400 | theyd 401 | theyll 402 | theyre 403 | theyve 404 | thing 405 | things 406 | think 407 | thinks 408 | this 409 | those 410 | though 411 | three 412 | through 413 | thus 414 | tis 415 | to 416 | today 417 | together 418 | too 419 | took 420 | toward 421 | turn 422 | turned 423 | turning 424 | turns 425 | twas 426 | two 427 | u 428 | under 429 | until 430 | up 431 | upon 432 | us 433 | use 434 | used 435 | uses 436 | using 437 | v 438 | very 439 | vs 440 | w 441 | want 442 | wanted 443 | wanting 444 | wants 445 | was 446 | wasnt 447 | way 448 | ways 449 | we 450 | wed 451 | well 452 | wells 453 | went 454 | were 455 | werent 456 | weve 457 | what 458 | whatd 459 | whats 460 | when 461 | whend 462 | whenll 463 | whens 464 | where 465 | whered 466 | wherell 467 | wheres 468 | whether 469 | which 470 | while 471 | who 472 | whod 473 | whole 474 | wholl 475 | whom 476 | whos 477 | whose 478 | why 479 | whyd 480 | whyll 481 | whys 482 | will 483 | with 484 | within 485 | without 486 | wont 487 | works 488 | would 489 | wouldnt 490 | wouldve 491 | www 492 | x 493 | y 494 | yes 495 | yet 496 | you 497 | youd 498 | youll 499 | young 500 | younger 501 | youngest 502 | your 503 | youre 504 | yours 505 | yourself 506 | yourselves 507 | youve 508 | z -------------------------------------------------------------------------------- /redditnlp/words/swearwords_english.txt: -------------------------------------------------------------------------------- 1 | 4r5e 2 | 5h1t 3 | 5hit 4 | a55 5 | anal 6 | anus 7 | ar5e 8 | arrse 9 | arse 10 | ass 11 | asses 12 | assfucker 13 | assfukka 14 | asshole 15 | assholes 16 | asswhole 17 | b00bs 18 | b17ch 19 | b1tch 20 | ballbag 21 | balls 22 | ballsack 23 | bastard 24 | beastial 25 | beastiality 26 | bellend 27 | bestial 28 | bestiality 29 | biatch 30 | bitch 31 | bitcher 32 | bitchers 33 | bitches 34 | bitchin 35 | bitching 36 | bloody 37 | blowjob 38 | blowjobs 39 | boiolas 40 | bollock 41 | bollok 42 | boner 43 | boob 44 | boobs 45 | booobs 46 | boooobs 47 | booooobs 48 | booooooobs 49 | breasts 50 | buceta 51 | bugger 52 | bum 53 | butt 54 | butthole 55 | buttmuch 56 | buttplug 57 | c0ck 58 | c0cksucker 59 | carpet muncher 60 | cawk 61 | chink 62 | cipa 63 | cl1t 64 | clit 65 | clitoris 66 | clits 67 | cnut 68 | cock 69 | cock-sucker 70 | cockface 71 | cockhead 72 | cockmunch 73 | cockmuncher 74 | cocks 75 | cocksuck 76 | cocksucked 77 | cocksucker 78 | cocksucking 79 | cocksucks 80 | cocksuka 81 | cocksukka 82 | cok 83 | cokmuncher 84 | coksucka 85 | coon 86 | cox 87 | crap 88 | cum 89 | cummer 90 | cumming 91 | cums 92 | cumshot 93 | cunilingus 94 | cunillingus 95 | cunnilingus 96 | cunt 97 | cuntlick 98 | cuntlicker 99 | cuntlicking 100 | cunts 101 | cyalis 102 | cyberfuc 103 | cyberfuck 104 | cyberfucked 105 | cyberfucker 106 | cyberfuckers 107 | cyberfucking 108 | d1ck 109 | damn 110 | dick 111 | dickhead 112 | dildo 113 | dildos 114 | dink 115 | dinks 116 | dirsa 117 | dlck 118 | doggin 119 | dogging 120 | donkeyribber 121 | doosh 122 | duche 123 | dyke 124 | ejaculate 125 | ejaculated 126 | ejaculates 127 | ejaculating 128 | ejaculatings 129 | ejaculation 130 | ejakulate 131 | f4nny 132 | fag 133 | fagging 134 | faggitt 135 | faggot 136 | faggs 137 | fagot 138 | fagots 139 | fags 140 | fanny 141 | fannyflaps 142 | fannyfucker 143 | fanyy 144 | fatass 145 | fcuk 146 | fcuker 147 | fcuking 148 | feck 149 | fecker 150 | felching 151 | fellate 152 | fellatio 153 | fingerfuck 154 | fingerfucked 155 | fingerfucker 156 | fingerfuckers 157 | fingerfucking 158 | fingerfucks 159 | fistfuck 160 | fistfucked 161 | fistfucker 162 | fistfuckers 163 | fistfucking 164 | fistfuckings 165 | fistfucks 166 | flange 167 | fook 168 | fooker 169 | fuck 170 | fucka 171 | fucked 172 | fucker 173 | fuckers 174 | fuckhead 175 | fuckheads 176 | fuckin 177 | fucking 178 | fuckings 179 | fuckingshitmotherfucker 180 | fuckme 181 | fucks 182 | fuckwhit 183 | fuckwit 184 | fudge packer 185 | fudgepacker 186 | fuk 187 | fuker 188 | fukker 189 | fukkin 190 | fuks 191 | fukwhit 192 | fukwit 193 | fux 194 | fux0r 195 | gangbang 196 | gangbanged 197 | gangbangs 198 | gaylord 199 | gaysex 200 | goatse 201 | God 202 | god-dam 203 | god-damned 204 | goddamn 205 | goddamned 206 | hardcoresex 207 | hell 208 | heshe 209 | hoar 210 | hoare 211 | hoer 212 | homo 213 | hore 214 | horniest 215 | horny 216 | hotsex 217 | jack-off 218 | jackoff 219 | jap 220 | jerk-off 221 | jism 222 | jiz 223 | jizm 224 | jizz 225 | kawk 226 | knob 227 | knobead 228 | knobed 229 | knobend 230 | knobhead 231 | knobjocky 232 | knobjokey 233 | kock 234 | kondum 235 | kondums 236 | kum 237 | kummer 238 | kumming 239 | kums 240 | kunilingus 241 | l3itch 242 | labia 243 | lmfao 244 | lust 245 | lusting 246 | m0f0 247 | m0fo 248 | m45terbate 249 | ma5terb8 250 | ma5terbate 251 | masochist 252 | master-bate 253 | masterb8 254 | masterbat3 255 | masterbate 256 | masterbation 257 | masterbations 258 | masturbate 259 | mof0 260 | mofo 261 | mothafuck 262 | mothafucka 263 | mothafuckas 264 | mothafuckaz 265 | mothafucked 266 | mothafucker 267 | mothafuckers 268 | mothafuckin 269 | mothafucking 270 | mothafuckings 271 | mothafucks 272 | motherfuck 273 | motherfucked 274 | motherfucker 275 | motherfuckers 276 | motherfuckin 277 | motherfucking 278 | motherfuckings 279 | motherfuckka 280 | motherfucks 281 | muff 282 | mutha 283 | muthafecker 284 | muthafuckker 285 | muther 286 | mutherfucker 287 | n1gga 288 | n1gger 289 | nazi 290 | nigg3r 291 | nigg4h 292 | nigga 293 | niggah 294 | niggas 295 | niggaz 296 | nigger 297 | niggers 298 | nob 299 | nobhead 300 | nobjocky 301 | nobjokey 302 | numbnuts 303 | nutsack 304 | orgasim 305 | orgasims 306 | orgasm 307 | orgasms 308 | p0rn 309 | pawn 310 | pecker 311 | penis 312 | penisfucker 313 | phonesex 314 | phuck 315 | phuk 316 | phuked 317 | phuking 318 | phukked 319 | phukking 320 | phuks 321 | phuq 322 | pigfucker 323 | pimpis 324 | piss 325 | pissed 326 | pisser 327 | pissers 328 | pisses 329 | pissflaps 330 | pissin 331 | pissing 332 | pissoff 333 | poop 334 | porn 335 | porno 336 | pornography 337 | pornos 338 | prick 339 | pricks 340 | pron 341 | pube 342 | pusse 343 | pussi 344 | pussies 345 | pussy 346 | pussys 347 | rectum 348 | retard 349 | rimjaw 350 | rimming 351 | sadist 352 | schlong 353 | screwing 354 | scroat 355 | scrote 356 | scrotum 357 | semen 358 | sex 359 | sh1t 360 | shag 361 | shagger 362 | shaggin 363 | shagging 364 | shemale 365 | shit 366 | shitdick 367 | shite 368 | shited 369 | shitey 370 | shitfuck 371 | shitfull 372 | shithead 373 | shiting 374 | shitings 375 | shits 376 | shitted 377 | shitter 378 | shitters 379 | shitting 380 | shittings 381 | shitty 382 | skank 383 | slut 384 | sluts 385 | smegma 386 | smut 387 | snatch 388 | spac 389 | spunk 390 | t1tt1e5 391 | t1tties 392 | teets 393 | teez 394 | testical 395 | testicle 396 | tit 397 | titfuck 398 | tits 399 | titt 400 | tittie5 401 | tittiefucker 402 | titties 403 | tittyfuck 404 | tittywank 405 | titwank 406 | tosser 407 | turd 408 | tw4t 409 | twat 410 | twathead 411 | twatty 412 | twunt 413 | twunter 414 | v14gra 415 | v1gra 416 | vagina 417 | viagra 418 | vulva 419 | w00se 420 | wang 421 | wank 422 | wanker 423 | wanky 424 | whoar 425 | whore 426 | willies 427 | willy 428 | xrated 429 | xxx1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reddit NLP Package [![Build Status](https://travis-ci.org/jaijuneja/reddit-nlp.svg?branch=master)](https://travis-ci.org/jaijuneja/reddit-nlp) [![PyPI version](https://badge.fury.io/py/redditnlp.svg)](https://pypi.python.org/pypi/redditnlp) 2 | 3 | A lightweight Python module that performs tokenization and processing of text on Reddit. It allows you to analyze users, titles, comments and subreddits to understand their vocabulary. The module comes packaged with its own inverted index builder for storing vocabularies and word frequencies, such that you can generate and manipulate large corpora of tf-idf weighted words without worrying about implementation. This is especially useful if you're running scripts over long periods and wish to save intermediary results. 4 | 5 | ## License 6 | 7 | Copyright 2014 Jai Juneja. 8 | 9 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/](http://www.gnu.org/licenses/). 14 | 15 | ## Installation 16 | 17 | ### Using pip or easy_install 18 | 19 | You can download the latest release version using `pip` or `easy_install`: 20 | 21 | ``` 22 | pip install redditnlp 23 | ``` 24 | 25 | ### Latest development version 26 | You can alternatively download the latest development version directly from GitHub: 27 | 28 | ``` 29 | git clone https://github.com/jaijuneja/reddit-nlp.git 30 | ``` 31 | 32 | Change into the root directory: 33 | 34 | ``` 35 | cd reddit-nlp 36 | ``` 37 | 38 | Then install the package: 39 | 40 | ``` 41 | python setup.py install 42 | ``` 43 | 44 | ### Error: the required version of setuptools is not available 45 | 46 | Upon running `pip install` or the `setup.py` script you might get a message like this: 47 | 48 | ``` 49 | The required version of setuptools (>=0.7) is not available, and can't be installed while this script is running. Please install a more recent version first, using 'easy_install -U setuptools'. 50 | ``` 51 | 52 | This is appearing because you have a very outdated version of the setuptools package. The redditnlp package typically bootstraps a newer version of setuptools during install, but it isn't working in this case. You need to update setuptools using `easy_install -U setuptools` (you may need to apply `sudo` to this command). 53 | 54 | If the above command doesn't do anything then it is likely that your version of setuptools was installed using a package manager such yum, apt or pip. Check your package manager for a package called python-setuptools or try `pip install setuptools --upgrade` and then re-run the install. 55 | 56 | ## Usage 57 | 58 | A more complex sample program using the redditnlp module can be found at `https://github.com/jaijuneja/reddit-nlp/blob/master/example.py`. Here we outline a basic word counter application. 59 | 60 | The module consists of three classes: 61 | 62 | * A basic word counter class, `WordCounter`, which performs tokenization and counting on input strings 63 | * A Reddit word counter, `RedditWordCounter`, which extends the `WordCounter` class to allow interaction with the Reddit API 64 | * A tf-idf corpus builder, which allows storing of large word corpora in an inverted index 65 | 66 | These three classes can be instantiated as follows: 67 | 68 | ```python 69 | from redditnlp import WordCounter, RedditWordCounter, TfidfCorpus 70 | 71 | word_counter = WordCounter() 72 | reddit_counter = RedditWordCounter('your_username') 73 | corpus = TfidfCorpus() 74 | ``` 75 | 76 | To adhere to the Reddit API rules, it is asked that you use your actual Reddit username in place of `'your_username'` above. 77 | 78 | For further information on the attributes and methods of these classes you can run: 79 | 80 | ```python 81 | help(WordCounter) 82 | help(RedditWordCounter) 83 | help(TfidfCorpus) 84 | ``` 85 | 86 | Next, we can tokenize 1000 comments from a selection of subreddits, extract the most common words and save all of our data to disk: 87 | 88 | ```python 89 | for subreddit in ['funny', 'aww', 'pics']: 90 | # Tokenize and count words for 1000 comments 91 | word_counts = counter.subreddit_comments(subreddit, limit=1000) 92 | 93 | # Add the word counts to our corpus 94 | corpus.add_document(word_counts, subreddit) 95 | 96 | # Save the corpus to a specified path (must be JSON) 97 | corpus.save(path='word_counts.json') 98 | 99 | # Save the top 50 words (by tf-idf score) from each subreddit to a text file 100 | for subreddit in corpus.get_document_list(): 101 | top_words = corpus.get_top_terms(document, num_terms=50) 102 | with open('top_words.txt', 'ab') as f: 103 | f.write(document + '\n' + '\n'.join(top_words.keys())) 104 | ``` 105 | 106 | ### Machine learning 107 | 108 | `redditnlp` now supports some of scikit-learn's machine learning capability. Several in-built functions enable the user to: 109 | 110 | * Convert a TfidfCorpus object into a scipy sparse feature matrix (using `build_feature_matrix()`) 111 | * Train a classifier using the documents contained in a TfidfCorpus (with `train_classifier()`) and thereafter classify new documents (with `classify_document()`) 112 | 113 | Below is an example of a simple machine learning application that loads a corpus of subreddit comment data, uses it to train a classifier and determines which subreddit a user's comments most closely match: 114 | 115 | ```python 116 | # Load the corpus of subreddit comment data and use it to train a classifier 117 | corpus = TfidfCorpus('path/to/subreddit_corpus.json') 118 | corpus.train_classifier(classifier_type='LinearSVC', tfidf=True) 119 | 120 | # Tokenize all of your comments 121 | counter = RedditWordCounter('your_username') 122 | user_comments = counter.user_comments('your_username') 123 | 124 | # Classify your comments against the documents in the corpus 125 | print corpus.classify_document(user_comments) 126 | ``` 127 | 128 | ### Multiprocessing 129 | 130 | `redditnlp` uses the [PRAW](https://github.com/praw-dev/praw) Reddit API wrapper. It supports multiprocessing, such that you can run multiple instances of `RedditWordCounter` without exceeding Reddit's rate limit. There is more information about this in the [PRAW documentation](https://praw.readthedocs.org/en/latest/pages/multiprocess.html) but for the sake of completeness an example is included below. 131 | 132 | First, you must initialise a request handling server on your local machine. This is done using the terminal/command line: 133 | 134 | ``` 135 | praw-multiprocess 136 | ``` 137 | 138 | Next, you can instantiate multiple `RedditWordCounter` objects and set the parameter `multiprocess=True` so that outgoing API calls are handled: 139 | 140 | ``` 141 | counter = RedditWordCounter('your_username', multiprocess=True) 142 | ``` 143 | 144 | ## Contact 145 | 146 | If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`. -------------------------------------------------------------------------------- /ez_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Bootstrap setuptools installation 3 | 4 | To use setuptools in your package's setup.py, include this 5 | file in the same directory and add this to the top of your setup.py:: 6 | 7 | from ez_setup import use_setuptools 8 | use_setuptools() 9 | 10 | To require a specific version of setuptools, set a download 11 | mirror, or use an alternate download directory, simply supply 12 | the appropriate options to ``use_setuptools()``. 13 | 14 | This file can also be run as a script to install or upgrade setuptools. 15 | """ 16 | import os 17 | import shutil 18 | import sys 19 | import tempfile 20 | import zipfile 21 | import optparse 22 | import subprocess 23 | import platform 24 | import textwrap 25 | import contextlib 26 | 27 | from distutils import log 28 | 29 | try: 30 | from urllib.request import urlopen 31 | except ImportError: 32 | from urllib2 import urlopen 33 | 34 | try: 35 | from site import USER_SITE 36 | except ImportError: 37 | USER_SITE = None 38 | 39 | DEFAULT_VERSION = "7.0" 40 | DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" 41 | 42 | def _python_cmd(*args): 43 | """ 44 | Return True if the command succeeded. 45 | """ 46 | args = (sys.executable,) + args 47 | return subprocess.call(args) == 0 48 | 49 | 50 | def _install(archive_filename, install_args=()): 51 | with archive_context(archive_filename): 52 | # installing 53 | log.warn('Installing Setuptools') 54 | if not _python_cmd('setup.py', 'install', *install_args): 55 | log.warn('Something went wrong during the installation.') 56 | log.warn('See the error message above.') 57 | # exitcode will be 2 58 | return 2 59 | 60 | 61 | def _build_egg(egg, archive_filename, to_dir): 62 | with archive_context(archive_filename): 63 | # building an egg 64 | log.warn('Building a Setuptools egg in %s', to_dir) 65 | _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) 66 | # returning the result 67 | log.warn(egg) 68 | if not os.path.exists(egg): 69 | raise IOError('Could not build the egg.') 70 | 71 | 72 | class ContextualZipFile(zipfile.ZipFile): 73 | """ 74 | Supplement ZipFile class to support context manager for Python 2.6 75 | """ 76 | 77 | def __enter__(self): 78 | return self 79 | 80 | def __exit__(self, type, value, traceback): 81 | self.close() 82 | 83 | def __new__(cls, *args, **kwargs): 84 | """ 85 | Construct a ZipFile or ContextualZipFile as appropriate 86 | """ 87 | if hasattr(zipfile.ZipFile, '__exit__'): 88 | return zipfile.ZipFile(*args, **kwargs) 89 | return super(ContextualZipFile, cls).__new__(cls) 90 | 91 | 92 | @contextlib.contextmanager 93 | def archive_context(filename): 94 | # extracting the archive 95 | tmpdir = tempfile.mkdtemp() 96 | log.warn('Extracting in %s', tmpdir) 97 | old_wd = os.getcwd() 98 | try: 99 | os.chdir(tmpdir) 100 | with ContextualZipFile(filename) as archive: 101 | archive.extractall() 102 | 103 | # going in the directory 104 | subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) 105 | os.chdir(subdir) 106 | log.warn('Now working in %s', subdir) 107 | yield 108 | 109 | finally: 110 | os.chdir(old_wd) 111 | shutil.rmtree(tmpdir) 112 | 113 | 114 | def _do_download(version, download_base, to_dir, download_delay): 115 | egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' 116 | % (version, sys.version_info[0], sys.version_info[1])) 117 | if not os.path.exists(egg): 118 | archive = download_setuptools(version, download_base, 119 | to_dir, download_delay) 120 | _build_egg(egg, archive, to_dir) 121 | sys.path.insert(0, egg) 122 | 123 | # Remove previously-imported pkg_resources if present (see 124 | # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). 125 | if 'pkg_resources' in sys.modules: 126 | del sys.modules['pkg_resources'] 127 | 128 | import setuptools 129 | setuptools.bootstrap_install_from = egg 130 | 131 | 132 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 133 | to_dir=os.curdir, download_delay=15): 134 | to_dir = os.path.abspath(to_dir) 135 | rep_modules = 'pkg_resources', 'setuptools' 136 | imported = set(sys.modules).intersection(rep_modules) 137 | try: 138 | import pkg_resources 139 | except ImportError: 140 | return _do_download(version, download_base, to_dir, download_delay) 141 | try: 142 | pkg_resources.require("setuptools>=" + version) 143 | return 144 | except pkg_resources.DistributionNotFound: 145 | return _do_download(version, download_base, to_dir, download_delay) 146 | except pkg_resources.VersionConflict as VC_err: 147 | if imported: 148 | msg = textwrap.dedent(""" 149 | The required version of setuptools (>={version}) is not available, 150 | and can't be installed while this script is running. Please 151 | install a more recent version first, using 152 | 'easy_install -U setuptools'. 153 | 154 | (Currently using {VC_err.args[0]!r}) 155 | """).format(VC_err=VC_err, version=version) 156 | sys.stderr.write(msg) 157 | sys.exit(2) 158 | 159 | # otherwise, reload ok 160 | del pkg_resources, sys.modules['pkg_resources'] 161 | return _do_download(version, download_base, to_dir, download_delay) 162 | 163 | def _clean_check(cmd, target): 164 | """ 165 | Run the command to download target. If the command fails, clean up before 166 | re-raising the error. 167 | """ 168 | try: 169 | subprocess.check_call(cmd) 170 | except subprocess.CalledProcessError: 171 | if os.access(target, os.F_OK): 172 | os.unlink(target) 173 | raise 174 | 175 | def download_file_powershell(url, target): 176 | """ 177 | Download the file at url to target using Powershell (which will validate 178 | trust). Raise an exception if the command cannot complete. 179 | """ 180 | target = os.path.abspath(target) 181 | ps_cmd = ( 182 | "[System.Net.WebRequest]::DefaultWebProxy.Credentials = " 183 | "[System.Net.CredentialCache]::DefaultCredentials; " 184 | "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" 185 | % vars() 186 | ) 187 | cmd = [ 188 | 'powershell', 189 | '-Command', 190 | ps_cmd, 191 | ] 192 | _clean_check(cmd, target) 193 | 194 | def has_powershell(): 195 | if platform.system() != 'Windows': 196 | return False 197 | cmd = ['powershell', '-Command', 'echo test'] 198 | with open(os.path.devnull, 'wb') as devnull: 199 | try: 200 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 201 | except Exception: 202 | return False 203 | return True 204 | 205 | download_file_powershell.viable = has_powershell 206 | 207 | def download_file_curl(url, target): 208 | cmd = ['curl', url, '--silent', '--output', target] 209 | _clean_check(cmd, target) 210 | 211 | def has_curl(): 212 | cmd = ['curl', '--version'] 213 | with open(os.path.devnull, 'wb') as devnull: 214 | try: 215 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 216 | except Exception: 217 | return False 218 | return True 219 | 220 | download_file_curl.viable = has_curl 221 | 222 | def download_file_wget(url, target): 223 | cmd = ['wget', url, '--quiet', '--output-document', target] 224 | _clean_check(cmd, target) 225 | 226 | def has_wget(): 227 | cmd = ['wget', '--version'] 228 | with open(os.path.devnull, 'wb') as devnull: 229 | try: 230 | subprocess.check_call(cmd, stdout=devnull, stderr=devnull) 231 | except Exception: 232 | return False 233 | return True 234 | 235 | download_file_wget.viable = has_wget 236 | 237 | def download_file_insecure(url, target): 238 | """ 239 | Use Python to download the file, even though it cannot authenticate the 240 | connection. 241 | """ 242 | src = urlopen(url) 243 | try: 244 | # Read all the data in one block. 245 | data = src.read() 246 | finally: 247 | src.close() 248 | 249 | # Write all the data in one block to avoid creating a partial file. 250 | with open(target, "wb") as dst: 251 | dst.write(data) 252 | 253 | download_file_insecure.viable = lambda: True 254 | 255 | def get_best_downloader(): 256 | downloaders = ( 257 | download_file_powershell, 258 | download_file_curl, 259 | download_file_wget, 260 | download_file_insecure, 261 | ) 262 | viable_downloaders = (dl for dl in downloaders if dl.viable()) 263 | return next(viable_downloaders, None) 264 | 265 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, 266 | to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader): 267 | """ 268 | Download setuptools from a specified location and return its filename 269 | 270 | `version` should be a valid setuptools version number that is available 271 | as an sdist for download under the `download_base` URL (which should end 272 | with a '/'). `to_dir` is the directory where the egg will be downloaded. 273 | `delay` is the number of seconds to pause before an actual download 274 | attempt. 275 | 276 | ``downloader_factory`` should be a function taking no arguments and 277 | returning a function for downloading a URL to a target. 278 | """ 279 | # making sure we use the absolute path 280 | to_dir = os.path.abspath(to_dir) 281 | zip_name = "setuptools-%s.zip" % version 282 | url = download_base + zip_name 283 | saveto = os.path.join(to_dir, zip_name) 284 | if not os.path.exists(saveto): # Avoid repeated downloads 285 | log.warn("Downloading %s", url) 286 | downloader = downloader_factory() 287 | downloader(url, saveto) 288 | return os.path.realpath(saveto) 289 | 290 | def _build_install_args(options): 291 | """ 292 | Build the arguments to 'python setup.py install' on the setuptools package 293 | """ 294 | return ['--user'] if options.user_install else [] 295 | 296 | def _parse_args(): 297 | """ 298 | Parse the command line for options 299 | """ 300 | parser = optparse.OptionParser() 301 | parser.add_option( 302 | '--user', dest='user_install', action='store_true', default=False, 303 | help='install in user site package (requires Python 2.6 or later)') 304 | parser.add_option( 305 | '--download-base', dest='download_base', metavar="URL", 306 | default=DEFAULT_URL, 307 | help='alternative URL from where to download the setuptools package') 308 | parser.add_option( 309 | '--insecure', dest='downloader_factory', action='store_const', 310 | const=lambda: download_file_insecure, default=get_best_downloader, 311 | help='Use internal, non-validating downloader' 312 | ) 313 | parser.add_option( 314 | '--version', help="Specify which version to download", 315 | default=DEFAULT_VERSION, 316 | ) 317 | options, args = parser.parse_args() 318 | # positional arguments are ignored 319 | return options 320 | 321 | def main(): 322 | """Install or upgrade setuptools and EasyInstall""" 323 | options = _parse_args() 324 | archive = download_setuptools( 325 | version=options.version, 326 | download_base=options.download_base, 327 | downloader_factory=options.downloader_factory, 328 | ) 329 | return _install(archive, _build_install_args(options)) 330 | 331 | if __name__ == '__main__': 332 | sys.exit(main()) -------------------------------------------------------------------------------- /redditnlp/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | import os 5 | import math 6 | import errno 7 | import json 8 | import operator 9 | import numpy as np 10 | import praw 11 | import urllib2 12 | import nltk 13 | 14 | from nltk.stem.porter import PorterStemmer 15 | from collections import Counter, OrderedDict 16 | from time import time, sleep 17 | from string import punctuation 18 | from praw.handlers import MultiprocessHandler 19 | from sklearn.feature_extraction import DictVectorizer 20 | from sklearn.feature_extraction.text import TfidfTransformer 21 | from sklearn.svm import LinearSVC 22 | from sklearn.naive_bayes import MultinomialNB 23 | from sklearn.multiclass import OneVsRestClassifier 24 | 25 | 26 | class WordCounter(object): 27 | """Performs word counting given an input string. 28 | 29 | Data attributes: 30 | stemmer: Porter stemmer used optionally to perform stemming of extracted words 31 | stopwords (list): list of stop words used to reject common words such as 'and' 32 | 33 | Methods: 34 | tokenize 35 | get_word_count 36 | remove_punctuation 37 | remove_stopwords 38 | stem_tokens: perform Porter stemming on a list of words 39 | """ 40 | 41 | def __init__(self): 42 | self.stemmer = PorterStemmer() 43 | 44 | # Load stop-words 45 | application_root = os.path.dirname(__file__) 46 | stopwords = os.path.join(application_root, 'words/stopwords_english.txt') 47 | with open(stopwords, 'rb') as stopwords_file: 48 | self.stopwords = [word.strip('\n') for word in stopwords_file.readlines()] 49 | 50 | def tokenize(self, text): 51 | """Tokenize an input string into a list of words (with punctuation removed).""" 52 | text = text.lower() 53 | punctuation_removed = self.remove_punctuation(text) 54 | tokens = nltk.word_tokenize(punctuation_removed) 55 | return tokens 56 | 57 | def get_word_count(self, text, stop_words=True, stemming=False): 58 | """Return a dict (Counter) of words and corresponding counts given an input string.""" 59 | tokens = self.tokenize(text) 60 | 61 | # Remove stop words 62 | if stop_words: 63 | tokens = self.remove_stopwords(tokens) 64 | 65 | if stemming: 66 | tokens = self.stem_tokens(tokens) 67 | 68 | return Counter(tokens) 69 | 70 | @staticmethod 71 | def remove_punctuation(text, replacement=' ', exclude="'"): 72 | """Remove punctuation from an input string.""" 73 | text = text.replace("'", "") # Single quote always stripped out 74 | for p in set(list(punctuation)) - set(list(exclude)): 75 | text = text.replace(p, replacement) 76 | 77 | text = ' '.join(text.split()) # Remove excess whitespace 78 | return text 79 | 80 | def remove_stopwords(self, tokens): 81 | """Remove all stopwords from a list of word tokens.""" 82 | return [word for word in tokens if word not in self.stopwords] 83 | 84 | def stem_tokens(self, tokens): 85 | """Perform porter stemming on a list of word tokens.""" 86 | return [self.stemmer.stem(word) for word in tokens] 87 | 88 | def count_words_from_list(self, text, word_list, normalize=True): 89 | """Count the number of times the words from a given list appear in text.""" 90 | text = self.tokenize(text) 91 | count = sum([1 for word in text if word in word_list]) 92 | if normalize: 93 | count /= len(text) 94 | return count 95 | 96 | 97 | class RedditWordCounter(WordCounter): 98 | """Performs word counting of comments and titles in Reddit using the Reddit API. 99 | 100 | To initialise a new RedditWordCounter instance: 101 | >>> counter = RedditWordCounter('your_username') 102 | 103 | To adhere to the Reddit API rules, please provide your Reddit username in place of 'your_username' above. 104 | This will ensure that the app doesn't get banned from Reddit! 105 | 106 | Data Attributes: 107 | user_agent (str): required to connect to Reddit 108 | reddit: instance of the Reddit API connection 109 | word_counter: WordCounter object used to perform word counting given input strings 110 | 111 | Methods: 112 | subreddit_comments: word count from comments of a given subreddit 113 | subreddit_titles: word count from titles of a given subreddit 114 | user_comments: word count from comments of a given user 115 | check_connection: check that there is a working connection to Reddit 116 | """ 117 | 118 | def __init__( 119 | self, 120 | user, 121 | multiprocess=False 122 | ): 123 | """Initialise a RedditWordCounter object. 124 | 125 | :param user: your Reddit username 126 | :param multiprocess: if True, will handle requests from multiple RedditWordCounter objects (False by default) 127 | :return: 128 | """ 129 | super(RedditWordCounter, self).__init__() # Initialise the WordCounter class 130 | handler = MultiprocessHandler() if multiprocess else None 131 | self.user_agent = 'redditvocab/0.1 bot by {0}'.format(user) 132 | self.reddit = praw.Reddit(user_agent=self.user_agent, handler=handler) 133 | 134 | def subreddit_comments(self, subreddit_name, limit=1000, stemming=False, get_all_comments=False): 135 | """Retrieve the vocabulary from the comments of a subreddit. 136 | 137 | :param subreddit_name: name of the subreddit excluding '/r/' 138 | :param limit: number of comments to retrieve (1000 by default) - note that at present the limit is approximate 139 | :param stemming: if True, performs stemming on tokenized words (False by default) 140 | :param get_all_comments: if True, retrieves all comments per submission. Note that this requires descending the 141 | comment tree, which drastically increases the number of API calls and reduces performance due to rate-limiting. 142 | :return: Counter (dict) of comment vocabulary in the form {'term1': freq, 'term2': freq, ...} 143 | """ 144 | 145 | def get_vocabulary(comments): 146 | 147 | vocab = Counter() 148 | num_comments = 0 149 | for comment in comments: 150 | if isinstance(comment, praw.objects.Comment): 151 | try: 152 | # Get the word counts for the comment 153 | vocab += self.get_word_count(comment.body, stemming=stemming) 154 | num_comments += 1 155 | 156 | except ValueError: 157 | pass 158 | elif isinstance(comment, praw.objects.MoreComments) and get_all_comments: 159 | new_vocab, num_new_comments = get_vocabulary(comment.comments) 160 | vocab += new_vocab 161 | num_comments += num_new_comments 162 | 163 | return vocab, num_comments 164 | 165 | subreddit = self.reddit.get_subreddit(subreddit_name) 166 | 167 | # Initialise loop variables 168 | vocabulary = Counter() 169 | comments_processed = 0 170 | 171 | for submission in subreddit.get_hot(limit=None): 172 | submission_comments = praw.helpers.flatten_tree(submission.comments) 173 | 174 | # Run over all comments 175 | submission_vocabulary, new_comments = get_vocabulary(submission_comments) 176 | vocabulary += submission_vocabulary 177 | comments_processed += new_comments 178 | 179 | print("Comments processed for subreddit '{0}': {1}".format(subreddit_name, comments_processed), end="\r") 180 | 181 | if limit and comments_processed >= limit: 182 | break 183 | 184 | print('\n') 185 | return vocabulary 186 | 187 | def subreddit_titles(self, subreddit_name, limit=1000, stemming=False): 188 | """Retrieve the vocabulary from the titles in a subreddit. 189 | 190 | :param subreddit_name: name of the subreddit excluding '/r/' 191 | :param limit: number of submissions to process (1000 by default - note that this is the maximum) 192 | :param stemming: if True, performs stemming on tokenized words (False by default) 193 | :return: Counter (dict) of title vocabulary in the form {'term1': freq, 'term2': freq, ...} 194 | """ 195 | 196 | subreddit = self.reddit.get_subreddit(subreddit_name) 197 | 198 | # Initialise loop variables 199 | vocabulary = Counter() 200 | submissions_processed = 0 201 | 202 | for submission in subreddit.get_hot(limit=limit): 203 | try: 204 | # Update the word counter to include the comment 205 | vocabulary += self.get_word_count(submission.title, stemming=stemming) 206 | submissions_processed += 1 207 | 208 | if submissions_processed % 100 == 0 or submissions_processed >= limit: 209 | print("Titles processed for subreddit '{0}': {1}".format(subreddit_name, submissions_processed), 210 | end="\r") 211 | 212 | except ValueError: 213 | pass 214 | 215 | print('\n') 216 | return vocabulary 217 | 218 | def user_comments(self, username, limit=1000, stemming=False): 219 | """Retrieve the vocabulary of a user's comments. 220 | 221 | :param username: user's Reddit username excluding '/u/' 222 | :param limit: number of comments to process (1000 by default - note that this is the maxmimum) 223 | :param stemming: if True, performs stemming on tokenized words (False by default) 224 | :return: Counter (dict) of user's vocabulary in the form {'term1': freq, 'term2': freq, ...} 225 | """ 226 | user = self.reddit.get_redditor(username) 227 | 228 | vocabulary = Counter() 229 | comments_processed = 0 230 | for comment in user.get_comments(limit=limit): 231 | try: 232 | # Get the word counts for the comment 233 | vocabulary += self.get_word_count(comment.body, stemming=stemming) 234 | comments_processed += 1 235 | 236 | if comments_processed % 100 == 0 or comments_processed >= limit: 237 | print("Comments processed for user '{0}': {1}".format(username, comments_processed), end="\r") 238 | 239 | except ValueError: 240 | pass 241 | 242 | print('\n') 243 | return vocabulary 244 | 245 | def check_connection(self, timeout=10): 246 | """Wait for a server response.""" 247 | header = {'User-Agent': self.user_agent} 248 | start = time() 249 | while True: 250 | try: 251 | request = urllib2.Request("http://www.reddit.com/", headers=header) 252 | response = urllib2.urlopen(request) 253 | response.read() 254 | sleep(2) # Adhere to Reddit API rule of 30 requests per minute 255 | if response.getcode() == 200: 256 | return True 257 | except urllib2.HTTPError as err: 258 | print(err) 259 | finally: 260 | if time() - start > timeout: 261 | return False 262 | 263 | 264 | class TfidfCorpus(object): 265 | """Stores features (e.g. words) and their document frequencies in an inverted index. Useful for NLP and machine 266 | learning applications. 267 | 268 | To initialise a new TfidfCorpus instance: 269 | >>> corpus = TfidfCorpus() 270 | 271 | By default the corpus will save to 'tfidf_corpus/corpus.json'. You can specify an existing file to load 272 | or a specific save path as follows: 273 | >>> corpus = TfidfCorpus(corpus_path='path/to/corpus.json') 274 | 275 | Data Attributes: 276 | corpus_path (str): save/load path of the corpus 277 | document_list (list): list of strings indicating the documents stored in the corpus 278 | document_lengths (dict): sum of word frequencies contained in each document, takes the form: 279 | { 280 | "document1": int, 281 | "document2": int, 282 | ... 283 | } 284 | corpus (dict): dict of Counters that takes the form: 285 | { 286 | "term1": { 287 | "document1": int, 288 | "document2": int 289 | }, 290 | "term2": { 291 | "document1": int, 292 | "document2": int, 293 | }, 294 | ... 295 | } 296 | 297 | Methods: 298 | save 299 | load 300 | get_corpus_path 301 | get_document_list 302 | add_document 303 | get_document 304 | delete_document 305 | append_document 306 | get_idf 307 | get_tfidf 308 | get_document_tfidfs 309 | get_top_terms 310 | build_feature_matrix 311 | train_classifier 312 | classify_document 313 | count_words_from_list 314 | get_mean_word_length 315 | check_corpus_path 316 | """ 317 | 318 | def __init__(self, corpus_path='corpus.json'): 319 | 320 | # Check that the corpus path is valid 321 | self.check_corpus_path(corpus_path) 322 | self.corpus_path = corpus_path 323 | self.document_list = list() 324 | self.document_lengths = dict() 325 | self.corpus = dict() 326 | 327 | # Initialise scikit-learn attributes 328 | self.vectorizer = None 329 | self.tfidf_transformer = None 330 | self.feature_matrix = None 331 | self.classifier = None 332 | 333 | if os.path.isfile(corpus_path): 334 | self.load() 335 | 336 | def save(self, path=''): 337 | """Save the corpus to a JSON file at the path specified in self.corpus_path. 338 | 339 | :param path: you can specify a save path (must end in .json), which will change self.corpus_path 340 | """ 341 | if path: 342 | self.check_corpus_path(path) 343 | self.corpus_path = path 344 | 345 | with open(self.corpus_path, 'wb') as save_file: 346 | json.dump( 347 | { 348 | 'document_list': self.document_list, 349 | 'document_lengths': self.document_lengths, 350 | 'corpus': self.corpus 351 | }, 352 | save_file 353 | ) 354 | 355 | def load(self): 356 | """Load the corpus from a JSON file. File path defined in self.corpus_path.""" 357 | with open(self.corpus_path, 'rb') as load_file: 358 | data = json.load(load_file) 359 | 360 | try: 361 | self.document_list = data['document_list'] 362 | self.document_lengths = data['document_lengths'] 363 | self.corpus = data['corpus'] 364 | 365 | # Make sure that frequency dicts in corpus are Counter objects 366 | for term in self.corpus.iterkeys(): 367 | self.corpus[term] = Counter(self.corpus[term]) 368 | except KeyError as err: 369 | print('Provided file does not have expected structure') 370 | raise err 371 | 372 | def get_corpus_path(self): 373 | return self.corpus_path 374 | 375 | def set_corpus_path(self, path): 376 | if not path.lower().endswith('.json'): 377 | raise Exception('Corpus path must be a JSON file (.json extension).') 378 | self.corpus_path = path 379 | 380 | def get_document_list(self): 381 | return self.document_list 382 | 383 | def get_vocabulary(self): 384 | """Return the full list of terms in the corpus.""" 385 | return self.corpus.keys() 386 | 387 | def get_document(self, document_name): 388 | """Retrieve a document from the corpus.""" 389 | if document_name not in self.document_list: 390 | raise Exception("No document with name '{0}' found in corpus".format(document_name)) 391 | return Counter({ 392 | term: freqs[document_name] for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0) 393 | }) 394 | 395 | def add_document(self, document, document_name): 396 | """Load a document into the corpus. 397 | 398 | :param document: takes the form {'term1': freq1, 'term2', freq2, ...} 399 | :param document_name: string which uniquely identifies the document 400 | """ 401 | if document_name in self.document_list: 402 | print("Document with name '{0}' already exists in corpus." 403 | "Do you wish to replace it?".format(document_name)) 404 | while True: 405 | replace_doc = raw_input("Response (y/n): ") 406 | if replace_doc in ['y', 'yes', 'ye']: 407 | self.delete_document(document_name) 408 | break 409 | elif replace_doc in ['n', 'no']: 410 | return 411 | else: 412 | print('Could not interpret response. Try again.') 413 | 414 | for term, freq in document.iteritems(): 415 | if not self.corpus.get(term, False): 416 | self.corpus[term] = Counter() 417 | 418 | self.corpus[term][document_name] = freq 419 | 420 | self.document_list.append(document_name) 421 | self.document_lengths[document_name] = sum(document.itervalues()) 422 | 423 | def delete_document(self, document_name): 424 | """Delete a document from the corpus. 425 | 426 | :param document_name: string indicating document's name in the corpus - should exist in self.document_list 427 | """ 428 | if document_name not in self.document_list: 429 | return 430 | [freqs.pop(document_name) for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0)] 431 | self.document_list.remove(document_name) 432 | self.document_lengths.pop(document_name) 433 | 434 | def append_document(self, document, document_name): 435 | """Add new counts to an existing document. If the document doesn't exist in the corpus then it is added. 436 | 437 | :param document: dict or Counter of word counts, e.g. {'i': 1, 'like': 2, 'cheese': 1} 438 | :param document_name: string indicating document's name in the corpus - should exist in self.document_list 439 | """ 440 | if document_name not in self.document_list: 441 | self.add_document(document, document_name) 442 | else: 443 | for term, freq in document.iteritems(): 444 | if not self.corpus.get(term, False): 445 | self.corpus[term] = Counter() 446 | 447 | self.corpus[term][document_name] += freq 448 | 449 | self.document_lengths[document_name] += sum(document.itervalues()) 450 | 451 | def get_idf(self, term): 452 | """Get inverse document frequency of a given term in the corpus.""" 453 | num_documents = len(self.document_list) 454 | docs_containing_term = len(self.corpus[term]) 455 | return math.log(num_documents / (1 + docs_containing_term)) 456 | 457 | def get_tfidf(self, term, document_name): 458 | """Get tf-idf score given a term and document in the corpus.""" 459 | tf = self.corpus[term].get(document_name, '') / self.document_lengths[document_name] 460 | idf = self.get_idf(term) 461 | return tf * idf 462 | 463 | def get_document_tfidfs(self, document_name, l2_norm=True): 464 | """Get tf-idf scores for all terms in a document. 465 | 466 | :param document_name: string indicating document's name in the corpus - should exist in self.document_list 467 | :param l2_norm: if True, applies Euclidean normalization to tf-idf scores of the document 468 | :return: Counter of tf-idf scores for each term 469 | """ 470 | tfidfs = { 471 | term: self.get_tfidf(term, document_name) for term, freq in self.corpus.iteritems() 472 | if freq.get(document_name, '') 473 | } 474 | 475 | if l2_norm: 476 | normalization = np.linalg.norm(tfidfs.values(), axis=0) 477 | for key, value in tfidfs.items(): 478 | tfidfs[key] = value / normalization 479 | 480 | return Counter(tfidfs) 481 | 482 | def get_top_terms(self, document_name, num_terms=30): 483 | """Get the top terms for a given document by tf-idf score. 484 | 485 | :param document_name: string indicating document's name in the corpus - should exist in self.document_list 486 | :param num_terms: number of top terms to return (30 by default) 487 | :return: dict of top terms and their corresponding tf-idf scores 488 | """ 489 | tfidfs = self.get_document_tfidfs(document_name) 490 | sorted_tfidfs = sorted(tfidfs.items(), key=operator.itemgetter(1), reverse=True) 491 | return OrderedDict(sorted_tfidfs[:num_terms]) 492 | 493 | def build_feature_matrix(self, tfidf=True): 494 | """Transforms the corpus into a scikit-learn vectorizer object which can be used for machine learning. 495 | Used to set the object attributes self.vectorizer and self.feature_matrix. 496 | 497 | :param tfidf (bool): if True, applies TfidfTransformer to vectorized features 498 | :return: scikit-learn vectorizer, scipy sparse feature matrix and its corresponding document labels 499 | """ 500 | 501 | train_data = [self.get_document(document) for document in self.document_list] 502 | labels = self.document_list 503 | vectorizer = DictVectorizer() 504 | feature_matrix = vectorizer.fit_transform(train_data) 505 | 506 | self.tfidf_transformer = None 507 | if tfidf: 508 | self.tfidf_transformer = TfidfTransformer() 509 | feature_matrix = self.tfidf_transformer.fit_transform(feature_matrix) 510 | 511 | self.vectorizer = vectorizer 512 | self.feature_matrix = feature_matrix 513 | return feature_matrix, labels, vectorizer 514 | 515 | def train_classifier(self, classifier_type='LinearSVC', tfidf=True): 516 | """Trains a document classifier using the vocabulary and documents contained in the corpus. Uses scikit-learn. 517 | 518 | :param classifier_type (str): 'LinearSVC' or 'MultinomialNB' (LinearSVC by default) 519 | :param tfidf (bool): if True, applies TfidfTransformer to vectorized features 520 | :return: classifier object 521 | """ 522 | self.build_feature_matrix(tfidf=tfidf) 523 | 524 | if classifier_type.lower() == 'linearsvc': 525 | classifier = OneVsRestClassifier(LinearSVC(random_state=0)) 526 | elif classifier_type.lower() == 'multinomialnb': 527 | classifier = OneVsRestClassifier(MultinomialNB()) 528 | else: 529 | raise Exception("Parameter classifier_type only accepts 'MultinomialNB', 'BernoulliNB' or 'LinearSVC'.") 530 | 531 | classifier.fit(self.feature_matrix, self.document_list) 532 | self.classifier = classifier 533 | return classifier 534 | 535 | def classify_document(self, document): 536 | """Classifies an input document using a bag-of-words approach with sparse features. 537 | 538 | :param document (dict): dict or Counter of the form {'word1': freq1, 'word2': freq2, ...} 539 | :return (str): label corresponding to the document's classification 540 | """ 541 | test_data = self.vectorizer.transform([document]) 542 | if self.tfidf_transformer: 543 | test_data = self.tfidf_transformer.transform(test_data) 544 | 545 | return self.classifier.predict(test_data) 546 | 547 | def count_words_from_list(self, document_name, word_list, normalize=True): 548 | """Given a list of input words, return the counts of these words in a specified document.""" 549 | document = self.get_document(document_name) 550 | word_counts = [document[word] for word in word_list] 551 | total_count = sum(word_counts) 552 | if normalize: 553 | total_count /= self.document_lengths[document_name] 554 | 555 | return total_count 556 | 557 | def get_mean_word_length(self, document_name, upper_limit=12): 558 | """Get the average word length for all words in a given document.""" 559 | document = self.get_document(document_name) 560 | return sum([len(term) * freq for term, freq in document.iteritems() 561 | if len(term) <= upper_limit]) / sum(document.itervalues()) 562 | 563 | @staticmethod 564 | def check_corpus_path(corpus_path): 565 | if not corpus_path.lower().endswith('.json'): 566 | raise Exception('corpus_path provided is not a valid JSON file.') 567 | make_path(corpus_path) 568 | 569 | 570 | def make_path(path): 571 | """Check if path exists. If it doesn't, create the necessary folders.""" 572 | 573 | # Remove file name from path 574 | base_name = os.path.basename(path) 575 | if '.' in base_name: 576 | path = path[:-len(base_name)] 577 | 578 | if not os.path.exists(path): 579 | try: 580 | os.makedirs(path) 581 | except OSError as exception: 582 | if exception.errno != errno.EEXIST: 583 | raise 584 | 585 | return path 586 | 587 | 588 | def get_word_corpora(): 589 | """Returns a list of paths to all word corpora installed in the module.""" 590 | application_root = get_root_dir() 591 | words_dir = os.path.join(application_root, 'words') 592 | return os.listdir(words_dir) 593 | 594 | 595 | def get_root_dir(): 596 | return os.path.dirname(__file__) --------------------------------------------------------------------------------