├── tests.py
├── requirements.txt
├── MANIFEST.in
├── .gitignore
├── .travis.yml
├── setup.py
├── example.py
├── redditnlp
    ├── words
    │   ├── stopwords_english.txt
    │   └── swearwords_english.txt
    └── __init__.py
├── README.md
└── ez_setup.py


/tests.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | praw>=2.1.19
2 | nltk>=3.0.0
3 | numpy>=1.8.0
4 | scikit-learn>=0.15.2


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include ez_setup.py
3 | include example.py
4 | recursive-include redditnlp/words *


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | *.zip
4 | *.egg
5 | .idea/
6 | tfidf_corpus/
7 | build/
8 | dist/
9 | redditnlp.egg-info/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "2.6"
4 |   - "2.7"
5 | install:
6 |   - pip install .
7 |   - pip install -r requirements.txt
8 | script: "python tests.py"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import ez_setup
 2 | ez_setup.use_setuptools(version='0.7')
 3 | 
 4 | from setuptools import setup
 5 | import os
 6 | 
 7 | PACKAGE_NAME = 'redditnlp'
 8 | VERSION = '0.1.3'
 9 | 
10 | 
11 | def read(filename):
12 |     filepath = os.path.join(os.path.dirname(__file__), filename)
13 |     try:
14 |         # Convert GitHub markdown to restructured text (needed for upload to PyPI)
15 |         from pypandoc import convert
16 |         return convert(filepath, 'rst')
17 |     except ImportError:
18 |         return open(filepath).read()
19 | 
20 | description = 'A tool to perform natural language processing of reddit content.'
21 | try:
22 |     long_description = read('README.md')
23 | except IOError:
24 |     long_description = description
25 | 
26 | setup(
27 |     name=PACKAGE_NAME,
28 |     version=VERSION,
29 |     author='Jai Juneja',
30 |     author_email='jai.juneja@gmail.com',
31 |     description=description,
32 |     license='BSD',
33 |     keywords='reddit, natural language processing, machine learning',
34 |     url='https://github.com/jaijuneja/reddit-nlp',
35 |     packages=[PACKAGE_NAME,],
36 |     long_description=long_description,
37 |     classifiers=[
38 |         'Development Status :: 3 - Alpha',
39 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
40 |         'Intended Audience :: Developers',
41 |         'License :: OSI Approved :: BSD License',
42 |         'Operating System :: OS Independent',
43 |         'Programming Language :: Python'
44 |     ],
45 |     install_requires=[
46 |         'praw>=2.1.19',
47 |         'nltk>=3.0.0',
48 |         'numpy>=1.8.0',
49 |         'scikit-learn>=0.15.2',
50 |     ],
51 |     include_package_data=True,
52 |     package_data={PACKAGE_NAME: ['words/*.txt'],
53 |                   '': ['README.md', 'ez_setup.py', 'example.py']},
54 |     test_suite='tests'
55 | )


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
  1 | from redditnlp import RedditWordCounter, TfidfCorpus
  2 | import requests
  3 | import os
  4 | from collections import deque
  5 | 
  6 | ######################
  7 | # SETTINGS (EDIT THIS)
  8 | ######################
  9 | 
 10 | USERNAME = 'your_username'  # Change this to your username
 11 | SAVE_DIR = 'tfidf_corpus'
 12 | CORPUS_FILE = 'corpus.json'
 13 | COMMENTS_PER_SUBREDDIT = 1000
 14 | SUBREDDITS = [
 15 |     'funny', 'pics', 'AskReddit', 'todayilearned', 'worldnews',
 16 |     'science', 'blog', 'IAmA', 'videos', 'gaming',
 17 |     'movies', 'Music', 'aww', 'technology', 'bestof',
 18 |     'WTF', 'AdviceAnimals', 'news', 'gifs', 'askscience',
 19 |     'explainlikeimfive', 'EarthPorn', 'books', 'television', 'politics'
 20 | ]
 21 | 
 22 | ###########################
 23 | # VOCABULARY ANALYTICS DEMO
 24 | ###########################
 25 | 
 26 | 
 27 | def get_subreddit_vocabularies():
 28 |     # Initialise Reddit word counter instance
 29 |     reddit_counter = RedditWordCounter(USERNAME)
 30 | 
 31 |     # Initialise tf-idf corpus instance
 32 |     corpus_path = os.path.join(SAVE_DIR, CORPUS_FILE)
 33 |     comment_corpus = TfidfCorpus(corpus_path)
 34 | 
 35 |     # Extract the vocabulary for each of the subreddits specified
 36 |     subreddit_queue = deque([subreddit for subreddit in SUBREDDITS])
 37 |     while len(subreddit_queue) > 0:
 38 |         subreddit = subreddit_queue.popleft()
 39 | 
 40 |         try:
 41 |             vocabulary = reddit_counter.subreddit_comments(subreddit, limit=COMMENTS_PER_SUBREDDIT)
 42 |         except requests.exceptions.HTTPError as err:
 43 |             print err
 44 |             # Add subreddit back into queue
 45 |             subreddit_queue.append(subreddit)
 46 |             continue
 47 | 
 48 |         comment_corpus.add_document(vocabulary, subreddit)
 49 |         comment_corpus.save()
 50 | 
 51 |     return comment_corpus, corpus_path
 52 | 
 53 | 
 54 | def save_subreddit_top_terms(corpus):
 55 |     # Save the top terms for each subreddit in a text file
 56 |     save_path = os.path.join(SAVE_DIR, 'top_words.txt')
 57 |     for document in corpus.get_document_list():
 58 |         top_terms = corpus.get_top_terms(document, num_terms=50)
 59 |         top_terms = sorted(top_terms.items(), key=lambda x: x[1], reverse=True)
 60 |         with open(save_path, 'ab') as f:
 61 |             f.write(document.encode('utf-8') + '\n' +
 62 |                     '\n'.join(['{0}, {1}'.format(term.encode('utf-8'), weight) for term, weight in top_terms])
 63 |                     + '\n\n')
 64 | 
 65 |     return save_path
 66 | 
 67 | 
 68 | def get_swearword_counts(corpus):
 69 |     with open('redditnlp/words/swearwords_english.txt', 'rb') as f:
 70 |         swearwords = [word.strip('\n') for word in f.readlines()]
 71 | 
 72 |     swearword_counts = dict()
 73 |     for document in corpus.get_document_list():
 74 |         swearword_counts[document] = corpus.count_words_from_list(document, swearwords)
 75 |     return swearword_counts
 76 | 
 77 | 
 78 | def get_vocabulary_sophistication(corpus):
 79 |     mean_word_lengths = dict()
 80 |     for document in corpus.get_document_list():
 81 |         mean_word_lengths[document] = corpus.get_mean_word_length(document)
 82 |     return mean_word_lengths
 83 | 
 84 | # Extract their word counts
 85 | corpus, corpus_path = get_subreddit_vocabularies()
 86 | print 'TF-IDF corpus saved to %s' % corpus_path
 87 | 
 88 | # Get the top words by subreddit
 89 | top_terms_path = save_subreddit_top_terms(corpus)
 90 | print 'Top terms saved to %s' % corpus_path
 91 | 
 92 | # Get the swearword frequency
 93 | swearword_frequency = get_swearword_counts(corpus)
 94 | print 'Normalized swearword frequency:'
 95 | for subreddit, frequency in swearword_frequency.items():
 96 |     print '%s, %s' % (subreddit, frequency)
 97 | 
 98 | # Get the average word length
 99 | print '\nAverage word length by subreddit:'
100 | word_lengths = get_vocabulary_sophistication(corpus)
101 | for subreddit, frequency in word_lengths.items():
102 |     print '%s, %s' % (subreddit, frequency)
103 | 
104 | #######################
105 | # MACHINE LEARNING DEMO
106 | #######################
107 | 
108 | # Collect the comments for a particular user and determine which subreddit their comments best match up with
109 | counter = RedditWordCounter(USERNAME)
110 | corpus = TfidfCorpus(os.path.join(SAVE_DIR, CORPUS_FILE))
111 | 
112 | user_comments = counter.user_comments('way_fairer')
113 | corpus.train_classifier(classifier_type='LinearSVC', tfidf=True)
114 | print corpus.classify_document(user_comments)


--------------------------------------------------------------------------------
/redditnlp/words/stopwords_english.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | across
  6 | actually
  7 | after
  8 | again
  9 | against
 10 | ago
 11 | aint
 12 | all
 13 | almost
 14 | alone
 15 | along
 16 | already
 17 | also
 18 | although
 19 | always
 20 | am
 21 | among
 22 | an
 23 | and
 24 | another
 25 | any
 26 | anybody
 27 | anyone
 28 | anything
 29 | anywhere
 30 | are
 31 | area
 32 | areas
 33 | arent
 34 | around
 35 | as
 36 | ask
 37 | asked
 38 | asking
 39 | asks
 40 | at
 41 | away
 42 | b
 43 | back
 44 | backed
 45 | backing
 46 | backs
 47 | be
 48 | became
 49 | because
 50 | become
 51 | becomes
 52 | been
 53 | before
 54 | began
 55 | behind
 56 | being
 57 | beings
 58 | below
 59 | best
 60 | better
 61 | between
 62 | big
 63 | both
 64 | but
 65 | by
 66 | c
 67 | came
 68 | can
 69 | cannot
 70 | cant
 71 | case
 72 | cases
 73 | certain
 74 | certainly
 75 | clear
 76 | clearly
 77 | com
 78 | come
 79 | couk
 80 | could
 81 | couldnt
 82 | couldve
 83 | d
 84 | dear
 85 | did
 86 | didnt
 87 | differ
 88 | different
 89 | differently
 90 | do
 91 | does
 92 | doesnt
 93 | doing
 94 | don
 95 | done
 96 | dont
 97 | down
 98 | downed
 99 | downing
100 | downs
101 | during
102 | e
103 | each
104 | early
105 | eg
106 | either
107 | else
108 | end
109 | ended
110 | ending
111 | ends
112 | enough
113 | etc
114 | even
115 | evenly
116 | ever
117 | every
118 | everybody
119 | everyone
120 | everything
121 | everywhere
122 | f
123 | far
124 | few
125 | find
126 | finds
127 | first
128 | for
129 | four
130 | from
131 | full
132 | fully
133 | further
134 | furthered
135 | furthering
136 | furthers
137 | g
138 | gave
139 | general
140 | generally
141 | get
142 | gets
143 | give
144 | given
145 | gives
146 | go
147 | going
148 | good
149 | got
150 | great
151 | greater
152 | greatest
153 | h
154 | had
155 | hadnt
156 | haha
157 | happen
158 | happened
159 | has
160 | hasnt
161 | have
162 | havent
163 | having
164 | he
165 | hed
166 | hell
167 | her
168 | here
169 | heres
170 | hers
171 | herself
172 | hes
173 | high
174 | higher
175 | highest
176 | him
177 | himself
178 | his
179 | how
180 | howd
181 | however
182 | howll
183 | hows
184 | http
185 | i
186 | id
187 | ie
188 | if
189 | ill
190 | im
191 | important
192 | in
193 | instead
194 | interested
195 | interesting
196 | into
197 | is
198 | isnt
199 | it
200 | itd
201 | itll
202 | its
203 | itself
204 | ive
205 | j
206 | just
207 | k
208 | keep
209 | keeps
210 | kind
211 | knew
212 | know
213 | known
214 | knows
215 | l
216 | large
217 | largely
218 | last
219 | later
220 | latest
221 | least
222 | less
223 | let
224 | lets
225 | like
226 | likely
227 | long
228 | longer
229 | longest
230 | looked
231 | looks
232 | lot
233 | m
234 | made
235 | make
236 | making
237 | many
238 | may
239 | maybe
240 | me
241 | might
242 | mightnt
243 | mightve
244 | more
245 | most
246 | mostly
247 | mr
248 | mrs
249 | much
250 | must
251 | mustnt
252 | mustve
253 | my
254 | myself
255 | n
256 | necessary
257 | need
258 | needed
259 | needing
260 | needs
261 | neither
262 | net
263 | never
264 | new
265 | newer
266 | newest
267 | next
268 | no
269 | nobody
270 | non
271 | noone
272 | nor
273 | not
274 | nothing
275 | now
276 | nowhere
277 | o
278 | of
279 | off
280 | often
281 | oh
282 | old
283 | older
284 | oldest
285 | on
286 | once
287 | one
288 | only
289 | open
290 | opened
291 | opening
292 | opens
293 | or
294 | order
295 | ordered
296 | ordering
297 | orders
298 | org
299 | other
300 | others
301 | ought
302 | our
303 | ours
304 | ourselves
305 | out
306 | over
307 | own
308 | p
309 | part
310 | parted
311 | parting
312 | parts
313 | per
314 | perhaps
315 | place
316 | places
317 | point
318 | pointed
319 | pointing
320 | points
321 | possible
322 | present
323 | presented
324 | presenting
325 | presents
326 | put
327 | puts
328 | q
329 | quite
330 | r
331 | rather
332 | really
333 | right
334 | room
335 | rooms
336 | s
337 | said
338 | same
339 | saw
340 | say
341 | says
342 | second
343 | seconds
344 | see
345 | seem
346 | seemed
347 | seeming
348 | seems
349 | sees
350 | several
351 | shall
352 | shant
353 | she
354 | shed
355 | shell
356 | shes
357 | should
358 | shouldnt
359 | shouldve
360 | show
361 | showed
362 | showing
363 | shows
364 | side
365 | sides
366 | since
367 | small
368 | smaller
369 | smallest
370 | so
371 | some
372 | somebody
373 | someone
374 | something
375 | somewhere
376 | state
377 | states
378 | still
379 | such
380 | sure
381 | t
382 | take
383 | taken
384 | teh
385 | than
386 | that
387 | thatll
388 | thats
389 | the
390 | their
391 | theirs
392 | them
393 | themselves
394 | then
395 | there
396 | therefore
397 | theres
398 | these
399 | they
400 | theyd
401 | theyll
402 | theyre
403 | theyve
404 | thing
405 | things
406 | think
407 | thinks
408 | this
409 | those
410 | though
411 | three
412 | through
413 | thus
414 | tis
415 | to
416 | today
417 | together
418 | too
419 | took
420 | toward
421 | turn
422 | turned
423 | turning
424 | turns
425 | twas
426 | two
427 | u
428 | under
429 | until
430 | up
431 | upon
432 | us
433 | use
434 | used
435 | uses
436 | using
437 | v
438 | very
439 | vs
440 | w
441 | want
442 | wanted
443 | wanting
444 | wants
445 | was
446 | wasnt
447 | way
448 | ways
449 | we
450 | wed
451 | well
452 | wells
453 | went
454 | were
455 | werent
456 | weve
457 | what
458 | whatd
459 | whats
460 | when
461 | whend
462 | whenll
463 | whens
464 | where
465 | whered
466 | wherell
467 | wheres
468 | whether
469 | which
470 | while
471 | who
472 | whod
473 | whole
474 | wholl
475 | whom
476 | whos
477 | whose
478 | why
479 | whyd
480 | whyll
481 | whys
482 | will
483 | with
484 | within
485 | without
486 | wont
487 | works
488 | would
489 | wouldnt
490 | wouldve
491 | www
492 | x
493 | y
494 | yes
495 | yet
496 | you
497 | youd
498 | youll
499 | young
500 | younger
501 | youngest
502 | your
503 | youre
504 | yours
505 | yourself
506 | yourselves
507 | youve
508 | z


--------------------------------------------------------------------------------
/redditnlp/words/swearwords_english.txt:
--------------------------------------------------------------------------------
  1 | 4r5e
  2 | 5h1t
  3 | 5hit
  4 | a55
  5 | anal
  6 | anus
  7 | ar5e
  8 | arrse
  9 | arse
 10 | ass
 11 | asses
 12 | assfucker
 13 | assfukka
 14 | asshole
 15 | assholes
 16 | asswhole
 17 | b00bs
 18 | b17ch
 19 | b1tch
 20 | ballbag
 21 | balls
 22 | ballsack
 23 | bastard
 24 | beastial
 25 | beastiality
 26 | bellend
 27 | bestial
 28 | bestiality
 29 | biatch
 30 | bitch
 31 | bitcher
 32 | bitchers
 33 | bitches
 34 | bitchin
 35 | bitching
 36 | bloody
 37 | blowjob
 38 | blowjobs
 39 | boiolas
 40 | bollock
 41 | bollok
 42 | boner
 43 | boob
 44 | boobs
 45 | booobs
 46 | boooobs
 47 | booooobs
 48 | booooooobs
 49 | breasts
 50 | buceta
 51 | bugger
 52 | bum
 53 | butt
 54 | butthole
 55 | buttmuch
 56 | buttplug
 57 | c0ck
 58 | c0cksucker
 59 | carpet muncher
 60 | cawk
 61 | chink
 62 | cipa
 63 | cl1t
 64 | clit
 65 | clitoris
 66 | clits
 67 | cnut
 68 | cock
 69 | cock-sucker
 70 | cockface
 71 | cockhead
 72 | cockmunch
 73 | cockmuncher
 74 | cocks
 75 | cocksuck 
 76 | cocksucked 
 77 | cocksucker
 78 | cocksucking
 79 | cocksucks 
 80 | cocksuka
 81 | cocksukka
 82 | cok
 83 | cokmuncher
 84 | coksucka
 85 | coon
 86 | cox
 87 | crap
 88 | cum
 89 | cummer
 90 | cumming
 91 | cums
 92 | cumshot
 93 | cunilingus
 94 | cunillingus
 95 | cunnilingus
 96 | cunt
 97 | cuntlick 
 98 | cuntlicker 
 99 | cuntlicking 
100 | cunts
101 | cyalis
102 | cyberfuc
103 | cyberfuck 
104 | cyberfucked 
105 | cyberfucker
106 | cyberfuckers
107 | cyberfucking 
108 | d1ck
109 | damn
110 | dick
111 | dickhead
112 | dildo
113 | dildos
114 | dink
115 | dinks
116 | dirsa
117 | dlck
118 | doggin
119 | dogging
120 | donkeyribber
121 | doosh
122 | duche
123 | dyke
124 | ejaculate
125 | ejaculated
126 | ejaculates 
127 | ejaculating 
128 | ejaculatings
129 | ejaculation
130 | ejakulate
131 | f4nny
132 | fag
133 | fagging
134 | faggitt
135 | faggot
136 | faggs
137 | fagot
138 | fagots
139 | fags
140 | fanny
141 | fannyflaps
142 | fannyfucker
143 | fanyy
144 | fatass
145 | fcuk
146 | fcuker
147 | fcuking
148 | feck
149 | fecker
150 | felching
151 | fellate
152 | fellatio
153 | fingerfuck 
154 | fingerfucked 
155 | fingerfucker 
156 | fingerfuckers
157 | fingerfucking 
158 | fingerfucks 
159 | fistfuck
160 | fistfucked 
161 | fistfucker 
162 | fistfuckers 
163 | fistfucking 
164 | fistfuckings 
165 | fistfucks 
166 | flange
167 | fook
168 | fooker
169 | fuck
170 | fucka
171 | fucked
172 | fucker
173 | fuckers
174 | fuckhead
175 | fuckheads
176 | fuckin
177 | fucking
178 | fuckings
179 | fuckingshitmotherfucker
180 | fuckme 
181 | fucks
182 | fuckwhit
183 | fuckwit
184 | fudge packer
185 | fudgepacker
186 | fuk
187 | fuker
188 | fukker
189 | fukkin
190 | fuks
191 | fukwhit
192 | fukwit
193 | fux
194 | fux0r
195 | gangbang
196 | gangbanged 
197 | gangbangs 
198 | gaylord
199 | gaysex
200 | goatse
201 | God
202 | god-dam
203 | god-damned
204 | goddamn
205 | goddamned
206 | hardcoresex 
207 | hell
208 | heshe
209 | hoar
210 | hoare
211 | hoer
212 | homo
213 | hore
214 | horniest
215 | horny
216 | hotsex
217 | jack-off 
218 | jackoff
219 | jap
220 | jerk-off 
221 | jism
222 | jiz 
223 | jizm 
224 | jizz
225 | kawk
226 | knob
227 | knobead
228 | knobed
229 | knobend
230 | knobhead
231 | knobjocky
232 | knobjokey
233 | kock
234 | kondum
235 | kondums
236 | kum
237 | kummer
238 | kumming
239 | kums
240 | kunilingus
241 | l3itch
242 | labia
243 | lmfao
244 | lust
245 | lusting
246 | m0f0
247 | m0fo
248 | m45terbate
249 | ma5terb8
250 | ma5terbate
251 | masochist
252 | master-bate
253 | masterb8
254 | masterbat3
255 | masterbate
256 | masterbation
257 | masterbations
258 | masturbate
259 | mof0
260 | mofo
261 | mothafuck
262 | mothafucka
263 | mothafuckas
264 | mothafuckaz
265 | mothafucked 
266 | mothafucker
267 | mothafuckers
268 | mothafuckin
269 | mothafucking 
270 | mothafuckings
271 | mothafucks
272 | motherfuck
273 | motherfucked
274 | motherfucker
275 | motherfuckers
276 | motherfuckin
277 | motherfucking
278 | motherfuckings
279 | motherfuckka
280 | motherfucks
281 | muff
282 | mutha
283 | muthafecker
284 | muthafuckker
285 | muther
286 | mutherfucker
287 | n1gga
288 | n1gger
289 | nazi
290 | nigg3r
291 | nigg4h
292 | nigga
293 | niggah
294 | niggas
295 | niggaz
296 | nigger
297 | niggers 
298 | nob
299 | nobhead
300 | nobjocky
301 | nobjokey
302 | numbnuts
303 | nutsack
304 | orgasim 
305 | orgasims 
306 | orgasm
307 | orgasms 
308 | p0rn
309 | pawn
310 | pecker
311 | penis
312 | penisfucker
313 | phonesex
314 | phuck
315 | phuk
316 | phuked
317 | phuking
318 | phukked
319 | phukking
320 | phuks
321 | phuq
322 | pigfucker
323 | pimpis
324 | piss
325 | pissed
326 | pisser
327 | pissers
328 | pisses 
329 | pissflaps
330 | pissin 
331 | pissing
332 | pissoff 
333 | poop
334 | porn
335 | porno
336 | pornography
337 | pornos
338 | prick
339 | pricks 
340 | pron
341 | pube
342 | pusse
343 | pussi
344 | pussies
345 | pussy
346 | pussys 
347 | rectum
348 | retard
349 | rimjaw
350 | rimming
351 | sadist
352 | schlong
353 | screwing
354 | scroat
355 | scrote
356 | scrotum
357 | semen
358 | sex
359 | sh1t
360 | shag
361 | shagger
362 | shaggin
363 | shagging
364 | shemale
365 | shit
366 | shitdick
367 | shite
368 | shited
369 | shitey
370 | shitfuck
371 | shitfull
372 | shithead
373 | shiting
374 | shitings
375 | shits
376 | shitted
377 | shitter
378 | shitters 
379 | shitting
380 | shittings
381 | shitty 
382 | skank
383 | slut
384 | sluts
385 | smegma
386 | smut
387 | snatch
388 | spac
389 | spunk
390 | t1tt1e5
391 | t1tties
392 | teets
393 | teez
394 | testical
395 | testicle
396 | tit
397 | titfuck
398 | tits
399 | titt
400 | tittie5
401 | tittiefucker
402 | titties
403 | tittyfuck
404 | tittywank
405 | titwank
406 | tosser
407 | turd
408 | tw4t
409 | twat
410 | twathead
411 | twatty
412 | twunt
413 | twunter
414 | v14gra
415 | v1gra
416 | vagina
417 | viagra
418 | vulva
419 | w00se
420 | wang
421 | wank
422 | wanker
423 | wanky
424 | whoar
425 | whore
426 | willies
427 | willy
428 | xrated
429 | xxx1


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reddit NLP Package [![Build Status](https://travis-ci.org/jaijuneja/reddit-nlp.svg?branch=master)](https://travis-ci.org/jaijuneja/reddit-nlp) [![PyPI version](https://badge.fury.io/py/redditnlp.svg)](https://pypi.python.org/pypi/redditnlp)
  2 | 
  3 | A lightweight Python module that performs tokenization and processing of text on Reddit. It allows you to analyze users, titles, comments and subreddits to understand their vocabulary. The module comes packaged with its own inverted index builder for storing vocabularies and word frequencies, such that you can generate and manipulate large corpora of tf-idf weighted words without worrying about implementation. This is especially useful if you're running scripts over long periods and wish to save intermediary results.
  4 | 
  5 | ## License
  6 | 
  7 | Copyright 2014 Jai Juneja.
  8 | 
  9 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 12 | 
 13 | You should have received a copy of the GNU General Public License along with this program. If not, see [http://www.gnu.org/licenses/](http://www.gnu.org/licenses/).
 14 | 
 15 | ## Installation
 16 | 
 17 | ### Using pip or easy_install
 18 | 
 19 | You can download the latest release version using `pip` or `easy_install`:
 20 | 
 21 | ```
 22 | pip install redditnlp
 23 | ```
 24 | 
 25 | ### Latest development version
 26 | You can alternatively download the latest development version directly from GitHub:
 27 | 
 28 | ```
 29 | git clone https://github.com/jaijuneja/reddit-nlp.git
 30 | ```
 31 | 
 32 | Change into the root directory:
 33 | 
 34 | ```
 35 | cd reddit-nlp
 36 | ```
 37 | 
 38 | Then install the package:
 39 | 
 40 | ```
 41 | python setup.py install
 42 | ```
 43 | 
 44 | ### Error: the required version of setuptools is not available
 45 | 
 46 | Upon running `pip install` or the `setup.py` script you might get a message like this:
 47 | 
 48 | ```
 49 | The required version of setuptools (>=0.7) is not available, and can't be installed while this script is running. Please install a more recent version first, using 'easy_install -U setuptools'.
 50 | ```
 51 | 
 52 | This is appearing because you have a very outdated version of the setuptools package. The redditnlp package typically bootstraps a newer version of setuptools during install, but it isn't working in this case. You need to update setuptools using `easy_install -U setuptools` (you may need to apply `sudo` to this command).
 53 | 
 54 | If the above command doesn't do anything then it is likely that your version of setuptools was installed using a package manager such yum, apt or pip. Check your package manager for a package called python-setuptools or try `pip install setuptools --upgrade` and then re-run the install.
 55 | 
 56 | ## Usage
 57 | 
 58 | A more complex sample program using the redditnlp module can be found at `https://github.com/jaijuneja/reddit-nlp/blob/master/example.py`. Here we outline a basic word counter application.
 59 | 
 60 | The module consists of three classes: 
 61 |  
 62 | * A basic word counter class, `WordCounter`, which performs tokenization and counting on input strings
 63 | * A Reddit word counter, `RedditWordCounter`, which extends the `WordCounter` class to allow interaction with the Reddit API
 64 | * A tf-idf corpus builder, which allows storing of large word corpora in an inverted index
 65 | 
 66 | These three classes can be instantiated as follows:
 67 | 
 68 | ```python
 69 | from redditnlp import WordCounter, RedditWordCounter, TfidfCorpus
 70 | 
 71 | word_counter = WordCounter()
 72 | reddit_counter = RedditWordCounter('your_username')
 73 | corpus = TfidfCorpus()
 74 | ```
 75 | 
 76 | To adhere to the Reddit API rules, it is asked that you use your actual Reddit username in place of `'your_username'` above.
 77 | 
 78 | For further information on the attributes and methods of these classes you can run:
 79 | 
 80 | ```python
 81 | help(WordCounter)
 82 | help(RedditWordCounter)
 83 | help(TfidfCorpus)
 84 | ```
 85 | 
 86 | Next, we can tokenize 1000 comments from a selection of subreddits, extract the most common words and save all of our data to disk:
 87 | 
 88 | ```python
 89 | for subreddit in ['funny', 'aww', 'pics']:
 90 |     # Tokenize and count words for 1000 comments
 91 |     word_counts = counter.subreddit_comments(subreddit, limit=1000)
 92 |     
 93 |     # Add the word counts to our corpus
 94 |     corpus.add_document(word_counts, subreddit)
 95 | 
 96 | # Save the corpus to a specified path (must be JSON)
 97 | corpus.save(path='word_counts.json')
 98 | 
 99 | # Save the top 50 words (by tf-idf score) from each subreddit to a text file
100 | for subreddit in corpus.get_document_list():
101 |     top_words = corpus.get_top_terms(document, num_terms=50)
102 |     with open('top_words.txt', 'ab') as f:
103 |         f.write(document + '\n' + '\n'.join(top_words.keys()))
104 | ```
105 | 
106 | ### Machine learning
107 | 
108 | `redditnlp` now supports some of scikit-learn's machine learning capability. Several in-built functions enable the user to:
109 | 
110 | * Convert a TfidfCorpus object into a scipy sparse feature matrix (using `build_feature_matrix()`)
111 | * Train a classifier using the documents contained in a TfidfCorpus (with `train_classifier()`) and thereafter classify new documents (with `classify_document()`)
112 | 
113 | Below is an example of a simple machine learning application that loads a corpus of subreddit comment data, uses it to train a classifier and determines which subreddit a user's comments most closely match:
114 | 
115 | ```python
116 | # Load the corpus of subreddit comment data and use it to train a classifier
117 | corpus = TfidfCorpus('path/to/subreddit_corpus.json')
118 | corpus.train_classifier(classifier_type='LinearSVC', tfidf=True)
119 | 
120 | # Tokenize all of your comments
121 | counter = RedditWordCounter('your_username')
122 | user_comments = counter.user_comments('your_username')
123 | 
124 | # Classify your comments against the documents in the corpus
125 | print corpus.classify_document(user_comments)
126 | ```
127 | 
128 | ### Multiprocessing
129 | 
130 | `redditnlp` uses the [PRAW](https://github.com/praw-dev/praw) Reddit API wrapper. It supports multiprocessing, such that you can run multiple instances of `RedditWordCounter` without exceeding Reddit's rate limit. There is more information about this in the [PRAW documentation](https://praw.readthedocs.org/en/latest/pages/multiprocess.html) but for the sake of completeness an example is included below.
131 | 
132 | First, you must initialise a request handling server on your local machine. This is done using the terminal/command line:
133 | 
134 | ```
135 | praw-multiprocess
136 | ```
137 | 
138 | Next, you can instantiate multiple `RedditWordCounter` objects and set the parameter `multiprocess=True` so that outgoing API calls are handled:
139 | 
140 | ```
141 | counter = RedditWordCounter('your_username', multiprocess=True)
142 | ```
143 | 
144 | ## Contact
145 | 
146 | If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`.


--------------------------------------------------------------------------------
/ez_setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Bootstrap setuptools installation
  3 | 
  4 | To use setuptools in your package's setup.py, include this
  5 | file in the same directory and add this to the top of your setup.py::
  6 | 
  7 |     from ez_setup import use_setuptools
  8 |     use_setuptools()
  9 | 
 10 | To require a specific version of setuptools, set a download
 11 | mirror, or use an alternate download directory, simply supply
 12 | the appropriate options to ``use_setuptools()``.
 13 | 
 14 | This file can also be run as a script to install or upgrade setuptools.
 15 | """
 16 | import os
 17 | import shutil
 18 | import sys
 19 | import tempfile
 20 | import zipfile
 21 | import optparse
 22 | import subprocess
 23 | import platform
 24 | import textwrap
 25 | import contextlib
 26 | 
 27 | from distutils import log
 28 | 
 29 | try:
 30 |     from urllib.request import urlopen
 31 | except ImportError:
 32 |     from urllib2 import urlopen
 33 | 
 34 | try:
 35 |     from site import USER_SITE
 36 | except ImportError:
 37 |     USER_SITE = None
 38 | 
 39 | DEFAULT_VERSION = "7.0"
 40 | DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/"
 41 | 
 42 | def _python_cmd(*args):
 43 |     """
 44 |     Return True if the command succeeded.
 45 |     """
 46 |     args = (sys.executable,) + args
 47 |     return subprocess.call(args) == 0
 48 | 
 49 | 
 50 | def _install(archive_filename, install_args=()):
 51 |     with archive_context(archive_filename):
 52 |         # installing
 53 |         log.warn('Installing Setuptools')
 54 |         if not _python_cmd('setup.py', 'install', *install_args):
 55 |             log.warn('Something went wrong during the installation.')
 56 |             log.warn('See the error message above.')
 57 |             # exitcode will be 2
 58 |             return 2
 59 | 
 60 | 
 61 | def _build_egg(egg, archive_filename, to_dir):
 62 |     with archive_context(archive_filename):
 63 |         # building an egg
 64 |         log.warn('Building a Setuptools egg in %s', to_dir)
 65 |         _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
 66 |     # returning the result
 67 |     log.warn(egg)
 68 |     if not os.path.exists(egg):
 69 |         raise IOError('Could not build the egg.')
 70 | 
 71 | 
 72 | class ContextualZipFile(zipfile.ZipFile):
 73 |     """
 74 |     Supplement ZipFile class to support context manager for Python 2.6
 75 |     """
 76 | 
 77 |     def __enter__(self):
 78 |         return self
 79 | 
 80 |     def __exit__(self, type, value, traceback):
 81 |         self.close()
 82 | 
 83 |     def __new__(cls, *args, **kwargs):
 84 |         """
 85 |         Construct a ZipFile or ContextualZipFile as appropriate
 86 |         """
 87 |         if hasattr(zipfile.ZipFile, '__exit__'):
 88 |             return zipfile.ZipFile(*args, **kwargs)
 89 |         return super(ContextualZipFile, cls).__new__(cls)
 90 | 
 91 | 
 92 | @contextlib.contextmanager
 93 | def archive_context(filename):
 94 |     # extracting the archive
 95 |     tmpdir = tempfile.mkdtemp()
 96 |     log.warn('Extracting in %s', tmpdir)
 97 |     old_wd = os.getcwd()
 98 |     try:
 99 |         os.chdir(tmpdir)
100 |         with ContextualZipFile(filename) as archive:
101 |             archive.extractall()
102 | 
103 |         # going in the directory
104 |         subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
105 |         os.chdir(subdir)
106 |         log.warn('Now working in %s', subdir)
107 |         yield
108 | 
109 |     finally:
110 |         os.chdir(old_wd)
111 |         shutil.rmtree(tmpdir)
112 | 
113 | 
114 | def _do_download(version, download_base, to_dir, download_delay):
115 |     egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg'
116 |                        % (version, sys.version_info[0], sys.version_info[1]))
117 |     if not os.path.exists(egg):
118 |         archive = download_setuptools(version, download_base,
119 |                                       to_dir, download_delay)
120 |         _build_egg(egg, archive, to_dir)
121 |     sys.path.insert(0, egg)
122 | 
123 |     # Remove previously-imported pkg_resources if present (see
124 |     # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details).
125 |     if 'pkg_resources' in sys.modules:
126 |         del sys.modules['pkg_resources']
127 | 
128 |     import setuptools
129 |     setuptools.bootstrap_install_from = egg
130 | 
131 | 
132 | def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
133 |         to_dir=os.curdir, download_delay=15):
134 |     to_dir = os.path.abspath(to_dir)
135 |     rep_modules = 'pkg_resources', 'setuptools'
136 |     imported = set(sys.modules).intersection(rep_modules)
137 |     try:
138 |         import pkg_resources
139 |     except ImportError:
140 |         return _do_download(version, download_base, to_dir, download_delay)
141 |     try:
142 |         pkg_resources.require("setuptools>=" + version)
143 |         return
144 |     except pkg_resources.DistributionNotFound:
145 |         return _do_download(version, download_base, to_dir, download_delay)
146 |     except pkg_resources.VersionConflict as VC_err:
147 |         if imported:
148 |             msg = textwrap.dedent("""
149 |                 The required version of setuptools (>={version}) is not available,
150 |                 and can't be installed while this script is running. Please
151 |                 install a more recent version first, using
152 |                 'easy_install -U setuptools'.
153 | 
154 |                 (Currently using {VC_err.args[0]!r})
155 |                 """).format(VC_err=VC_err, version=version)
156 |             sys.stderr.write(msg)
157 |             sys.exit(2)
158 | 
159 |         # otherwise, reload ok
160 |         del pkg_resources, sys.modules['pkg_resources']
161 |         return _do_download(version, download_base, to_dir, download_delay)
162 | 
163 | def _clean_check(cmd, target):
164 |     """
165 |     Run the command to download target. If the command fails, clean up before
166 |     re-raising the error.
167 |     """
168 |     try:
169 |         subprocess.check_call(cmd)
170 |     except subprocess.CalledProcessError:
171 |         if os.access(target, os.F_OK):
172 |             os.unlink(target)
173 |         raise
174 | 
175 | def download_file_powershell(url, target):
176 |     """
177 |     Download the file at url to target using Powershell (which will validate
178 |     trust). Raise an exception if the command cannot complete.
179 |     """
180 |     target = os.path.abspath(target)
181 |     ps_cmd = (
182 |         "[System.Net.WebRequest]::DefaultWebProxy.Credentials = "
183 |         "[System.Net.CredentialCache]::DefaultCredentials; "
184 |         "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)"
185 |         % vars()
186 |     )
187 |     cmd = [
188 |         'powershell',
189 |         '-Command',
190 |         ps_cmd,
191 |     ]
192 |     _clean_check(cmd, target)
193 | 
194 | def has_powershell():
195 |     if platform.system() != 'Windows':
196 |         return False
197 |     cmd = ['powershell', '-Command', 'echo test']
198 |     with open(os.path.devnull, 'wb') as devnull:
199 |         try:
200 |             subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
201 |         except Exception:
202 |             return False
203 |     return True
204 | 
205 | download_file_powershell.viable = has_powershell
206 | 
207 | def download_file_curl(url, target):
208 |     cmd = ['curl', url, '--silent', '--output', target]
209 |     _clean_check(cmd, target)
210 | 
211 | def has_curl():
212 |     cmd = ['curl', '--version']
213 |     with open(os.path.devnull, 'wb') as devnull:
214 |         try:
215 |             subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
216 |         except Exception:
217 |             return False
218 |     return True
219 | 
220 | download_file_curl.viable = has_curl
221 | 
222 | def download_file_wget(url, target):
223 |     cmd = ['wget', url, '--quiet', '--output-document', target]
224 |     _clean_check(cmd, target)
225 | 
226 | def has_wget():
227 |     cmd = ['wget', '--version']
228 |     with open(os.path.devnull, 'wb') as devnull:
229 |         try:
230 |             subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
231 |         except Exception:
232 |             return False
233 |     return True
234 | 
235 | download_file_wget.viable = has_wget
236 | 
237 | def download_file_insecure(url, target):
238 |     """
239 |     Use Python to download the file, even though it cannot authenticate the
240 |     connection.
241 |     """
242 |     src = urlopen(url)
243 |     try:
244 |         # Read all the data in one block.
245 |         data = src.read()
246 |     finally:
247 |         src.close()
248 | 
249 |     # Write all the data in one block to avoid creating a partial file.
250 |     with open(target, "wb") as dst:
251 |         dst.write(data)
252 | 
253 | download_file_insecure.viable = lambda: True
254 | 
255 | def get_best_downloader():
256 |     downloaders = (
257 |         download_file_powershell,
258 |         download_file_curl,
259 |         download_file_wget,
260 |         download_file_insecure,
261 |     )
262 |     viable_downloaders = (dl for dl in downloaders if dl.viable())
263 |     return next(viable_downloaders, None)
264 | 
265 | def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
266 |         to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader):
267 |     """
268 |     Download setuptools from a specified location and return its filename
269 | 
270 |     `version` should be a valid setuptools version number that is available
271 |     as an sdist for download under the `download_base` URL (which should end
272 |     with a '/'). `to_dir` is the directory where the egg will be downloaded.
273 |     `delay` is the number of seconds to pause before an actual download
274 |     attempt.
275 | 
276 |     ``downloader_factory`` should be a function taking no arguments and
277 |     returning a function for downloading a URL to a target.
278 |     """
279 |     # making sure we use the absolute path
280 |     to_dir = os.path.abspath(to_dir)
281 |     zip_name = "setuptools-%s.zip" % version
282 |     url = download_base + zip_name
283 |     saveto = os.path.join(to_dir, zip_name)
284 |     if not os.path.exists(saveto):  # Avoid repeated downloads
285 |         log.warn("Downloading %s", url)
286 |         downloader = downloader_factory()
287 |         downloader(url, saveto)
288 |     return os.path.realpath(saveto)
289 | 
290 | def _build_install_args(options):
291 |     """
292 |     Build the arguments to 'python setup.py install' on the setuptools package
293 |     """
294 |     return ['--user'] if options.user_install else []
295 | 
296 | def _parse_args():
297 |     """
298 |     Parse the command line for options
299 |     """
300 |     parser = optparse.OptionParser()
301 |     parser.add_option(
302 |         '--user', dest='user_install', action='store_true', default=False,
303 |         help='install in user site package (requires Python 2.6 or later)')
304 |     parser.add_option(
305 |         '--download-base', dest='download_base', metavar="URL",
306 |         default=DEFAULT_URL,
307 |         help='alternative URL from where to download the setuptools package')
308 |     parser.add_option(
309 |         '--insecure', dest='downloader_factory', action='store_const',
310 |         const=lambda: download_file_insecure, default=get_best_downloader,
311 |         help='Use internal, non-validating downloader'
312 |     )
313 |     parser.add_option(
314 |         '--version', help="Specify which version to download",
315 |         default=DEFAULT_VERSION,
316 |     )
317 |     options, args = parser.parse_args()
318 |     # positional arguments are ignored
319 |     return options
320 | 
321 | def main():
322 |     """Install or upgrade setuptools and EasyInstall"""
323 |     options = _parse_args()
324 |     archive = download_setuptools(
325 |         version=options.version,
326 |         download_base=options.download_base,
327 |         downloader_factory=options.downloader_factory,
328 |     )
329 |     return _install(archive, _build_install_args(options))
330 | 
331 | if __name__ == '__main__':
332 |     sys.exit(main())


--------------------------------------------------------------------------------
/redditnlp/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | 
  4 | import os
  5 | import math
  6 | import errno
  7 | import json
  8 | import operator
  9 | import numpy as np
 10 | import praw
 11 | import urllib2
 12 | import nltk
 13 | 
 14 | from nltk.stem.porter import PorterStemmer
 15 | from collections import Counter, OrderedDict
 16 | from time import time, sleep
 17 | from string import punctuation
 18 | from praw.handlers import MultiprocessHandler
 19 | from sklearn.feature_extraction import DictVectorizer
 20 | from sklearn.feature_extraction.text import TfidfTransformer
 21 | from sklearn.svm import LinearSVC
 22 | from sklearn.naive_bayes import MultinomialNB
 23 | from sklearn.multiclass import OneVsRestClassifier
 24 | 
 25 | 
 26 | class WordCounter(object):
 27 |     """Performs word counting given an input string.
 28 | 
 29 |     Data attributes:
 30 |         stemmer: Porter stemmer used optionally to perform stemming of extracted words
 31 |         stopwords (list): list of stop words used to reject common words such as 'and'
 32 | 
 33 |     Methods:
 34 |         tokenize
 35 |         get_word_count
 36 |         remove_punctuation
 37 |         remove_stopwords
 38 |         stem_tokens: perform Porter stemming on a list of words
 39 |     """
 40 | 
 41 |     def __init__(self):
 42 |         self.stemmer = PorterStemmer()
 43 | 
 44 |         # Load stop-words
 45 |         application_root = os.path.dirname(__file__)
 46 |         stopwords = os.path.join(application_root, 'words/stopwords_english.txt')
 47 |         with open(stopwords, 'rb') as stopwords_file:
 48 |             self.stopwords = [word.strip('\n') for word in stopwords_file.readlines()]
 49 | 
 50 |     def tokenize(self, text):
 51 |         """Tokenize an input string into a list of words (with punctuation removed)."""
 52 |         text = text.lower()
 53 |         punctuation_removed = self.remove_punctuation(text)
 54 |         tokens = nltk.word_tokenize(punctuation_removed)
 55 |         return tokens
 56 | 
 57 |     def get_word_count(self, text, stop_words=True, stemming=False):
 58 |         """Return a dict (Counter) of words and corresponding counts given an input string."""
 59 |         tokens = self.tokenize(text)
 60 | 
 61 |         # Remove stop words
 62 |         if stop_words:
 63 |             tokens = self.remove_stopwords(tokens)
 64 | 
 65 |         if stemming:
 66 |             tokens = self.stem_tokens(tokens)
 67 | 
 68 |         return Counter(tokens)
 69 | 
 70 |     @staticmethod
 71 |     def remove_punctuation(text, replacement=' ', exclude="'"):
 72 |         """Remove punctuation from an input string."""
 73 |         text = text.replace("'", "")  # Single quote always stripped out
 74 |         for p in set(list(punctuation)) - set(list(exclude)):
 75 |             text = text.replace(p, replacement)
 76 | 
 77 |         text = ' '.join(text.split())  # Remove excess whitespace
 78 |         return text
 79 | 
 80 |     def remove_stopwords(self, tokens):
 81 |         """Remove all stopwords from a list of word tokens."""
 82 |         return [word for word in tokens if word not in self.stopwords]
 83 | 
 84 |     def stem_tokens(self, tokens):
 85 |         """Perform porter stemming on a list of word tokens."""
 86 |         return [self.stemmer.stem(word) for word in tokens]
 87 | 
 88 |     def count_words_from_list(self, text, word_list, normalize=True):
 89 |         """Count the number of times the words from a given list appear in text."""
 90 |         text = self.tokenize(text)
 91 |         count = sum([1 for word in text if word in word_list])
 92 |         if normalize:
 93 |             count /= len(text)
 94 |         return count
 95 | 
 96 | 
 97 | class RedditWordCounter(WordCounter):
 98 |     """Performs word counting of comments and titles in Reddit using the Reddit API.
 99 | 
100 |     To initialise a new RedditWordCounter instance:
101 |     >>> counter = RedditWordCounter('your_username')
102 | 
103 |     To adhere to the Reddit API rules, please provide your Reddit username in place of 'your_username' above.
104 |     This will ensure that the app doesn't get banned from Reddit!
105 | 
106 |     Data Attributes:
107 |         user_agent (str): required to connect to Reddit
108 |         reddit: instance of the Reddit API connection
109 |         word_counter: WordCounter object used to perform word counting given input strings
110 | 
111 |     Methods:
112 |         subreddit_comments: word count from comments of a given subreddit
113 |         subreddit_titles: word count from titles of a given subreddit
114 |         user_comments: word count from comments of a given user
115 |         check_connection: check that there is a working connection to Reddit
116 |     """
117 | 
118 |     def __init__(
119 |             self,
120 |             user,
121 |             multiprocess=False
122 |     ):
123 |         """Initialise a RedditWordCounter object.
124 | 
125 |         :param user: your Reddit username
126 |         :param multiprocess: if True, will handle requests from multiple RedditWordCounter objects (False by default)
127 |         :return:
128 |         """
129 |         super(RedditWordCounter, self).__init__()  # Initialise the WordCounter class
130 |         handler = MultiprocessHandler() if multiprocess else None
131 |         self.user_agent = 'redditvocab/0.1 bot by {0}'.format(user)
132 |         self.reddit = praw.Reddit(user_agent=self.user_agent, handler=handler)
133 | 
134 |     def subreddit_comments(self, subreddit_name, limit=1000, stemming=False, get_all_comments=False):
135 |         """Retrieve the vocabulary from the comments of a subreddit.
136 | 
137 |         :param subreddit_name: name of the subreddit excluding '/r/'
138 |         :param limit: number of comments to retrieve (1000 by default) - note that at present the limit is approximate
139 |         :param stemming: if True, performs stemming on tokenized words (False by default)
140 |         :param get_all_comments: if True, retrieves all comments per submission. Note that this requires descending the
141 |         comment tree, which drastically increases the number of API calls and reduces performance due to rate-limiting.
142 |         :return: Counter (dict) of comment vocabulary in the form {'term1': freq, 'term2': freq, ...}
143 |         """
144 | 
145 |         def get_vocabulary(comments):
146 | 
147 |             vocab = Counter()
148 |             num_comments = 0
149 |             for comment in comments:
150 |                 if isinstance(comment, praw.objects.Comment):
151 |                     try:
152 |                         # Get the word counts for the comment
153 |                         vocab += self.get_word_count(comment.body, stemming=stemming)
154 |                         num_comments += 1
155 | 
156 |                     except ValueError:
157 |                         pass
158 |                 elif isinstance(comment, praw.objects.MoreComments) and get_all_comments:
159 |                     new_vocab, num_new_comments = get_vocabulary(comment.comments)
160 |                     vocab += new_vocab
161 |                     num_comments += num_new_comments
162 | 
163 |             return vocab, num_comments
164 | 
165 |         subreddit = self.reddit.get_subreddit(subreddit_name)
166 | 
167 |         # Initialise loop variables
168 |         vocabulary = Counter()
169 |         comments_processed = 0
170 | 
171 |         for submission in subreddit.get_hot(limit=None):
172 |             submission_comments = praw.helpers.flatten_tree(submission.comments)
173 | 
174 |             # Run over all comments
175 |             submission_vocabulary, new_comments = get_vocabulary(submission_comments)
176 |             vocabulary += submission_vocabulary
177 |             comments_processed += new_comments
178 | 
179 |             print("Comments processed for subreddit '{0}': {1}".format(subreddit_name, comments_processed), end="\r")
180 | 
181 |             if limit and comments_processed >= limit:
182 |                 break
183 | 
184 |         print('\n')
185 |         return vocabulary
186 | 
187 |     def subreddit_titles(self, subreddit_name, limit=1000, stemming=False):
188 |         """Retrieve the vocabulary from the titles in a subreddit.
189 | 
190 |         :param subreddit_name: name of the subreddit excluding '/r/'
191 |         :param limit: number of submissions to process (1000 by default - note that this is the maximum)
192 |         :param stemming: if True, performs stemming on tokenized words (False by default)
193 |         :return: Counter (dict) of title vocabulary in the form {'term1': freq, 'term2': freq, ...}
194 |         """
195 | 
196 |         subreddit = self.reddit.get_subreddit(subreddit_name)
197 | 
198 |         # Initialise loop variables
199 |         vocabulary = Counter()
200 |         submissions_processed = 0
201 | 
202 |         for submission in subreddit.get_hot(limit=limit):
203 |             try:
204 |                 # Update the word counter to include the comment
205 |                 vocabulary += self.get_word_count(submission.title, stemming=stemming)
206 |                 submissions_processed += 1
207 | 
208 |                 if submissions_processed % 100 == 0 or submissions_processed >= limit:
209 |                     print("Titles processed for subreddit '{0}': {1}".format(subreddit_name, submissions_processed),
210 |                           end="\r")
211 | 
212 |             except ValueError:
213 |                 pass
214 | 
215 |         print('\n')
216 |         return vocabulary
217 | 
218 |     def user_comments(self, username, limit=1000, stemming=False):
219 |         """Retrieve the vocabulary of a user's comments.
220 | 
221 |         :param username: user's Reddit username excluding '/u/'
222 |         :param limit: number of comments to process (1000 by default - note that this is the maxmimum)
223 |         :param stemming: if True, performs stemming on tokenized words (False by default)
224 |         :return: Counter (dict) of user's vocabulary in the form {'term1': freq, 'term2': freq, ...}
225 |         """
226 |         user = self.reddit.get_redditor(username)
227 | 
228 |         vocabulary = Counter()
229 |         comments_processed = 0
230 |         for comment in user.get_comments(limit=limit):
231 |             try:
232 |                 # Get the word counts for the comment
233 |                 vocabulary += self.get_word_count(comment.body, stemming=stemming)
234 |                 comments_processed += 1
235 | 
236 |                 if comments_processed % 100 == 0 or comments_processed >= limit:
237 |                     print("Comments processed for user '{0}': {1}".format(username, comments_processed), end="\r")
238 | 
239 |             except ValueError:
240 |                 pass
241 | 
242 |         print('\n')
243 |         return vocabulary
244 | 
245 |     def check_connection(self, timeout=10):
246 |         """Wait for a server response."""
247 |         header = {'User-Agent': self.user_agent}
248 |         start = time()
249 |         while True:
250 |             try:
251 |                 request = urllib2.Request("http://www.reddit.com/", headers=header)
252 |                 response = urllib2.urlopen(request)
253 |                 response.read()
254 |                 sleep(2)  # Adhere to Reddit API rule of 30 requests per minute
255 |                 if response.getcode() == 200:
256 |                     return True
257 |             except urllib2.HTTPError as err:
258 |                 print(err)
259 |             finally:
260 |                 if time() - start > timeout:
261 |                     return False
262 | 
263 | 
264 | class TfidfCorpus(object):
265 |     """Stores features (e.g. words) and their document frequencies in an inverted index. Useful for NLP and machine
266 |     learning applications.
267 | 
268 |     To initialise a new TfidfCorpus instance:
269 |     >>> corpus = TfidfCorpus()
270 | 
271 |     By default the corpus will save to 'tfidf_corpus/corpus.json'. You can specify an existing file to load
272 |     or a specific save path as follows:
273 |     >>> corpus = TfidfCorpus(corpus_path='path/to/corpus.json')
274 | 
275 |     Data Attributes:
276 |         corpus_path (str): save/load path of the corpus
277 |         document_list (list): list of strings indicating the documents stored in the corpus
278 |         document_lengths (dict): sum of word frequencies contained in each document, takes the form:
279 |             {
280 |                 "document1": int,
281 |                 "document2": int,
282 |                 ...
283 |             }
284 |         corpus (dict): dict of Counters that takes the form:
285 |             {
286 |                 "term1": {
287 |                     "document1": int,
288 |                     "document2": int
289 |                 },
290 |                 "term2": {
291 |                     "document1": int,
292 |                     "document2": int,
293 |                 },
294 |                 ...
295 |             }
296 | 
297 |     Methods:
298 |         save
299 |         load
300 |         get_corpus_path
301 |         get_document_list
302 |         add_document
303 |         get_document
304 |         delete_document
305 |         append_document
306 |         get_idf
307 |         get_tfidf
308 |         get_document_tfidfs
309 |         get_top_terms
310 |         build_feature_matrix
311 |         train_classifier
312 |         classify_document
313 |         count_words_from_list
314 |         get_mean_word_length
315 |         check_corpus_path
316 |     """
317 | 
318 |     def __init__(self, corpus_path='corpus.json'):
319 | 
320 |         # Check that the corpus path is valid
321 |         self.check_corpus_path(corpus_path)
322 |         self.corpus_path = corpus_path
323 |         self.document_list = list()
324 |         self.document_lengths = dict()
325 |         self.corpus = dict()
326 | 
327 |         # Initialise scikit-learn attributes
328 |         self.vectorizer = None
329 |         self.tfidf_transformer = None
330 |         self.feature_matrix = None
331 |         self.classifier = None
332 | 
333 |         if os.path.isfile(corpus_path):
334 |             self.load()
335 | 
336 |     def save(self, path=''):
337 |         """Save the corpus to a JSON file at the path specified in self.corpus_path.
338 | 
339 |         :param path: you can specify a save path (must end in .json), which will change self.corpus_path
340 |         """
341 |         if path:
342 |             self.check_corpus_path(path)
343 |             self.corpus_path = path
344 | 
345 |         with open(self.corpus_path, 'wb') as save_file:
346 |             json.dump(
347 |                 {
348 |                     'document_list': self.document_list,
349 |                     'document_lengths': self.document_lengths,
350 |                     'corpus': self.corpus
351 |                 },
352 |                 save_file
353 |             )
354 | 
355 |     def load(self):
356 |         """Load the corpus from a JSON file. File path defined in self.corpus_path."""
357 |         with open(self.corpus_path, 'rb') as load_file:
358 |             data = json.load(load_file)
359 | 
360 |         try:
361 |             self.document_list = data['document_list']
362 |             self.document_lengths = data['document_lengths']
363 |             self.corpus = data['corpus']
364 | 
365 |             # Make sure that frequency dicts in corpus are Counter objects
366 |             for term in self.corpus.iterkeys():
367 |                 self.corpus[term] = Counter(self.corpus[term])
368 |         except KeyError as err:
369 |             print('Provided file does not have expected structure')
370 |             raise err
371 | 
372 |     def get_corpus_path(self):
373 |         return self.corpus_path
374 | 
375 |     def set_corpus_path(self, path):
376 |         if not path.lower().endswith('.json'):
377 |             raise Exception('Corpus path must be a JSON file (.json extension).')
378 |         self.corpus_path = path
379 | 
380 |     def get_document_list(self):
381 |         return self.document_list
382 | 
383 |     def get_vocabulary(self):
384 |         """Return the full list of terms in the corpus."""
385 |         return self.corpus.keys()
386 | 
387 |     def get_document(self, document_name):
388 |         """Retrieve a document from the corpus."""
389 |         if document_name not in self.document_list:
390 |             raise Exception("No document with name '{0}' found in corpus".format(document_name))
391 |         return Counter({
392 |             term: freqs[document_name] for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0)
393 |         })
394 | 
395 |     def add_document(self, document, document_name):
396 |         """Load a document into the corpus.
397 | 
398 |         :param document: takes the form {'term1': freq1, 'term2', freq2, ...}
399 |         :param document_name: string which uniquely identifies the document
400 |         """
401 |         if document_name in self.document_list:
402 |             print("Document with name '{0}' already exists in corpus."
403 |                   "Do you wish to replace it?".format(document_name))
404 |             while True:
405 |                 replace_doc = raw_input("Response (y/n): ")
406 |                 if replace_doc in ['y', 'yes', 'ye']:
407 |                     self.delete_document(document_name)
408 |                     break
409 |                 elif replace_doc in ['n', 'no']:
410 |                     return
411 |                 else:
412 |                     print('Could not interpret response. Try again.')
413 | 
414 |         for term, freq in document.iteritems():
415 |             if not self.corpus.get(term, False):
416 |                 self.corpus[term] = Counter()
417 | 
418 |             self.corpus[term][document_name] = freq
419 | 
420 |         self.document_list.append(document_name)
421 |         self.document_lengths[document_name] = sum(document.itervalues())
422 | 
423 |     def delete_document(self, document_name):
424 |         """Delete a document from the corpus.
425 | 
426 |         :param document_name: string indicating document's name in the corpus - should exist in self.document_list
427 |         """
428 |         if document_name not in self.document_list:
429 |             return
430 |         [freqs.pop(document_name) for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0)]
431 |         self.document_list.remove(document_name)
432 |         self.document_lengths.pop(document_name)
433 | 
434 |     def append_document(self, document, document_name):
435 |         """Add new counts to an existing document. If the document doesn't exist in the corpus then it is added.
436 | 
437 |         :param document: dict or Counter of word counts, e.g. {'i': 1, 'like': 2, 'cheese': 1}
438 |         :param document_name: string indicating document's name in the corpus - should exist in self.document_list
439 |         """
440 |         if document_name not in self.document_list:
441 |             self.add_document(document, document_name)
442 |         else:
443 |             for term, freq in document.iteritems():
444 |                 if not self.corpus.get(term, False):
445 |                     self.corpus[term] = Counter()
446 | 
447 |                 self.corpus[term][document_name] += freq
448 | 
449 |         self.document_lengths[document_name] += sum(document.itervalues())
450 | 
451 |     def get_idf(self, term):
452 |         """Get inverse document frequency of a given term in the corpus."""
453 |         num_documents = len(self.document_list)
454 |         docs_containing_term = len(self.corpus[term])
455 |         return math.log(num_documents / (1 + docs_containing_term))
456 | 
457 |     def get_tfidf(self, term, document_name):
458 |         """Get tf-idf score given a term and document in the corpus."""
459 |         tf = self.corpus[term].get(document_name, '') / self.document_lengths[document_name]
460 |         idf = self.get_idf(term)
461 |         return tf * idf
462 | 
463 |     def get_document_tfidfs(self, document_name, l2_norm=True):
464 |         """Get tf-idf scores for all terms in a document.
465 | 
466 |         :param document_name: string indicating document's name in the corpus - should exist in self.document_list
467 |         :param l2_norm: if True, applies Euclidean normalization to tf-idf scores of the document
468 |         :return: Counter of tf-idf scores for each term
469 |         """
470 |         tfidfs = {
471 |             term: self.get_tfidf(term, document_name) for term, freq in self.corpus.iteritems()
472 |             if freq.get(document_name, '')
473 |         }
474 | 
475 |         if l2_norm:
476 |             normalization = np.linalg.norm(tfidfs.values(), axis=0)
477 |             for key, value in tfidfs.items():
478 |                 tfidfs[key] = value / normalization
479 | 
480 |         return Counter(tfidfs)
481 | 
482 |     def get_top_terms(self, document_name, num_terms=30):
483 |         """Get the top terms for a given document by tf-idf score.
484 | 
485 |         :param document_name: string indicating document's name in the corpus - should exist in self.document_list
486 |         :param num_terms: number of top terms to return (30 by default)
487 |         :return: dict of top terms and their corresponding tf-idf scores
488 |         """
489 |         tfidfs = self.get_document_tfidfs(document_name)
490 |         sorted_tfidfs = sorted(tfidfs.items(), key=operator.itemgetter(1), reverse=True)
491 |         return OrderedDict(sorted_tfidfs[:num_terms])
492 | 
493 |     def build_feature_matrix(self, tfidf=True):
494 |         """Transforms the corpus into a scikit-learn vectorizer object which can be used for machine learning.
495 |         Used to set the object attributes self.vectorizer and self.feature_matrix.
496 | 
497 |         :param tfidf (bool): if True, applies TfidfTransformer to vectorized features
498 |         :return: scikit-learn vectorizer, scipy sparse feature matrix and its corresponding document labels
499 |         """
500 | 
501 |         train_data = [self.get_document(document) for document in self.document_list]
502 |         labels = self.document_list
503 |         vectorizer = DictVectorizer()
504 |         feature_matrix = vectorizer.fit_transform(train_data)
505 | 
506 |         self.tfidf_transformer = None
507 |         if tfidf:
508 |             self.tfidf_transformer = TfidfTransformer()
509 |             feature_matrix = self.tfidf_transformer.fit_transform(feature_matrix)
510 | 
511 |         self.vectorizer = vectorizer
512 |         self.feature_matrix = feature_matrix
513 |         return feature_matrix, labels, vectorizer
514 | 
515 |     def train_classifier(self, classifier_type='LinearSVC', tfidf=True):
516 |         """Trains a document classifier using the vocabulary and documents contained in the corpus. Uses scikit-learn.
517 | 
518 |         :param classifier_type (str): 'LinearSVC' or 'MultinomialNB' (LinearSVC by default)
519 |         :param tfidf (bool): if True, applies TfidfTransformer to vectorized features
520 |         :return: classifier object
521 |         """
522 |         self.build_feature_matrix(tfidf=tfidf)
523 | 
524 |         if classifier_type.lower() == 'linearsvc':
525 |             classifier = OneVsRestClassifier(LinearSVC(random_state=0))
526 |         elif classifier_type.lower() == 'multinomialnb':
527 |             classifier = OneVsRestClassifier(MultinomialNB())
528 |         else:
529 |             raise Exception("Parameter classifier_type only accepts 'MultinomialNB', 'BernoulliNB' or 'LinearSVC'.")
530 | 
531 |         classifier.fit(self.feature_matrix, self.document_list)
532 |         self.classifier = classifier
533 |         return classifier
534 | 
535 |     def classify_document(self, document):
536 |         """Classifies an input document using a bag-of-words approach with sparse features.
537 | 
538 |         :param document (dict): dict or Counter of the form {'word1': freq1, 'word2': freq2, ...}
539 |         :return (str): label corresponding to the document's classification
540 |         """
541 |         test_data = self.vectorizer.transform([document])
542 |         if self.tfidf_transformer:
543 |             test_data = self.tfidf_transformer.transform(test_data)
544 | 
545 |         return self.classifier.predict(test_data)
546 | 
547 |     def count_words_from_list(self, document_name, word_list, normalize=True):
548 |         """Given a list of input words, return the counts of these words in a specified document."""
549 |         document = self.get_document(document_name)
550 |         word_counts = [document[word] for word in word_list]
551 |         total_count = sum(word_counts)
552 |         if normalize:
553 |             total_count /= self.document_lengths[document_name]
554 | 
555 |         return total_count
556 | 
557 |     def get_mean_word_length(self, document_name, upper_limit=12):
558 |         """Get the average word length for all words in a given document."""
559 |         document = self.get_document(document_name)
560 |         return sum([len(term) * freq for term, freq in document.iteritems()
561 |                     if len(term) <= upper_limit]) / sum(document.itervalues())
562 | 
563 |     @staticmethod
564 |     def check_corpus_path(corpus_path):
565 |         if not corpus_path.lower().endswith('.json'):
566 |             raise Exception('corpus_path provided is not a valid JSON file.')
567 |         make_path(corpus_path)
568 | 
569 | 
570 | def make_path(path):
571 |     """Check if path exists. If it doesn't, create the necessary folders."""
572 | 
573 |     # Remove file name from path
574 |     base_name = os.path.basename(path)
575 |     if '.' in base_name:
576 |         path = path[:-len(base_name)]
577 | 
578 |     if not os.path.exists(path):
579 |         try:
580 |             os.makedirs(path)
581 |         except OSError as exception:
582 |             if exception.errno != errno.EEXIST:
583 |                 raise
584 | 
585 |     return path
586 | 
587 | 
588 | def get_word_corpora():
589 |     """Returns a list of paths to all word corpora installed in the module."""
590 |     application_root = get_root_dir()
591 |     words_dir = os.path.join(application_root, 'words')
592 |     return os.listdir(words_dir)
593 | 
594 | 
595 | def get_root_dir():
596 |     return os.path.dirname(__file__)


--------------------------------------------------------------------------------