├── .gitignore
├── icml2016.txt
├── morph
    ├── morpha.exe
    ├── verbstem.list
    └── README
├── clouds
    ├── topics_icml_3.png
    ├── topics_icml_5.png
    ├── topics_trump.png
    ├── topics_hillary.png
    ├── topics_hillary2.png
    ├── topics_hillary3.png
    ├── topics_sanders.png
    ├── topiccloud-aamas.png
    ├── topiccloud-ijcai.png
    ├── topics_drugstory.png
    ├── wordcloud_drugstory.png
    └── topics_drugstory_kmeans.png
├── README.md
├── morpha.py
├── drugstory.txt
├── TopicCloud.py
└── gencloud.py


/.gitignore:
--------------------------------------------------------------------------------
1 | commit.bat
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/icml2016.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/icml2016.txt


--------------------------------------------------------------------------------
/morph/morpha.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/morph/morpha.exe


--------------------------------------------------------------------------------
/clouds/topics_icml_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_icml_3.png


--------------------------------------------------------------------------------
/clouds/topics_icml_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_icml_5.png


--------------------------------------------------------------------------------
/clouds/topics_trump.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_trump.png


--------------------------------------------------------------------------------
/clouds/topics_hillary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary.png


--------------------------------------------------------------------------------
/clouds/topics_hillary2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary2.png


--------------------------------------------------------------------------------
/clouds/topics_hillary3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary3.png


--------------------------------------------------------------------------------
/clouds/topics_sanders.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_sanders.png


--------------------------------------------------------------------------------
/clouds/topiccloud-aamas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topiccloud-aamas.png


--------------------------------------------------------------------------------
/clouds/topiccloud-ijcai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topiccloud-ijcai.png


--------------------------------------------------------------------------------
/clouds/topics_drugstory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_drugstory.png


--------------------------------------------------------------------------------
/clouds/wordcloud_drugstory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/wordcloud_drugstory.png


--------------------------------------------------------------------------------
/clouds/topics_drugstory_kmeans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_drugstory_kmeans.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Topic Cloud generation:
 2 | 
 3 | This Python toolkit generates a Topic Cloud, a visual representation of topics derived from a document (documents). 
 4 | 
 5 | Please check the graphs in "/clouds" directory for example topic clouds.
 6 | 
 7 | # Prerequisites: 
 8 | * [wordcloud](https://github.com/amueller/word_cloud/) ([windows binary](http://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud))
 9 | * Pillow ([windows binary](http://www.lfd.uci.edu/~gohlke/pythonlibs/#pillow))
10 | * matplotlib
11 | 


--------------------------------------------------------------------------------
/morpha.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Wrapper around morpha from
  3 | http://www.informatics.sussex.ac.uk/research/groups/nlp/carroll/morph.html
  4 | 
  5 | Vaguely follows edu.stanford.nlp.Morphology except we implement with a pipe.
  6 | hacky.  Would be nice to use cython/swig/ctypes to directly embed morpha.yy.c
  7 | as a python extension.
  8 | 
  9 | TODO compare linguistic quality to lemmatizer in python's "pattern" package
 10 | 
 11 | By Brendan O'Connor (http://brenocon.com), at https://gist.github.com/brendano/6008945
 12 | """
 13 | 
 14 | import os,subprocess
 15 | 
 16 | #MorphaDir = os.path.join(os.path.dirname(__file__), 'morph')
 17 | MorphaDir = 'morph'
 18 | MorphaCmd = os.path.join(MorphaDir, 'morpha')
 19 | MorphaArgs= ['-f', os.path.join(MorphaDir, 'verbstem.list')]
 20 | 
 21 | _pipe = None
 22 | 
 23 | def get_pipe():
 24 |     global _pipe
 25 |     if _pipe is None:
 26 |         open_pipe()
 27 |     elif _pipe.returncode is not None:
 28 |         print "Pipe seems to have died, restarting"
 29 |         open_pipe()
 30 |     return _pipe
 31 | 
 32 | def open_pipe():
 33 |     global _pipe
 34 |     print "Opening morpha pipe"
 35 |     _pipe = subprocess.Popen([MorphaCmd] + MorphaArgs, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 36 | 
 37 | def process(input):
 38 |     input = input.strip()
 39 |     output = None
 40 |     for retry in range(3):
 41 |         try:
 42 |             pipe = get_pipe()
 43 |             print>>pipe.stdin, input
 44 |             pipe.stdin.flush()
 45 |             output = pipe.stdout.readline()
 46 |         except IOError:
 47 |             if retry==2: raise
 48 |             print "Retry on pipe breakage"
 49 |             open_pipe()
 50 |     return output.rstrip('\n')
 51 | 
 52 | 
 53 | ## From morph/doc.txt....
 54 | 
 55 | #Where the -u option is not used, each input token is expected to be of
 56 | #the form <word>_<tag>. For example:
 57 | #
 58 | #   A_AT1 move_NN1 to_TO stop_VV0 Mr._NNS Gaitskell_NP1 from_II nominating_VVG
 59 | #
 60 | #Contractions and punctuation must have been separated out into separate
 61 | #tokens. The tagset is assumed to resemble CLAWS-2, in the following
 62 | #respects:
 63 | #
 64 | #   V...      all verbs
 65 | #   NP...     all proper names
 66 | #   N[^P]...  all common nouns
 67 | #
 68 | #and for specific cases of ambiguous lexical items:
 69 | #
 70 | #   'd_VH...  root is 'have'
 71 | #   'd_VM...  root is 'would'
 72 | #   's_VBZ... root is 'be'
 73 | #   's_VHZ... root is 'have'
 74 | #   's_$...   possessive morpheme (also _POS for CLAWS-5)
 75 | #   ai_VB...  root is 'be'
 76 | #   ai_VH...  root is 'have'
 77 | #   ca_VM...  root is 'can'
 78 | #   sha_VM... root is 'shall'
 79 | #   wo_VM...  root is 'will'
 80 | #   n't_XX... root is 'not'
 81 | 
 82 | def ptb_is_proper(ptb):
 83 |     return ptb in ('NP','NNP','NNPS')
 84 | 
 85 | def ptb2morphtag(ptb):
 86 |     ptb = ptb.upper()
 87 |     if ptb.startswith('V'):
 88 |         return 'V'
 89 |     if ptb_is_proper(ptb):
 90 |         return 'NP'
 91 |     if ptb.startswith('N'):
 92 |         return 'N'
 93 |     if ptb == 'MD':
 94 |         return 'V'   # um is this right?  it looks like it can take incomplete versions...
 95 |     if ptb == 'POS':
 96 |         return '$'
 97 |     return ''
 98 | 
 99 | def lemmatize_seq(words_and_pos, tagset='PENN'):
100 |     """List of (word,pos) pairs.  Words are Unicode strings.
101 |     Returns list of lemma strings."""
102 |     assert tagset=='PENN', "don't support different tagsets yet"
103 | 
104 |     # Decorate the input pairs into one big string that morpha wants,
105 |     # Run morpha,
106 |     # Then undecorate the output.
107 | 
108 |     goods = [i for i in range(len(words_and_pos)) if words_and_pos[i][0]]
109 |     escape_str = '..axsxdxfxqxwxexr..'
110 |     new_pairs = []
111 |     #for word,pos in words_and_pos:
112 |     for i in goods:
113 |         word,pos = words_and_pos[i]
114 |         assert ' ' not in word
115 |         word = word.replace('_', escape_str)
116 |         morph_tag = ptb2morphtag(pos)
117 |         new_pairs.append((word, morph_tag))
118 |     decorated_input = u' '.join(u'{}_{}'.format(word,tag) if tag else word for word,tag in new_pairs)
119 |     decorated_input = decorated_input.encode('utf8') # TODO is morpha utf8 safe?
120 |     #print "INPUT", decorated_input
121 |     result = process(decorated_input)
122 |     #print "RESULT", result
123 | 
124 |     lemma_results = []
125 |     result_tokens = result.split()
126 |     assert len(result_tokens) == len(new_pairs)
127 |     for i,lemma in enumerate(result_tokens):
128 |         lemma = lemma.split('_')[0]    # Rare. I think this is a bug in morpha
129 |         #assert '_' not in lemma
130 |         lemma = lemma.decode('utf-8','replace') # TODO is morpha utf8 safe?
131 |         lemma = lemma.replace(escape_str, '_')
132 |         if not ptb_is_proper(words_and_pos[i][1]):
133 |             lemma = lemma.lower()
134 |         lemma_results.append(lemma)
135 | 
136 |     # juxtapose it back in
137 |     final_results = ['' for x in range(len(words_and_pos))]
138 |     for i,lemma in enumerate(lemma_results):
139 |         final_results[goods[i]] = lemma
140 |     return final_results
141 | 
142 | def lemmatize(word,pos, tagset='PENN'):
143 |     seq = [(word,pos)]
144 |     result = lemmatize_seq(seq, tagset=tagset)
145 |     return result[0]
146 | 


--------------------------------------------------------------------------------
/morph/verbstem.list:
--------------------------------------------------------------------------------
  1 | abat abet abhor abut accur acquit adlib admit aerobat aerosol
  2 | agendaset allot alot anagram annul appal apparel armbar aver babysit
  3 | airdrop appal blackleg bobsled bur chum confab counterplot curet dib
  4 | backdrop backfil backflip backlog backpedal backslap backstab bag
  5 | balfun ballot ban bar barbel bareleg barrel bat bayonet becom bed
  6 | bedevil bedwet beenhop befit befog beg beget begin bejewel bemedal
  7 | benefit benum beset besot bestir bet betassel bevel bewig bib bid
  8 | billet bin bip bit bitmap blab blag blam blan blat bles blim blip blob
  9 | bloodlet blot blub blur bob bodypop bog booby-trap boobytrap booksel
 10 | bootleg bop bot bowel bracket brag brig brim bud buffet bug bullshit
 11 | bum bun bus but cab cabal cam can cancel cap caracol caravan carburet
 12 | carnap carol carpetbag castanet cat catcal catnap cavil chan chanel
 13 | channel chap char chargecap chat chin chip chir chirrup chisel chop
 14 | chug chur clam clap clearcut clip clodhop clog clop closet clot club
 15 | co-occur co-program co-refer co-run co-star cob cobweb cod coif com
 16 | combat comit commit compel con concur confer confiscat control cop
 17 | coquet coral corbel corral cosset cotransmit councel council counsel
 18 | court-martial crab cram crap crib crop crossleg cub cudgel cum cun cup
 19 | cut dab dag dam dan dap daysit de-control de-gazet de-hul de-instal
 20 | de-mob de-program de-rig de-skil deadpan debag debar debug decommit
 21 | decontrol defer defog deg degas deinstal demit demob demur den denet
 22 | depig depip depit der deskil deter devil diagram dial dig dim din dip
 23 | disbar disbud discomfit disembed disembowel dishevel disinter dispel
 24 | disprefer distil dog dognap don doorstep dot dowel drag drat driftnet
 25 | distil egotrip enrol enthral extol fulfil gaffe golliwog idyl inspan
 26 | drip drivel drop drub drug drum dub duel dun dybbuk earwig eavesdrop
 27 | ecolabel eitherspigot electroblot embed emit empanel enamel endlabel
 28 | endtrim enrol enthral entrammel entrap enwrap equal equip estop
 29 | exaggerat excel expel extol fag fan farewel fat featherbed feget fet
 30 | fib fig fin fingerspel fingertip fit flab flag flap flip flit flog
 31 | flop fob focus fog footbal footslog fop forbid forget format
 32 | fortunetel fot foxtrot frag freefal fret frig frip frog frug fuel
 33 | fufil fulfil fullyfit fun funnel fur furpul gab gad gag gam gambol gap
 34 | garot garrot gas gat gel gen get giftwrap gig gimbal gin glam glenden
 35 | glendin globetrot glug glut gob goldpan goostep gossip grab gravel
 36 | grid grin grip grit groundhop grovel grub gum gun gunrun gut gyp haircut
 37 | ham han handbag handicap handknit handset hap hareleg hat headbut
 38 | hedgehop hem hen hiccup highwal hip hit hobnob hog hop horsewhip
 39 | hostel hot hotdog hovel hug hum humbug hup hushkit hut illfit imbed
 40 | immunblot immunoblot impannel impel imperil incur infer infil inflam
 41 | initial input inset instil inter interbed intercrop intercut interfer
 42 | instal instil intermit japan jug kris manumit mishit mousse mud
 43 | interwar jab jag jam jar jawdrop jet jetlag jewel jib jig jitterbug
 44 | job jog jog-trot jot jut ken kennel kid kidnap kip kissogram kit knap
 45 | kneecap knit knob knot kor label lag lam lap lavel leafcut leapfrog
 46 | leg lem lep let level libel lid lig lip lob log lok lollop longleg lop
 47 | lowbal lug mackerel mahom man map mar marshal marvel mat matchwin
 48 | metal micro-program microplan microprogram milksop mis-cal mis-club
 49 | mis-spel miscal mishit mislabel mit mob mod model mohmam monogram mop
 50 | mothbal mug multilevel mum nab nag nan nap net nightclub nightsit nip
 51 | nod nonplus norkop nostril not nut nutmeg occur ocur offput offset
 52 | omit ommit onlap out-general out-gun out-jab out-plan out-pol out-pul
 53 | out-put out-run out-sel outbid outcrop outfit outgas outgun outhit
 54 | outjab outpol output outrun outship outshop outsin outstrip outswel
 55 | outspan overcrop pettifog photostat pouf preset prim pug ret rosin
 56 | outwit over-commit over-control over-fil over-fit over-lap over-model
 57 | over-pedal over-pet over-run over-sel over-step over-tip over-top
 58 | overbid overcal overcommit overcontrol overcrap overdub overfil
 59 | overhat overhit overlap overman overplot overrun overshop overstep
 60 | overtip overtop overwet overwil pad paintbal pan panel paperclip par
 61 | parallel parcel partiescal pat patrol pedal peewit peg pen pencil pep
 62 | permit pet petal photoset phototypeset phut picket pig pilot pin
 63 | pinbal pip pipefit pipet pit plan plit plod plop plot plug plumet
 64 | plummet pod policyset polyfil ponytrek pop pot pram prebag predistil
 65 | predril prefer prefil preinstal prep preplan preprogram prizewin prod
 66 | profer prog program prop propel pub pummel pun pup pushfit put quarel
 67 | quarrel quickskim quickstep quickwit quip quit quivertip quiz rabbit
 68 | rabit radiolabel rag ram ramrod rap rat ratecap ravel re-admit re-cal
 69 | re-cap re-channel re-dig re-dril re-emit re-fil re-fit re-flag
 70 | re-format re-fret re-hab re-instal re-inter re-lap re-let re-map
 71 | re-metal re-model re-pastel re-plan re-plot re-plug re-pot re-program
 72 | re-refer re-rig re-rol re-run re-sel re-set re-skin re-stal re-submit
 73 | re-tel re-top re-transmit re-trim re-wrap readmit reallot rebel rebid
 74 | rebin rebut recap rechannel recommit recrop recur recut red redril
 75 | refer refit reformat refret refuel reget regret reinter rejig rekit
 76 | reknot relabel relet rem remap remetal remit remodel reoccur rep repel
 77 | repin replan replot repol repot reprogram rerun reset resignal resit
 78 | reskil resubmit retransfer retransmit retro-fit retrofit rev revel
 79 | revet rewrap rib richochet ricochet rid rig rim ringlet rip rit rival
 80 | rivet roadrun rob rocket rod roset rot rowel rub run runnel rut sab
 81 | sad sag sandbag sap scab scalpel scam scan scar scat schlep scrag
 82 | scram shall sled smut stet sulfuret trepan unrip unstop whir whop wig
 83 | scrap scrat scrub scrum scud scum scur semi-control semi-skil
 84 | semi-skim semiskil sentinel set shag sham shed shim shin ship shir
 85 | shit shlap shop shopfit shortfal shot shovel shred shrinkwrap shrivel
 86 | shrug shun shut side-step sideslip sidestep signal sin sinbin sip sit
 87 | skid skim skin skip skir skrag slab slag slam slap slim slip slit slob
 88 | slog slop slot slowclap slug slum slur smit snag snap snip snivel snog
 89 | snorkel snowcem snub snug sob sod softpedal son sop spam span spar
 90 | spat spiderweb spin spiral spit splat split spot sprag spraygun sprig
 91 | springtip spud spur squat squirrel stab stag star stem sten stencil
 92 | step stir stop storytel strap strim strip strop strug strum strut stub
 93 | stud stun sub subcrop sublet submit subset suedetrim sum summit sun
 94 | suntan sup super-chil superad swab swag swan swap swat swig swim
 95 | swivel swot tab tag tan tansfer tap tar tassel tat tefer teleshop
 96 | tendril terschel th'strip thermal thermostat thin throb thrum thud
 97 | thug tightlip tin tinsel tip tittup toecap tog tom tomorrow top tot
 98 | total towel traget trainspot tram trammel transfer tranship transit
 99 | transmit transship trap travel trek trendset trim trip tripod trod
100 | trog trot trousseaushop trowel trup tub tug tunnel tup tut twat twig
101 | twin twit typeset tyset un-man unban unbar unbob uncap unclip uncompel
102 | undam under-bil under-cut under-fit under-pin under-skil underbid
103 | undercut underlet underman underpin unfit unfulfil unknot unlip
104 | unlywil unman unpad unpeg unpin unplug unravel unrol unscrol unsnap
105 | unstal unstep unstir untap unwrap unzip up upset upskil upwel ven
106 | verbal vet victual vignet wad wag wainscot wan war water-log waterfal
107 | waterfil waterlog weasel web wed wet wham whet whip whir whiteskin
108 | whiz whup wildcat win windmil wit woodchop woodcut wor worship wrap
109 | will wiretap yen yak yap yarnspin yip yodel zag zap zig zig-zag zigzag
110 | zip ztrip
111 | 


--------------------------------------------------------------------------------
/morph/README:
--------------------------------------------------------------------------------
  1 |                                          University of Sussex  8 Sep 2003
  2 | 
  3 | This directory contains software for morphological processing of English
  4 | as developed by Kevin Humphreys <kwh@dcs.shef.ac.uk>, John Carroll
  5 | <john.carroll@cogs.susx.ac.uk> and Guido Minnen.
  6 | 
  7 | To be used for research purposes only (see section 4 below). If you make
  8 | any changes, the authors would appreciate it if you sent them details of
  9 | what you have done.
 10 | 
 11 | Covers the English inflectional suffixes:
 12 | 
 13 |   -s     plural of nouns, 3rd person singular present of verbs
 14 |   -ed    past tense
 15 |   -en    past participle
 16 |   -ing   progressive of verbs
 17 | 
 18 | 1. Usage
 19 | --------
 20 | 
 21 |   morpha [-a] [-c] [-t] [-u] [-f verbstem-file]
 22 |   morphg [-c] [-t] [-u] [-f verbstem-file]
 23 | 
 24 | The commands operate as filters, reading from the standard input and
 25 | writing to the standard output.
 26 | 
 27 | They may be invoked with the following command-line options:
 28 | 
 29 |   -a Output affixes (morpha only).
 30 | 
 31 |   -c Preserve case distinctions wherever possible.
 32 | 
 33 |   -t Output part-of-speech tags if they are in the input.
 34 | 
 35 |   -u Indicate that the words in the input are not tagged with
 36 |      part-of-speech labels. N.B. This mode of use is not recommended
 37 |      since the resulting ambiguity in the input is likely to lead to
 38 |      incorrect output.
 39 | 
 40 |   -f By default, the commands attempt to read a file called
 41 |      'verbstem.list' in the user's current directory which is expected
 42 |      to contain a list of stems of verbs that undergo doubling of
 43 |      their final consonant, as occurs in British English spelling.
 44 |      This option allows the user to specify a different file of verb
 45 |      stems (for example if American English behaviour is required).
 46 |      If this option is specified then it must be the last one on
 47 |      the command-line.
 48 | 
 49 | See the file doc.txt for specifications of input and output formats,
 50 | and examples of usage.
 51 | 
 52 | 2. Files
 53 | --------
 54 | 
 55 |   Makefile       makefile for compiling the flex sources; can be
 56 |                  used for compiling both flex descriptions by 
 57 |                  the command `make flex-description-file'
 58 |   README         this file
 59 |   doc.txt        specifications of input/output formats, and usage
 60 |                  examples
 61 |   gpost          postamble file used in deriving morphg.lex
 62 |   gpre           preamble file used in deriving morphg.lex
 63 |   invert.sh      unix shell program that derives morphg.lex from
 64 |                  morpha.lex
 65 |   minnen.pdf     pre-final PDF version of the NLE article by Minnen, 
 66 |                  Carroll and Pearce (2001)
 67 |   morpha.{ix86_linux|ppc_darwin|sun4_sunos}
 68 |                  executables for the morphological analyser; for
 69 |                  details of usage see above 
 70 |   morpha.lex     flex description constituting the source of the 
 71 |                  morphological analyser 
 72 |   morphg.{ix86_linux|ppc_darwin|sun4_sunos}
 73 |                  executables for the morphological generator; for
 74 |                  details of usage see above
 75 |   morphg.lex     flex description constituting the source of the 
 76 |                  morphological generator
 77 |   verbstem.list  list of verb stems that allow for consonant doubling
 78 |                  in British English 
 79 | 
 80 | The file morphg.lex is derived automatically from the file morpha.lex
 81 | using invert.sh, as described in the paper by Minnen, Carroll and
 82 | Pearce (2001) -- full reference below.
 83 | 
 84 | 3. Compilation
 85 | --------------
 86 | 
 87 | To recompile the morph tools, either type the following commands
 88 | (making sure that you use the 2.5.4a version of flex recompiled with
 89 | larger internal limits -- see below), or (more conveniently) use the
 90 | Makefile in this directory by typing `make morpha' or `make morphg'.
 91 | 
 92 |   flex -i -Cfe -8 -omorpha.yy.c morpha.lex
 93 |   gcc -o morpha morpha.yy.c
 94 | 
 95 | or 
 96 | 
 97 |   flex -i -Cfe -8 -omorphg.yy.c morphg.lex
 98 |   gcc -o morphg morphg.yy.c
 99 | 
100 | The executables included in this release were built omitting the
101 | Flex options -Cfe -8, resulting in a reduction in binary file size
102 | of two thirds (and a reduction in processing speed of around 20%).
103 | These options also have to be left out and the option -Dinteractive
104 | added to gcc (resulting in a further decrease in throughput) in order
105 | to get the morph tools to return results immediately when used via
106 | unix pipes inside other programs.
107 | 
108 | N.B. Recompiling the morph tools requires an adapted version of Flex.
109 | The Flex source code is freely available from:
110 | 
111 |   http://www.go.dlr.de/fresh/unix/src/misc/.warix/flex-2.5.4a.tar.gz.html
112 | 
113 | The Flex source should be changed to allow for more internal states by
114 | increasing the definitions in flexdef.h of:
115 | 
116 |   #define JAMSTATE -32766 
117 |   ... 
118 |   #define MAXIMUM_MNS 31999
119 |   ...
120 |   #define BAD_SUBSCRIPT -32767
121 | 
122 | to:
123 | 
124 |   #define JAMSTATE -800000 
125 |   ... 
126 |   #define MAXIMUM_MNS 800000
127 |   ...
128 |   #define BAD_SUBSCRIPT -800000
129 | 
130 | and recompiling Flex. When recompiling the morph tools ensure that the
131 | Makefile points to the new version of Flex.
132 | 
133 | 4. Acknowledgements, copyrights etc.
134 | ------------------------------------
135 | 
136 | Copyright (c) 1995-2000 University of Sheffield, University of Sussex
137 | All rights reserved.
138 | 
139 | Redistribution and use of source and derived binary forms are
140 | permitted without fee provided that:
141 | 
142 |   - they are not used in commercial products
143 |   - the above copyright notice and this paragraph are duplicated in
144 |     all such forms
145 |   - any documentation, advertising materials, and other materials
146 |     related to such distribution and use acknowledge that the software
147 |     was developed by Kevin Humphreys <kwh@dcs.shef.ac.uk>, John
148 |     Carroll <john.carroll@cogs.susx.ac.uk> and Guido Minnen
149 |     and refer to the following related publication:
150 | 
151 |   Guido Minnen, John Carroll and Darren Pearce. 2001. `Applied
152 |   morphological processing of English'. Natural Language Engineering,
153 |   7(3). 207-223.
154 | 
155 | The name of University of Sheffield may not be used to endorse or
156 | promote products derived from this software without specific prior
157 | written permission.
158 |   
159 | This software is provided "as is" and without any express or implied
160 | warranties, including, without limitation, the implied warranties of
161 | merchantibility and fitness for a particular purpose.
162 | 
163 | The exception lists were derived semi-automatically from WordNet 1.5,
164 | and various other corpora and MRDs.
165 | 
166 | Many thanks to Tim Baldwin, Chris Brew, Bill Fisher, Gerald Gazdar,
167 | Dale Gerdemann, Adam Kilgarriff and Ehud Reiter for suggested
168 | improvements.
169 | 
170 | WordNet 1.5 Copyright 1995 by Princeton University.
171 | All rights reseved.
172 | 
173 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
174 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
175 | BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO
176 | REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
177 | PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE
178 | OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS,
179 | COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
180 | 
181 | The name of Princeton University or Princeton may not be used in
182 | advertising or publicity pertaining to distribution of the software
183 | and/or database.  Title to copyright in this software, database and
184 | any associated documentation shall at all times remain with Princeton
185 | University and LICENSEE agrees to preserve same.
186 | 


--------------------------------------------------------------------------------
/drugstory.txt:
--------------------------------------------------------------------------------
 1 | Topic 4 (3.00): 26.7%
 2 | Most relevant words:
 3 | drugs (3528,10): 2.028/6.412/0.735/7.070 drug (1942,15): 1.722/6.668/0.691/6.910 use (176,7): 1.555/4.115/0.345/2.396 prescriptions (34657,3): 1.314/2.277/0.586/7.272 generic (7602,4): 1.307/2.613/0.310/2.722 patients (2468,9): 1.257/3.771/0.660/5.960 hospitals (4777,5): 1.163/2.601/0.443/4.253 turing (18313,9): 1.130/3.390/0.067/0.502 treatment (1460,5): 1.061/2.373/0.617/5.572 used (82,4): 1.037/2.074/0.174/1.317 pharmaceutical (8897,2): 0.919/1.300/0.558/5.198 pharmaceuticals (16906,3): 0.908/1.572/0.476/4.394 health (502,3): 0.898/1.556/0.468/3.955 medicine (1528,2): 0.891/1.259/0.447/4.153 controlled (2257,3): 0.880/1.524/0.234/2.163 treat (5814,3): 0.858/1.485/0.556/4.734 therapies (21018,2): 0.831/1.175/0.566/4.696 certain (1095,4): 0.825/1.650/0.297/2.043 dr (570,5): 0.825/1.845/0.158/1.255 treatments (9357,2): 0.791/1.118/0.642/5.847 doctors (4958,2): 0.776/1.098/0.455/4.300 generics (85144,1): 0.750/0.750/0.407/6.914 care (1069,2): 0.738/1.044/0.502/4.284 dangerous (3584,2): 0.713/1.008/0.253/2.339 sinai (13035,3): 0.709/1.228/0.147/1.184 patient (3777,2): 0.703/0.995/0.571/5.049 tablet (10396,2): 0.679/0.960/0.158/1.565 effects (1380,2): 0.665/0.941/0.326/2.985 pills (18688,1): 0.660/0.660/0.454/3.698 antibiotic (24098,2): 0.652/0.922/0.502/3.627 
 4 | Most similar words in vocab:
 5 | drugs: 0.735/7.070 medication: 0.724/6.284 medications: 0.715/6.052 drug: 0.691/6.910 antiretroviral: 0.667/7.589 prescription: 0.664/5.763 patients: 0.660/5.960 treatments: 0.642/5.847 methadone: 0.636/7.626 medicines: 0.632/5.483 antipsychotic: 0.631/7.077 antiepileptic: 0.623/7.670 ivig: 0.623/7.779 prophylactic: 0.622/7.108 treatment: 0.617/5.572 regimens: 0.610/7.109 therapeutic: 0.600/5.231 contraindicated: 0.600/6.283 analgesics: 0.597/6.625 dosages: 0.590/6.364 prescriptions: 0.586/7.272 warfarin: 0.575/6.138 prescribing: 0.572/7.143 patient: 0.571/5.049 therapies: 0.566/4.696 nsaids: 0.565/6.234 diazepam: 0.564/5.971 antidepressant: 0.564/6.338 opiates: 0.563/6.488 clonazepam: 0.563/7.389 
 6 | 
 7 | Topic 1 (3.00): 21.9%
 8 | Most relevant words:
 9 | increase (1265,8): 2.380/6.732/0.634/4.916 million (357,6): 2.331/5.710/0.637/6.217 increases (4242,4): 1.378/2.756/0.490/4.213 raised (1286,4): 1.303/2.606/0.289/2.576 dollars (4434,2): 1.262/1.784/0.665/5.687 year (54,6): 1.221/2.991/0.144/1.050 percent (1651,2): 1.163/1.645/0.515/4.703 sales (1485,3): 1.151/1.993/0.469/4.523 huge (3131,2): 1.066/1.508/0.338/2.961 money (808,2): 1.013/1.433/0.478/4.058 pay (1653,2): 0.988/1.397/0.503/4.325 cost (1178,2): 0.980/1.386/0.583/4.606 millions (5429,1): 0.882/0.882/0.538/4.874 hundreds (3443,2): 0.851/1.204/0.297/2.429 amount (1588,1): 0.834/0.834/0.549/4.450 rebates (60162,1): 0.815/0.815/0.595/6.684 clamoring (148365,1): 0.800/0.800/0.362/5.470 increased (1167,1): 0.791/0.791/0.534/4.307 shrank (46494,1): 0.787/0.787/0.363/4.175 month (1066,2): 0.758/1.072/0.294/2.159 fund (2014,3): 0.743/1.287/0.422/3.718 thousands (2861,1): 0.717/0.717/0.359/3.126 income (874,1): 0.685/0.685/0.473/3.421 annual (895,1): 0.680/0.680/0.269/2.632 lawmakers (27291,2): 0.679/0.960/0.291/3.278 tens (11057,1): 0.655/0.655/0.362/2.360 sharply (12378,2): 0.640/0.905/0.224/1.914 less (615,2): 0.639/0.903/0.331/2.294 ago (3139,2): 0.633/0.895/0.175/1.418 shorting (108317,1): 0.627/0.627/0.348/6.101 
10 | Most similar words in vocab:
11 | dollars: 0.665/5.687 million: 0.637/6.217 increase: 0.634/4.916 billion: 0.632/6.304 profits: 0.605/4.967 rebates: 0.595/6.684 costs: 0.591/4.978 revenues: 0.583/5.086 cost: 0.583/4.606 outlay: 0.565/6.000 recouping: 0.561/8.163 reinvest: 0.552/7.343 shortfall: 0.549/6.193 amount: 0.549/4.450 billions: 0.546/3.705 euros: 0.544/4.286 exorbitant: 0.543/6.478 repayments: 0.542/5.822 tenfold: 0.540/6.259 millions: 0.538/4.874 surtax: 0.535/7.351 increased: 0.534/4.307 refunds: 0.529/6.098 tripling: 0.527/6.139 refinancing: 0.523/5.726 quadrupled: 0.523/5.553 disburse: 0.521/6.913 disbursed: 0.518/5.408 trillion: 0.518/4.273 skyrocketed: 0.517/5.861 
12 | 
13 | Topic 12 (3.00): 16.4%
14 | Most relevant words:
15 | company (151,8): 2.758/7.800/0.724/5.842 acquired (1447,5): 2.038/4.557/0.544/4.705 companies (939,4): 1.687/3.374/0.649/5.130 mr (757,5): 1.105/2.471/0.218/1.613 manager (830,2): 1.087/1.537/0.285/2.496 hedge (14848,3): 1.063/1.842/0.366/3.228 business (354,2): 0.990/1.400/0.436/3.313 filed (3883,2): 0.949/1.342/0.391/3.739 now (169,6): 0.903/2.213/0.182/1.043 icahn (83210,1): 0.883/0.883/0.422/8.258 sold (614,2): 0.845/1.194/0.408/3.373 august (149,3): 0.828/1.434/0.113/0.856 investors (5976,2): 0.825/1.167/0.529/4.703 founder (1298,1): 0.820/0.820/0.362/2.904 bank (640,1): 0.767/0.767/0.369/3.361 chief (508,2): 0.737/1.043/0.161/1.342 acquisition (4356,1): 0.721/0.721/0.484/4.211 executive (876,1): 0.716/0.716/0.329/2.594 fund (2014,3): 0.713/1.235/0.449/3.955 president (211,1): 0.695/0.695/0.237/2.133 stock (1878,2): 0.695/0.983/0.396/3.792 directors (2573,1): 0.651/0.651/0.343/2.746 director (302,1): 0.639/0.639/0.228/1.749 glaxo (101914,1): 0.636/0.636/0.394/7.630 former (185,1): 0.622/0.622/0.157/1.254 private (597,1): 0.614/0.614/0.348/2.993 atlanta (2271,2): 0.609/0.861/0.177/1.510 announced (568,1): 0.608/0.608/0.321/2.392 university (61,2): 0.595/0.841/0.050/0.472 marketing (2693,1): 0.567/0.567/0.408/3.683 
16 | Most similar words in vocab:
17 | company: 0.724/5.842 companies: 0.649/5.130 corporation: 0.595/5.294 interpublic: 0.588/7.334 subsidiaries: 0.587/4.875 unitedhealth: 0.581/8.112 corp: 0.572/5.563 company's: 0.564/4.652 shareholder: 0.561/4.272 inc: 0.557/5.278 ceo: 0.555/5.131 investments: 0.555/4.823 shareholders: 0.550/4.495 subsidiary: 0.548/5.308 ameriprise: 0.547/7.710 acquired: 0.544/4.705 firm: 0.535/4.619 wellpoint: 0.535/8.259 holdings: 0.531/5.067 venture: 0.530/4.646 investment: 0.529/4.560 investors: 0.529/4.703 acquisitions: 0.528/4.164 smithkline: 0.526/7.536 cendant: 0.524/7.746 vornado: 0.521/7.003 purchased: 0.519/4.289 investor: 0.517/4.443 buyout: 0.515/3.424 sungard: 0.515/7.334 
18 | 
19 | Topic 7 (3.00): 10.5%
20 | Most relevant words:
21 | patients (2468,9): 1.261/3.783/0.764/6.903 treatment (1460,5): 0.813/1.818/0.692/6.245 hospitals (4777,5): 0.731/1.635/0.492/4.728 treat (5814,3): 0.515/0.893/0.607/5.164 patient (3777,2): 0.506/0.716/0.640/5.660 health (502,3): 0.470/0.814/0.503/4.247 care (1069,2): 0.448/0.633/0.554/4.723 doctors (4958,2): 0.446/0.631/0.495/4.685 treatments (9357,2): 0.426/0.602/0.677/6.167 use (176,7): 0.368/0.973/0.272/1.894 therapies (21018,2): 0.366/0.517/0.580/4.815 dr (570,5): 0.353/0.789/0.170/1.345 cancer (1871,2): 0.348/0.493/0.514/4.970 inpatient (33848,1): 0.338/0.338/0.465/5.812 turing (18313,9): 0.332/0.997/0.029/0.218 medicine (1528,2): 0.330/0.466/0.441/4.098 hospital (723,1): 0.324/0.324/0.394/3.612 sinai (13035,3): 0.317/0.549/0.163/1.318 aids (4693,2): 0.309/0.437/0.470/4.507 certain (1095,4): 0.303/0.607/0.288/1.983 medically (28191,1): 0.302/0.302/0.421/6.117 treating (10124,1): 0.296/0.296/0.613/4.943 drugs (3528,10): 0.293/0.926/0.632/6.074 threatening (7927,2): 0.278/0.393/0.248/2.211 medical (679,1): 0.277/0.277/0.549/4.698 serious (2375,2): 0.268/0.379/0.276/2.461 used (82,4): 0.266/0.531/0.118/0.894 neglected (12251,2): 0.264/0.374/0.193/1.493 effects (1380,2): 0.262/0.370/0.327/2.991 antibiotic (24098,2): 0.261/0.369/0.505/3.650 
22 | Most similar words in vocab:
23 | patients: 0.764/6.903 medication: 0.705/6.118 medications: 0.693/5.863 treatment: 0.692/6.245 treatments: 0.677/6.167 antiretroviral: 0.646/7.345 patient: 0.640/5.660 drugs: 0.632/6.074 prophylactic: 0.625/7.142 ivig: 0.619/7.731 contraindicated: 0.614/6.435 treating: 0.613/4.943 regimens: 0.610/7.113 antiepileptic: 0.608/7.483 treat: 0.607/5.164 diabetes: 0.597/5.192 therapeutic: 0.593/5.167 methadone: 0.593/7.109 drug: 0.590/5.897 clinical: 0.584/5.297 chemotherapy: 0.583/4.497 antipsychotic: 0.583/6.538 therapies: 0.580/4.815 medicines: 0.579/5.023 treatable: 0.578/6.294 diarrheal: 0.577/7.264 prophylaxis: 0.576/6.743 prescription: 0.572/4.964 therapy: 0.571/5.616 chronic: 0.570/5.024 
24 | 
25 | Topic 18 (3.00): 9.9%
26 | Most relevant words:
27 | price (1535,17): 4.027/16.602/0.962/8.408 prices (4345,4): 1.786/3.572/0.821/8.028 gouge (81038,1): 0.840/0.840/0.227/4.774 priced (14909,1): 0.715/0.715/0.549/4.614 high (104,3): 0.479/0.830/0.165/1.473 sharply (12378,2): 0.389/0.550/0.248/2.118 year (54,6): 0.322/0.789/0.058/0.421 stock (1878,2): 0.315/0.445/0.357/3.424 rose (1336,1): 0.313/0.313/0.180/1.313 sales (1485,3): 0.271/0.469/0.392/3.777 mr (757,5): 0.267/0.597/0.083/0.617 mount (1283,3): 0.262/0.453/0.009/0.096 old (204,5): 0.260/0.582/-0.037/-0.259 according (331,4): 0.253/0.507/0.083/0.495 turing (18313,9): 0.253/0.758/-0.008/-0.061 low (612,1): 0.246/0.246/0.303/2.538 august (149,3): 0.241/0.418/0.006/0.048 made (94,3): 0.237/0.410/0.072/0.516 last (237,2): 0.227/0.322/-0.035/-0.198 jumped (11157,2): 0.223/0.315/0.118/0.849 scott (1062,1): 0.223/0.223/0.102/0.653 piggy (34457,1): 0.221/0.221/0.147/2.015 cost (1178,2): 0.215/0.305/0.480/3.794 martin (778,1): 0.214/0.214/0.048/0.307 swindle (42243,1): 0.213/0.213/0.193/2.690 long (195,3): 0.213/0.369/0.024/0.201 spencer (4173,1): 0.213/0.213/0.117/0.880 raised (1286,4): 0.209/0.418/0.163/1.449 standard (712,2): 0.208/0.294/0.112/0.937 shortages (16485,1): 0.204/0.204/0.268/2.213 
28 | Most similar words in vocab:
29 | price: 0.962/8.408 prices: 0.821/8.028 pricing: 0.602/5.262 priced: 0.549/4.614 inflation: 0.535/4.948 commodity: 0.497/4.148 discount: 0.490/4.234 exorbitant: 0.489/5.836 cost: 0.480/3.794 discounted: 0.478/3.183 resale: 0.467/5.345 purchases: 0.464/3.577 costs: 0.463/3.895 rates: 0.439/4.075 purchasing: 0.438/3.479 buyers: 0.435/3.705 commodities: 0.431/3.727 buying: 0.429/3.555 undervalued: 0.426/4.792 demand: 0.426/3.511 premium: 0.424/4.245 skyrocketed: 0.423/4.790 asset's: 0.420/5.837 repayments: 0.419/4.495 market: 0.418/3.411 tariff: 0.416/3.359 volatility: 0.415/5.309 valuations: 0.415/4.747 cents: 0.414/3.823 buy: 0.414/3.396 
30 | 


--------------------------------------------------------------------------------
/TopicCloud.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from random import Random
  3 | from os import path
  4 | from wordcloud.wordcloud import WordCloud, IntegralOccupancyMap
  5 | from operator import itemgetter
  6 | import numpy as np
  7 | import pdb
  8 | import colorsys
  9 | from nltk.stem.wordnet import WordNetLemmatizer
 10 | from morpha import lemmatize
 11 | import re
 12 | 
 13 | from PIL import Image
 14 | from PIL import ImageColor
 15 | from PIL import ImageDraw
 16 | from PIL import ImageFont
 17 | 
 18 | def str2dict(s):
 19 |     wordlist = re.split( "\s+", s )
 20 |     worddict = {}    
 21 |     for w in wordlist:
 22 |         worddict[w] = 1
 23 |     return worddict
 24 | 
 25 | #lmtzr = WordNetLemmatizer()
 26 | random_state = Random()
 27 | img_padding = 15
 28 | 
 29 | specialNounsStr = "embeddings"
 30 | specialVerbsStr = ""
 31 | specialNounDict = str2dict(specialNounsStr)
 32 | specialVerbDict = str2dict(specialVerbsStr)
 33 | originalStr = "embedding turing sinai saudi data"
 34 | originalDict = str2dict(originalStr)
 35 | 
 36 | def clockwise(start_angle, stop_angle):
 37 |     start_angle = start_angle % 360
 38 |     stop_angle = stop_angle % 360
 39 |     # clockwise (90 degree at bottom, as the custom of pillow), start is the first and stop is the second
 40 |     # so start_angle < stop_angle
 41 |     if stop_angle < start_angle:
 42 |         if start_angle - stop_angle < 180:
 43 |             start_angle, stop_angle = stop_angle, start_angle
 44 |         else:
 45 |             stop_angle += 360
 46 |             
 47 |     return start_angle, stop_angle
 48 |     
 49 | def genSectorMask( width, height, start_angle, stop_angle ):
 50 | 
 51 |     start_angle, stop_angle = clockwise(start_angle, stop_angle)
 52 |     sector_mask = np.ones( (height, width) )
 53 |     origin_x = width / 2
 54 |     origin_y = height / 2
 55 |     sin1 = np.sin( start_angle * np.pi / 180 )
 56 |     cos1 = np.cos( start_angle * np.pi / 180 )
 57 |     sin2 = np.sin( stop_angle * np.pi / 180 )
 58 |     cos2 = np.cos( stop_angle * np.pi / 180 )
 59 |     reservedCenterRadius = 5
 60 |     maxRadius = min(width, height) * 0.5 - img_padding
 61 |     for y in xrange(height):
 62 |         for x in xrange(width):
 63 |             x2 = x - origin_x
 64 |             y2 = (height - y) - origin_y
 65 |             radius = np.sqrt(x2*x2 + y2*y2)
 66 |             if radius >= reservedCenterRadius and radius <= maxRadius and sin1 * x2 <= -cos1 * y2 and sin2 * x2 >= -cos2 * y2:
 67 |                 sector_mask[y,x] = 0
 68 |     
 69 |     return sector_mask
 70 | 
 71 | def d3_category20_rand(topicID):
 72 |     d3_category20 = [   # "#aec7e8", "#ffbb78", "#98df8a", 
 73 |                         # "#d62728", too striking red, "#ff7f0e", orange is alerting; 
 74 |                         # "#bcbd22", ugly; "#e377c2", striking
 75 |                         "#2ca02c", "#9467bd", "#1f77b4", "#ff9896", 
 76 |                         "#17becf", "#7f7f7f", "#8c564b", "#c49c94" ]
 77 |                         #   ""#c5b0d5", "#c49c94", 
 78 |                         #   "#f7b6d2", "#c7c7c7", "#dbdb8d", "#9edae5"
 79 |     colorID = topicID % len(d3_category20)
 80 |     basecolor = d3_category20[colorID]
 81 |     r, g, b = ImageColor.getrgb(basecolor)
 82 |     fluc = 60
 83 |     r += random_state.randint( 0, fluc ) - fluc/2
 84 |     g += random_state.randint( 0, fluc ) - fluc/2
 85 |     b += random_state.randint( 0, fluc ) - fluc/2
 86 |     r = min( max(r, 0), 255 )
 87 |     g = min( max(g, 0), 255 )
 88 |     b = min( max(b, 0), 255 )
 89 |     return "rgb(%d, %d, %d)" %(r, g, b)
 90 | 
 91 | def lemmatize2(word):
 92 |     if word in originalDict:
 93 |         return word
 94 |         
 95 |     candidatePOSs = ('n', 'v')
 96 |     
 97 |     if word in specialNounDict:
 98 |         candidatePOSs = [ 'n' ]
 99 |     if word in specialVerbDict:
100 |         candidatePOSs = [ 'v' ]
101 |         
102 |     for pos in candidatePOSs:
103 |         #w2 = lmtzr.lemmatize(word, pos)
104 |         w2 = lemmatize(word, pos)
105 |         if w2 != word:
106 |             return w2
107 |     return word
108 |         
109 | class TopicCloud(WordCloud):
110 |     def __init__(self, min_sector_padding=0, max_topic_num=10, max_sector_angle=150, max_topic_prop_ratio=6, 
111 |                     min_sector_angle=20, max_topic_words=10, min_word_topic_prop=0.5, **kwargs):
112 |         super(TopicCloud, self).__init__(**kwargs)
113 |         self.min_sector_padding = min_sector_padding
114 |         self.max_topic_num = max_topic_num
115 |         self.max_sector_angle = max_sector_angle
116 |         self.min_sector_angle = min_sector_angle
117 |         self.max_topic_prop_ratio = max_topic_prop_ratio
118 |         self.max_topic_words = max_topic_words
119 |         self.min_word_topic_prop = min_word_topic_prop
120 |         self.margin = 4
121 |         self.font_path = "C:/Windows/fonts/impact.ttf"
122 |         self.background_color = "white"
123 |         self.prefer_horizontal = 1
124 |         
125 |     def generate_from_topics(self, topics):
126 |         """Create a topic_cloud from topics.
127 | 
128 |         Parameters
129 |         ----------
130 |         topics : array of tuples
131 |                  Each topic: (proportion in the document, [ (word1, freq1), (word2, freq2), ... ] )
132 | 
133 |         Returns
134 |         -------
135 |         self
136 | 
137 |         """
138 | 
139 |         # lemmatizing
140 |         for topic in topics:  
141 |             words_freq = topic[1]
142 |             words_freq2 = []
143 |             word2idx = {}
144 |             idx = 0
145 |             for word, freq in words_freq:
146 |                 word2 = lemmatize2(word)
147 |                 if word2 in word2idx:
148 |                     wid = word2idx[word2]
149 |                     words_freq2[wid][1] += freq
150 |                 else:
151 |                     words_freq2.append( [word2, freq] )
152 |                     word2idx[word2] = idx
153 |                     idx += 1
154 |                                 
155 |             words_freq2 = sorted(words_freq2, key=itemgetter(1), reverse=True)
156 |             for i in xrange( len(words_freq2)-1, -1, -1 ):
157 |                 if words_freq2[i][1] >= self.min_word_topic_prop:
158 |                     break
159 |             words_freq2 = words_freq2[:i+1]
160 |                             
161 |             topic[1] = words_freq2[:self.max_topic_words]
162 |             # topic_mass = sum( [ len(w) for (w,f) in topic[1] ] )
163 |             # topic_masses.append(topic_mass)
164 |             #topic[0] *= topic[1][0][1] * sum( [ word_freq[1] for word_freq in topic[1] ] )
165 |             
166 |         # make sure topics are sorted and normalized
167 |         topics = sorted( topics, key=itemgetter(0), reverse=True )
168 |         if len(topics) > self.max_topic_num:
169 |             topics = topics[:self.max_topic_num]
170 |         min_topic_prop = topics[0][0] / self.max_topic_prop_ratio
171 |         for i in xrange( len(topics)-1, 0, -1 ):
172 |             if topics[i][0] >= min_topic_prop:
173 |                 break
174 |         topics = topics[:i+1]
175 |         T = len(topics)
176 | 
177 |         #topic_masses = []
178 |         topic_masses = np.ones(T)
179 |                     
180 |         # sqrt for smoothing    
181 |         total_props = sum( [ np.power(topics[i][0] * topic_masses[i],0.8) for i in xrange(len(topics)) ] )
182 |         for i in xrange(len(topics)):
183 |             topics[i][0] = np.power(topics[i][0] * topic_masses[i],0.8) / total_props
184 |         
185 |         avail_angles = 360 - T * self.min_sector_padding
186 |         max_angle = avail_angles * topics[0][0]
187 |         angle_scale = 1
188 |         if max_angle > self.max_sector_angle:
189 |             angle_scale = self.max_sector_angle / max_angle
190 |         topic_angles = []
191 |         for topic in topics:
192 |             topic_angles.append( avail_angles * topic[0] * angle_scale )
193 |         sector_padding = ( 360 - sum(topic_angles) ) / T
194 |         topic_angles = np.array(topic_angles)
195 | 
196 |         height, width = self.height, self.width
197 |         # create image
198 |         img_grey = Image.new("L", (width, height))
199 |         draw = ImageDraw.Draw(img_grey)
200 |         img_array = np.asarray(img_grey)
201 |         total_freqs, font_sizes, positions, orientations, colors = [], [], [], [], []
202 | 
203 |         if self.random_state is not None:
204 |             random_state = self.random_state
205 |         else:
206 |             random_state = Random()
207 | 
208 |         sector_masks = []
209 |         sector_angles = []
210 |         
211 |         for i,topic in enumerate(topics):
212 |             width = self.width
213 |             height = self.height
214 |             last_freq = 1.
215 |             font_size = self.max_font_size * min( np.sqrt(topic[1][0][1] / topics[0][1][0][1]), 2 )
216 |             
217 |             if i == 0:
218 |                 # initial angle starts from the symmetric left side of the y-axis
219 |                 # to ensure first sector always at right above of the canvas
220 |                 start_angle = 270 - topic_angles[0]/2 
221 |                 stop_angle = 270 + topic_angles[0]/2
222 |             else:
223 |                 start_angle = stop_angle + sector_padding
224 |                 stop_angle += sector_padding + topic_angles[i]
225 | 
226 |             # reverse sign to conform with pillow's measurement of angles
227 |             sector_angles.append( clockwise(start_angle, stop_angle) )
228 |             #print "%.1f - %.1f =>" %( start_angle % 360, stop_angle % 360),
229 |             #print "%.1f - %.1f" %( clockwise(start_angle, stop_angle) )
230 |             
231 |             sector_mask = genSectorMask( width, height, start_angle, stop_angle )
232 |             sector_masks.append(sector_mask)
233 |             occupancy = IntegralOccupancyMap(height, width, sector_mask)
234 | 
235 |             frequencies = topic[1][:self.max_words]
236 |             frequencies = sorted( frequencies, key=itemgetter(1), reverse=True )
237 |             
238 |             # largest entry will be 1
239 |             max_frequency = float(frequencies[0][1])
240 |     
241 |             frequencies = [ (word, freq / max_frequency) for word, freq in frequencies ]
242 |     
243 |             if len(frequencies) == 0:
244 |                 print("We need at least 1 word to plot a word cloud, got 0.")
245 |                 continue
246 |             
247 |             total_freqs += frequencies
248 |             drawn_words = []
249 |             
250 |             # start drawing grey image
251 |             for word, freq in frequencies:
252 |                 # select the font size
253 |                 rs = self.relative_scaling
254 |                 if rs != 0:
255 |                     font_size = int(round((rs * (freq / float(last_freq)) + (1 - rs)) * font_size))
256 |                 while True:
257 |                     # try to find a position
258 |                     font = ImageFont.truetype(self.font_path, font_size)
259 |                     # transpose font optionally
260 |                     if random_state.random() < self.prefer_horizontal:
261 |                         orientation = None
262 |                     else:
263 |                         orientation = Image.ROTATE_90
264 |                     transposed_font = ImageFont.TransposedFont(font,
265 |                                                                orientation=orientation)
266 |                     # get size of resulting text
267 |                     box_size = draw.textsize(word, font=transposed_font)
268 |                     # find possible places using integral image:
269 |                     result = occupancy.sample_position(box_size[1] + 2 * self.margin,
270 |                                                        box_size[0] + 2 * self.margin,
271 |                                                        random_state)
272 |                     if result is not None or font_size == 0:
273 |                         break
274 |                     # if we didn't find a place, make font smaller
275 |                     font_size -= self.font_step
276 |                         
277 |                 if font_size < self.min_font_size:
278 |                     # we were unable to draw any more
279 |                     font_size = self.min_font_size
280 |                 drawn_words.append(word)
281 |                 
282 |                 x, y = np.array(result) + self.margin // 2
283 |                 # actually draw the text
284 |                 draw.text((y, x), word, fill="white", font=transposed_font)
285 |                 positions.append((x, y))
286 |                 orientations.append(orientation)
287 |                 font_sizes.append(font_size)
288 |                 colors.append(d3_category20_rand(i))
289 |                                               
290 |                 # recompute integral image
291 |                 img_array = ( np.asarray(img_grey) + sector_mask ) > 0
292 |                 # recompute bottom right
293 |                 # the order of the cumsum's is important for speed ?!
294 |                 occupancy.update(img_array, x, y)
295 |                 last_freq = freq
296 |                 
297 |             print "Topic %d (%.1f):" %(i+1, topic_angles[i])
298 |             print drawn_words
299 |          
300 | #        for i in xrange(len(sector_masks)):
301 | #            for j in xrange(i):
302 | #                if np.any( (1-sector_masks[i]) * (1-sector_masks[j]) ):
303 | #                    pdb.set_trace()
304 |                     
305 |         self.layout_ = list(zip(total_freqs, font_sizes, positions, orientations, colors))
306 |         self.sector_angles = sector_angles
307 |         return self
308 |     
309 |     def to_image(self):
310 |         self._check_generated()
311 |         height, width = self.height, self.width
312 | 
313 |         img = Image.new(self.mode, (int(width * self.scale), int(height * self.scale)),
314 |                         self.background_color)
315 |                         
316 |         draw = ImageDraw.Draw(img)
317 |         bbox = (img_padding, img_padding, height-img_padding, height-img_padding)
318 |         
319 |         colors = [ "rgb(255,255,242)", "rgb(255,242,255)", "rgb(242,255,255)", "rgb(242,242,242)" ]
320 |         i = 0
321 |         if len(self.sector_angles) % len(colors) == 1:
322 |             modulus = len(colors) - 1
323 |         else:
324 |             modulus = len(colors)
325 |             
326 |         for (start_angle, stop_angle) in self.sector_angles:
327 |             draw.pieslice(bbox, start_angle, stop_angle, fill = colors[i%modulus])
328 |             i += 1
329 |             #print "%d-%d: %s" %(start_angle, stop_angle, colors[i%3])
330 |             
331 |         for (word, count), font_size, position, orientation, color in self.layout_:
332 |             font = ImageFont.truetype(self.font_path, int(font_size * self.scale))
333 |             transposed_font = ImageFont.TransposedFont(font,
334 |                                                        orientation=orientation)
335 |             pos = (int(position[1] * self.scale), int(position[0] * self.scale))
336 |             draw.text(pos, word, fill=color, font=transposed_font)
337 |         return img
338 |             


--------------------------------------------------------------------------------
/gencloud.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # an example of using TopicCloud
  4 | 
  5 | from os import path
  6 | from wordcloud import WordCloud
  7 | import sys
  8 | #sys.path.append("C:/Dropbox/topicvec/visualization/")
  9 | from TopicCloud import TopicCloud
 10 | 
 11 | topics_drugstory = [ 
 12 |   [ 26.7, [ ('drugs', 2.028), ('drug', 1.722), ('use', 1.555),
 13 |     ('prescriptions', 1.314), ('generic', 1.307), ('patients', 1.257),
 14 |     ('hospitals', 1.163), ('turing', 1.130), ('treatment', 1.061),
 15 |     ('used', 1.037), ('pharmaceutical', 0.919), ('pharmaceuticals', 0.908),
 16 |     ('health', 0.898), ('medicine', 0.891) ] ],
 17 |   [ 21.9, [ ('increase', 2.380), ('million', 2.331), ('increases', 1.378),
 18 |     ('raised', 1.303), ('dollars', 1.262), ('year', 1.221),
 19 |     ('percent', 1.163), ('sales', 1.151), ('huge', 1.066),
 20 |     ('money', 1.013), ('pay', 0.988), ('cost', 0.980),
 21 |     ('millions', 0.882), ('hundreds', 0.851) ] ],
 22 |   [ 16.4, [ ('company', 2.758), ('acquired', 2.038), ('companies', 1.687),
 23 |     ('mr', 1.105), ('manager', 1.087), ('hedge', 1.063),
 24 |     ('business', 0.990), ('filed', 0.949), ('now', 0.903),
 25 |     ('icahn', 0.883), ('sold', 0.845), ('august', 0.828),
 26 |     ('investors', 0.825), ('founder', 0.820) ] ],
 27 |   [ 10.5, [ ('patients', 1.261), ('treatment', 0.813), ('hospitals', 0.731),
 28 |     ('treat', 0.515), ('patient', 0.506), ('health', 0.470),
 29 |     ('care', 0.448), ('doctors', 0.446), ('treatments', 0.426),
 30 |     ('use', 0.368), ('therapies', 0.366), ('dr', 0.353),
 31 |     ('cancer', 0.348), ('inpatient', 0.338) ] ],
 32 |   [ 9.9, [ ('price', 4.027), ('prices', 1.786), ('gouge', 0.840),
 33 |     ('priced', 0.715), ('high', 0.479), ('sharply', 0.389),
 34 |     ('year', 0.322), ('stock', 0.315), ('rose', 0.313),
 35 |     ('sales', 0.271), ('mr', 0.267), ('mount', 0.262),
 36 |     ('according', 0.253), ('low', 0.246) ] ]
 37 | ]
 38 | 
 39 | topics_drugstory_8 = [
 40 |   [ 27.1, [ ('turing', 3.00), ('year', 2.35), ('use', 2.31),
 41 |     ('old', 2.15), ('mr', 2.14), ('generic', 1.99),
 42 |     ('dr', 1.96), ('used', 1.89), ('now', 1.79),
 43 |     ('first', 1.69), ('two', 1.59), ('called', 1.58),
 44 |     ('standard', 1.38), ('manager', 1.38) ] ],
 45 |   [ 22.8, [ ('drug', 3.52), ('drugs', 2.94), ('patients', 2.85),
 46 |     ('diseases', 2.39), ('hospitals', 2.13), ('toxoplasmosis', 2.00),
 47 |     ('infectious', 1.96), ('treatment', 1.93), ('certain', 1.77),
 48 |     ('treat', 1.61), ('controlled', 1.43), ('effects', 1.39),
 49 |     ('infection', 1.37), ('dangerous', 1.37) ] ],
 50 |   [ 22.8, [ ('raised', 1.99), ('mount', 1.72), ('sinai', 1.71),
 51 |     ('center', 1.67), ('hedge', 1.66), ('fund', 1.64),
 52 |     ('high', 1.54), ('lawmakers', 1.40), ('marathon', 1.35),
 53 |     ('atlanta', 1.35), ('investors', 1.32), ('sharply', 1.21),
 54 |     ('according', 1.16), ('jumped', 0.97) ] ],
 55 |   [ 22.1, [ ('price', 4.08), ('increase', 2.73), ('increases', 1.96),
 56 |     ('prices', 1.78), ('distribution', 1.64), ('better', 1.62),
 57 |     ('sales', 1.52), ('prescriptions', 1.47), ('make', 1.45),
 58 |     ('million', 1.40), ('huge', 1.34), ('made', 1.27),
 59 |     ('pay', 1.22), ('cost', 1.19) ] ],
 60 |   [ 5.2, [ ('acquired', 2.22), ('company', 2.00), ('pharmaceuticals', 1.65),
 61 |     ('companies', 1.52), ('pharmaceutical', 1.17), ('therapeutics', 0.97),
 62 |     ('glaxo', 0.95), ('drugstores', 0.94), ('acquisition', 0.88),
 63 |     ('glaxosmithkline', 0.82), ('sold', 0.72), ('products', 0.59),
 64 |     ('laboratories', 0.39), ('develop', 0.18) ] ]
 65 | ]
 66 | 
 67 | topics_drugstory_kmeans = [
 68 |   [ 10.0, [ ('company', 2.83), ('year', 2.45), ('dr', 2.24),
 69 |     ('acquired', 2.24), ('raised', 2.00), ('first', 1.73),
 70 |     ('august', 1.73), ('sinai', 1.73), ('center', 1.73),
 71 |     ('mount', 1.73), ('time', 1.41), ('university', 1.41),
 72 |     ('life', 1.41), ('same', 1.41) ] ],
 73 |   [ 9.0, [ ('now', 2.45), ('mr', 2.24), ('old', 2.24),
 74 |     ('called', 1.73), ('even', 1.73), ('make', 1.73),
 75 |     ('better', 1.73), ('think', 1.41), ('like', 1.41),
 76 |     ('side', 1.41), ('jumped', 1.41), ('ago', 1.41),
 77 |     ('threatening', 1.41), ('trying', 1.41) ] ],
 78 |   [ 8.0, [ ('federal', 1.41), ('lawmakers', 1.41), ('filed', 1.41),
 79 |     ('chief', 1.41), ('claim', 1.00), ('state', 1.00),
 80 |     ('united', 1.00), ('states', 1.00), ('general', 1.00),
 81 |     ('senator', 1.00), ('district', 1.00), ('public', 1.00),
 82 |     ('former', 1.00), ('president', 1.00) ] ],
 83 |   [ 5.0, [ ('price', 4.12), ('million', 2.45), ('prices', 2.00),
 84 |     ('companies', 2.00), ('hedge', 1.73), ('sales', 1.73),
 85 |     ('fund', 1.73), ('dollars', 1.41), ('business', 1.41),
 86 |     ('sold', 1.41), ('money', 1.41), ('cost', 1.41),
 87 |     ('pay', 1.41), ('stock', 1.41) ] ],
 88 |   [ 5.0, [ ('turing', 3.00), ('use', 2.65), ('used', 2.00),
 89 |     ('generic', 2.00), ('controlled', 1.73), ('tablet', 1.41),
 90 |     ('standard', 1.41), ('strategy', 1.41), ('system', 1.00),
 91 |     ('patents', 1.00), ('example', 1.00), ('data', 1.00),
 92 |     ('switch', 1.00), ('systems', 1.00) ] ],
 93 |   [ 5.0, [ ('made', 1.73), ('long', 1.73), ('huge', 1.41),
 94 |     ('attention', 1.41), ('mainstays', 1.00), ('criticism', 1.00),
 95 |     ('led', 1.00), ('further', 1.00), ('making', 1.00),
 96 |     ('despite', 1.00), ('controversy', 1.00), ('drew', 1.00),
 97 |     ('previous', 1.00), ('overnight', 1.00) ] ],
 98 |   [ 4.0, [ ('drug', 3.87), ('drugs', 3.16), ('patients', 3.00),
 99 |     ('treatment', 2.24), ('hospitals', 2.24), ('health', 1.73),
100 |     ('pharmaceuticals', 1.73), ('prescriptions', 1.73), ('therapies', 1.41),
101 |     ('pharmaceutical', 1.41), ('doctors', 1.41), ('care', 1.41),
102 |     ('treatments', 1.41), ('medicine', 1.41) ] ],
103 |   [ 4.0, [ ('increase', 2.83), ('increases', 2.00), ('certain', 2.00),
104 |     ('serious', 1.41), ('less', 1.41), ('required', 1.41),
105 |     ('need', 1.41), ('effects', 1.41), ('dangerous', 1.41),
106 |     ('discourage', 1.00), ('shortages', 1.00), ('rarely', 1.00),
107 |     ('potentially', 1.00), ('possibly', 1.00) ] ],
108 |   [ 4.0, [ ('according', 2.00), ('two', 1.73), ('high', 1.73),
109 |     ('distribution', 1.73), ('sharply', 1.41), ('many', 1.41),
110 |     ('hundreds', 1.41), ('percent', 1.41), ('number', 1.00),
111 |     ('list', 1.00), ('include', 1.00), ('differently', 1.00),
112 |     ('copies', 1.00), ('added', 1.00) ] ]
113 | ]
114 | 
115 | topics_sigir = [
116 |   [ 15.6, [ ('user', 0.904), ('web', 0.621), ('document', 0.589),
117 |     ('query', 0.435), ('cursor', 0.429), ('queries', 0.421),
118 |     ('retrieval', 0.407), ('text', 0.404), ('information', 0.399),
119 |     ('online', 0.397), ('knowledge', 0.375), ('interface', 0.363),
120 |     ('click', 0.342), ('collaborative', 0.319) ] ],
121 |   [ 13.8, [ ('user', 0.482), ('document', 0.439), ('web', 0.403),
122 |     ('knowledge', 0.386), ('entity', 0.379), ('retrieval', 0.376),
123 |     ('information', 0.342), ('collaborative', 0.312), ('queries', 0.308),
124 |     ('leveraging', 0.306), ('query', 0.302), ('text', 0.291),
125 |     ('online', 0.285), ('relevance', 0.285) ] ],
126 |   [ 13.4, [ ('knowledge', 0.379), ('retrieval', 0.369), ('user', 0.345),
127 |     ('document', 0.326), ('web', 0.313), ('information', 0.302),
128 |     ('leveraging', 0.294), ('collaborative', 0.290), ('relevance', 0.290),
129 |     ('queries', 0.261), ('query', 0.253), ('summarization', 0.247),
130 |     ('models', 0.242), ('evaluation', 0.239) ] ],
131 |   [ 12.9, [ ('fast', 0.459), ('neural', 0.318), ('efficient', 0.316),
132 |     ('models', 0.310), ('retrieval', 0.304), ('knowledge', 0.248),
133 |     ('model', 0.237), ('user', 0.233), ('modeling', 0.227),
134 |     ('relevance', 0.225), ('leveraging', 0.224), ('random', 0.219),
135 |     ('networks', 0.218), ('selection', 0.217) ] ],
136 |   [ 12.8, [ ('embeddings', 1.047), ('graphs', 0.438), ('subspace', 0.366),
137 |     ('hamming', 0.363), ('random', 0.351), ('quantization', 0.312),
138 |     ('factorization', 0.297), ('graph', 0.296), ('discrete', 0.291),
139 |     ('parameterized', 0.290), ('generalized', 0.285), ('math', 0.282),
140 |     ('neural', 0.275), ('models', 0.269) ] ],
141 |   [ 12.3, [ ('retrieval', 0.346), ('neural', 0.279), ('knowledge', 0.274),
142 |     ('models', 0.272), ('user', 0.250), ('document', 0.245),
143 |     ('relevance', 0.240), ('information', 0.238), ('queries', 0.236),
144 |     ('web', 0.235), ('modeling', 0.232), ('query', 0.232),
145 |     ('summarization', 0.224), ('efficient', 0.222) ] ],
146 |   [ 9.4, [ ('retrieval', 0.267), ('user', 0.248), ('web', 0.215),
147 |     ('knowledge', 0.213), ('efficient', 0.203), ('queries', 0.196),
148 |     ('query', 0.193), ('neural', 0.193), ('information', 0.192),
149 |     ('leveraging', 0.180), ('document', 0.180), ('models', 0.175),
150 |     ('relevance', 0.170), ('collaborative', 0.167) ] ],
151 |   [ 7.2, [ ('search', 3.558), ('searching', 0.675), ('finding', 0.278),
152 |     ('investigation', 0.203), ('retrieving', 0.190), ('exploration', 0.171),
153 |     ('click', 0.146), ('web', 0.146), ('exploring', 0.124),
154 |     ('online', 0.113), ('answers', 0.107), ('query', 0.096),
155 |     ('cache', 0.095), ('knowledge', 0.092) ] ]
156 | ]
157 | 
158 | topics_icml = [
159 |   [ 14.4, [ ('models', 1.037), ('neural', 1.033), ('data', 0.966),
160 |     ('optimization', 0.912), ('efficient', 0.875), ('model', 0.683),
161 |     ('inference', 0.675), ('analysis', 0.673), ('sampling', 0.601),
162 |     ('bayesian', 0.594), ('stochastic', 0.572), ('clustering', 0.549),
163 |     ('estimation', 0.546), ('structured', 0.521) ] ],
164 |   [ 14.4, [ ('neural', 1.169), ('models', 1.063), ('data', 0.990),
165 |     ('efficient', 0.942), ('optimization', 0.882), ('model', 0.692),
166 |     ('inference', 0.656), ('analysis', 0.648), ('memory', 0.625),
167 |     ('sampling', 0.563), ('bayesian', 0.559), ('structured', 0.549),
168 |     ('clustering', 0.540), ('stochastic', 0.527) ] ],
169 |   [ 13.9, [ ('optimization', 0.914), ('models', 0.891), ('neural', 0.853),
170 |     ('data', 0.814), ('efficient', 0.707), ('stochastic', 0.668),
171 |     ('inference', 0.662), ('analysis', 0.656), ('sampling', 0.623),
172 |     ('bayesian', 0.615), ('estimation', 0.610), ('model', 0.602),
173 |     ('clustering', 0.525), ('sparse', 0.517) ] ],
174 |   [ 12.9, [ ('convex', 1.166), ('embeddings', 1.128), ('matrix', 1.086),
175 |     ('tensor', 0.965), ('factorization', 0.825), ('riemannian', 0.770),
176 |     ('gaussian', 0.762), ('linear', 0.675), ('dimensional', 0.672),
177 |     ('matrices', 0.665), ('subspace', 0.621), ('nonconvex', 0.582),
178 |     ('kernel', 0.581), ('gradient', 0.580) ] ],
179 |   [ 12.3, [ ('stochastic', 0.772), ('optimization', 0.740), ('estimation', 0.577),
180 |     ('models', 0.558), ('regression', 0.534), ('sampling', 0.521),
181 |     ('neural', 0.518), ('sparse', 0.507), ('analysis', 0.493),
182 |     ('linear', 0.493), ('data', 0.488), ('gaussian', 0.488),
183 |     ('matrix', 0.485), ('inference', 0.482) ] ],
184 |   [ 12.2, [ ('matrix', 0.817), ('convex', 0.702), ('gaussian', 0.692),
185 |     ('stochastic', 0.652), ('linear', 0.611), ('tensor', 0.580),
186 |     ('factorization', 0.547), ('gradient', 0.539), ('sparse', 0.535),
187 |     ('embeddings', 0.534), ('optimization', 0.526), ('variational', 0.523),
188 |     ('kernel', 0.488), ('dimensional', 0.460) ] ],
189 |   [ 11.9, [ ('stochastic', 0.740), ('optimization', 0.685), ('estimation', 0.534),
190 |     ('matrix', 0.515), ('gaussian', 0.510), ('regression', 0.506),
191 |     ('models', 0.503), ('linear', 0.498), ('sparse', 0.498),
192 |     ('sampling', 0.479), ('neural', 0.467), ('variational', 0.453),
193 |     ('analysis', 0.452), ('gradient', 0.446) ] ],
194 |   [ 4.4, [ ('deep', 4.408), ('convolutional', 2.100), ('fast', 0.334),
195 |     ('exploration', 0.297), ('inner', 0.227), ('memory', 0.192),
196 |     ('reconstructive', 0.188), ('squeezing', 0.187), ('rectifier', 0.184),
197 |     ('streams', 0.175), ('faster', 0.167), ('neural', 0.163),
198 |     ('layers', 0.157), ('hidden', 0.154) ] ]
199 | ]
200 | 
201 | topics_icml_3 = [
202 |   [ 14.2, [ ('convex', 1.84), ('embeddings', 1.81), ('rank', 1.66),
203 |     ('tensor', 1.58), ('matrix', 1.55), ('riemannian', 1.26),
204 |     ('factorization', 1.25), ('matrices', 1.01), ('dimensional', 1.01),
205 |     ('subspace', 0.96), ('gaussian', 0.89), ('nonconvex', 0.87),
206 |     ('doubly', 0.82), ('metric', 0.78) ] ],
207 |   [ 13.1, [ ('models', 0.85), ('optimization', 0.85), ('data', 0.78),
208 |     ('efficient', 0.73), ('neural', 0.71), ('stochastic', 0.63),
209 |     ('analysis', 0.60), ('inference', 0.59), ('model', 0.58),
210 |     ('sampling', 0.56), ('estimation', 0.54), ('bayesian', 0.53),
211 |     ('sparse', 0.51), ('clustering', 0.51) ] ],
212 |   [ 13.0, [ ('optimization', 0.84), ('models', 0.82), ('data', 0.74),
213 |     ('neural', 0.69), ('efficient', 0.68), ('stochastic', 0.65),
214 |     ('analysis', 0.58), ('inference', 0.58), ('model', 0.56),
215 |     ('sampling', 0.55), ('estimation', 0.55), ('bayesian', 0.53),
216 |     ('sparse', 0.51), ('clustering', 0.50) ] ],
217 |   [ 12.3, [ ('optimization', 0.78), ('stochastic', 0.70), ('models', 0.69),
218 |     ('data', 0.61), ('neural', 0.57), ('efficient', 0.56),
219 |     ('estimation', 0.55), ('analysis', 0.53), ('sampling', 0.53),
220 |     ('inference', 0.52), ('bayesian', 0.50), ('regression', 0.50),
221 |     ('sparse', 0.49), ('model', 0.48) ] ],
222 |   [ 12.3, [ ('optimization', 0.78), ('models', 0.71), ('stochastic', 0.68),
223 |     ('data', 0.63), ('neural', 0.59), ('efficient', 0.58),
224 |     ('estimation', 0.55), ('analysis', 0.54), ('inference', 0.53),
225 |     ('sampling', 0.53), ('bayesian', 0.50), ('model', 0.49),
226 |     ('sparse', 0.49), ('regression', 0.49) ] ],
227 |   [ 12.0, [ ('optimization', 0.75), ('stochastic', 0.69), ('models', 0.65),
228 |     ('data', 0.58), ('neural', 0.55), ('estimation', 0.54),
229 |     ('efficient', 0.53), ('analysis', 0.51), ('sampling', 0.51),
230 |     ('inference', 0.50), ('regression', 0.49), ('sparse', 0.49),
231 |     ('bayesian', 0.48), ('model', 0.46) ] ],
232 |   [ 11.6, [ ('optimization', 0.72), ('stochastic', 0.69), ('models', 0.62),
233 |     ('data', 0.54), ('estimation', 0.53), ('neural', 0.51),
234 |     ('efficient', 0.49), ('sampling', 0.49), ('analysis', 0.49),
235 |     ('regression', 0.48), ('inference', 0.48), ('sparse', 0.47),
236 |     ('bayesian', 0.46), ('linear', 0.44) ] ],
237 |   [ 7.8, [ ('deep', 4.22), ('convolutional', 2.50), ('neural', 0.97),
238 |     ('memory', 0.62), ('fast', 0.61), ('faster', 0.38),
239 |     ('brain', 0.33), ('reconstructive', 0.32), ('rectifier', 0.31),
240 |     ('efficient', 0.27), ('data', 0.26), ('generative', 0.26),
241 |     ('simple', 0.25), ('squeezing', 0.25) ] ]
242 | ]
243 | 
244 | topics_icml_5 = [
245 |   [ 21.8, [ ('stochastic', 3.88), ('optimization', 3.82), ('rank', 3.21),
246 |     ('estimation', 2.95), ('gradient', 2.71), ('monte', 2.38),
247 |     ('gaussian', 2.37), ('variational', 2.35), ('carlo', 2.35),
248 |     ('regression', 2.27), ('optimal', 2.27), ('approximate', 2.18),
249 |     ('descent', 1.96), ('approximation', 1.91) ] ],
250 |   [ 17.7, [ ('networks', 5.51), ('deep', 4.51), ('efficient', 3.88),
251 |     ('fast', 3.20), ('bandits', 2.66), ('faster', 2.36),
252 |     ('search', 2.24), ('online', 2.19), ('network', 2.14),
253 |     ('bandit', 1.42), ('nystrom', 1.41), ('dueling', 1.37),
254 |     ('simple', 1.36), ('anytime', 1.36) ] ],
255 |   [ 17.3, [ ('inference', 3.05), ('reinforcement', 2.73), ('hierarchical', 2.20),
256 |     ('generative', 2.13), ('data', 1.98), ('bayesian', 1.56),
257 |     ('contextual', 1.56), ('clustering', 1.54), ('recurrent', 1.41),
258 |     ('structured', 1.40), ('conditional', 1.40), ('graphical', 1.35),
259 |     ('empirical', 1.31), ('analysis', 1.30) ] ],
260 |   [ 14.8, [ ('matrix', 3.48), ('convex', 3.04), ('embeddings', 2.42),
261 |     ('factorization', 2.38), ('kernel', 2.32), ('tensor', 2.21),
262 |     ('doubly', 1.97), ('dimensional', 1.76), ('matrices', 1.75),
263 |     ('riemannian', 1.66), ('nonconvex', 1.61), ('subspace', 1.61),
264 |     ('decomposition', 1.59), ('dual', 1.43) ] ],
265 |   [ 11.5, [ ('classification', 2.65), ('policy', 2.47), ('supervised', 2.14),
266 |     ('evaluation', 2.09), ('training', 1.89), ('cca', 1.73),
267 |     ('correcting', 1.65), ('testing', 1.47), ('unsupervised', 1.32),
268 |     ('dropout', 1.21), ('pca', 1.21), ('analysis', 1.19),
269 |     ('test', 1.17), ('objectives', 1.13) ] ],
270 |   [ 8.9, [ ('sparse', 3.69), ('sampling', 3.03), ('low', 2.65),
271 |     ('noisy', 1.40), ('high', 1.27), ('large', 1.14),
272 |     ('heavy', 1.10), ('noise', 1.08), ('sample', 1.02),
273 |     ('mixed', 1.00), ('mixture', 0.96), ('mixing', 0.94),
274 |     ('variable', 0.92), ('samples', 0.89) ] ],
275 |   [ 8.0, [ ('neural', 4.23), ('models', 4.22), ('convolutional', 2.72),
276 |     ('memory', 2.21), ('model', 1.96), ('block', 1.05),
277 |     ('data', 0.89), ('architectures', 0.74), ('rectifier', 0.68),
278 |     ('brain', 0.63), ('motor', 0.58), ('activation', 0.57),
279 |     ('unlabeled', 0.54), ('processes', 0.52) ] ]
280 | ]
281 | 
282 | topics_ijcai = [
283 |   [ 14.7, [ ('logic', 2.911), ('semantics', 2.332), ('logics', 2.042),
284 |     ('modal', 1.667), ('semantic', 1.534), ('symbolic', 1.471),
285 |     ('convolutional', 1.394), ('language', 1.384), ('representation', 1.290),
286 |     ('object', 1.248), ('representations', 1.247), ('reasoning', 1.241),
287 |     ('abstraction', 1.188), ('calculus', 1.179) ] ],
288 |   [ 14.7, [ ('neural', 3.449), ('deep', 2.619), ('networks', 1.994),
289 |     ('robot', 1.704), ('human', 1.648), ('network', 1.485),
290 |     ('models', 1.415), ('systems', 1.190), ('model', 1.009),
291 |     ('machine', 0.969), ('robust', 0.884), ('simulation', 0.830),
292 |     ('interactive', 0.741), ('facial', 0.697) ] ],
293 |   [ 12.0, [ ('planning', 4.458), ('efficient', 2.270), ('task', 2.109),
294 |     ('plan', 1.939), ('improving', 1.491), ('joint', 1.490),
295 |     ('strategy', 1.480), ('supervised', 1.259), ('citywide', 1.184),
296 |     ('recommendations', 1.072), ('policy', 1.071), ('transfer', 1.035),
297 |     ('scheduling', 1.033), ('repositioning', 1.022) ] ],
298 |   [ 11.5, [ ('search', 1.508), ('detection', 1.116), ('recognition', 1.029),
299 |     ('information', 0.920), ('data', 0.812), ('knowledge', 0.808),
300 |     ('tracking', 0.783), ('online', 0.736), ('prediction', 0.634),
301 |     ('identification', 0.515), ('selection', 0.497), ('automatic', 0.490),
302 |     ('networks', 0.477), ('robust', 0.476) ] ],
303 |   [ 11.1, [ ('embeddings', 2.786), ('embedding', 2.720), ('factorization', 2.460),
304 |     ('matrix', 1.998), ('kernel', 1.758), ('subspace', 1.731),
305 |     ('graph', 1.645), ('generalized', 1.642), ('norm', 1.620),
306 |     ('metric', 1.591), ('graphs', 1.535), ('hashing', 1.347),
307 |     ('convex', 1.258), ('modulo', 1.257) ] ],
308 |   [ 9.9, [ ('knowledge', 1.085), ('recognition', 0.814), ('information', 0.705),
309 |     ('search', 0.592), ('data', 0.591), ('preference', 0.533),
310 |     ('reasoning', 0.519), ('probabilistic', 0.499), ('query', 0.485),
311 |     ('selection', 0.476), ('text', 0.455), ('representation', 0.439),
312 |     ('classification', 0.435), ('elicitation', 0.428) ] ],
313 |   [ 9.6, [ ('search', 0.778), ('selection', 0.657), ('clustering', 0.653),
314 |     ('data', 0.550), ('prediction', 0.549), ('optimization', 0.524),
315 |     ('information', 0.509), ('preference', 0.508), ('robust', 0.506),
316 |     ('detection', 0.499), ('efficient', 0.490), ('tracking', 0.465),
317 |     ('recognition', 0.455), ('optimal', 0.447) ] ]
318 | ]
319 | 
320 | topics_aamas = [
321 |   [ 23.4, [ ('social', 2.99), ('cooperation', 2.22), ('security', 1.74),
322 |     ('voting', 1.61), ('policies', 1.61), ('cooperative', 1.45),
323 |     ('mechanism', 1.43), ('mechanisms', 1.35), ('networks', 1.30),
324 |     ('systems', 1.23), ('policy', 1.14), ('behavior', 1.07),
325 |     ('preferences', 1.04), ('strategy', 0.98) ] ],
326 |   [ 22.0, [ ('reinforcement', 2.18), ('networks', 1.38), ('task', 1.29),
327 |     ('distributed', 1.23), ('modeling', 1.21), ('systems', 1.12),
328 |     ('scheduling', 1.09), ('online', 1.02), ('planning', 0.88),
329 |     ('efficient', 0.83), ('dynamic', 0.82), ('decision', 0.79),
330 |     ('automated', 0.77), ('simulation', 0.76) ] ],
331 |   [ 16.9, [ ('equilibria', 2.52), ('nash', 1.58), ('optimal', 1.52),
332 |     ('equilibrium', 1.27), ('maximization', 1.16), ('stochastic', 1.02),
333 |     ('models', 0.98), ('matching', 0.95), ('markov', 0.93),
334 |     ('inverse', 0.91), ('model', 0.89), ('efficient', 0.81),
335 |     ('constrained', 0.78), ('continuous', 0.77) ] ],
336 |   [ 14.2, [ ('argumentation', 2.53), ('logic', 2.03), ('theoretic', 1.89),
337 |     ('truthful', 1.88), ('proof', 1.80), ('BDI', 1.71),
338 |     ('hedonic', 1.65), ('reasoning', 1.61), ('epistemic', 1.14),
339 |     ('boolean', 1.14), ('judgment', 1.07), ('abstract', 0.99),
340 |     ('empirical', 0.94), ('propositional', 0.92) ] ],
341 |   [ 7.8, [ ('stackelberg', 1.68), ('reinforcement', 0.99), ('optimal', 0.32),
342 |     ('maximization', 0.30), ('continuous', 0.29), ('model', 0.28),
343 |     ('matching', 0.28), ('decision', 0.27), ('behavior', 0.26),
344 |     ('dynamic', 0.26), ('modeling', 0.26), ('task', 0.25),
345 |     ('models', 0.25), ('strategy', 0.24) ] ],
346 |   [ 6.2, [ ('robot', 3.13), ('human', 2.96), ('robots', 2.19),
347 |     ('robotic', 1.97), ('autonomous', 0.99), ('humanoid', 0.97),
348 |     ('humans', 0.76), ('vehicle', 0.71), ('swarms', 0.71),
349 |     ('automated', 0.45), ('wheeled', 0.43),
350 |     ('poachers', 0.42), ('body', 0.38) ] ],
351 |   [ 4.3, [ ('games', 5.13), ('game', 1.93), ('teams', 1.62),
352 |     ('team', 1.05), ('playing', 0.58), ('player', 0.54),
353 |     ('multiplayer', 0.39), ('players', 0.39), ('atari', 0.34),
354 |     ('scoring', 0.31), ('competitions', 0.31), ('pac', 0.28),
355 |     ('points', 0.26), ('winning', 0.25) ] ],
356 |   [ 3.5, [ ('agent', 5.52), ('agents', 3.53), ('sobe', 0.40),
357 |     ('investigating', 0.09), ('intelligence', 0.06), ('customs', 0.05),
358 |     ('poachers', 0.05), ('assignment', 0.05), ('dealers', 0.05),
359 |     ('contact', 0.04), ('collusion', 0.04), ('security', 0.04),
360 |     ('mdp', 0.04), ('anti', 0.04) ] ]
361 | ]
362 | 
363 | topics_trump = [
364 |   [ 19.1, [ ('know', 5.49), ('say', 4.62), ('think', 2.94),
365 |     ('believe', 2.90), ('tell', 2.60), ('happen', 2.39),
366 |     ('hear', 1.90), ('stupid', 1.75), ('want', 1.75),
367 |     ('cheerleader', 1.71), ('gonna', 1.65), ('heard', 1.60),
368 |     ('answer', 1.60), ('never', 1.52) ] ],
369 |   [ 17.6, [ ('going', 3.72), ('back', 3.71), ('right', 3.16),
370 |     ('send', 2.26), ('over', 2.03), ('take', 1.98),
371 |     ('sending', 1.70), ('money', 1.52), ('running', 1.40),
372 |     ('bring', 1.37), ('run', 1.29), ('stop', 1.23),
373 |     ('go', 1.22), ('deal', 1.05) ] ],
374 |   [ 16.3, [ ('jobs', 3.27), ('need', 3.11), ('job', 2.08),
375 |     ('lobbyists', 1.76), ('obamacare', 1.71), ('care', 1.43),
376 |     ('money', 1.37), ('problems', 1.34), ('vets', 1.30),
377 |     ('politicians', 1.16), ('going', 1.08), ('problem', 1.08),
378 |     ('needs', 1.04), ('bring', 1.02) ] ],
379 |   [ 11.4, [ ('very', 5.03), ('good', 3.65), ('nice', 3.27),
380 |     ('big', 2.64), ('like', 2.26), ('rich', 2.02),
381 |     ('make', 1.36), ('highly', 1.31), ('wonderful', 1.30),
382 |     ('talented', 1.12), ('kind', 1.09), ('makes', 1.09),
383 |     ('bad', 1.08), ('proud', 1.08) ] ],
384 |   [ 10.0, [ ('building', 3.02), ('build', 2.92), ('ford', 2.80),
385 |     ('equipment', 2.21), ('built', 2.04), ('car', 1.81),
386 |     ('manufacturer', 1.49), ('hotel', 1.48), ('tower', 1.32),
387 |     ('manufacturing', 1.21), ('rebuild', 1.20), ('trucks', 1.20),
388 |     ('truck', 1.18), ('cars', 1.16) ] ],
389 |   [ 9.5, [ ('country', 4.82), ('us', 3.47), ('iraq', 2.96),
390 |     ('president', 2.21), ('iran', 2.17), ('military', 2.03),
391 |     ('border', 1.92), ('saudi', 1.82), ('arabia', 1.78),
392 |     ('mexico', 1.66), ('yemen', 1.55), ('united', 1.49),
393 |     ('states', 1.44), ('airports', 1.41) ] ],
394 |   [ 8.8, [ ('china', 4.31), ('billion', 3.38), ('trillion', 2.60),
395 |     ('trade', 1.87), ('mexico', 1.51), ('oil', 1.44),
396 |     ('world', 1.21), ('debt', 1.21), ('worth', 1.09),
397 |     ('currency', 1.08), ('million', 1.07), ('net', 1.06),
398 |     ('over', 1.05), ('japan', 1.05) ] ],
399 |   [ 4.7, [ ('people', 6.59), ('thousands', 1.89), ('person', 0.60),
400 |     ('hundreds', 0.57), ('killing', 0.50), ('many', 0.47),
401 |     ('number', 0.42), ('leaders', 0.37), ('wounded', 0.36),
402 |     ('millions', 0.31), ('tens', 0.27), ('ago', 0.25),
403 |     ('crowd', 0.24), ('soldiers', 0.24) ] ]
404 | ]
405 | 
406 | topics_hillary = [
407 |   [ 22.9, [ ('people', 2.48), ('million', 2.46), ('now', 2.13),
408 |     ('years', 2.10), ('country', 1.78), ('back', 1.69),
409 |     ('working', 1.48), ('time', 1.45), ('today', 1.38),
410 |     ('thank', 1.36), ('millions', 1.36), ('stop', 1.30),
411 |     ('decades', 1.26), ('make', 1.24) ] ],
412 |   [ 17.6, [ ('right', 2.44), ('rightly', 1.16), ('respect', 1.09),
413 |     ('fight', 1.05), ('mean', 1.02), ('extremist', 0.91),
414 |     ('righted', 0.91), ('think', 0.91), ('bemoans', 0.89),
415 |     ('tarnish', 0.89), ('starker', 0.88), ('hard', 0.88),
416 |     ('progressive', 0.85), ('statesmanship', 0.84) ] ],
417 |   [ 14.3, [ ('corporations', 1.67), ('powerful', 1.46), ('people', 1.09),
418 |     ('women', 0.89), ('make', 0.77), ('citizens', 0.75),
419 |     ('care', 0.73), ('interests', 0.72), ('unions', 0.71),
420 |     ('affordable', 0.70), ('rich', 0.65), ('corporate', 0.65),
421 |     ('america', 0.64), ('want', 0.64) ] ],
422 |   [ 13.5, [ ('court', 6.53), ('supreme', 3.18), ('legal', 2.61),
423 |     ('case', 2.58), ('justice', 2.28), ('law', 2.23),
424 |     ('justices', 1.94), ('cases', 1.88), ('ruled', 1.77),
425 |     ('judge', 1.66), ('courts', 1.63), ('hearing', 1.23),
426 |     ('decisions', 1.10), ('lawyer', 1.06) ] ],
427 |   [ 9.5, [ ('president', 4.90), ('senator', 3.06), ('obama', 2.06),
428 |     ('senate', 2.04), ('republican', 1.65), ('republicans', 1.41),
429 |     ('grassley', 1.33), ('election', 1.32), ('john', 1.22),
430 |     ('barack', 1.17), ('presidency', 1.11), ('former', 1.07),
431 |     ('united', 1.05), ('governor', 1.01) ] ],
432 |   [ 8.7, [ ('women', 0.61), ('people', 0.50), ('care', 0.48),
433 |     ('citizens', 0.48), ('country', 0.48), ('corporations', 0.47),
434 |     ('unions', 0.45), ('america', 0.44), ('americans', 0.39),
435 |     ('act', 0.37), ('health', 0.37), ('nation', 0.37),
436 |     ('politics', 0.35), ('interests', 0.35) ] ],
437 |   [ 6.6, [ ('trump', 1.80), ('vote', 0.86), ('elections', 0.45),
438 |     ('party', 0.44), ('choose', 0.39), ('voting', 0.39),
439 |     ('politics', 0.39), ('republicans', 0.37), ('election', 0.34),
440 |     ('voted', 0.33), ('decided', 0.31), ('constitution', 0.30),
441 |     ('progressive', 0.30), ('votes', 0.30) ] ]
442 | ]
443 | 
444 | topics_hillary2 = [
445 |   [ 20.4, [ ('let', 2.64), ('go', 1.63), ('hard', 1.52),
446 |     ('want', 1.19), ('going', 1.19), ('tonight', 1.09),
447 |     ('live', 1.02), ('back', 0.95), ('know', 0.93),
448 |     ('say', 0.90), ('tomorrow', 0.90), ('unselfish', 0.80),
449 |     ('thank', 0.78), ('good', 0.73) ] ],
450 |   [ 18.8, [ ('great', 1.63), ('like', 0.91), ('grandparent', 0.78),
451 |     ('people', 0.74), ('lives', 0.62), ('trust', 0.59),
452 |     ('going', 0.58), ('lot', 0.55), ('remarkable', 0.54),
453 |     ('way', 0.53), ('children', 0.53), ('good', 0.52),
454 |     ('know', 0.52), ('child', 0.50) ] ],
455 |   [ 11.3, [ ('new', 1.80), ('state', 1.65), ('york', 1.62),
456 |     ('president', 1.40), ('roosevelt', 1.07), ('senate', 0.93),
457 |     ('governor', 0.77), ('presidents', 0.73), ('stuyvesant', 0.72),
458 |     ('mayor', 0.68), ('island', 0.67), ('election', 0.66),
459 |     ('senator', 0.66), ('members', 0.66) ] ],
460 |   [ 11.3, [ ('barriers', 0.85), ('people', 0.49), ('back', 0.47),
461 |     ('going', 0.44), ('like', 0.43), ('way', 0.42),
462 |     ('jobs', 0.40), ('hard', 0.40), ('americans', 0.33),
463 |     ('crumbling', 0.29), ('make', 0.29), ('need', 0.27),
464 |     ('problems', 0.27), ('stop', 0.27) ] ],
465 |   [ 8.9, [ ('rights', 2.29), ('people', 0.64), ('diversity', 0.60),
466 |     ('lgbt', 0.59), ('discrimination', 0.56), ('dignity', 0.54),
467 |     ('equal', 0.46), ('women', 0.45), ('americans', 0.44),
468 |     ('advocate', 0.43), ('empowerment', 0.39), ('values', 0.39),
469 |     ('racism', 0.35), ('families', 0.34) ] ],
470 |   [ 8.6, [ ('responders', 1.10), ('yorkers', 0.99), ('rikers', 0.99),
471 |     ('inaudible', 0.98), ('trayvon', 0.97), ('fdny', 0.91),
472 |     ('firefighter', 0.79), ('ladders', 0.46), ('firefighters', 0.41),
473 |     ('people', 0.40), ('heard', 0.38), ('emergency', 0.38),
474 |     ('survivors', 0.36), ('officers', 0.35) ] ],
475 |   [ 8.2, [ ('campaign', 2.35), ('progressive', 1.03), ('reform', 0.89),
476 |     ('election', 0.63), ('democratic', 0.59), ('votes', 0.52),
477 |     ('divisive', 0.50), ('voters', 0.49), ('voting', 0.42),
478 |     ('supported', 0.42), ('congressional', 0.37), ('specter', 0.36),
479 |     ('elections', 0.36), ('supporters', 0.36) ] ],
480 |   [ 7.6, [ ('country', 3.32), ('america', 2.63), ('us', 1.27),
481 |     ('american', 0.55), ('world', 0.50), ('continent', 0.49),
482 |     ('region', 0.40), ('nation', 0.38), ('cities', 0.36),
483 |     ('south', 0.35), ('million', 0.24), ('places', 0.20),
484 |     ('today', 0.19), ('americans', 0.18) ] ]
485 | ]
486 | 
487 | topics_hillary3 = [
488 |   [ 19.5, [ ('back', 3.24), ('let', 2.66), ('stop', 2.61),
489 |     ('going', 2.54), ('hard', 2.53), ('go', 2.44),
490 |     ('right', 2.23), ('fight', 2.00), ('keep', 1.98),
491 |     ('take', 1.90), ('hold', 1.85), ('break', 1.79),
492 |     ('make', 1.79), ('single', 1.69) ] ],
493 |   [ 17.5, [ ('people', 4.54), ('million', 2.84), ('years', 2.30),
494 |     ('children', 2.26), ('working', 2.10), ('families', 1.94),
495 |     ('americans', 1.91), ('many', 1.80), ('millions', 1.77),
496 |     ('women', 1.74), ('worked', 1.56), ('today', 1.52),
497 |     ('workers', 1.50), ('lives', 1.38) ] ],
498 |   [ 14.3, [ ('powerful', 2.61), ('great', 1.71), ('like', 1.30),
499 |     ('very', 1.27), ('dangerous', 1.15), ('kind', 1.04),
500 |     ('make', 1.04), ('remarkable', 0.99), ('good', 0.95),
501 |     ('respect', 0.93), ('mean', 0.93), ('humbling', 0.93),
502 |     ('strong', 0.92), ('know', 0.92) ] ],
503 |   [ 9.8, [ ('care', 1.09), ('barriers', 0.88), ('affordable', 0.75),
504 |     ('equality', 0.74), ('health', 0.71), ('protecting', 0.69),
505 |     ('protect', 0.66), ('discrimination', 0.64), ('women', 0.61),
506 |     ('know', 0.60), ('rights', 0.59), ('want', 0.55),
507 |     ('fair', 0.55), ('equal', 0.53) ] ],
508 |   [ 9.0, [ ('campaign', 2.82), ('corporations', 2.30), ('progressive', 2.17),
509 |     ('politics', 1.53), ('pacs', 1.23), ('unions', 1.06),
510 |     ('reform', 1.04), ('corporate', 0.96), ('party', 0.89),
511 |     ('interests', 0.82), ('voters', 0.75), ('elections', 0.75),
512 |     ('voting', 0.74), ('ads', 0.71) ] ],
513 |   [ 8.5, [ ('legal', 3.32), ('trump', 2.80), ('law', 1.98),
514 |     ('rights', 1.83), ('issues', 1.55), ('issue', 1.24),
515 |     ('decisions', 1.01), ('marriage', 0.96), ('matter', 0.89),
516 |     ('lawyer', 0.81), ('matters', 0.78), ('constitutional', 0.74),
517 |     ('laws', 0.64), ('constitution', 0.63) ] ],
518 |   [ 8.5, [ ('president', 5.21), ('senator', 3.24), ('senate', 2.56),
519 |     ('obama', 2.34), ('republican', 1.91), ('election', 1.81),
520 |     ('republicans', 1.78), ('john', 1.60), ('nomination', 1.52),
521 |     ('barack', 1.50), ('grassley', 1.37), ('vote', 1.36),
522 |     ('governor', 1.34), ('presidency', 1.19) ] ],
523 |   [ 7.9, [ ('court', 6.73), ('supreme', 3.30), ('case', 2.46),
524 |     ('justice', 2.28), ('justices', 2.14), ('ruled', 1.98),
525 |     ('cases', 1.76), ('courts', 1.69), ('judge', 1.68),
526 |     ('hearing', 1.21), ('judges', 1.06), ('decisions', 1.02),
527 |     ('judiciary', 0.92), ('scalia', 0.85) ] ]
528 | ]
529 | 
530 | topics_sanders = [
531 |   [ 18.0, [ ('people', 4.73), ('world', 4.08), ('country', 2.89),
532 |     ('american', 2.12), ('us', 2.02), ('time', 1.88),
533 |     ('nation', 1.75), ('america', 1.71), ('states', 1.70),
534 |     ('united', 1.66), ('americans', 1.65), ('new', 1.48),
535 |     ('young', 1.46), ('women', 1.33) ] ],
536 |   [ 17.8, [ ('wall', 2.36), ('going', 1.93), ('street', 1.71),
537 |     ('bottom', 1.21), ('right', 1.20), ('let', 1.05),
538 |     ('good', 1.02), ('protect', 0.99), ('back', 0.92),
539 |     ('way', 0.89), ('fight', 0.87), ('bring', 0.86),
540 |     ('top', 0.86), ('continue', 0.85) ] ],
541 |   [ 16.4, [ ('financial', 3.03), ('wealth', 2.80), ('economy', 1.53),
542 |     ('profits', 1.51), ('banks', 1.32), ('money', 1.23),
543 |     ('tax', 1.21), ('economic', 1.15), ('energy', 1.10),
544 |     ('corporations', 1.09), ('investments', 1.03), ('income', 1.02),
545 |     ('huge', 1.01), ('enormous', 0.96) ] ],
546 |   [ 15.2, [ ('political', 2.78), ('moral', 2.67), ('excesses', 1.59),
547 |     ('deeply', 1.49), ('powerfully', 1.32), ('morality', 1.16),
548 |     ('politics', 1.14), ('social', 1.09), ('sense', 0.97),
549 |     ('indifference', 0.95), ('cynicism', 0.95), ('misguided', 0.93),
550 |     ('recklessness', 0.90), ('disgrace', 0.90) ] ],
551 |   [ 14.4, [ ('system', 1.43), ('care', 1.14), ('poor', 0.97),
552 |     ('pay', 0.88), ('protect', 0.84), ('workers', 0.79),
553 |     ('adequate', 0.76), ('healthcare', 0.76), ('rights', 0.76),
554 |     ('good', 0.70), ('wage', 0.69), ('need', 0.69),
555 |     ('allow', 0.69), ('fair', 0.68) ] ],
556 |   [ 6.2, [ ('economy', 2.30), ('market', 0.46), ('economic', 0.44),
557 |     ('climate', 0.38), ('industry', 0.29), ('globalization', 0.29),
558 |     ('system', 0.29), ('workers', 0.28), ('inequality', 0.26),
559 |     ('jobs', 0.25), ('working', 0.25), ('today', 0.24),
560 |     ('change', 0.24), ('trade', 0.23) ] ],
561 |   [ 5.7, [ ('billionaires', 1.85), ('us', 0.68), ('billionaire', 0.47),
562 |     ('top', 0.46), ('wealthiest', 0.35), ('million', 0.33),
563 |     ('dollars', 0.32), ('millions', 0.31), ('class', 0.28),
564 |     ('percent', 0.26), ('hampshire', 0.25), ('billion', 0.24),
565 |     ('today', 0.24), ('country', 0.24) ] ]
566 | ]
567 | 
568 | topicCloud = TopicCloud(max_topic_words=8, max_topic_num=7, min_word_topic_prop=0.25, max_words=50, 
569 |                     height=1000, width=1000, relative_scaling=0.7, max_font_size=80, 
570 |                     min_font_size=30 ).generate_from_topics(topics_icml_5)
571 | 
572 | # Display the generated image:
573 | # the matplotlib way:
574 | import matplotlib.pyplot as plt
575 | plt.imshow(topicCloud)
576 | plt.axis("off")
577 | plt.show()
578 | topicCloud.to_file("clouds/topics_icml_5.png")
579 | 


--------------------------------------------------------------------------------