├── .gitignore ├── icml2016.txt ├── morph ├── morpha.exe ├── verbstem.list └── README ├── clouds ├── topics_icml_3.png ├── topics_icml_5.png ├── topics_trump.png ├── topics_hillary.png ├── topics_hillary2.png ├── topics_hillary3.png ├── topics_sanders.png ├── topiccloud-aamas.png ├── topiccloud-ijcai.png ├── topics_drugstory.png ├── wordcloud_drugstory.png └── topics_drugstory_kmeans.png ├── README.md ├── morpha.py ├── drugstory.txt ├── TopicCloud.py └── gencloud.py /.gitignore: -------------------------------------------------------------------------------- 1 | commit.bat 2 | *.pyc 3 | -------------------------------------------------------------------------------- /icml2016.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/icml2016.txt -------------------------------------------------------------------------------- /morph/morpha.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/morph/morpha.exe -------------------------------------------------------------------------------- /clouds/topics_icml_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_icml_3.png -------------------------------------------------------------------------------- /clouds/topics_icml_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_icml_5.png -------------------------------------------------------------------------------- /clouds/topics_trump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_trump.png -------------------------------------------------------------------------------- /clouds/topics_hillary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary.png -------------------------------------------------------------------------------- /clouds/topics_hillary2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary2.png -------------------------------------------------------------------------------- /clouds/topics_hillary3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_hillary3.png -------------------------------------------------------------------------------- /clouds/topics_sanders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_sanders.png -------------------------------------------------------------------------------- /clouds/topiccloud-aamas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topiccloud-aamas.png -------------------------------------------------------------------------------- /clouds/topiccloud-ijcai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topiccloud-ijcai.png -------------------------------------------------------------------------------- /clouds/topics_drugstory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_drugstory.png -------------------------------------------------------------------------------- /clouds/wordcloud_drugstory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/wordcloud_drugstory.png -------------------------------------------------------------------------------- /clouds/topics_drugstory_kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topiccloud/HEAD/clouds/topics_drugstory_kmeans.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Topic Cloud generation: 2 | 3 | This Python toolkit generates a Topic Cloud, a visual representation of topics derived from a document (documents). 4 | 5 | Please check the graphs in "/clouds" directory for example topic clouds. 6 | 7 | # Prerequisites: 8 | * [wordcloud](https://github.com/amueller/word_cloud/) ([windows binary](http://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud)) 9 | * Pillow ([windows binary](http://www.lfd.uci.edu/~gohlke/pythonlibs/#pillow)) 10 | * matplotlib 11 | -------------------------------------------------------------------------------- /morpha.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper around morpha from 3 | http://www.informatics.sussex.ac.uk/research/groups/nlp/carroll/morph.html 4 | 5 | Vaguely follows edu.stanford.nlp.Morphology except we implement with a pipe. 6 | hacky. Would be nice to use cython/swig/ctypes to directly embed morpha.yy.c 7 | as a python extension. 8 | 9 | TODO compare linguistic quality to lemmatizer in python's "pattern" package 10 | 11 | By Brendan O'Connor (http://brenocon.com), at https://gist.github.com/brendano/6008945 12 | """ 13 | 14 | import os,subprocess 15 | 16 | #MorphaDir = os.path.join(os.path.dirname(__file__), 'morph') 17 | MorphaDir = 'morph' 18 | MorphaCmd = os.path.join(MorphaDir, 'morpha') 19 | MorphaArgs= ['-f', os.path.join(MorphaDir, 'verbstem.list')] 20 | 21 | _pipe = None 22 | 23 | def get_pipe(): 24 | global _pipe 25 | if _pipe is None: 26 | open_pipe() 27 | elif _pipe.returncode is not None: 28 | print "Pipe seems to have died, restarting" 29 | open_pipe() 30 | return _pipe 31 | 32 | def open_pipe(): 33 | global _pipe 34 | print "Opening morpha pipe" 35 | _pipe = subprocess.Popen([MorphaCmd] + MorphaArgs, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 36 | 37 | def process(input): 38 | input = input.strip() 39 | output = None 40 | for retry in range(3): 41 | try: 42 | pipe = get_pipe() 43 | print>>pipe.stdin, input 44 | pipe.stdin.flush() 45 | output = pipe.stdout.readline() 46 | except IOError: 47 | if retry==2: raise 48 | print "Retry on pipe breakage" 49 | open_pipe() 50 | return output.rstrip('\n') 51 | 52 | 53 | ## From morph/doc.txt.... 54 | 55 | #Where the -u option is not used, each input token is expected to be of 56 | #the form _. For example: 57 | # 58 | # A_AT1 move_NN1 to_TO stop_VV0 Mr._NNS Gaitskell_NP1 from_II nominating_VVG 59 | # 60 | #Contractions and punctuation must have been separated out into separate 61 | #tokens. The tagset is assumed to resemble CLAWS-2, in the following 62 | #respects: 63 | # 64 | # V... all verbs 65 | # NP... all proper names 66 | # N[^P]... all common nouns 67 | # 68 | #and for specific cases of ambiguous lexical items: 69 | # 70 | # 'd_VH... root is 'have' 71 | # 'd_VM... root is 'would' 72 | # 's_VBZ... root is 'be' 73 | # 's_VHZ... root is 'have' 74 | # 's_$... possessive morpheme (also _POS for CLAWS-5) 75 | # ai_VB... root is 'be' 76 | # ai_VH... root is 'have' 77 | # ca_VM... root is 'can' 78 | # sha_VM... root is 'shall' 79 | # wo_VM... root is 'will' 80 | # n't_XX... root is 'not' 81 | 82 | def ptb_is_proper(ptb): 83 | return ptb in ('NP','NNP','NNPS') 84 | 85 | def ptb2morphtag(ptb): 86 | ptb = ptb.upper() 87 | if ptb.startswith('V'): 88 | return 'V' 89 | if ptb_is_proper(ptb): 90 | return 'NP' 91 | if ptb.startswith('N'): 92 | return 'N' 93 | if ptb == 'MD': 94 | return 'V' # um is this right? it looks like it can take incomplete versions... 95 | if ptb == 'POS': 96 | return '$' 97 | return '' 98 | 99 | def lemmatize_seq(words_and_pos, tagset='PENN'): 100 | """List of (word,pos) pairs. Words are Unicode strings. 101 | Returns list of lemma strings.""" 102 | assert tagset=='PENN', "don't support different tagsets yet" 103 | 104 | # Decorate the input pairs into one big string that morpha wants, 105 | # Run morpha, 106 | # Then undecorate the output. 107 | 108 | goods = [i for i in range(len(words_and_pos)) if words_and_pos[i][0]] 109 | escape_str = '..axsxdxfxqxwxexr..' 110 | new_pairs = [] 111 | #for word,pos in words_and_pos: 112 | for i in goods: 113 | word,pos = words_and_pos[i] 114 | assert ' ' not in word 115 | word = word.replace('_', escape_str) 116 | morph_tag = ptb2morphtag(pos) 117 | new_pairs.append((word, morph_tag)) 118 | decorated_input = u' '.join(u'{}_{}'.format(word,tag) if tag else word for word,tag in new_pairs) 119 | decorated_input = decorated_input.encode('utf8') # TODO is morpha utf8 safe? 120 | #print "INPUT", decorated_input 121 | result = process(decorated_input) 122 | #print "RESULT", result 123 | 124 | lemma_results = [] 125 | result_tokens = result.split() 126 | assert len(result_tokens) == len(new_pairs) 127 | for i,lemma in enumerate(result_tokens): 128 | lemma = lemma.split('_')[0] # Rare. I think this is a bug in morpha 129 | #assert '_' not in lemma 130 | lemma = lemma.decode('utf-8','replace') # TODO is morpha utf8 safe? 131 | lemma = lemma.replace(escape_str, '_') 132 | if not ptb_is_proper(words_and_pos[i][1]): 133 | lemma = lemma.lower() 134 | lemma_results.append(lemma) 135 | 136 | # juxtapose it back in 137 | final_results = ['' for x in range(len(words_and_pos))] 138 | for i,lemma in enumerate(lemma_results): 139 | final_results[goods[i]] = lemma 140 | return final_results 141 | 142 | def lemmatize(word,pos, tagset='PENN'): 143 | seq = [(word,pos)] 144 | result = lemmatize_seq(seq, tagset=tagset) 145 | return result[0] 146 | -------------------------------------------------------------------------------- /morph/verbstem.list: -------------------------------------------------------------------------------- 1 | abat abet abhor abut accur acquit adlib admit aerobat aerosol 2 | agendaset allot alot anagram annul appal apparel armbar aver babysit 3 | airdrop appal blackleg bobsled bur chum confab counterplot curet dib 4 | backdrop backfil backflip backlog backpedal backslap backstab bag 5 | balfun ballot ban bar barbel bareleg barrel bat bayonet becom bed 6 | bedevil bedwet beenhop befit befog beg beget begin bejewel bemedal 7 | benefit benum beset besot bestir bet betassel bevel bewig bib bid 8 | billet bin bip bit bitmap blab blag blam blan blat bles blim blip blob 9 | bloodlet blot blub blur bob bodypop bog booby-trap boobytrap booksel 10 | bootleg bop bot bowel bracket brag brig brim bud buffet bug bullshit 11 | bum bun bus but cab cabal cam can cancel cap caracol caravan carburet 12 | carnap carol carpetbag castanet cat catcal catnap cavil chan chanel 13 | channel chap char chargecap chat chin chip chir chirrup chisel chop 14 | chug chur clam clap clearcut clip clodhop clog clop closet clot club 15 | co-occur co-program co-refer co-run co-star cob cobweb cod coif com 16 | combat comit commit compel con concur confer confiscat control cop 17 | coquet coral corbel corral cosset cotransmit councel council counsel 18 | court-martial crab cram crap crib crop crossleg cub cudgel cum cun cup 19 | cut dab dag dam dan dap daysit de-control de-gazet de-hul de-instal 20 | de-mob de-program de-rig de-skil deadpan debag debar debug decommit 21 | decontrol defer defog deg degas deinstal demit demob demur den denet 22 | depig depip depit der deskil deter devil diagram dial dig dim din dip 23 | disbar disbud discomfit disembed disembowel dishevel disinter dispel 24 | disprefer distil dog dognap don doorstep dot dowel drag drat driftnet 25 | distil egotrip enrol enthral extol fulfil gaffe golliwog idyl inspan 26 | drip drivel drop drub drug drum dub duel dun dybbuk earwig eavesdrop 27 | ecolabel eitherspigot electroblot embed emit empanel enamel endlabel 28 | endtrim enrol enthral entrammel entrap enwrap equal equip estop 29 | exaggerat excel expel extol fag fan farewel fat featherbed feget fet 30 | fib fig fin fingerspel fingertip fit flab flag flap flip flit flog 31 | flop fob focus fog footbal footslog fop forbid forget format 32 | fortunetel fot foxtrot frag freefal fret frig frip frog frug fuel 33 | fufil fulfil fullyfit fun funnel fur furpul gab gad gag gam gambol gap 34 | garot garrot gas gat gel gen get giftwrap gig gimbal gin glam glenden 35 | glendin globetrot glug glut gob goldpan goostep gossip grab gravel 36 | grid grin grip grit groundhop grovel grub gum gun gunrun gut gyp haircut 37 | ham han handbag handicap handknit handset hap hareleg hat headbut 38 | hedgehop hem hen hiccup highwal hip hit hobnob hog hop horsewhip 39 | hostel hot hotdog hovel hug hum humbug hup hushkit hut illfit imbed 40 | immunblot immunoblot impannel impel imperil incur infer infil inflam 41 | initial input inset instil inter interbed intercrop intercut interfer 42 | instal instil intermit japan jug kris manumit mishit mousse mud 43 | interwar jab jag jam jar jawdrop jet jetlag jewel jib jig jitterbug 44 | job jog jog-trot jot jut ken kennel kid kidnap kip kissogram kit knap 45 | kneecap knit knob knot kor label lag lam lap lavel leafcut leapfrog 46 | leg lem lep let level libel lid lig lip lob log lok lollop longleg lop 47 | lowbal lug mackerel mahom man map mar marshal marvel mat matchwin 48 | metal micro-program microplan microprogram milksop mis-cal mis-club 49 | mis-spel miscal mishit mislabel mit mob mod model mohmam monogram mop 50 | mothbal mug multilevel mum nab nag nan nap net nightclub nightsit nip 51 | nod nonplus norkop nostril not nut nutmeg occur ocur offput offset 52 | omit ommit onlap out-general out-gun out-jab out-plan out-pol out-pul 53 | out-put out-run out-sel outbid outcrop outfit outgas outgun outhit 54 | outjab outpol output outrun outship outshop outsin outstrip outswel 55 | outspan overcrop pettifog photostat pouf preset prim pug ret rosin 56 | outwit over-commit over-control over-fil over-fit over-lap over-model 57 | over-pedal over-pet over-run over-sel over-step over-tip over-top 58 | overbid overcal overcommit overcontrol overcrap overdub overfil 59 | overhat overhit overlap overman overplot overrun overshop overstep 60 | overtip overtop overwet overwil pad paintbal pan panel paperclip par 61 | parallel parcel partiescal pat patrol pedal peewit peg pen pencil pep 62 | permit pet petal photoset phototypeset phut picket pig pilot pin 63 | pinbal pip pipefit pipet pit plan plit plod plop plot plug plumet 64 | plummet pod policyset polyfil ponytrek pop pot pram prebag predistil 65 | predril prefer prefil preinstal prep preplan preprogram prizewin prod 66 | profer prog program prop propel pub pummel pun pup pushfit put quarel 67 | quarrel quickskim quickstep quickwit quip quit quivertip quiz rabbit 68 | rabit radiolabel rag ram ramrod rap rat ratecap ravel re-admit re-cal 69 | re-cap re-channel re-dig re-dril re-emit re-fil re-fit re-flag 70 | re-format re-fret re-hab re-instal re-inter re-lap re-let re-map 71 | re-metal re-model re-pastel re-plan re-plot re-plug re-pot re-program 72 | re-refer re-rig re-rol re-run re-sel re-set re-skin re-stal re-submit 73 | re-tel re-top re-transmit re-trim re-wrap readmit reallot rebel rebid 74 | rebin rebut recap rechannel recommit recrop recur recut red redril 75 | refer refit reformat refret refuel reget regret reinter rejig rekit 76 | reknot relabel relet rem remap remetal remit remodel reoccur rep repel 77 | repin replan replot repol repot reprogram rerun reset resignal resit 78 | reskil resubmit retransfer retransmit retro-fit retrofit rev revel 79 | revet rewrap rib richochet ricochet rid rig rim ringlet rip rit rival 80 | rivet roadrun rob rocket rod roset rot rowel rub run runnel rut sab 81 | sad sag sandbag sap scab scalpel scam scan scar scat schlep scrag 82 | scram shall sled smut stet sulfuret trepan unrip unstop whir whop wig 83 | scrap scrat scrub scrum scud scum scur semi-control semi-skil 84 | semi-skim semiskil sentinel set shag sham shed shim shin ship shir 85 | shit shlap shop shopfit shortfal shot shovel shred shrinkwrap shrivel 86 | shrug shun shut side-step sideslip sidestep signal sin sinbin sip sit 87 | skid skim skin skip skir skrag slab slag slam slap slim slip slit slob 88 | slog slop slot slowclap slug slum slur smit snag snap snip snivel snog 89 | snorkel snowcem snub snug sob sod softpedal son sop spam span spar 90 | spat spiderweb spin spiral spit splat split spot sprag spraygun sprig 91 | springtip spud spur squat squirrel stab stag star stem sten stencil 92 | step stir stop storytel strap strim strip strop strug strum strut stub 93 | stud stun sub subcrop sublet submit subset suedetrim sum summit sun 94 | suntan sup super-chil superad swab swag swan swap swat swig swim 95 | swivel swot tab tag tan tansfer tap tar tassel tat tefer teleshop 96 | tendril terschel th'strip thermal thermostat thin throb thrum thud 97 | thug tightlip tin tinsel tip tittup toecap tog tom tomorrow top tot 98 | total towel traget trainspot tram trammel transfer tranship transit 99 | transmit transship trap travel trek trendset trim trip tripod trod 100 | trog trot trousseaushop trowel trup tub tug tunnel tup tut twat twig 101 | twin twit typeset tyset un-man unban unbar unbob uncap unclip uncompel 102 | undam under-bil under-cut under-fit under-pin under-skil underbid 103 | undercut underlet underman underpin unfit unfulfil unknot unlip 104 | unlywil unman unpad unpeg unpin unplug unravel unrol unscrol unsnap 105 | unstal unstep unstir untap unwrap unzip up upset upskil upwel ven 106 | verbal vet victual vignet wad wag wainscot wan war water-log waterfal 107 | waterfil waterlog weasel web wed wet wham whet whip whir whiteskin 108 | whiz whup wildcat win windmil wit woodchop woodcut wor worship wrap 109 | will wiretap yen yak yap yarnspin yip yodel zag zap zig zig-zag zigzag 110 | zip ztrip 111 | -------------------------------------------------------------------------------- /morph/README: -------------------------------------------------------------------------------- 1 | University of Sussex 8 Sep 2003 2 | 3 | This directory contains software for morphological processing of English 4 | as developed by Kevin Humphreys , John Carroll 5 | and Guido Minnen. 6 | 7 | To be used for research purposes only (see section 4 below). If you make 8 | any changes, the authors would appreciate it if you sent them details of 9 | what you have done. 10 | 11 | Covers the English inflectional suffixes: 12 | 13 | -s plural of nouns, 3rd person singular present of verbs 14 | -ed past tense 15 | -en past participle 16 | -ing progressive of verbs 17 | 18 | 1. Usage 19 | -------- 20 | 21 | morpha [-a] [-c] [-t] [-u] [-f verbstem-file] 22 | morphg [-c] [-t] [-u] [-f verbstem-file] 23 | 24 | The commands operate as filters, reading from the standard input and 25 | writing to the standard output. 26 | 27 | They may be invoked with the following command-line options: 28 | 29 | -a Output affixes (morpha only). 30 | 31 | -c Preserve case distinctions wherever possible. 32 | 33 | -t Output part-of-speech tags if they are in the input. 34 | 35 | -u Indicate that the words in the input are not tagged with 36 | part-of-speech labels. N.B. This mode of use is not recommended 37 | since the resulting ambiguity in the input is likely to lead to 38 | incorrect output. 39 | 40 | -f By default, the commands attempt to read a file called 41 | 'verbstem.list' in the user's current directory which is expected 42 | to contain a list of stems of verbs that undergo doubling of 43 | their final consonant, as occurs in British English spelling. 44 | This option allows the user to specify a different file of verb 45 | stems (for example if American English behaviour is required). 46 | If this option is specified then it must be the last one on 47 | the command-line. 48 | 49 | See the file doc.txt for specifications of input and output formats, 50 | and examples of usage. 51 | 52 | 2. Files 53 | -------- 54 | 55 | Makefile makefile for compiling the flex sources; can be 56 | used for compiling both flex descriptions by 57 | the command `make flex-description-file' 58 | README this file 59 | doc.txt specifications of input/output formats, and usage 60 | examples 61 | gpost postamble file used in deriving morphg.lex 62 | gpre preamble file used in deriving morphg.lex 63 | invert.sh unix shell program that derives morphg.lex from 64 | morpha.lex 65 | minnen.pdf pre-final PDF version of the NLE article by Minnen, 66 | Carroll and Pearce (2001) 67 | morpha.{ix86_linux|ppc_darwin|sun4_sunos} 68 | executables for the morphological analyser; for 69 | details of usage see above 70 | morpha.lex flex description constituting the source of the 71 | morphological analyser 72 | morphg.{ix86_linux|ppc_darwin|sun4_sunos} 73 | executables for the morphological generator; for 74 | details of usage see above 75 | morphg.lex flex description constituting the source of the 76 | morphological generator 77 | verbstem.list list of verb stems that allow for consonant doubling 78 | in British English 79 | 80 | The file morphg.lex is derived automatically from the file morpha.lex 81 | using invert.sh, as described in the paper by Minnen, Carroll and 82 | Pearce (2001) -- full reference below. 83 | 84 | 3. Compilation 85 | -------------- 86 | 87 | To recompile the morph tools, either type the following commands 88 | (making sure that you use the 2.5.4a version of flex recompiled with 89 | larger internal limits -- see below), or (more conveniently) use the 90 | Makefile in this directory by typing `make morpha' or `make morphg'. 91 | 92 | flex -i -Cfe -8 -omorpha.yy.c morpha.lex 93 | gcc -o morpha morpha.yy.c 94 | 95 | or 96 | 97 | flex -i -Cfe -8 -omorphg.yy.c morphg.lex 98 | gcc -o morphg morphg.yy.c 99 | 100 | The executables included in this release were built omitting the 101 | Flex options -Cfe -8, resulting in a reduction in binary file size 102 | of two thirds (and a reduction in processing speed of around 20%). 103 | These options also have to be left out and the option -Dinteractive 104 | added to gcc (resulting in a further decrease in throughput) in order 105 | to get the morph tools to return results immediately when used via 106 | unix pipes inside other programs. 107 | 108 | N.B. Recompiling the morph tools requires an adapted version of Flex. 109 | The Flex source code is freely available from: 110 | 111 | http://www.go.dlr.de/fresh/unix/src/misc/.warix/flex-2.5.4a.tar.gz.html 112 | 113 | The Flex source should be changed to allow for more internal states by 114 | increasing the definitions in flexdef.h of: 115 | 116 | #define JAMSTATE -32766 117 | ... 118 | #define MAXIMUM_MNS 31999 119 | ... 120 | #define BAD_SUBSCRIPT -32767 121 | 122 | to: 123 | 124 | #define JAMSTATE -800000 125 | ... 126 | #define MAXIMUM_MNS 800000 127 | ... 128 | #define BAD_SUBSCRIPT -800000 129 | 130 | and recompiling Flex. When recompiling the morph tools ensure that the 131 | Makefile points to the new version of Flex. 132 | 133 | 4. Acknowledgements, copyrights etc. 134 | ------------------------------------ 135 | 136 | Copyright (c) 1995-2000 University of Sheffield, University of Sussex 137 | All rights reserved. 138 | 139 | Redistribution and use of source and derived binary forms are 140 | permitted without fee provided that: 141 | 142 | - they are not used in commercial products 143 | - the above copyright notice and this paragraph are duplicated in 144 | all such forms 145 | - any documentation, advertising materials, and other materials 146 | related to such distribution and use acknowledge that the software 147 | was developed by Kevin Humphreys , John 148 | Carroll and Guido Minnen 149 | and refer to the following related publication: 150 | 151 | Guido Minnen, John Carroll and Darren Pearce. 2001. `Applied 152 | morphological processing of English'. Natural Language Engineering, 153 | 7(3). 207-223. 154 | 155 | The name of University of Sheffield may not be used to endorse or 156 | promote products derived from this software without specific prior 157 | written permission. 158 | 159 | This software is provided "as is" and without any express or implied 160 | warranties, including, without limitation, the implied warranties of 161 | merchantibility and fitness for a particular purpose. 162 | 163 | The exception lists were derived semi-automatically from WordNet 1.5, 164 | and various other corpora and MRDs. 165 | 166 | Many thanks to Tim Baldwin, Chris Brew, Bill Fisher, Gerald Gazdar, 167 | Dale Gerdemann, Adam Kilgarriff and Ehud Reiter for suggested 168 | improvements. 169 | 170 | WordNet 1.5 Copyright 1995 by Princeton University. 171 | All rights reseved. 172 | 173 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON 174 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. 175 | BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO 176 | REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY 177 | PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE 178 | OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, 179 | COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. 180 | 181 | The name of Princeton University or Princeton may not be used in 182 | advertising or publicity pertaining to distribution of the software 183 | and/or database. Title to copyright in this software, database and 184 | any associated documentation shall at all times remain with Princeton 185 | University and LICENSEE agrees to preserve same. 186 | -------------------------------------------------------------------------------- /drugstory.txt: -------------------------------------------------------------------------------- 1 | Topic 4 (3.00): 26.7% 2 | Most relevant words: 3 | drugs (3528,10): 2.028/6.412/0.735/7.070 drug (1942,15): 1.722/6.668/0.691/6.910 use (176,7): 1.555/4.115/0.345/2.396 prescriptions (34657,3): 1.314/2.277/0.586/7.272 generic (7602,4): 1.307/2.613/0.310/2.722 patients (2468,9): 1.257/3.771/0.660/5.960 hospitals (4777,5): 1.163/2.601/0.443/4.253 turing (18313,9): 1.130/3.390/0.067/0.502 treatment (1460,5): 1.061/2.373/0.617/5.572 used (82,4): 1.037/2.074/0.174/1.317 pharmaceutical (8897,2): 0.919/1.300/0.558/5.198 pharmaceuticals (16906,3): 0.908/1.572/0.476/4.394 health (502,3): 0.898/1.556/0.468/3.955 medicine (1528,2): 0.891/1.259/0.447/4.153 controlled (2257,3): 0.880/1.524/0.234/2.163 treat (5814,3): 0.858/1.485/0.556/4.734 therapies (21018,2): 0.831/1.175/0.566/4.696 certain (1095,4): 0.825/1.650/0.297/2.043 dr (570,5): 0.825/1.845/0.158/1.255 treatments (9357,2): 0.791/1.118/0.642/5.847 doctors (4958,2): 0.776/1.098/0.455/4.300 generics (85144,1): 0.750/0.750/0.407/6.914 care (1069,2): 0.738/1.044/0.502/4.284 dangerous (3584,2): 0.713/1.008/0.253/2.339 sinai (13035,3): 0.709/1.228/0.147/1.184 patient (3777,2): 0.703/0.995/0.571/5.049 tablet (10396,2): 0.679/0.960/0.158/1.565 effects (1380,2): 0.665/0.941/0.326/2.985 pills (18688,1): 0.660/0.660/0.454/3.698 antibiotic (24098,2): 0.652/0.922/0.502/3.627 4 | Most similar words in vocab: 5 | drugs: 0.735/7.070 medication: 0.724/6.284 medications: 0.715/6.052 drug: 0.691/6.910 antiretroviral: 0.667/7.589 prescription: 0.664/5.763 patients: 0.660/5.960 treatments: 0.642/5.847 methadone: 0.636/7.626 medicines: 0.632/5.483 antipsychotic: 0.631/7.077 antiepileptic: 0.623/7.670 ivig: 0.623/7.779 prophylactic: 0.622/7.108 treatment: 0.617/5.572 regimens: 0.610/7.109 therapeutic: 0.600/5.231 contraindicated: 0.600/6.283 analgesics: 0.597/6.625 dosages: 0.590/6.364 prescriptions: 0.586/7.272 warfarin: 0.575/6.138 prescribing: 0.572/7.143 patient: 0.571/5.049 therapies: 0.566/4.696 nsaids: 0.565/6.234 diazepam: 0.564/5.971 antidepressant: 0.564/6.338 opiates: 0.563/6.488 clonazepam: 0.563/7.389 6 | 7 | Topic 1 (3.00): 21.9% 8 | Most relevant words: 9 | increase (1265,8): 2.380/6.732/0.634/4.916 million (357,6): 2.331/5.710/0.637/6.217 increases (4242,4): 1.378/2.756/0.490/4.213 raised (1286,4): 1.303/2.606/0.289/2.576 dollars (4434,2): 1.262/1.784/0.665/5.687 year (54,6): 1.221/2.991/0.144/1.050 percent (1651,2): 1.163/1.645/0.515/4.703 sales (1485,3): 1.151/1.993/0.469/4.523 huge (3131,2): 1.066/1.508/0.338/2.961 money (808,2): 1.013/1.433/0.478/4.058 pay (1653,2): 0.988/1.397/0.503/4.325 cost (1178,2): 0.980/1.386/0.583/4.606 millions (5429,1): 0.882/0.882/0.538/4.874 hundreds (3443,2): 0.851/1.204/0.297/2.429 amount (1588,1): 0.834/0.834/0.549/4.450 rebates (60162,1): 0.815/0.815/0.595/6.684 clamoring (148365,1): 0.800/0.800/0.362/5.470 increased (1167,1): 0.791/0.791/0.534/4.307 shrank (46494,1): 0.787/0.787/0.363/4.175 month (1066,2): 0.758/1.072/0.294/2.159 fund (2014,3): 0.743/1.287/0.422/3.718 thousands (2861,1): 0.717/0.717/0.359/3.126 income (874,1): 0.685/0.685/0.473/3.421 annual (895,1): 0.680/0.680/0.269/2.632 lawmakers (27291,2): 0.679/0.960/0.291/3.278 tens (11057,1): 0.655/0.655/0.362/2.360 sharply (12378,2): 0.640/0.905/0.224/1.914 less (615,2): 0.639/0.903/0.331/2.294 ago (3139,2): 0.633/0.895/0.175/1.418 shorting (108317,1): 0.627/0.627/0.348/6.101 10 | Most similar words in vocab: 11 | dollars: 0.665/5.687 million: 0.637/6.217 increase: 0.634/4.916 billion: 0.632/6.304 profits: 0.605/4.967 rebates: 0.595/6.684 costs: 0.591/4.978 revenues: 0.583/5.086 cost: 0.583/4.606 outlay: 0.565/6.000 recouping: 0.561/8.163 reinvest: 0.552/7.343 shortfall: 0.549/6.193 amount: 0.549/4.450 billions: 0.546/3.705 euros: 0.544/4.286 exorbitant: 0.543/6.478 repayments: 0.542/5.822 tenfold: 0.540/6.259 millions: 0.538/4.874 surtax: 0.535/7.351 increased: 0.534/4.307 refunds: 0.529/6.098 tripling: 0.527/6.139 refinancing: 0.523/5.726 quadrupled: 0.523/5.553 disburse: 0.521/6.913 disbursed: 0.518/5.408 trillion: 0.518/4.273 skyrocketed: 0.517/5.861 12 | 13 | Topic 12 (3.00): 16.4% 14 | Most relevant words: 15 | company (151,8): 2.758/7.800/0.724/5.842 acquired (1447,5): 2.038/4.557/0.544/4.705 companies (939,4): 1.687/3.374/0.649/5.130 mr (757,5): 1.105/2.471/0.218/1.613 manager (830,2): 1.087/1.537/0.285/2.496 hedge (14848,3): 1.063/1.842/0.366/3.228 business (354,2): 0.990/1.400/0.436/3.313 filed (3883,2): 0.949/1.342/0.391/3.739 now (169,6): 0.903/2.213/0.182/1.043 icahn (83210,1): 0.883/0.883/0.422/8.258 sold (614,2): 0.845/1.194/0.408/3.373 august (149,3): 0.828/1.434/0.113/0.856 investors (5976,2): 0.825/1.167/0.529/4.703 founder (1298,1): 0.820/0.820/0.362/2.904 bank (640,1): 0.767/0.767/0.369/3.361 chief (508,2): 0.737/1.043/0.161/1.342 acquisition (4356,1): 0.721/0.721/0.484/4.211 executive (876,1): 0.716/0.716/0.329/2.594 fund (2014,3): 0.713/1.235/0.449/3.955 president (211,1): 0.695/0.695/0.237/2.133 stock (1878,2): 0.695/0.983/0.396/3.792 directors (2573,1): 0.651/0.651/0.343/2.746 director (302,1): 0.639/0.639/0.228/1.749 glaxo (101914,1): 0.636/0.636/0.394/7.630 former (185,1): 0.622/0.622/0.157/1.254 private (597,1): 0.614/0.614/0.348/2.993 atlanta (2271,2): 0.609/0.861/0.177/1.510 announced (568,1): 0.608/0.608/0.321/2.392 university (61,2): 0.595/0.841/0.050/0.472 marketing (2693,1): 0.567/0.567/0.408/3.683 16 | Most similar words in vocab: 17 | company: 0.724/5.842 companies: 0.649/5.130 corporation: 0.595/5.294 interpublic: 0.588/7.334 subsidiaries: 0.587/4.875 unitedhealth: 0.581/8.112 corp: 0.572/5.563 company's: 0.564/4.652 shareholder: 0.561/4.272 inc: 0.557/5.278 ceo: 0.555/5.131 investments: 0.555/4.823 shareholders: 0.550/4.495 subsidiary: 0.548/5.308 ameriprise: 0.547/7.710 acquired: 0.544/4.705 firm: 0.535/4.619 wellpoint: 0.535/8.259 holdings: 0.531/5.067 venture: 0.530/4.646 investment: 0.529/4.560 investors: 0.529/4.703 acquisitions: 0.528/4.164 smithkline: 0.526/7.536 cendant: 0.524/7.746 vornado: 0.521/7.003 purchased: 0.519/4.289 investor: 0.517/4.443 buyout: 0.515/3.424 sungard: 0.515/7.334 18 | 19 | Topic 7 (3.00): 10.5% 20 | Most relevant words: 21 | patients (2468,9): 1.261/3.783/0.764/6.903 treatment (1460,5): 0.813/1.818/0.692/6.245 hospitals (4777,5): 0.731/1.635/0.492/4.728 treat (5814,3): 0.515/0.893/0.607/5.164 patient (3777,2): 0.506/0.716/0.640/5.660 health (502,3): 0.470/0.814/0.503/4.247 care (1069,2): 0.448/0.633/0.554/4.723 doctors (4958,2): 0.446/0.631/0.495/4.685 treatments (9357,2): 0.426/0.602/0.677/6.167 use (176,7): 0.368/0.973/0.272/1.894 therapies (21018,2): 0.366/0.517/0.580/4.815 dr (570,5): 0.353/0.789/0.170/1.345 cancer (1871,2): 0.348/0.493/0.514/4.970 inpatient (33848,1): 0.338/0.338/0.465/5.812 turing (18313,9): 0.332/0.997/0.029/0.218 medicine (1528,2): 0.330/0.466/0.441/4.098 hospital (723,1): 0.324/0.324/0.394/3.612 sinai (13035,3): 0.317/0.549/0.163/1.318 aids (4693,2): 0.309/0.437/0.470/4.507 certain (1095,4): 0.303/0.607/0.288/1.983 medically (28191,1): 0.302/0.302/0.421/6.117 treating (10124,1): 0.296/0.296/0.613/4.943 drugs (3528,10): 0.293/0.926/0.632/6.074 threatening (7927,2): 0.278/0.393/0.248/2.211 medical (679,1): 0.277/0.277/0.549/4.698 serious (2375,2): 0.268/0.379/0.276/2.461 used (82,4): 0.266/0.531/0.118/0.894 neglected (12251,2): 0.264/0.374/0.193/1.493 effects (1380,2): 0.262/0.370/0.327/2.991 antibiotic (24098,2): 0.261/0.369/0.505/3.650 22 | Most similar words in vocab: 23 | patients: 0.764/6.903 medication: 0.705/6.118 medications: 0.693/5.863 treatment: 0.692/6.245 treatments: 0.677/6.167 antiretroviral: 0.646/7.345 patient: 0.640/5.660 drugs: 0.632/6.074 prophylactic: 0.625/7.142 ivig: 0.619/7.731 contraindicated: 0.614/6.435 treating: 0.613/4.943 regimens: 0.610/7.113 antiepileptic: 0.608/7.483 treat: 0.607/5.164 diabetes: 0.597/5.192 therapeutic: 0.593/5.167 methadone: 0.593/7.109 drug: 0.590/5.897 clinical: 0.584/5.297 chemotherapy: 0.583/4.497 antipsychotic: 0.583/6.538 therapies: 0.580/4.815 medicines: 0.579/5.023 treatable: 0.578/6.294 diarrheal: 0.577/7.264 prophylaxis: 0.576/6.743 prescription: 0.572/4.964 therapy: 0.571/5.616 chronic: 0.570/5.024 24 | 25 | Topic 18 (3.00): 9.9% 26 | Most relevant words: 27 | price (1535,17): 4.027/16.602/0.962/8.408 prices (4345,4): 1.786/3.572/0.821/8.028 gouge (81038,1): 0.840/0.840/0.227/4.774 priced (14909,1): 0.715/0.715/0.549/4.614 high (104,3): 0.479/0.830/0.165/1.473 sharply (12378,2): 0.389/0.550/0.248/2.118 year (54,6): 0.322/0.789/0.058/0.421 stock (1878,2): 0.315/0.445/0.357/3.424 rose (1336,1): 0.313/0.313/0.180/1.313 sales (1485,3): 0.271/0.469/0.392/3.777 mr (757,5): 0.267/0.597/0.083/0.617 mount (1283,3): 0.262/0.453/0.009/0.096 old (204,5): 0.260/0.582/-0.037/-0.259 according (331,4): 0.253/0.507/0.083/0.495 turing (18313,9): 0.253/0.758/-0.008/-0.061 low (612,1): 0.246/0.246/0.303/2.538 august (149,3): 0.241/0.418/0.006/0.048 made (94,3): 0.237/0.410/0.072/0.516 last (237,2): 0.227/0.322/-0.035/-0.198 jumped (11157,2): 0.223/0.315/0.118/0.849 scott (1062,1): 0.223/0.223/0.102/0.653 piggy (34457,1): 0.221/0.221/0.147/2.015 cost (1178,2): 0.215/0.305/0.480/3.794 martin (778,1): 0.214/0.214/0.048/0.307 swindle (42243,1): 0.213/0.213/0.193/2.690 long (195,3): 0.213/0.369/0.024/0.201 spencer (4173,1): 0.213/0.213/0.117/0.880 raised (1286,4): 0.209/0.418/0.163/1.449 standard (712,2): 0.208/0.294/0.112/0.937 shortages (16485,1): 0.204/0.204/0.268/2.213 28 | Most similar words in vocab: 29 | price: 0.962/8.408 prices: 0.821/8.028 pricing: 0.602/5.262 priced: 0.549/4.614 inflation: 0.535/4.948 commodity: 0.497/4.148 discount: 0.490/4.234 exorbitant: 0.489/5.836 cost: 0.480/3.794 discounted: 0.478/3.183 resale: 0.467/5.345 purchases: 0.464/3.577 costs: 0.463/3.895 rates: 0.439/4.075 purchasing: 0.438/3.479 buyers: 0.435/3.705 commodities: 0.431/3.727 buying: 0.429/3.555 undervalued: 0.426/4.792 demand: 0.426/3.511 premium: 0.424/4.245 skyrocketed: 0.423/4.790 asset's: 0.420/5.837 repayments: 0.419/4.495 market: 0.418/3.411 tariff: 0.416/3.359 volatility: 0.415/5.309 valuations: 0.415/4.747 cents: 0.414/3.823 buy: 0.414/3.396 30 | -------------------------------------------------------------------------------- /TopicCloud.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from random import Random 3 | from os import path 4 | from wordcloud.wordcloud import WordCloud, IntegralOccupancyMap 5 | from operator import itemgetter 6 | import numpy as np 7 | import pdb 8 | import colorsys 9 | from nltk.stem.wordnet import WordNetLemmatizer 10 | from morpha import lemmatize 11 | import re 12 | 13 | from PIL import Image 14 | from PIL import ImageColor 15 | from PIL import ImageDraw 16 | from PIL import ImageFont 17 | 18 | def str2dict(s): 19 | wordlist = re.split( "\s+", s ) 20 | worddict = {} 21 | for w in wordlist: 22 | worddict[w] = 1 23 | return worddict 24 | 25 | #lmtzr = WordNetLemmatizer() 26 | random_state = Random() 27 | img_padding = 15 28 | 29 | specialNounsStr = "embeddings" 30 | specialVerbsStr = "" 31 | specialNounDict = str2dict(specialNounsStr) 32 | specialVerbDict = str2dict(specialVerbsStr) 33 | originalStr = "embedding turing sinai saudi data" 34 | originalDict = str2dict(originalStr) 35 | 36 | def clockwise(start_angle, stop_angle): 37 | start_angle = start_angle % 360 38 | stop_angle = stop_angle % 360 39 | # clockwise (90 degree at bottom, as the custom of pillow), start is the first and stop is the second 40 | # so start_angle < stop_angle 41 | if stop_angle < start_angle: 42 | if start_angle - stop_angle < 180: 43 | start_angle, stop_angle = stop_angle, start_angle 44 | else: 45 | stop_angle += 360 46 | 47 | return start_angle, stop_angle 48 | 49 | def genSectorMask( width, height, start_angle, stop_angle ): 50 | 51 | start_angle, stop_angle = clockwise(start_angle, stop_angle) 52 | sector_mask = np.ones( (height, width) ) 53 | origin_x = width / 2 54 | origin_y = height / 2 55 | sin1 = np.sin( start_angle * np.pi / 180 ) 56 | cos1 = np.cos( start_angle * np.pi / 180 ) 57 | sin2 = np.sin( stop_angle * np.pi / 180 ) 58 | cos2 = np.cos( stop_angle * np.pi / 180 ) 59 | reservedCenterRadius = 5 60 | maxRadius = min(width, height) * 0.5 - img_padding 61 | for y in xrange(height): 62 | for x in xrange(width): 63 | x2 = x - origin_x 64 | y2 = (height - y) - origin_y 65 | radius = np.sqrt(x2*x2 + y2*y2) 66 | if radius >= reservedCenterRadius and radius <= maxRadius and sin1 * x2 <= -cos1 * y2 and sin2 * x2 >= -cos2 * y2: 67 | sector_mask[y,x] = 0 68 | 69 | return sector_mask 70 | 71 | def d3_category20_rand(topicID): 72 | d3_category20 = [ # "#aec7e8", "#ffbb78", "#98df8a", 73 | # "#d62728", too striking red, "#ff7f0e", orange is alerting; 74 | # "#bcbd22", ugly; "#e377c2", striking 75 | "#2ca02c", "#9467bd", "#1f77b4", "#ff9896", 76 | "#17becf", "#7f7f7f", "#8c564b", "#c49c94" ] 77 | # ""#c5b0d5", "#c49c94", 78 | # "#f7b6d2", "#c7c7c7", "#dbdb8d", "#9edae5" 79 | colorID = topicID % len(d3_category20) 80 | basecolor = d3_category20[colorID] 81 | r, g, b = ImageColor.getrgb(basecolor) 82 | fluc = 60 83 | r += random_state.randint( 0, fluc ) - fluc/2 84 | g += random_state.randint( 0, fluc ) - fluc/2 85 | b += random_state.randint( 0, fluc ) - fluc/2 86 | r = min( max(r, 0), 255 ) 87 | g = min( max(g, 0), 255 ) 88 | b = min( max(b, 0), 255 ) 89 | return "rgb(%d, %d, %d)" %(r, g, b) 90 | 91 | def lemmatize2(word): 92 | if word in originalDict: 93 | return word 94 | 95 | candidatePOSs = ('n', 'v') 96 | 97 | if word in specialNounDict: 98 | candidatePOSs = [ 'n' ] 99 | if word in specialVerbDict: 100 | candidatePOSs = [ 'v' ] 101 | 102 | for pos in candidatePOSs: 103 | #w2 = lmtzr.lemmatize(word, pos) 104 | w2 = lemmatize(word, pos) 105 | if w2 != word: 106 | return w2 107 | return word 108 | 109 | class TopicCloud(WordCloud): 110 | def __init__(self, min_sector_padding=0, max_topic_num=10, max_sector_angle=150, max_topic_prop_ratio=6, 111 | min_sector_angle=20, max_topic_words=10, min_word_topic_prop=0.5, **kwargs): 112 | super(TopicCloud, self).__init__(**kwargs) 113 | self.min_sector_padding = min_sector_padding 114 | self.max_topic_num = max_topic_num 115 | self.max_sector_angle = max_sector_angle 116 | self.min_sector_angle = min_sector_angle 117 | self.max_topic_prop_ratio = max_topic_prop_ratio 118 | self.max_topic_words = max_topic_words 119 | self.min_word_topic_prop = min_word_topic_prop 120 | self.margin = 4 121 | self.font_path = "C:/Windows/fonts/impact.ttf" 122 | self.background_color = "white" 123 | self.prefer_horizontal = 1 124 | 125 | def generate_from_topics(self, topics): 126 | """Create a topic_cloud from topics. 127 | 128 | Parameters 129 | ---------- 130 | topics : array of tuples 131 | Each topic: (proportion in the document, [ (word1, freq1), (word2, freq2), ... ] ) 132 | 133 | Returns 134 | ------- 135 | self 136 | 137 | """ 138 | 139 | # lemmatizing 140 | for topic in topics: 141 | words_freq = topic[1] 142 | words_freq2 = [] 143 | word2idx = {} 144 | idx = 0 145 | for word, freq in words_freq: 146 | word2 = lemmatize2(word) 147 | if word2 in word2idx: 148 | wid = word2idx[word2] 149 | words_freq2[wid][1] += freq 150 | else: 151 | words_freq2.append( [word2, freq] ) 152 | word2idx[word2] = idx 153 | idx += 1 154 | 155 | words_freq2 = sorted(words_freq2, key=itemgetter(1), reverse=True) 156 | for i in xrange( len(words_freq2)-1, -1, -1 ): 157 | if words_freq2[i][1] >= self.min_word_topic_prop: 158 | break 159 | words_freq2 = words_freq2[:i+1] 160 | 161 | topic[1] = words_freq2[:self.max_topic_words] 162 | # topic_mass = sum( [ len(w) for (w,f) in topic[1] ] ) 163 | # topic_masses.append(topic_mass) 164 | #topic[0] *= topic[1][0][1] * sum( [ word_freq[1] for word_freq in topic[1] ] ) 165 | 166 | # make sure topics are sorted and normalized 167 | topics = sorted( topics, key=itemgetter(0), reverse=True ) 168 | if len(topics) > self.max_topic_num: 169 | topics = topics[:self.max_topic_num] 170 | min_topic_prop = topics[0][0] / self.max_topic_prop_ratio 171 | for i in xrange( len(topics)-1, 0, -1 ): 172 | if topics[i][0] >= min_topic_prop: 173 | break 174 | topics = topics[:i+1] 175 | T = len(topics) 176 | 177 | #topic_masses = [] 178 | topic_masses = np.ones(T) 179 | 180 | # sqrt for smoothing 181 | total_props = sum( [ np.power(topics[i][0] * topic_masses[i],0.8) for i in xrange(len(topics)) ] ) 182 | for i in xrange(len(topics)): 183 | topics[i][0] = np.power(topics[i][0] * topic_masses[i],0.8) / total_props 184 | 185 | avail_angles = 360 - T * self.min_sector_padding 186 | max_angle = avail_angles * topics[0][0] 187 | angle_scale = 1 188 | if max_angle > self.max_sector_angle: 189 | angle_scale = self.max_sector_angle / max_angle 190 | topic_angles = [] 191 | for topic in topics: 192 | topic_angles.append( avail_angles * topic[0] * angle_scale ) 193 | sector_padding = ( 360 - sum(topic_angles) ) / T 194 | topic_angles = np.array(topic_angles) 195 | 196 | height, width = self.height, self.width 197 | # create image 198 | img_grey = Image.new("L", (width, height)) 199 | draw = ImageDraw.Draw(img_grey) 200 | img_array = np.asarray(img_grey) 201 | total_freqs, font_sizes, positions, orientations, colors = [], [], [], [], [] 202 | 203 | if self.random_state is not None: 204 | random_state = self.random_state 205 | else: 206 | random_state = Random() 207 | 208 | sector_masks = [] 209 | sector_angles = [] 210 | 211 | for i,topic in enumerate(topics): 212 | width = self.width 213 | height = self.height 214 | last_freq = 1. 215 | font_size = self.max_font_size * min( np.sqrt(topic[1][0][1] / topics[0][1][0][1]), 2 ) 216 | 217 | if i == 0: 218 | # initial angle starts from the symmetric left side of the y-axis 219 | # to ensure first sector always at right above of the canvas 220 | start_angle = 270 - topic_angles[0]/2 221 | stop_angle = 270 + topic_angles[0]/2 222 | else: 223 | start_angle = stop_angle + sector_padding 224 | stop_angle += sector_padding + topic_angles[i] 225 | 226 | # reverse sign to conform with pillow's measurement of angles 227 | sector_angles.append( clockwise(start_angle, stop_angle) ) 228 | #print "%.1f - %.1f =>" %( start_angle % 360, stop_angle % 360), 229 | #print "%.1f - %.1f" %( clockwise(start_angle, stop_angle) ) 230 | 231 | sector_mask = genSectorMask( width, height, start_angle, stop_angle ) 232 | sector_masks.append(sector_mask) 233 | occupancy = IntegralOccupancyMap(height, width, sector_mask) 234 | 235 | frequencies = topic[1][:self.max_words] 236 | frequencies = sorted( frequencies, key=itemgetter(1), reverse=True ) 237 | 238 | # largest entry will be 1 239 | max_frequency = float(frequencies[0][1]) 240 | 241 | frequencies = [ (word, freq / max_frequency) for word, freq in frequencies ] 242 | 243 | if len(frequencies) == 0: 244 | print("We need at least 1 word to plot a word cloud, got 0.") 245 | continue 246 | 247 | total_freqs += frequencies 248 | drawn_words = [] 249 | 250 | # start drawing grey image 251 | for word, freq in frequencies: 252 | # select the font size 253 | rs = self.relative_scaling 254 | if rs != 0: 255 | font_size = int(round((rs * (freq / float(last_freq)) + (1 - rs)) * font_size)) 256 | while True: 257 | # try to find a position 258 | font = ImageFont.truetype(self.font_path, font_size) 259 | # transpose font optionally 260 | if random_state.random() < self.prefer_horizontal: 261 | orientation = None 262 | else: 263 | orientation = Image.ROTATE_90 264 | transposed_font = ImageFont.TransposedFont(font, 265 | orientation=orientation) 266 | # get size of resulting text 267 | box_size = draw.textsize(word, font=transposed_font) 268 | # find possible places using integral image: 269 | result = occupancy.sample_position(box_size[1] + 2 * self.margin, 270 | box_size[0] + 2 * self.margin, 271 | random_state) 272 | if result is not None or font_size == 0: 273 | break 274 | # if we didn't find a place, make font smaller 275 | font_size -= self.font_step 276 | 277 | if font_size < self.min_font_size: 278 | # we were unable to draw any more 279 | font_size = self.min_font_size 280 | drawn_words.append(word) 281 | 282 | x, y = np.array(result) + self.margin // 2 283 | # actually draw the text 284 | draw.text((y, x), word, fill="white", font=transposed_font) 285 | positions.append((x, y)) 286 | orientations.append(orientation) 287 | font_sizes.append(font_size) 288 | colors.append(d3_category20_rand(i)) 289 | 290 | # recompute integral image 291 | img_array = ( np.asarray(img_grey) + sector_mask ) > 0 292 | # recompute bottom right 293 | # the order of the cumsum's is important for speed ?! 294 | occupancy.update(img_array, x, y) 295 | last_freq = freq 296 | 297 | print "Topic %d (%.1f):" %(i+1, topic_angles[i]) 298 | print drawn_words 299 | 300 | # for i in xrange(len(sector_masks)): 301 | # for j in xrange(i): 302 | # if np.any( (1-sector_masks[i]) * (1-sector_masks[j]) ): 303 | # pdb.set_trace() 304 | 305 | self.layout_ = list(zip(total_freqs, font_sizes, positions, orientations, colors)) 306 | self.sector_angles = sector_angles 307 | return self 308 | 309 | def to_image(self): 310 | self._check_generated() 311 | height, width = self.height, self.width 312 | 313 | img = Image.new(self.mode, (int(width * self.scale), int(height * self.scale)), 314 | self.background_color) 315 | 316 | draw = ImageDraw.Draw(img) 317 | bbox = (img_padding, img_padding, height-img_padding, height-img_padding) 318 | 319 | colors = [ "rgb(255,255,242)", "rgb(255,242,255)", "rgb(242,255,255)", "rgb(242,242,242)" ] 320 | i = 0 321 | if len(self.sector_angles) % len(colors) == 1: 322 | modulus = len(colors) - 1 323 | else: 324 | modulus = len(colors) 325 | 326 | for (start_angle, stop_angle) in self.sector_angles: 327 | draw.pieslice(bbox, start_angle, stop_angle, fill = colors[i%modulus]) 328 | i += 1 329 | #print "%d-%d: %s" %(start_angle, stop_angle, colors[i%3]) 330 | 331 | for (word, count), font_size, position, orientation, color in self.layout_: 332 | font = ImageFont.truetype(self.font_path, int(font_size * self.scale)) 333 | transposed_font = ImageFont.TransposedFont(font, 334 | orientation=orientation) 335 | pos = (int(position[1] * self.scale), int(position[0] * self.scale)) 336 | draw.text(pos, word, fill=color, font=transposed_font) 337 | return img 338 | -------------------------------------------------------------------------------- /gencloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # an example of using TopicCloud 4 | 5 | from os import path 6 | from wordcloud import WordCloud 7 | import sys 8 | #sys.path.append("C:/Dropbox/topicvec/visualization/") 9 | from TopicCloud import TopicCloud 10 | 11 | topics_drugstory = [ 12 | [ 26.7, [ ('drugs', 2.028), ('drug', 1.722), ('use', 1.555), 13 | ('prescriptions', 1.314), ('generic', 1.307), ('patients', 1.257), 14 | ('hospitals', 1.163), ('turing', 1.130), ('treatment', 1.061), 15 | ('used', 1.037), ('pharmaceutical', 0.919), ('pharmaceuticals', 0.908), 16 | ('health', 0.898), ('medicine', 0.891) ] ], 17 | [ 21.9, [ ('increase', 2.380), ('million', 2.331), ('increases', 1.378), 18 | ('raised', 1.303), ('dollars', 1.262), ('year', 1.221), 19 | ('percent', 1.163), ('sales', 1.151), ('huge', 1.066), 20 | ('money', 1.013), ('pay', 0.988), ('cost', 0.980), 21 | ('millions', 0.882), ('hundreds', 0.851) ] ], 22 | [ 16.4, [ ('company', 2.758), ('acquired', 2.038), ('companies', 1.687), 23 | ('mr', 1.105), ('manager', 1.087), ('hedge', 1.063), 24 | ('business', 0.990), ('filed', 0.949), ('now', 0.903), 25 | ('icahn', 0.883), ('sold', 0.845), ('august', 0.828), 26 | ('investors', 0.825), ('founder', 0.820) ] ], 27 | [ 10.5, [ ('patients', 1.261), ('treatment', 0.813), ('hospitals', 0.731), 28 | ('treat', 0.515), ('patient', 0.506), ('health', 0.470), 29 | ('care', 0.448), ('doctors', 0.446), ('treatments', 0.426), 30 | ('use', 0.368), ('therapies', 0.366), ('dr', 0.353), 31 | ('cancer', 0.348), ('inpatient', 0.338) ] ], 32 | [ 9.9, [ ('price', 4.027), ('prices', 1.786), ('gouge', 0.840), 33 | ('priced', 0.715), ('high', 0.479), ('sharply', 0.389), 34 | ('year', 0.322), ('stock', 0.315), ('rose', 0.313), 35 | ('sales', 0.271), ('mr', 0.267), ('mount', 0.262), 36 | ('according', 0.253), ('low', 0.246) ] ] 37 | ] 38 | 39 | topics_drugstory_8 = [ 40 | [ 27.1, [ ('turing', 3.00), ('year', 2.35), ('use', 2.31), 41 | ('old', 2.15), ('mr', 2.14), ('generic', 1.99), 42 | ('dr', 1.96), ('used', 1.89), ('now', 1.79), 43 | ('first', 1.69), ('two', 1.59), ('called', 1.58), 44 | ('standard', 1.38), ('manager', 1.38) ] ], 45 | [ 22.8, [ ('drug', 3.52), ('drugs', 2.94), ('patients', 2.85), 46 | ('diseases', 2.39), ('hospitals', 2.13), ('toxoplasmosis', 2.00), 47 | ('infectious', 1.96), ('treatment', 1.93), ('certain', 1.77), 48 | ('treat', 1.61), ('controlled', 1.43), ('effects', 1.39), 49 | ('infection', 1.37), ('dangerous', 1.37) ] ], 50 | [ 22.8, [ ('raised', 1.99), ('mount', 1.72), ('sinai', 1.71), 51 | ('center', 1.67), ('hedge', 1.66), ('fund', 1.64), 52 | ('high', 1.54), ('lawmakers', 1.40), ('marathon', 1.35), 53 | ('atlanta', 1.35), ('investors', 1.32), ('sharply', 1.21), 54 | ('according', 1.16), ('jumped', 0.97) ] ], 55 | [ 22.1, [ ('price', 4.08), ('increase', 2.73), ('increases', 1.96), 56 | ('prices', 1.78), ('distribution', 1.64), ('better', 1.62), 57 | ('sales', 1.52), ('prescriptions', 1.47), ('make', 1.45), 58 | ('million', 1.40), ('huge', 1.34), ('made', 1.27), 59 | ('pay', 1.22), ('cost', 1.19) ] ], 60 | [ 5.2, [ ('acquired', 2.22), ('company', 2.00), ('pharmaceuticals', 1.65), 61 | ('companies', 1.52), ('pharmaceutical', 1.17), ('therapeutics', 0.97), 62 | ('glaxo', 0.95), ('drugstores', 0.94), ('acquisition', 0.88), 63 | ('glaxosmithkline', 0.82), ('sold', 0.72), ('products', 0.59), 64 | ('laboratories', 0.39), ('develop', 0.18) ] ] 65 | ] 66 | 67 | topics_drugstory_kmeans = [ 68 | [ 10.0, [ ('company', 2.83), ('year', 2.45), ('dr', 2.24), 69 | ('acquired', 2.24), ('raised', 2.00), ('first', 1.73), 70 | ('august', 1.73), ('sinai', 1.73), ('center', 1.73), 71 | ('mount', 1.73), ('time', 1.41), ('university', 1.41), 72 | ('life', 1.41), ('same', 1.41) ] ], 73 | [ 9.0, [ ('now', 2.45), ('mr', 2.24), ('old', 2.24), 74 | ('called', 1.73), ('even', 1.73), ('make', 1.73), 75 | ('better', 1.73), ('think', 1.41), ('like', 1.41), 76 | ('side', 1.41), ('jumped', 1.41), ('ago', 1.41), 77 | ('threatening', 1.41), ('trying', 1.41) ] ], 78 | [ 8.0, [ ('federal', 1.41), ('lawmakers', 1.41), ('filed', 1.41), 79 | ('chief', 1.41), ('claim', 1.00), ('state', 1.00), 80 | ('united', 1.00), ('states', 1.00), ('general', 1.00), 81 | ('senator', 1.00), ('district', 1.00), ('public', 1.00), 82 | ('former', 1.00), ('president', 1.00) ] ], 83 | [ 5.0, [ ('price', 4.12), ('million', 2.45), ('prices', 2.00), 84 | ('companies', 2.00), ('hedge', 1.73), ('sales', 1.73), 85 | ('fund', 1.73), ('dollars', 1.41), ('business', 1.41), 86 | ('sold', 1.41), ('money', 1.41), ('cost', 1.41), 87 | ('pay', 1.41), ('stock', 1.41) ] ], 88 | [ 5.0, [ ('turing', 3.00), ('use', 2.65), ('used', 2.00), 89 | ('generic', 2.00), ('controlled', 1.73), ('tablet', 1.41), 90 | ('standard', 1.41), ('strategy', 1.41), ('system', 1.00), 91 | ('patents', 1.00), ('example', 1.00), ('data', 1.00), 92 | ('switch', 1.00), ('systems', 1.00) ] ], 93 | [ 5.0, [ ('made', 1.73), ('long', 1.73), ('huge', 1.41), 94 | ('attention', 1.41), ('mainstays', 1.00), ('criticism', 1.00), 95 | ('led', 1.00), ('further', 1.00), ('making', 1.00), 96 | ('despite', 1.00), ('controversy', 1.00), ('drew', 1.00), 97 | ('previous', 1.00), ('overnight', 1.00) ] ], 98 | [ 4.0, [ ('drug', 3.87), ('drugs', 3.16), ('patients', 3.00), 99 | ('treatment', 2.24), ('hospitals', 2.24), ('health', 1.73), 100 | ('pharmaceuticals', 1.73), ('prescriptions', 1.73), ('therapies', 1.41), 101 | ('pharmaceutical', 1.41), ('doctors', 1.41), ('care', 1.41), 102 | ('treatments', 1.41), ('medicine', 1.41) ] ], 103 | [ 4.0, [ ('increase', 2.83), ('increases', 2.00), ('certain', 2.00), 104 | ('serious', 1.41), ('less', 1.41), ('required', 1.41), 105 | ('need', 1.41), ('effects', 1.41), ('dangerous', 1.41), 106 | ('discourage', 1.00), ('shortages', 1.00), ('rarely', 1.00), 107 | ('potentially', 1.00), ('possibly', 1.00) ] ], 108 | [ 4.0, [ ('according', 2.00), ('two', 1.73), ('high', 1.73), 109 | ('distribution', 1.73), ('sharply', 1.41), ('many', 1.41), 110 | ('hundreds', 1.41), ('percent', 1.41), ('number', 1.00), 111 | ('list', 1.00), ('include', 1.00), ('differently', 1.00), 112 | ('copies', 1.00), ('added', 1.00) ] ] 113 | ] 114 | 115 | topics_sigir = [ 116 | [ 15.6, [ ('user', 0.904), ('web', 0.621), ('document', 0.589), 117 | ('query', 0.435), ('cursor', 0.429), ('queries', 0.421), 118 | ('retrieval', 0.407), ('text', 0.404), ('information', 0.399), 119 | ('online', 0.397), ('knowledge', 0.375), ('interface', 0.363), 120 | ('click', 0.342), ('collaborative', 0.319) ] ], 121 | [ 13.8, [ ('user', 0.482), ('document', 0.439), ('web', 0.403), 122 | ('knowledge', 0.386), ('entity', 0.379), ('retrieval', 0.376), 123 | ('information', 0.342), ('collaborative', 0.312), ('queries', 0.308), 124 | ('leveraging', 0.306), ('query', 0.302), ('text', 0.291), 125 | ('online', 0.285), ('relevance', 0.285) ] ], 126 | [ 13.4, [ ('knowledge', 0.379), ('retrieval', 0.369), ('user', 0.345), 127 | ('document', 0.326), ('web', 0.313), ('information', 0.302), 128 | ('leveraging', 0.294), ('collaborative', 0.290), ('relevance', 0.290), 129 | ('queries', 0.261), ('query', 0.253), ('summarization', 0.247), 130 | ('models', 0.242), ('evaluation', 0.239) ] ], 131 | [ 12.9, [ ('fast', 0.459), ('neural', 0.318), ('efficient', 0.316), 132 | ('models', 0.310), ('retrieval', 0.304), ('knowledge', 0.248), 133 | ('model', 0.237), ('user', 0.233), ('modeling', 0.227), 134 | ('relevance', 0.225), ('leveraging', 0.224), ('random', 0.219), 135 | ('networks', 0.218), ('selection', 0.217) ] ], 136 | [ 12.8, [ ('embeddings', 1.047), ('graphs', 0.438), ('subspace', 0.366), 137 | ('hamming', 0.363), ('random', 0.351), ('quantization', 0.312), 138 | ('factorization', 0.297), ('graph', 0.296), ('discrete', 0.291), 139 | ('parameterized', 0.290), ('generalized', 0.285), ('math', 0.282), 140 | ('neural', 0.275), ('models', 0.269) ] ], 141 | [ 12.3, [ ('retrieval', 0.346), ('neural', 0.279), ('knowledge', 0.274), 142 | ('models', 0.272), ('user', 0.250), ('document', 0.245), 143 | ('relevance', 0.240), ('information', 0.238), ('queries', 0.236), 144 | ('web', 0.235), ('modeling', 0.232), ('query', 0.232), 145 | ('summarization', 0.224), ('efficient', 0.222) ] ], 146 | [ 9.4, [ ('retrieval', 0.267), ('user', 0.248), ('web', 0.215), 147 | ('knowledge', 0.213), ('efficient', 0.203), ('queries', 0.196), 148 | ('query', 0.193), ('neural', 0.193), ('information', 0.192), 149 | ('leveraging', 0.180), ('document', 0.180), ('models', 0.175), 150 | ('relevance', 0.170), ('collaborative', 0.167) ] ], 151 | [ 7.2, [ ('search', 3.558), ('searching', 0.675), ('finding', 0.278), 152 | ('investigation', 0.203), ('retrieving', 0.190), ('exploration', 0.171), 153 | ('click', 0.146), ('web', 0.146), ('exploring', 0.124), 154 | ('online', 0.113), ('answers', 0.107), ('query', 0.096), 155 | ('cache', 0.095), ('knowledge', 0.092) ] ] 156 | ] 157 | 158 | topics_icml = [ 159 | [ 14.4, [ ('models', 1.037), ('neural', 1.033), ('data', 0.966), 160 | ('optimization', 0.912), ('efficient', 0.875), ('model', 0.683), 161 | ('inference', 0.675), ('analysis', 0.673), ('sampling', 0.601), 162 | ('bayesian', 0.594), ('stochastic', 0.572), ('clustering', 0.549), 163 | ('estimation', 0.546), ('structured', 0.521) ] ], 164 | [ 14.4, [ ('neural', 1.169), ('models', 1.063), ('data', 0.990), 165 | ('efficient', 0.942), ('optimization', 0.882), ('model', 0.692), 166 | ('inference', 0.656), ('analysis', 0.648), ('memory', 0.625), 167 | ('sampling', 0.563), ('bayesian', 0.559), ('structured', 0.549), 168 | ('clustering', 0.540), ('stochastic', 0.527) ] ], 169 | [ 13.9, [ ('optimization', 0.914), ('models', 0.891), ('neural', 0.853), 170 | ('data', 0.814), ('efficient', 0.707), ('stochastic', 0.668), 171 | ('inference', 0.662), ('analysis', 0.656), ('sampling', 0.623), 172 | ('bayesian', 0.615), ('estimation', 0.610), ('model', 0.602), 173 | ('clustering', 0.525), ('sparse', 0.517) ] ], 174 | [ 12.9, [ ('convex', 1.166), ('embeddings', 1.128), ('matrix', 1.086), 175 | ('tensor', 0.965), ('factorization', 0.825), ('riemannian', 0.770), 176 | ('gaussian', 0.762), ('linear', 0.675), ('dimensional', 0.672), 177 | ('matrices', 0.665), ('subspace', 0.621), ('nonconvex', 0.582), 178 | ('kernel', 0.581), ('gradient', 0.580) ] ], 179 | [ 12.3, [ ('stochastic', 0.772), ('optimization', 0.740), ('estimation', 0.577), 180 | ('models', 0.558), ('regression', 0.534), ('sampling', 0.521), 181 | ('neural', 0.518), ('sparse', 0.507), ('analysis', 0.493), 182 | ('linear', 0.493), ('data', 0.488), ('gaussian', 0.488), 183 | ('matrix', 0.485), ('inference', 0.482) ] ], 184 | [ 12.2, [ ('matrix', 0.817), ('convex', 0.702), ('gaussian', 0.692), 185 | ('stochastic', 0.652), ('linear', 0.611), ('tensor', 0.580), 186 | ('factorization', 0.547), ('gradient', 0.539), ('sparse', 0.535), 187 | ('embeddings', 0.534), ('optimization', 0.526), ('variational', 0.523), 188 | ('kernel', 0.488), ('dimensional', 0.460) ] ], 189 | [ 11.9, [ ('stochastic', 0.740), ('optimization', 0.685), ('estimation', 0.534), 190 | ('matrix', 0.515), ('gaussian', 0.510), ('regression', 0.506), 191 | ('models', 0.503), ('linear', 0.498), ('sparse', 0.498), 192 | ('sampling', 0.479), ('neural', 0.467), ('variational', 0.453), 193 | ('analysis', 0.452), ('gradient', 0.446) ] ], 194 | [ 4.4, [ ('deep', 4.408), ('convolutional', 2.100), ('fast', 0.334), 195 | ('exploration', 0.297), ('inner', 0.227), ('memory', 0.192), 196 | ('reconstructive', 0.188), ('squeezing', 0.187), ('rectifier', 0.184), 197 | ('streams', 0.175), ('faster', 0.167), ('neural', 0.163), 198 | ('layers', 0.157), ('hidden', 0.154) ] ] 199 | ] 200 | 201 | topics_icml_3 = [ 202 | [ 14.2, [ ('convex', 1.84), ('embeddings', 1.81), ('rank', 1.66), 203 | ('tensor', 1.58), ('matrix', 1.55), ('riemannian', 1.26), 204 | ('factorization', 1.25), ('matrices', 1.01), ('dimensional', 1.01), 205 | ('subspace', 0.96), ('gaussian', 0.89), ('nonconvex', 0.87), 206 | ('doubly', 0.82), ('metric', 0.78) ] ], 207 | [ 13.1, [ ('models', 0.85), ('optimization', 0.85), ('data', 0.78), 208 | ('efficient', 0.73), ('neural', 0.71), ('stochastic', 0.63), 209 | ('analysis', 0.60), ('inference', 0.59), ('model', 0.58), 210 | ('sampling', 0.56), ('estimation', 0.54), ('bayesian', 0.53), 211 | ('sparse', 0.51), ('clustering', 0.51) ] ], 212 | [ 13.0, [ ('optimization', 0.84), ('models', 0.82), ('data', 0.74), 213 | ('neural', 0.69), ('efficient', 0.68), ('stochastic', 0.65), 214 | ('analysis', 0.58), ('inference', 0.58), ('model', 0.56), 215 | ('sampling', 0.55), ('estimation', 0.55), ('bayesian', 0.53), 216 | ('sparse', 0.51), ('clustering', 0.50) ] ], 217 | [ 12.3, [ ('optimization', 0.78), ('stochastic', 0.70), ('models', 0.69), 218 | ('data', 0.61), ('neural', 0.57), ('efficient', 0.56), 219 | ('estimation', 0.55), ('analysis', 0.53), ('sampling', 0.53), 220 | ('inference', 0.52), ('bayesian', 0.50), ('regression', 0.50), 221 | ('sparse', 0.49), ('model', 0.48) ] ], 222 | [ 12.3, [ ('optimization', 0.78), ('models', 0.71), ('stochastic', 0.68), 223 | ('data', 0.63), ('neural', 0.59), ('efficient', 0.58), 224 | ('estimation', 0.55), ('analysis', 0.54), ('inference', 0.53), 225 | ('sampling', 0.53), ('bayesian', 0.50), ('model', 0.49), 226 | ('sparse', 0.49), ('regression', 0.49) ] ], 227 | [ 12.0, [ ('optimization', 0.75), ('stochastic', 0.69), ('models', 0.65), 228 | ('data', 0.58), ('neural', 0.55), ('estimation', 0.54), 229 | ('efficient', 0.53), ('analysis', 0.51), ('sampling', 0.51), 230 | ('inference', 0.50), ('regression', 0.49), ('sparse', 0.49), 231 | ('bayesian', 0.48), ('model', 0.46) ] ], 232 | [ 11.6, [ ('optimization', 0.72), ('stochastic', 0.69), ('models', 0.62), 233 | ('data', 0.54), ('estimation', 0.53), ('neural', 0.51), 234 | ('efficient', 0.49), ('sampling', 0.49), ('analysis', 0.49), 235 | ('regression', 0.48), ('inference', 0.48), ('sparse', 0.47), 236 | ('bayesian', 0.46), ('linear', 0.44) ] ], 237 | [ 7.8, [ ('deep', 4.22), ('convolutional', 2.50), ('neural', 0.97), 238 | ('memory', 0.62), ('fast', 0.61), ('faster', 0.38), 239 | ('brain', 0.33), ('reconstructive', 0.32), ('rectifier', 0.31), 240 | ('efficient', 0.27), ('data', 0.26), ('generative', 0.26), 241 | ('simple', 0.25), ('squeezing', 0.25) ] ] 242 | ] 243 | 244 | topics_icml_5 = [ 245 | [ 21.8, [ ('stochastic', 3.88), ('optimization', 3.82), ('rank', 3.21), 246 | ('estimation', 2.95), ('gradient', 2.71), ('monte', 2.38), 247 | ('gaussian', 2.37), ('variational', 2.35), ('carlo', 2.35), 248 | ('regression', 2.27), ('optimal', 2.27), ('approximate', 2.18), 249 | ('descent', 1.96), ('approximation', 1.91) ] ], 250 | [ 17.7, [ ('networks', 5.51), ('deep', 4.51), ('efficient', 3.88), 251 | ('fast', 3.20), ('bandits', 2.66), ('faster', 2.36), 252 | ('search', 2.24), ('online', 2.19), ('network', 2.14), 253 | ('bandit', 1.42), ('nystrom', 1.41), ('dueling', 1.37), 254 | ('simple', 1.36), ('anytime', 1.36) ] ], 255 | [ 17.3, [ ('inference', 3.05), ('reinforcement', 2.73), ('hierarchical', 2.20), 256 | ('generative', 2.13), ('data', 1.98), ('bayesian', 1.56), 257 | ('contextual', 1.56), ('clustering', 1.54), ('recurrent', 1.41), 258 | ('structured', 1.40), ('conditional', 1.40), ('graphical', 1.35), 259 | ('empirical', 1.31), ('analysis', 1.30) ] ], 260 | [ 14.8, [ ('matrix', 3.48), ('convex', 3.04), ('embeddings', 2.42), 261 | ('factorization', 2.38), ('kernel', 2.32), ('tensor', 2.21), 262 | ('doubly', 1.97), ('dimensional', 1.76), ('matrices', 1.75), 263 | ('riemannian', 1.66), ('nonconvex', 1.61), ('subspace', 1.61), 264 | ('decomposition', 1.59), ('dual', 1.43) ] ], 265 | [ 11.5, [ ('classification', 2.65), ('policy', 2.47), ('supervised', 2.14), 266 | ('evaluation', 2.09), ('training', 1.89), ('cca', 1.73), 267 | ('correcting', 1.65), ('testing', 1.47), ('unsupervised', 1.32), 268 | ('dropout', 1.21), ('pca', 1.21), ('analysis', 1.19), 269 | ('test', 1.17), ('objectives', 1.13) ] ], 270 | [ 8.9, [ ('sparse', 3.69), ('sampling', 3.03), ('low', 2.65), 271 | ('noisy', 1.40), ('high', 1.27), ('large', 1.14), 272 | ('heavy', 1.10), ('noise', 1.08), ('sample', 1.02), 273 | ('mixed', 1.00), ('mixture', 0.96), ('mixing', 0.94), 274 | ('variable', 0.92), ('samples', 0.89) ] ], 275 | [ 8.0, [ ('neural', 4.23), ('models', 4.22), ('convolutional', 2.72), 276 | ('memory', 2.21), ('model', 1.96), ('block', 1.05), 277 | ('data', 0.89), ('architectures', 0.74), ('rectifier', 0.68), 278 | ('brain', 0.63), ('motor', 0.58), ('activation', 0.57), 279 | ('unlabeled', 0.54), ('processes', 0.52) ] ] 280 | ] 281 | 282 | topics_ijcai = [ 283 | [ 14.7, [ ('logic', 2.911), ('semantics', 2.332), ('logics', 2.042), 284 | ('modal', 1.667), ('semantic', 1.534), ('symbolic', 1.471), 285 | ('convolutional', 1.394), ('language', 1.384), ('representation', 1.290), 286 | ('object', 1.248), ('representations', 1.247), ('reasoning', 1.241), 287 | ('abstraction', 1.188), ('calculus', 1.179) ] ], 288 | [ 14.7, [ ('neural', 3.449), ('deep', 2.619), ('networks', 1.994), 289 | ('robot', 1.704), ('human', 1.648), ('network', 1.485), 290 | ('models', 1.415), ('systems', 1.190), ('model', 1.009), 291 | ('machine', 0.969), ('robust', 0.884), ('simulation', 0.830), 292 | ('interactive', 0.741), ('facial', 0.697) ] ], 293 | [ 12.0, [ ('planning', 4.458), ('efficient', 2.270), ('task', 2.109), 294 | ('plan', 1.939), ('improving', 1.491), ('joint', 1.490), 295 | ('strategy', 1.480), ('supervised', 1.259), ('citywide', 1.184), 296 | ('recommendations', 1.072), ('policy', 1.071), ('transfer', 1.035), 297 | ('scheduling', 1.033), ('repositioning', 1.022) ] ], 298 | [ 11.5, [ ('search', 1.508), ('detection', 1.116), ('recognition', 1.029), 299 | ('information', 0.920), ('data', 0.812), ('knowledge', 0.808), 300 | ('tracking', 0.783), ('online', 0.736), ('prediction', 0.634), 301 | ('identification', 0.515), ('selection', 0.497), ('automatic', 0.490), 302 | ('networks', 0.477), ('robust', 0.476) ] ], 303 | [ 11.1, [ ('embeddings', 2.786), ('embedding', 2.720), ('factorization', 2.460), 304 | ('matrix', 1.998), ('kernel', 1.758), ('subspace', 1.731), 305 | ('graph', 1.645), ('generalized', 1.642), ('norm', 1.620), 306 | ('metric', 1.591), ('graphs', 1.535), ('hashing', 1.347), 307 | ('convex', 1.258), ('modulo', 1.257) ] ], 308 | [ 9.9, [ ('knowledge', 1.085), ('recognition', 0.814), ('information', 0.705), 309 | ('search', 0.592), ('data', 0.591), ('preference', 0.533), 310 | ('reasoning', 0.519), ('probabilistic', 0.499), ('query', 0.485), 311 | ('selection', 0.476), ('text', 0.455), ('representation', 0.439), 312 | ('classification', 0.435), ('elicitation', 0.428) ] ], 313 | [ 9.6, [ ('search', 0.778), ('selection', 0.657), ('clustering', 0.653), 314 | ('data', 0.550), ('prediction', 0.549), ('optimization', 0.524), 315 | ('information', 0.509), ('preference', 0.508), ('robust', 0.506), 316 | ('detection', 0.499), ('efficient', 0.490), ('tracking', 0.465), 317 | ('recognition', 0.455), ('optimal', 0.447) ] ] 318 | ] 319 | 320 | topics_aamas = [ 321 | [ 23.4, [ ('social', 2.99), ('cooperation', 2.22), ('security', 1.74), 322 | ('voting', 1.61), ('policies', 1.61), ('cooperative', 1.45), 323 | ('mechanism', 1.43), ('mechanisms', 1.35), ('networks', 1.30), 324 | ('systems', 1.23), ('policy', 1.14), ('behavior', 1.07), 325 | ('preferences', 1.04), ('strategy', 0.98) ] ], 326 | [ 22.0, [ ('reinforcement', 2.18), ('networks', 1.38), ('task', 1.29), 327 | ('distributed', 1.23), ('modeling', 1.21), ('systems', 1.12), 328 | ('scheduling', 1.09), ('online', 1.02), ('planning', 0.88), 329 | ('efficient', 0.83), ('dynamic', 0.82), ('decision', 0.79), 330 | ('automated', 0.77), ('simulation', 0.76) ] ], 331 | [ 16.9, [ ('equilibria', 2.52), ('nash', 1.58), ('optimal', 1.52), 332 | ('equilibrium', 1.27), ('maximization', 1.16), ('stochastic', 1.02), 333 | ('models', 0.98), ('matching', 0.95), ('markov', 0.93), 334 | ('inverse', 0.91), ('model', 0.89), ('efficient', 0.81), 335 | ('constrained', 0.78), ('continuous', 0.77) ] ], 336 | [ 14.2, [ ('argumentation', 2.53), ('logic', 2.03), ('theoretic', 1.89), 337 | ('truthful', 1.88), ('proof', 1.80), ('BDI', 1.71), 338 | ('hedonic', 1.65), ('reasoning', 1.61), ('epistemic', 1.14), 339 | ('boolean', 1.14), ('judgment', 1.07), ('abstract', 0.99), 340 | ('empirical', 0.94), ('propositional', 0.92) ] ], 341 | [ 7.8, [ ('stackelberg', 1.68), ('reinforcement', 0.99), ('optimal', 0.32), 342 | ('maximization', 0.30), ('continuous', 0.29), ('model', 0.28), 343 | ('matching', 0.28), ('decision', 0.27), ('behavior', 0.26), 344 | ('dynamic', 0.26), ('modeling', 0.26), ('task', 0.25), 345 | ('models', 0.25), ('strategy', 0.24) ] ], 346 | [ 6.2, [ ('robot', 3.13), ('human', 2.96), ('robots', 2.19), 347 | ('robotic', 1.97), ('autonomous', 0.99), ('humanoid', 0.97), 348 | ('humans', 0.76), ('vehicle', 0.71), ('swarms', 0.71), 349 | ('automated', 0.45), ('wheeled', 0.43), 350 | ('poachers', 0.42), ('body', 0.38) ] ], 351 | [ 4.3, [ ('games', 5.13), ('game', 1.93), ('teams', 1.62), 352 | ('team', 1.05), ('playing', 0.58), ('player', 0.54), 353 | ('multiplayer', 0.39), ('players', 0.39), ('atari', 0.34), 354 | ('scoring', 0.31), ('competitions', 0.31), ('pac', 0.28), 355 | ('points', 0.26), ('winning', 0.25) ] ], 356 | [ 3.5, [ ('agent', 5.52), ('agents', 3.53), ('sobe', 0.40), 357 | ('investigating', 0.09), ('intelligence', 0.06), ('customs', 0.05), 358 | ('poachers', 0.05), ('assignment', 0.05), ('dealers', 0.05), 359 | ('contact', 0.04), ('collusion', 0.04), ('security', 0.04), 360 | ('mdp', 0.04), ('anti', 0.04) ] ] 361 | ] 362 | 363 | topics_trump = [ 364 | [ 19.1, [ ('know', 5.49), ('say', 4.62), ('think', 2.94), 365 | ('believe', 2.90), ('tell', 2.60), ('happen', 2.39), 366 | ('hear', 1.90), ('stupid', 1.75), ('want', 1.75), 367 | ('cheerleader', 1.71), ('gonna', 1.65), ('heard', 1.60), 368 | ('answer', 1.60), ('never', 1.52) ] ], 369 | [ 17.6, [ ('going', 3.72), ('back', 3.71), ('right', 3.16), 370 | ('send', 2.26), ('over', 2.03), ('take', 1.98), 371 | ('sending', 1.70), ('money', 1.52), ('running', 1.40), 372 | ('bring', 1.37), ('run', 1.29), ('stop', 1.23), 373 | ('go', 1.22), ('deal', 1.05) ] ], 374 | [ 16.3, [ ('jobs', 3.27), ('need', 3.11), ('job', 2.08), 375 | ('lobbyists', 1.76), ('obamacare', 1.71), ('care', 1.43), 376 | ('money', 1.37), ('problems', 1.34), ('vets', 1.30), 377 | ('politicians', 1.16), ('going', 1.08), ('problem', 1.08), 378 | ('needs', 1.04), ('bring', 1.02) ] ], 379 | [ 11.4, [ ('very', 5.03), ('good', 3.65), ('nice', 3.27), 380 | ('big', 2.64), ('like', 2.26), ('rich', 2.02), 381 | ('make', 1.36), ('highly', 1.31), ('wonderful', 1.30), 382 | ('talented', 1.12), ('kind', 1.09), ('makes', 1.09), 383 | ('bad', 1.08), ('proud', 1.08) ] ], 384 | [ 10.0, [ ('building', 3.02), ('build', 2.92), ('ford', 2.80), 385 | ('equipment', 2.21), ('built', 2.04), ('car', 1.81), 386 | ('manufacturer', 1.49), ('hotel', 1.48), ('tower', 1.32), 387 | ('manufacturing', 1.21), ('rebuild', 1.20), ('trucks', 1.20), 388 | ('truck', 1.18), ('cars', 1.16) ] ], 389 | [ 9.5, [ ('country', 4.82), ('us', 3.47), ('iraq', 2.96), 390 | ('president', 2.21), ('iran', 2.17), ('military', 2.03), 391 | ('border', 1.92), ('saudi', 1.82), ('arabia', 1.78), 392 | ('mexico', 1.66), ('yemen', 1.55), ('united', 1.49), 393 | ('states', 1.44), ('airports', 1.41) ] ], 394 | [ 8.8, [ ('china', 4.31), ('billion', 3.38), ('trillion', 2.60), 395 | ('trade', 1.87), ('mexico', 1.51), ('oil', 1.44), 396 | ('world', 1.21), ('debt', 1.21), ('worth', 1.09), 397 | ('currency', 1.08), ('million', 1.07), ('net', 1.06), 398 | ('over', 1.05), ('japan', 1.05) ] ], 399 | [ 4.7, [ ('people', 6.59), ('thousands', 1.89), ('person', 0.60), 400 | ('hundreds', 0.57), ('killing', 0.50), ('many', 0.47), 401 | ('number', 0.42), ('leaders', 0.37), ('wounded', 0.36), 402 | ('millions', 0.31), ('tens', 0.27), ('ago', 0.25), 403 | ('crowd', 0.24), ('soldiers', 0.24) ] ] 404 | ] 405 | 406 | topics_hillary = [ 407 | [ 22.9, [ ('people', 2.48), ('million', 2.46), ('now', 2.13), 408 | ('years', 2.10), ('country', 1.78), ('back', 1.69), 409 | ('working', 1.48), ('time', 1.45), ('today', 1.38), 410 | ('thank', 1.36), ('millions', 1.36), ('stop', 1.30), 411 | ('decades', 1.26), ('make', 1.24) ] ], 412 | [ 17.6, [ ('right', 2.44), ('rightly', 1.16), ('respect', 1.09), 413 | ('fight', 1.05), ('mean', 1.02), ('extremist', 0.91), 414 | ('righted', 0.91), ('think', 0.91), ('bemoans', 0.89), 415 | ('tarnish', 0.89), ('starker', 0.88), ('hard', 0.88), 416 | ('progressive', 0.85), ('statesmanship', 0.84) ] ], 417 | [ 14.3, [ ('corporations', 1.67), ('powerful', 1.46), ('people', 1.09), 418 | ('women', 0.89), ('make', 0.77), ('citizens', 0.75), 419 | ('care', 0.73), ('interests', 0.72), ('unions', 0.71), 420 | ('affordable', 0.70), ('rich', 0.65), ('corporate', 0.65), 421 | ('america', 0.64), ('want', 0.64) ] ], 422 | [ 13.5, [ ('court', 6.53), ('supreme', 3.18), ('legal', 2.61), 423 | ('case', 2.58), ('justice', 2.28), ('law', 2.23), 424 | ('justices', 1.94), ('cases', 1.88), ('ruled', 1.77), 425 | ('judge', 1.66), ('courts', 1.63), ('hearing', 1.23), 426 | ('decisions', 1.10), ('lawyer', 1.06) ] ], 427 | [ 9.5, [ ('president', 4.90), ('senator', 3.06), ('obama', 2.06), 428 | ('senate', 2.04), ('republican', 1.65), ('republicans', 1.41), 429 | ('grassley', 1.33), ('election', 1.32), ('john', 1.22), 430 | ('barack', 1.17), ('presidency', 1.11), ('former', 1.07), 431 | ('united', 1.05), ('governor', 1.01) ] ], 432 | [ 8.7, [ ('women', 0.61), ('people', 0.50), ('care', 0.48), 433 | ('citizens', 0.48), ('country', 0.48), ('corporations', 0.47), 434 | ('unions', 0.45), ('america', 0.44), ('americans', 0.39), 435 | ('act', 0.37), ('health', 0.37), ('nation', 0.37), 436 | ('politics', 0.35), ('interests', 0.35) ] ], 437 | [ 6.6, [ ('trump', 1.80), ('vote', 0.86), ('elections', 0.45), 438 | ('party', 0.44), ('choose', 0.39), ('voting', 0.39), 439 | ('politics', 0.39), ('republicans', 0.37), ('election', 0.34), 440 | ('voted', 0.33), ('decided', 0.31), ('constitution', 0.30), 441 | ('progressive', 0.30), ('votes', 0.30) ] ] 442 | ] 443 | 444 | topics_hillary2 = [ 445 | [ 20.4, [ ('let', 2.64), ('go', 1.63), ('hard', 1.52), 446 | ('want', 1.19), ('going', 1.19), ('tonight', 1.09), 447 | ('live', 1.02), ('back', 0.95), ('know', 0.93), 448 | ('say', 0.90), ('tomorrow', 0.90), ('unselfish', 0.80), 449 | ('thank', 0.78), ('good', 0.73) ] ], 450 | [ 18.8, [ ('great', 1.63), ('like', 0.91), ('grandparent', 0.78), 451 | ('people', 0.74), ('lives', 0.62), ('trust', 0.59), 452 | ('going', 0.58), ('lot', 0.55), ('remarkable', 0.54), 453 | ('way', 0.53), ('children', 0.53), ('good', 0.52), 454 | ('know', 0.52), ('child', 0.50) ] ], 455 | [ 11.3, [ ('new', 1.80), ('state', 1.65), ('york', 1.62), 456 | ('president', 1.40), ('roosevelt', 1.07), ('senate', 0.93), 457 | ('governor', 0.77), ('presidents', 0.73), ('stuyvesant', 0.72), 458 | ('mayor', 0.68), ('island', 0.67), ('election', 0.66), 459 | ('senator', 0.66), ('members', 0.66) ] ], 460 | [ 11.3, [ ('barriers', 0.85), ('people', 0.49), ('back', 0.47), 461 | ('going', 0.44), ('like', 0.43), ('way', 0.42), 462 | ('jobs', 0.40), ('hard', 0.40), ('americans', 0.33), 463 | ('crumbling', 0.29), ('make', 0.29), ('need', 0.27), 464 | ('problems', 0.27), ('stop', 0.27) ] ], 465 | [ 8.9, [ ('rights', 2.29), ('people', 0.64), ('diversity', 0.60), 466 | ('lgbt', 0.59), ('discrimination', 0.56), ('dignity', 0.54), 467 | ('equal', 0.46), ('women', 0.45), ('americans', 0.44), 468 | ('advocate', 0.43), ('empowerment', 0.39), ('values', 0.39), 469 | ('racism', 0.35), ('families', 0.34) ] ], 470 | [ 8.6, [ ('responders', 1.10), ('yorkers', 0.99), ('rikers', 0.99), 471 | ('inaudible', 0.98), ('trayvon', 0.97), ('fdny', 0.91), 472 | ('firefighter', 0.79), ('ladders', 0.46), ('firefighters', 0.41), 473 | ('people', 0.40), ('heard', 0.38), ('emergency', 0.38), 474 | ('survivors', 0.36), ('officers', 0.35) ] ], 475 | [ 8.2, [ ('campaign', 2.35), ('progressive', 1.03), ('reform', 0.89), 476 | ('election', 0.63), ('democratic', 0.59), ('votes', 0.52), 477 | ('divisive', 0.50), ('voters', 0.49), ('voting', 0.42), 478 | ('supported', 0.42), ('congressional', 0.37), ('specter', 0.36), 479 | ('elections', 0.36), ('supporters', 0.36) ] ], 480 | [ 7.6, [ ('country', 3.32), ('america', 2.63), ('us', 1.27), 481 | ('american', 0.55), ('world', 0.50), ('continent', 0.49), 482 | ('region', 0.40), ('nation', 0.38), ('cities', 0.36), 483 | ('south', 0.35), ('million', 0.24), ('places', 0.20), 484 | ('today', 0.19), ('americans', 0.18) ] ] 485 | ] 486 | 487 | topics_hillary3 = [ 488 | [ 19.5, [ ('back', 3.24), ('let', 2.66), ('stop', 2.61), 489 | ('going', 2.54), ('hard', 2.53), ('go', 2.44), 490 | ('right', 2.23), ('fight', 2.00), ('keep', 1.98), 491 | ('take', 1.90), ('hold', 1.85), ('break', 1.79), 492 | ('make', 1.79), ('single', 1.69) ] ], 493 | [ 17.5, [ ('people', 4.54), ('million', 2.84), ('years', 2.30), 494 | ('children', 2.26), ('working', 2.10), ('families', 1.94), 495 | ('americans', 1.91), ('many', 1.80), ('millions', 1.77), 496 | ('women', 1.74), ('worked', 1.56), ('today', 1.52), 497 | ('workers', 1.50), ('lives', 1.38) ] ], 498 | [ 14.3, [ ('powerful', 2.61), ('great', 1.71), ('like', 1.30), 499 | ('very', 1.27), ('dangerous', 1.15), ('kind', 1.04), 500 | ('make', 1.04), ('remarkable', 0.99), ('good', 0.95), 501 | ('respect', 0.93), ('mean', 0.93), ('humbling', 0.93), 502 | ('strong', 0.92), ('know', 0.92) ] ], 503 | [ 9.8, [ ('care', 1.09), ('barriers', 0.88), ('affordable', 0.75), 504 | ('equality', 0.74), ('health', 0.71), ('protecting', 0.69), 505 | ('protect', 0.66), ('discrimination', 0.64), ('women', 0.61), 506 | ('know', 0.60), ('rights', 0.59), ('want', 0.55), 507 | ('fair', 0.55), ('equal', 0.53) ] ], 508 | [ 9.0, [ ('campaign', 2.82), ('corporations', 2.30), ('progressive', 2.17), 509 | ('politics', 1.53), ('pacs', 1.23), ('unions', 1.06), 510 | ('reform', 1.04), ('corporate', 0.96), ('party', 0.89), 511 | ('interests', 0.82), ('voters', 0.75), ('elections', 0.75), 512 | ('voting', 0.74), ('ads', 0.71) ] ], 513 | [ 8.5, [ ('legal', 3.32), ('trump', 2.80), ('law', 1.98), 514 | ('rights', 1.83), ('issues', 1.55), ('issue', 1.24), 515 | ('decisions', 1.01), ('marriage', 0.96), ('matter', 0.89), 516 | ('lawyer', 0.81), ('matters', 0.78), ('constitutional', 0.74), 517 | ('laws', 0.64), ('constitution', 0.63) ] ], 518 | [ 8.5, [ ('president', 5.21), ('senator', 3.24), ('senate', 2.56), 519 | ('obama', 2.34), ('republican', 1.91), ('election', 1.81), 520 | ('republicans', 1.78), ('john', 1.60), ('nomination', 1.52), 521 | ('barack', 1.50), ('grassley', 1.37), ('vote', 1.36), 522 | ('governor', 1.34), ('presidency', 1.19) ] ], 523 | [ 7.9, [ ('court', 6.73), ('supreme', 3.30), ('case', 2.46), 524 | ('justice', 2.28), ('justices', 2.14), ('ruled', 1.98), 525 | ('cases', 1.76), ('courts', 1.69), ('judge', 1.68), 526 | ('hearing', 1.21), ('judges', 1.06), ('decisions', 1.02), 527 | ('judiciary', 0.92), ('scalia', 0.85) ] ] 528 | ] 529 | 530 | topics_sanders = [ 531 | [ 18.0, [ ('people', 4.73), ('world', 4.08), ('country', 2.89), 532 | ('american', 2.12), ('us', 2.02), ('time', 1.88), 533 | ('nation', 1.75), ('america', 1.71), ('states', 1.70), 534 | ('united', 1.66), ('americans', 1.65), ('new', 1.48), 535 | ('young', 1.46), ('women', 1.33) ] ], 536 | [ 17.8, [ ('wall', 2.36), ('going', 1.93), ('street', 1.71), 537 | ('bottom', 1.21), ('right', 1.20), ('let', 1.05), 538 | ('good', 1.02), ('protect', 0.99), ('back', 0.92), 539 | ('way', 0.89), ('fight', 0.87), ('bring', 0.86), 540 | ('top', 0.86), ('continue', 0.85) ] ], 541 | [ 16.4, [ ('financial', 3.03), ('wealth', 2.80), ('economy', 1.53), 542 | ('profits', 1.51), ('banks', 1.32), ('money', 1.23), 543 | ('tax', 1.21), ('economic', 1.15), ('energy', 1.10), 544 | ('corporations', 1.09), ('investments', 1.03), ('income', 1.02), 545 | ('huge', 1.01), ('enormous', 0.96) ] ], 546 | [ 15.2, [ ('political', 2.78), ('moral', 2.67), ('excesses', 1.59), 547 | ('deeply', 1.49), ('powerfully', 1.32), ('morality', 1.16), 548 | ('politics', 1.14), ('social', 1.09), ('sense', 0.97), 549 | ('indifference', 0.95), ('cynicism', 0.95), ('misguided', 0.93), 550 | ('recklessness', 0.90), ('disgrace', 0.90) ] ], 551 | [ 14.4, [ ('system', 1.43), ('care', 1.14), ('poor', 0.97), 552 | ('pay', 0.88), ('protect', 0.84), ('workers', 0.79), 553 | ('adequate', 0.76), ('healthcare', 0.76), ('rights', 0.76), 554 | ('good', 0.70), ('wage', 0.69), ('need', 0.69), 555 | ('allow', 0.69), ('fair', 0.68) ] ], 556 | [ 6.2, [ ('economy', 2.30), ('market', 0.46), ('economic', 0.44), 557 | ('climate', 0.38), ('industry', 0.29), ('globalization', 0.29), 558 | ('system', 0.29), ('workers', 0.28), ('inequality', 0.26), 559 | ('jobs', 0.25), ('working', 0.25), ('today', 0.24), 560 | ('change', 0.24), ('trade', 0.23) ] ], 561 | [ 5.7, [ ('billionaires', 1.85), ('us', 0.68), ('billionaire', 0.47), 562 | ('top', 0.46), ('wealthiest', 0.35), ('million', 0.33), 563 | ('dollars', 0.32), ('millions', 0.31), ('class', 0.28), 564 | ('percent', 0.26), ('hampshire', 0.25), ('billion', 0.24), 565 | ('today', 0.24), ('country', 0.24) ] ] 566 | ] 567 | 568 | topicCloud = TopicCloud(max_topic_words=8, max_topic_num=7, min_word_topic_prop=0.25, max_words=50, 569 | height=1000, width=1000, relative_scaling=0.7, max_font_size=80, 570 | min_font_size=30 ).generate_from_topics(topics_icml_5) 571 | 572 | # Display the generated image: 573 | # the matplotlib way: 574 | import matplotlib.pyplot as plt 575 | plt.imshow(topicCloud) 576 | plt.axis("off") 577 | plt.show() 578 | topicCloud.to_file("clouds/topics_icml_5.png") 579 | --------------------------------------------------------------------------------