├── word_cloud ├── __init__.py └── word_cloud_generator.py ├── _config.yml ├── word_cloud.gif ├── .gitignore ├── setup.py ├── README.md ├── Word Cloud Examples.ipynb └── Example word clouds.ipynb /word_cloud/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-hacker -------------------------------------------------------------------------------- /word_cloud.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kavgan/word_cloud/HEAD/word_cloud.gif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.xml 2 | *.iml 3 | .ipynb_checkpoints/Word Cloud Examples-checkpoint.ipynb 4 | *.pyc 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import io,os,sys 3 | 4 | def tag(): 5 | return os.getenv("version") 6 | 7 | 8 | def read_text_lines(fname): 9 | with io.open(fname) as fd: 10 | lines=fd.readlines() 11 | return ''.join(lines) 12 | 13 | 14 | setup( 15 | name="word_cloud", 16 | version=tag(), 17 | packages=find_packages(), 18 | description='Word cloud of data scientist', 19 | long_description=open("README.md").read(), 20 | classifiers=[ 21 | 'Programming Language :: Python :: 3.5', 22 | 'Programming Language :: Python :: 3.6', 23 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 24 | 'Topic :: Scientific/Engineering :: Information Analysis', 25 | 'Topic :: Text Processing :: Linguistic' 26 | ], 27 | author='kavgan', 28 | author_email='ganesan.kavita@gmail.com', 29 | license='Apache', 30 | url='https://github.com/kavgan/word_cloud', 31 | download_url='https://github.com/kavgan/word_cloud/archive/{0}.tar.gz'.format(tag()), 32 | keywords=['word cloud','visualization','text mining'], 33 | install_requires=[ 34 | 'scikit-learn>=0.19.1', 35 | 'pandas>=0.20.3' 36 | ], 37 | include_package_data=True, 38 | entry_points={ 39 | 40 | } 41 | ) 42 | 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # word_cloud 2 | Library for word cloud visualization for data scientists. Use within Jupyter notebook, from a webapp, etc. 3 | 4 | ![alt text](word_cloud.gif) 5 | 6 | ## Features 7 | 8 | - Generate word cloud for individual documents 9 | - Generate word cloud using a list of documents 10 | - Generate word cloud for words or phrases that already have scores defined 11 | - Embed in Jupyter Notebook 12 | - Show on an HTML page 13 | - Randomize colors 14 | 15 | 16 | ## Quick Start 17 | 18 | 1. Install with pip 19 | 20 | ``` 21 | pip install git+ssh://git@github.com/kavgan/word_cloud.git 22 | ``` 23 | 24 | 2. Instantiate WordCloud, get word cloud HTML code and display! 25 | 26 | ``` python 27 | from word_cloud.word_cloud_generator import WordCloud 28 | from IPython.core.display import HTML 29 | 30 | ENGLISH_STOP_WORDS = frozenset([ 31 | "a", "about", "above", "across", "after", "afterwards", "again", "against", 32 | "all", "almost", "alone", "along", "already", "also", "although", "always", 33 | "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", 34 | "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", 35 | "around", "as", "at", "back", "be", "became", "because", "become", 36 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being", 37 | "below", "beside", "besides", "between", "beyond", "bill", "both", 38 | "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", 39 | "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", 40 | "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", 41 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", 42 | "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", 43 | "find", "fire", "first", "five", "for", "former", "formerly", "forty", 44 | "found", "four", "from", "front", "full", "further", "get", "give", "go", 45 | "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", 46 | "who", "whoever", "whole", "whom", "whose", "why", "will", "with", 47 | "within", "without", "would", "yet", "said","you", "your", "yours", "yourself", 48 | "yourselves"]) 49 | 50 | # list of documents 51 | texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was "potentially catastrophic," forecasters warned. The hurricane center said it could make landfall along Mexico\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. "Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico," the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was "potentially catastrophic," forecasters warned. The hurricane center said it could make landfall along Mexico\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. "Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico," the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.'] 52 | 53 | # initialize WordCloud 54 | wc=WordCloud(stopwords=ENGLISH_STOP_WORDS) 55 | 56 | # get html code 57 | embed_code=wc.get_embed_code(text=texts,random_color=True,topn=40) 58 | 59 | # display 60 | HTML(embed_code) 61 | 62 | ``` 63 | 64 | ## More Examples 65 | - [Checkout Jupyter Notebook from this Repo](https://github.com/kavgan/word_cloud/blob/master/Example%20word%20clouds.ipynb) (word cloud only renders if your server is running) 66 | - [Jupyter Notebook on Google's Colaboratory](https://colab.research.google.com/drive/1AkdUKEFmaYom77r6KPh18jdQrplIQbKQ) 67 | - Article about this [Python word cloud](http://kavita-ganesan.com/word-cloud-for-data-scientists/#.W86v6RNKj64) module 68 | -------------------------------------------------------------------------------- /word_cloud/word_cloud_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Generate HTML code for word cloud 4 | 5 | ''' 6 | 7 | import pandas as pd 8 | import logging 9 | import numpy as np 10 | import random 11 | from sklearn.feature_extraction.text import CountVectorizer 12 | from sklearn.feature_extraction.text import TfidfTransformer 13 | 14 | 15 | class WordCloud: 16 | 17 | def __init__(self, stopwords=[], use_tfidf=False): 18 | 19 | self.use_tfidf = use_tfidf 20 | self.data = [] 21 | self.color_choices = ['#b82c2c', 22 | '#a55571', 23 | '#bc72d0', 24 | '#8000FF', 25 | '#3498DB', 26 | '#FF5733', 27 | '#223AE6', 28 | '#2ECC71', 29 | '#5F6A6A', 30 | '#6C22E6', 31 | '#CE22E6', 32 | '#ACB02E', 33 | '#B18904', 34 | '#848484', 35 | '#04B404', 36 | '#5882FA', 37 | '#FF0080', 38 | '#0489B1', 39 | '#FA5858', 40 | '#DBA901', 41 | '#00b4ff', 42 | '#008080', 43 | '#003366', 44 | '#725394' 45 | ] 46 | self.color_choices = ['#b82c2c', 47 | '#a55571', 48 | '#bc72d0', 49 | '#8000FF', 50 | '#3498DB', 51 | '#FF5733', 52 | '#223AE6', 53 | '#2ECC71', 54 | '#5F6A6A', 55 | '#6C22E6', 56 | '#CE22E6', 57 | '#ACB02E', 58 | '#B18904', 59 | '#848484', 60 | '#04B404', 61 | '#5882FA', 62 | '#FF0080', 63 | '#0489B1', 64 | '#FA5858', 65 | '#DBA901', 66 | '#00b4ff', 67 | '#008080', 68 | '#003366', 69 | '#725394' 70 | ] 71 | 72 | # load a set of stop words 73 | self.stopwords = stopwords 74 | 75 | def get_color_code(self, score): 76 | """Get the appropriate color codes.""" 77 | 78 | step = 0.05 79 | current_incremented_score = 0 80 | idx = 0 81 | 82 | while current_incremented_score < 1: 83 | if score <= current_incremented_score: 84 | return self.color_choices[idx] 85 | idx += 1 86 | current_incremented_score = current_incremented_score + step 87 | 88 | return self.color_choices[0] 89 | 90 | 91 | def get_font_size(self, score: float): 92 | """Increment scale until score almost equals current_incremented_score.""" 93 | 94 | # font size start and increment 95 | scale = 0.5 96 | max_scale = 2.5 97 | scale_step = 0.15 98 | 99 | # score increment 100 | score_step = 0.05 101 | current_incremented_score = 0 102 | 103 | while current_incremented_score < 1: 104 | 105 | # increment scale until score almost equals current_incremented_score 106 | # the larger the score, the more the scale increment 107 | if score <= current_incremented_score: 108 | return scale 109 | 110 | current_incremented_score = current_incremented_score + score_step 111 | scale += scale_step 112 | 113 | #if scale > max_scale: 114 | # scale = max_scale 115 | 116 | return scale 117 | 118 | def get_embed_code(self, text_scores: pd.DataFrame = None, text: list = [], topn=100, random_color=True): 119 | 120 | if text_scores is None and len(text) > 0: 121 | items = self.extract_topn_from_vector(text, topn=topn) 122 | text_df = pd.DataFrame(items, columns=['words', 'score']) 123 | elif text_scores is not None: 124 | text_df = text_scores 125 | text_df.columns = ['words', 'score'] 126 | else: 127 | logging.error( 128 | "There is a problem with your input text. Did you provide any?") 129 | return 130 | 131 | if random_color: 132 | random.shuffle(self.color_choices) 133 | 134 | word_cloud_items = [] 135 | 136 | html = [ 137 | "
"] 138 | for idx, row in text_df.iterrows(): 139 | word = row.words.replace(" ", "-") 140 | scale = self.get_font_size(row.score) 141 | color_code = self.get_color_code(row.score) 142 | word_cloud_items.append( 143 | " {2} ".format( 144 | color_code, scale, word)) 145 | 146 | random.shuffle(word_cloud_items) 147 | random.shuffle(word_cloud_items) 148 | 149 | html.extend(word_cloud_items) 150 | html.append("
") 151 | return ''.join(html) 152 | 153 | def sort_coo(self, coo_matrix): 154 | tuples = zip(coo_matrix.col, coo_matrix.data) 155 | return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) 156 | 157 | 158 | def get_ranks(self,word_vector): 159 | """Get normalized tf.""" 160 | 161 | max = np.max(word_vector) 162 | 163 | # normalize raw counts 164 | word_count_vector = np.multiply(word_vector, 1/(max)) 165 | 166 | return word_count_vector 167 | 168 | def get_normalized_tf(self, cv: CountVectorizer, text: list): 169 | """Get normalized tf.""" 170 | 171 | big_text = ' '.join(text) 172 | word_count_vector = cv.fit_transform([big_text]) 173 | max = np.max(word_count_vector) 174 | 175 | # normalize raw counts 176 | word_count_vector = np.multiply(word_count_vector, 1/(max)) 177 | 178 | return word_count_vector 179 | 180 | def get_tfidf_scores(self, cv: CountVectorizer, text: list): 181 | """Get tfidf values.""" 182 | 183 | word_count_vector = cv.fit_transform(text) 184 | 185 | big_text = ' '.join(text) 186 | 187 | # compute word scores 188 | tfidf_transformer = TfidfTransformer( 189 | smooth_idf=False, use_idf=True, norm='l2') 190 | tfidf_transformer.fit(word_count_vector) 191 | tf_idf_vector = tfidf_transformer.transform(cv.transform([big_text])) 192 | 193 | return tf_idf_vector 194 | 195 | def extract_topn_from_vector(self, text: list, topn=10): 196 | """Extract keywords based on tf-idf score.""" 197 | 198 | # get word count 199 | cv = CountVectorizer(stop_words=self.stopwords) 200 | 201 | word_scores_vector = None 202 | if self.use_tfidf: 203 | word_scores_vector = self.get_tfidf_scores(cv, text) 204 | else: 205 | word_scores_vector = self.get_normalized_tf(cv, text) 206 | 207 | #word_scores_vector=self.get_ranks(word_scores_vector) 208 | 209 | # sort the tf-idf vectors by descending order of scores 210 | sorted_items = self.sort_coo(word_scores_vector.tocoo()) 211 | sorted_items = sorted_items[:topn] 212 | 213 | 214 | 215 | final_items = [] 216 | 217 | # word index and corresponding tf-idf score 218 | for idx, score in sorted_items: 219 | final_items.append([cv.get_feature_names()[idx], score]) 220 | 221 | return final_items 222 | -------------------------------------------------------------------------------- /Word Cloud Examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
near  forecast  rainfall  forecasters  center  cause  coast  oct  path  life  miles  projected  mazatlan  makes  hurricane  produce  shows  willa  category  south  warning  map  surge  mph  flooding  tuesday  local  north  winds  landfall  national  san  southern  outward  portions  11  southwest  mexico  storm  threatening  flash  expected  inches  extended  et  tropical  force  western  amounts  major 
" 12 | ], 13 | "text/plain": [ 14 | "" 15 | ] 16 | }, 17 | "execution_count": 4, 18 | "metadata": {}, 19 | "output_type": "execute_result" 20 | } 21 | ], 22 | "source": [ 23 | "from word_cloud.word_cloud import WordCloud\n", 24 | "from IPython.core.display import HTML\n", 25 | "\n", 26 | "ENGLISH_STOP_WORDS = frozenset([\n", 27 | " \"a\", \"about\", \"above\", \"across\", \"after\", \"afterwards\", \"again\", \"against\",\n", 28 | " \"all\", \"almost\", \"alone\", \"along\", \"already\", \"also\", \"although\", \"always\",\n", 29 | " \"am\", \"among\", \"amongst\", \"amoungst\", \"amount\", \"an\", \"and\", \"another\",\n", 30 | " \"any\", \"anyhow\", \"anyone\", \"anything\", \"anyway\", \"anywhere\", \"are\",\n", 31 | " \"around\", \"as\", \"at\", \"back\", \"be\", \"became\", \"because\", \"become\",\n", 32 | " \"becomes\", \"becoming\", \"been\", \"before\", \"beforehand\", \"behind\", \"being\",\n", 33 | " \"below\", \"beside\", \"besides\", \"between\", \"beyond\", \"bill\", \"both\",\n", 34 | " \"bottom\", \"but\", \"by\", \"call\", \"can\", \"cannot\", \"cant\", \"co\", \"con\",\n", 35 | " \"could\", \"couldnt\", \"cry\", \"de\", \"describe\", \"detail\", \"do\", \"done\",\n", 36 | " \"down\", \"due\", \"during\", \"each\", \"eg\", \"eight\", \"either\", \"eleven\", \"else\",\n", 37 | " \"elsewhere\", \"empty\", \"enough\", \"etc\", \"even\", \"ever\", \"every\", \"everyone\",\n", 38 | " \"everything\", \"everywhere\", \"except\", \"few\", \"fifteen\", \"fifty\", \"fill\",\n", 39 | " \"find\", \"fire\", \"first\", \"five\", \"for\", \"former\", \"formerly\", \"forty\",\n", 40 | " \"found\", \"four\", \"from\", \"front\", \"full\", \"further\", \"get\", \"give\", \"go\",\n", 41 | " \"had\", \"has\", \"hasnt\", \"have\", \"he\", \"hence\", \"her\", \"here\", \"hereafter\",\n", 42 | " \"hereby\", \"herein\", \"hereupon\", \"hers\", \"herself\", \"him\", \"himself\", \"his\",\n", 43 | " \"how\", \"however\", \"hundred\", \"i\", \"ie\", \"if\", \"in\", \"inc\", \"indeed\",\n", 44 | " \"interest\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keep\", \"last\", \"latter\",\n", 45 | " \"latterly\", \"least\", \"less\", \"ltd\", \"made\", \"many\", \"may\", \"me\",\n", 46 | " \"meanwhile\", \"might\", \"mill\", \"mine\", \"more\", \"moreover\", \"most\", \"mostly\",\n", 47 | " \"move\", \"much\", \"must\", \"my\", \"myself\", \"name\", \"namely\", \"neither\",\n", 48 | " \"never\", \"nevertheless\", \"next\", \"nine\", \"no\", \"nobody\", \"none\", \"noone\",\n", 49 | " \"nor\", \"not\", \"nothing\", \"now\", \"nowhere\", \"of\", \"off\", \"often\", \"on\",\n", 50 | " \"once\", \"one\", \"only\", \"onto\", \"or\", \"other\", \"others\", \"otherwise\", \"our\",\n", 51 | " \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"part\", \"per\", \"perhaps\",\n", 52 | " \"please\", \"put\", \"rather\", \"re\", \"same\", \"see\", \"seem\", \"seemed\",\n", 53 | " \"seeming\", \"seems\", \"serious\", \"several\", \"she\", \"should\", \"show\", \"side\",\n", 54 | " \"since\", \"sincere\", \"six\", \"sixty\", \"so\", \"some\", \"somehow\", \"someone\",\n", 55 | " \"something\", \"sometime\", \"sometimes\", \"somewhere\", \"still\", \"such\",\n", 56 | " \"system\", \"take\", \"ten\", \"than\", \"that\", \"the\", \"their\", \"them\",\n", 57 | " \"themselves\", \"then\", \"thence\", \"there\", \"thereafter\", \"thereby\",\n", 58 | " \"therefore\", \"therein\", \"thereupon\", \"these\", \"they\", \"thick\", \"thin\",\n", 59 | " \"third\", \"this\", \"those\", \"though\", \"three\", \"through\", \"throughout\",\n", 60 | " \"thru\", \"thus\", \"to\", \"together\", \"too\", \"top\", \"toward\", \"towards\",\n", 61 | " \"twelve\", \"twenty\", \"two\", \"un\", \"under\", \"until\", \"up\", \"upon\", \"us\",\n", 62 | " \"very\", \"via\", \"was\", \"we\", \"well\", \"were\", \"what\", \"whatever\", \"when\",\n", 63 | " \"whence\", \"whenever\", \"where\", \"whereafter\", \"whereas\", \"whereby\",\n", 64 | " \"wherein\", \"whereupon\", \"wherever\", \"whether\", \"which\", \"while\", \"whither\",\n", 65 | " \"who\", \"whoever\", \"whole\", \"whom\", \"whose\", \"why\", \"will\", \"with\",\n", 66 | " \"within\", \"without\", \"would\", \"yet\", \"said\",\"you\", \"your\", \"yours\", \"yourself\",\n", 67 | " \"yourselves\"])\n", 68 | "\n", 69 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n", 70 | "texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.']\n", 71 | "embed_code=wc.get_embed_code(text=texts,random_color=False,topn=50)\n", 72 | "HTML(embed_code)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.6.5" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 1 104 | } 105 | -------------------------------------------------------------------------------- /Example word clouds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from word_cloud.word_cloud_generator import WordCloud\n", 10 | "from IPython.core.display import HTML\n", 11 | "from nltk.corpus import reuters\n", 12 | "import nltk\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "ENGLISH_STOP_WORDS = frozenset([\n", 23 | " \"a\", \"about\", \"above\", \"across\", \"after\", \"afterwards\", \"again\", \"against\",\n", 24 | " \"all\", \"almost\", \"alone\", \"along\", \"already\", \"also\", \"although\", \"always\",\n", 25 | " \"am\", \"among\", \"amongst\", \"amoungst\", \"amount\", \"an\", \"and\", \"another\",\n", 26 | " \"any\", \"anyhow\", \"anyone\", \"anything\", \"anyway\", \"anywhere\", \"are\",\n", 27 | " \"around\", \"as\", \"at\", \"back\", \"be\", \"became\", \"because\", \"become\",\n", 28 | " \"becomes\", \"becoming\", \"been\", \"before\", \"beforehand\", \"behind\", \"being\",\n", 29 | " \"below\", \"beside\", \"besides\", \"between\", \"beyond\", \"bill\", \"both\",\n", 30 | " \"bottom\", \"but\", \"by\", \"call\", \"can\", \"cannot\", \"cant\", \"co\", \"con\",\n", 31 | " \"could\", \"couldnt\", \"cry\", \"de\", \"describe\", \"detail\", \"do\", \"done\",\n", 32 | " \"down\", \"due\", \"during\", \"each\", \"eg\", \"eight\", \"either\", \"eleven\", \"else\",\n", 33 | " \"elsewhere\", \"empty\", \"enough\", \"etc\", \"even\", \"ever\", \"every\", \"everyone\",\n", 34 | " \"everything\", \"everywhere\", \"except\", \"few\", \"fifteen\", \"fifty\", \"fill\",\n", 35 | " \"find\", \"fire\", \"first\", \"five\", \"for\", \"former\", \"formerly\", \"forty\",\n", 36 | " \"found\", \"four\", \"from\", \"front\", \"full\", \"further\", \"get\", \"give\", \"go\",\n", 37 | " \"had\", \"has\", \"hasnt\", \"have\", \"he\", \"hence\", \"her\", \"here\", \"hereafter\",\n", 38 | " \"hereby\", \"herein\", \"hereupon\", \"hers\", \"herself\", \"him\", \"himself\", \"his\",\n", 39 | " \"how\", \"however\", \"hundred\", \"i\", \"ie\", \"if\", \"in\", \"inc\", \"indeed\",\n", 40 | " \"interest\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keep\", \"last\", \"latter\",\n", 41 | " \"latterly\", \"least\", \"less\", \"ltd\", \"made\", \"many\", \"may\", \"me\",\n", 42 | " \"meanwhile\", \"might\", \"mill\", \"mine\", \"more\", \"moreover\", \"most\", \"mostly\",\n", 43 | " \"move\", \"much\", \"must\", \"my\", \"myself\", \"name\", \"namely\", \"neither\",\n", 44 | " \"never\", \"nevertheless\", \"next\", \"nine\", \"no\", \"nobody\", \"none\", \"noone\",\n", 45 | " \"nor\", \"not\", \"nothing\", \"now\", \"nowhere\", \"of\", \"off\", \"often\", \"on\",\n", 46 | " \"once\", \"one\", \"only\", \"onto\", \"or\", \"other\", \"others\", \"otherwise\", \"our\",\n", 47 | " \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"part\", \"per\", \"perhaps\",\n", 48 | " \"please\", \"put\", \"rather\", \"re\", \"same\", \"see\", \"seem\", \"seemed\",\n", 49 | " \"seeming\", \"seems\", \"serious\", \"several\", \"she\", \"should\", \"show\", \"side\",\n", 50 | " \"since\", \"sincere\", \"six\", \"sixty\", \"so\", \"some\", \"somehow\", \"someone\",\n", 51 | " \"something\", \"sometime\", \"sometimes\", \"somewhere\", \"still\", \"such\",\n", 52 | " \"system\", \"take\", \"ten\", \"than\", \"that\", \"the\", \"their\", \"them\",\n", 53 | " \"themselves\", \"then\", \"thence\", \"there\", \"thereafter\", \"thereby\",\n", 54 | " \"therefore\", \"therein\", \"thereupon\", \"these\", \"they\", \"thick\", \"thin\",\n", 55 | " \"third\", \"this\", \"those\", \"though\", \"three\", \"through\", \"throughout\",\n", 56 | " \"thru\", \"thus\", \"to\", \"together\", \"too\", \"top\", \"toward\", \"towards\",\n", 57 | " \"twelve\", \"twenty\", \"two\", \"un\", \"under\", \"until\", \"up\", \"upon\", \"us\",\n", 58 | " \"very\", \"via\", \"was\", \"we\", \"well\", \"were\", \"what\", \"whatever\", \"when\",\n", 59 | " \"whence\", \"whenever\", \"where\", \"whereafter\", \"whereas\", \"whereby\",\n", 60 | " \"wherein\", \"whereupon\", \"wherever\", \"whether\", \"which\", \"while\", \"whither\",\n", 61 | " \"who\", \"whoever\", \"whole\", \"whom\", \"whose\", \"why\", \"will\", \"with\",\n", 62 | " \"within\", \"without\", \"would\", \"yet\", \"said\",\"you\", \"your\", \"yours\", \"yourself\",\n", 63 | " \"yourselves\"])\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Generate word clouds with a single text document\n", 71 | "\n", 72 | "This example show cases how you can generate word clouds with just one document. While the colors can be randomized, in this example, the colors are based on the default color settings. By default, the words are weighted by word counts unless you explicitly ask for `tfidf` weighting. Tfidf weighting makes sense only if you have a lot of documents to start with, otherwise the `idf` values would be incorrect." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 19, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/html": [ 83 | "
inches  willa  center  national  south  cause  tropical  hurricane  11  path  mazatlan  miles  winds  threatening  mexico  san  warning  outward  southern  southwest  tuesday  storm  mph  et  surge  coast  oct  shows  rainfall  flooding  projected  expected  life  produce  western  north  portions  amounts  near  map 
" 84 | ], 85 | "text/plain": [ 86 | "" 87 | ] 88 | }, 89 | "execution_count": 19, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "#only one news article here\n", 96 | "texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.']\n", 97 | "\n", 98 | "\n", 99 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n", 100 | "\n", 101 | "#don't randomize color, show only top 50\n", 102 | "embed_code=wc.get_embed_code(text=texts,random_color=True,topn=40)\n", 103 | "HTML(embed_code)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "### Randomized color\n", 111 | "This is the same example as above, with the colors randomized" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 12, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
storm  projected  flooding  oct  rainfall  et  shows  forecast  near  mph  western  south  life  warning  major  north  san  threatening  winds  mazatlan  tuesday  makes  category  forecasters  11  miles  portions  willa  surge  extended  national  tropical  landfall  local  cause  coast  path  inches  force  southern  outward  southwest  expected  mexico  flash  amounts  center  produce  hurricane  map 
" 123 | ], 124 | "text/plain": [ 125 | "" 126 | ] 127 | }, 128 | "execution_count": 12, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "#don't randomize color, show only top 50\n", 135 | "embed_code=wc.get_embed_code(text=texts,random_color=True,topn=50)\n", 136 | "HTML(embed_code)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Generate word clouds from multiple documents " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "[nltk_data] Downloading package reuters to /Users/kavgan/nltk_data...\n", 156 | "[nltk_data] Package reuters is already up-to-date!\n" 157 | ] 158 | }, 159 | { 160 | "data": { 161 | "text/html": [ 162 | "
board  told  ual  group  offer  purchase  takeover  year  securities  business  unit  chairman  terms  national  billion  bank  international  bid  company  share  new  industries  mln  agreement  april  statement  buy  price  acquisition  american  stake  stock  1986  sale  acquired  companies  report  dlrs  acquire  merger  plan  shares  agreed  lt  000  corp  analysts  pct  chemical  sell 
" 163 | ], 164 | "text/plain": [ 165 | "" 166 | ] 167 | }, 168 | "execution_count": 5, 169 | "metadata": {}, 170 | "output_type": "execute_result" 171 | } 172 | ], 173 | "source": [ 174 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n", 175 | "\n", 176 | "nltk.download('reuters')\n", 177 | "\n", 178 | "#get all articles related to acquisitions\n", 179 | "category_docs = reuters.fileids(\"acq\");\n", 180 | "\n", 181 | "\n", 182 | "list_of_documents=[]\n", 183 | "\n", 184 | "#use raw content from a 100 documents\n", 185 | "for i in range (100):\n", 186 | " document_id = category_docs[i]\n", 187 | " list_of_documents.append(reuters.raw(document_id)) \n", 188 | " \n", 189 | "\n", 190 | "#don't randomize color, show only top 50\n", 191 | "embed_code=wc.get_embed_code(text=list_of_documents,random_color=True,topn=50)\n", 192 | "HTML(embed_code)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## Generate word clouds from existing weights\n", 200 | "Let's say you already have a set of words with corresponding weights, and you just want to visualize it, that is also an option with this library. All you need to do is make sure that weights are normalized between [0-1]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/html": [ 211 | "
ambitious  cool-place  nice-work  smart  awesome  great-job  nice-colors  intelligent  small-font  fun-place  medium-font  word-cloud  crazy  cool-cloud  phrase-cloud 
" 212 | ], 213 | "text/plain": [ 214 | "" 215 | ] 216 | }, 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n", 224 | "\n", 225 | "#words with corresponding weights\n", 226 | "list_of_scores=[['nice-work',0.2],['great-job',0.7],['cool-place',0.1],['cool-cloud',0.6],['phrase-cloud',0.34],['word-cloud',0.625],['nice-colors',0.525],['small-font',0.4],['fun-place',0.6],['awesome',0.4],['intelligent',0.4],['medium-font',0.4],['crazy',0.2],['smart',0.3],['ambitious',0.4]]\n", 227 | "\n", 228 | "#don't randomize color, show only top 50\n", 229 | "embed_code=wc.get_embed_code(text_scores=pd.DataFrame(list_of_scores),random_color=True,topn=50)\n", 230 | "HTML(embed_code)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.6.5" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 2 269 | } 270 | --------------------------------------------------------------------------------