├── word_cloud
├── __init__.py
└── word_cloud_generator.py
├── _config.yml
├── word_cloud.gif
├── .gitignore
├── setup.py
├── README.md
├── Word Cloud Examples.ipynb
└── Example word clouds.ipynb
/word_cloud/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-hacker
--------------------------------------------------------------------------------
/word_cloud.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kavgan/word_cloud/HEAD/word_cloud.gif
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.xml
2 | *.iml
3 | .ipynb_checkpoints/Word Cloud Examples-checkpoint.ipynb
4 | *.pyc
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import io,os,sys
3 |
4 | def tag():
5 | return os.getenv("version")
6 |
7 |
8 | def read_text_lines(fname):
9 | with io.open(fname) as fd:
10 | lines=fd.readlines()
11 | return ''.join(lines)
12 |
13 |
14 | setup(
15 | name="word_cloud",
16 | version=tag(),
17 | packages=find_packages(),
18 | description='Word cloud of data scientist',
19 | long_description=open("README.md").read(),
20 | classifiers=[
21 | 'Programming Language :: Python :: 3.5',
22 | 'Programming Language :: Python :: 3.6',
23 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
24 | 'Topic :: Scientific/Engineering :: Information Analysis',
25 | 'Topic :: Text Processing :: Linguistic'
26 | ],
27 | author='kavgan',
28 | author_email='ganesan.kavita@gmail.com',
29 | license='Apache',
30 | url='https://github.com/kavgan/word_cloud',
31 | download_url='https://github.com/kavgan/word_cloud/archive/{0}.tar.gz'.format(tag()),
32 | keywords=['word cloud','visualization','text mining'],
33 | install_requires=[
34 | 'scikit-learn>=0.19.1',
35 | 'pandas>=0.20.3'
36 | ],
37 | include_package_data=True,
38 | entry_points={
39 |
40 | }
41 | )
42 |
43 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # word_cloud
2 | Library for word cloud visualization for data scientists. Use within Jupyter notebook, from a webapp, etc.
3 |
4 | 
5 |
6 | ## Features
7 |
8 | - Generate word cloud for individual documents
9 | - Generate word cloud using a list of documents
10 | - Generate word cloud for words or phrases that already have scores defined
11 | - Embed in Jupyter Notebook
12 | - Show on an HTML page
13 | - Randomize colors
14 |
15 |
16 | ## Quick Start
17 |
18 | 1. Install with pip
19 |
20 | ```
21 | pip install git+ssh://git@github.com/kavgan/word_cloud.git
22 | ```
23 |
24 | 2. Instantiate WordCloud, get word cloud HTML code and display!
25 |
26 | ``` python
27 | from word_cloud.word_cloud_generator import WordCloud
28 | from IPython.core.display import HTML
29 |
30 | ENGLISH_STOP_WORDS = frozenset([
31 | "a", "about", "above", "across", "after", "afterwards", "again", "against",
32 | "all", "almost", "alone", "along", "already", "also", "although", "always",
33 | "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
34 | "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
35 | "around", "as", "at", "back", "be", "became", "because", "become",
36 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
37 | "below", "beside", "besides", "between", "beyond", "bill", "both",
38 | "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
39 | "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
40 | "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
41 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
42 | "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
43 | "find", "fire", "first", "five", "for", "former", "formerly", "forty",
44 | "found", "four", "from", "front", "full", "further", "get", "give", "go",
45 | "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
46 | "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
47 | "within", "without", "would", "yet", "said","you", "your", "yours", "yourself",
48 | "yourselves"])
49 |
50 | # list of documents
51 | texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was "potentially catastrophic," forecasters warned. The hurricane center said it could make landfall along Mexico\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. "Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico," the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was "potentially catastrophic," forecasters warned. The hurricane center said it could make landfall along Mexico\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. "Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico," the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.']
52 |
53 | # initialize WordCloud
54 | wc=WordCloud(stopwords=ENGLISH_STOP_WORDS)
55 |
56 | # get html code
57 | embed_code=wc.get_embed_code(text=texts,random_color=True,topn=40)
58 |
59 | # display
60 | HTML(embed_code)
61 |
62 | ```
63 |
64 | ## More Examples
65 | - [Checkout Jupyter Notebook from this Repo](https://github.com/kavgan/word_cloud/blob/master/Example%20word%20clouds.ipynb) (word cloud only renders if your server is running)
66 | - [Jupyter Notebook on Google's Colaboratory](https://colab.research.google.com/drive/1AkdUKEFmaYom77r6KPh18jdQrplIQbKQ)
67 | - Article about this [Python word cloud](http://kavita-ganesan.com/word-cloud-for-data-scientists/#.W86v6RNKj64) module
68 |
--------------------------------------------------------------------------------
/word_cloud/word_cloud_generator.py:
--------------------------------------------------------------------------------
1 | '''
2 |
3 | Generate HTML code for word cloud
4 |
5 | '''
6 |
7 | import pandas as pd
8 | import logging
9 | import numpy as np
10 | import random
11 | from sklearn.feature_extraction.text import CountVectorizer
12 | from sklearn.feature_extraction.text import TfidfTransformer
13 |
14 |
15 | class WordCloud:
16 |
17 | def __init__(self, stopwords=[], use_tfidf=False):
18 |
19 | self.use_tfidf = use_tfidf
20 | self.data = []
21 | self.color_choices = ['#b82c2c',
22 | '#a55571',
23 | '#bc72d0',
24 | '#8000FF',
25 | '#3498DB',
26 | '#FF5733',
27 | '#223AE6',
28 | '#2ECC71',
29 | '#5F6A6A',
30 | '#6C22E6',
31 | '#CE22E6',
32 | '#ACB02E',
33 | '#B18904',
34 | '#848484',
35 | '#04B404',
36 | '#5882FA',
37 | '#FF0080',
38 | '#0489B1',
39 | '#FA5858',
40 | '#DBA901',
41 | '#00b4ff',
42 | '#008080',
43 | '#003366',
44 | '#725394'
45 | ]
46 | self.color_choices = ['#b82c2c',
47 | '#a55571',
48 | '#bc72d0',
49 | '#8000FF',
50 | '#3498DB',
51 | '#FF5733',
52 | '#223AE6',
53 | '#2ECC71',
54 | '#5F6A6A',
55 | '#6C22E6',
56 | '#CE22E6',
57 | '#ACB02E',
58 | '#B18904',
59 | '#848484',
60 | '#04B404',
61 | '#5882FA',
62 | '#FF0080',
63 | '#0489B1',
64 | '#FA5858',
65 | '#DBA901',
66 | '#00b4ff',
67 | '#008080',
68 | '#003366',
69 | '#725394'
70 | ]
71 |
72 | # load a set of stop words
73 | self.stopwords = stopwords
74 |
75 | def get_color_code(self, score):
76 | """Get the appropriate color codes."""
77 |
78 | step = 0.05
79 | current_incremented_score = 0
80 | idx = 0
81 |
82 | while current_incremented_score < 1:
83 | if score <= current_incremented_score:
84 | return self.color_choices[idx]
85 | idx += 1
86 | current_incremented_score = current_incremented_score + step
87 |
88 | return self.color_choices[0]
89 |
90 |
91 | def get_font_size(self, score: float):
92 | """Increment scale until score almost equals current_incremented_score."""
93 |
94 | # font size start and increment
95 | scale = 0.5
96 | max_scale = 2.5
97 | scale_step = 0.15
98 |
99 | # score increment
100 | score_step = 0.05
101 | current_incremented_score = 0
102 |
103 | while current_incremented_score < 1:
104 |
105 | # increment scale until score almost equals current_incremented_score
106 | # the larger the score, the more the scale increment
107 | if score <= current_incremented_score:
108 | return scale
109 |
110 | current_incremented_score = current_incremented_score + score_step
111 | scale += scale_step
112 |
113 | #if scale > max_scale:
114 | # scale = max_scale
115 |
116 | return scale
117 |
118 | def get_embed_code(self, text_scores: pd.DataFrame = None, text: list = [], topn=100, random_color=True):
119 |
120 | if text_scores is None and len(text) > 0:
121 | items = self.extract_topn_from_vector(text, topn=topn)
122 | text_df = pd.DataFrame(items, columns=['words', 'score'])
123 | elif text_scores is not None:
124 | text_df = text_scores
125 | text_df.columns = ['words', 'score']
126 | else:
127 | logging.error(
128 | "There is a problem with your input text. Did you provide any?")
129 | return
130 |
131 | if random_color:
132 | random.shuffle(self.color_choices)
133 |
134 | word_cloud_items = []
135 |
136 | html = [
137 | "
"]
138 | for idx, row in text_df.iterrows():
139 | word = row.words.replace(" ", "-")
140 | scale = self.get_font_size(row.score)
141 | color_code = self.get_color_code(row.score)
142 | word_cloud_items.append(
143 | " {2} ".format(
144 | color_code, scale, word))
145 |
146 | random.shuffle(word_cloud_items)
147 | random.shuffle(word_cloud_items)
148 |
149 | html.extend(word_cloud_items)
150 | html.append("
")
151 | return ''.join(html)
152 |
153 | def sort_coo(self, coo_matrix):
154 | tuples = zip(coo_matrix.col, coo_matrix.data)
155 | return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
156 |
157 |
158 | def get_ranks(self,word_vector):
159 | """Get normalized tf."""
160 |
161 | max = np.max(word_vector)
162 |
163 | # normalize raw counts
164 | word_count_vector = np.multiply(word_vector, 1/(max))
165 |
166 | return word_count_vector
167 |
168 | def get_normalized_tf(self, cv: CountVectorizer, text: list):
169 | """Get normalized tf."""
170 |
171 | big_text = ' '.join(text)
172 | word_count_vector = cv.fit_transform([big_text])
173 | max = np.max(word_count_vector)
174 |
175 | # normalize raw counts
176 | word_count_vector = np.multiply(word_count_vector, 1/(max))
177 |
178 | return word_count_vector
179 |
180 | def get_tfidf_scores(self, cv: CountVectorizer, text: list):
181 | """Get tfidf values."""
182 |
183 | word_count_vector = cv.fit_transform(text)
184 |
185 | big_text = ' '.join(text)
186 |
187 | # compute word scores
188 | tfidf_transformer = TfidfTransformer(
189 | smooth_idf=False, use_idf=True, norm='l2')
190 | tfidf_transformer.fit(word_count_vector)
191 | tf_idf_vector = tfidf_transformer.transform(cv.transform([big_text]))
192 |
193 | return tf_idf_vector
194 |
195 | def extract_topn_from_vector(self, text: list, topn=10):
196 | """Extract keywords based on tf-idf score."""
197 |
198 | # get word count
199 | cv = CountVectorizer(stop_words=self.stopwords)
200 |
201 | word_scores_vector = None
202 | if self.use_tfidf:
203 | word_scores_vector = self.get_tfidf_scores(cv, text)
204 | else:
205 | word_scores_vector = self.get_normalized_tf(cv, text)
206 |
207 | #word_scores_vector=self.get_ranks(word_scores_vector)
208 |
209 | # sort the tf-idf vectors by descending order of scores
210 | sorted_items = self.sort_coo(word_scores_vector.tocoo())
211 | sorted_items = sorted_items[:topn]
212 |
213 |
214 |
215 | final_items = []
216 |
217 | # word index and corresponding tf-idf score
218 | for idx, score in sorted_items:
219 | final_items.append([cv.get_feature_names()[idx], score])
220 |
221 | return final_items
222 |
--------------------------------------------------------------------------------
/Word Cloud Examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | " near forecast rainfall forecasters center cause coast oct path life miles projected mazatlan makes hurricane produce shows willa category south warning map surge mph flooding tuesday local north winds landfall national san southern outward portions 11 southwest mexico storm threatening flash expected inches extended et tropical force western amounts major
"
12 | ],
13 | "text/plain": [
14 | ""
15 | ]
16 | },
17 | "execution_count": 4,
18 | "metadata": {},
19 | "output_type": "execute_result"
20 | }
21 | ],
22 | "source": [
23 | "from word_cloud.word_cloud import WordCloud\n",
24 | "from IPython.core.display import HTML\n",
25 | "\n",
26 | "ENGLISH_STOP_WORDS = frozenset([\n",
27 | " \"a\", \"about\", \"above\", \"across\", \"after\", \"afterwards\", \"again\", \"against\",\n",
28 | " \"all\", \"almost\", \"alone\", \"along\", \"already\", \"also\", \"although\", \"always\",\n",
29 | " \"am\", \"among\", \"amongst\", \"amoungst\", \"amount\", \"an\", \"and\", \"another\",\n",
30 | " \"any\", \"anyhow\", \"anyone\", \"anything\", \"anyway\", \"anywhere\", \"are\",\n",
31 | " \"around\", \"as\", \"at\", \"back\", \"be\", \"became\", \"because\", \"become\",\n",
32 | " \"becomes\", \"becoming\", \"been\", \"before\", \"beforehand\", \"behind\", \"being\",\n",
33 | " \"below\", \"beside\", \"besides\", \"between\", \"beyond\", \"bill\", \"both\",\n",
34 | " \"bottom\", \"but\", \"by\", \"call\", \"can\", \"cannot\", \"cant\", \"co\", \"con\",\n",
35 | " \"could\", \"couldnt\", \"cry\", \"de\", \"describe\", \"detail\", \"do\", \"done\",\n",
36 | " \"down\", \"due\", \"during\", \"each\", \"eg\", \"eight\", \"either\", \"eleven\", \"else\",\n",
37 | " \"elsewhere\", \"empty\", \"enough\", \"etc\", \"even\", \"ever\", \"every\", \"everyone\",\n",
38 | " \"everything\", \"everywhere\", \"except\", \"few\", \"fifteen\", \"fifty\", \"fill\",\n",
39 | " \"find\", \"fire\", \"first\", \"five\", \"for\", \"former\", \"formerly\", \"forty\",\n",
40 | " \"found\", \"four\", \"from\", \"front\", \"full\", \"further\", \"get\", \"give\", \"go\",\n",
41 | " \"had\", \"has\", \"hasnt\", \"have\", \"he\", \"hence\", \"her\", \"here\", \"hereafter\",\n",
42 | " \"hereby\", \"herein\", \"hereupon\", \"hers\", \"herself\", \"him\", \"himself\", \"his\",\n",
43 | " \"how\", \"however\", \"hundred\", \"i\", \"ie\", \"if\", \"in\", \"inc\", \"indeed\",\n",
44 | " \"interest\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keep\", \"last\", \"latter\",\n",
45 | " \"latterly\", \"least\", \"less\", \"ltd\", \"made\", \"many\", \"may\", \"me\",\n",
46 | " \"meanwhile\", \"might\", \"mill\", \"mine\", \"more\", \"moreover\", \"most\", \"mostly\",\n",
47 | " \"move\", \"much\", \"must\", \"my\", \"myself\", \"name\", \"namely\", \"neither\",\n",
48 | " \"never\", \"nevertheless\", \"next\", \"nine\", \"no\", \"nobody\", \"none\", \"noone\",\n",
49 | " \"nor\", \"not\", \"nothing\", \"now\", \"nowhere\", \"of\", \"off\", \"often\", \"on\",\n",
50 | " \"once\", \"one\", \"only\", \"onto\", \"or\", \"other\", \"others\", \"otherwise\", \"our\",\n",
51 | " \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"part\", \"per\", \"perhaps\",\n",
52 | " \"please\", \"put\", \"rather\", \"re\", \"same\", \"see\", \"seem\", \"seemed\",\n",
53 | " \"seeming\", \"seems\", \"serious\", \"several\", \"she\", \"should\", \"show\", \"side\",\n",
54 | " \"since\", \"sincere\", \"six\", \"sixty\", \"so\", \"some\", \"somehow\", \"someone\",\n",
55 | " \"something\", \"sometime\", \"sometimes\", \"somewhere\", \"still\", \"such\",\n",
56 | " \"system\", \"take\", \"ten\", \"than\", \"that\", \"the\", \"their\", \"them\",\n",
57 | " \"themselves\", \"then\", \"thence\", \"there\", \"thereafter\", \"thereby\",\n",
58 | " \"therefore\", \"therein\", \"thereupon\", \"these\", \"they\", \"thick\", \"thin\",\n",
59 | " \"third\", \"this\", \"those\", \"though\", \"three\", \"through\", \"throughout\",\n",
60 | " \"thru\", \"thus\", \"to\", \"together\", \"too\", \"top\", \"toward\", \"towards\",\n",
61 | " \"twelve\", \"twenty\", \"two\", \"un\", \"under\", \"until\", \"up\", \"upon\", \"us\",\n",
62 | " \"very\", \"via\", \"was\", \"we\", \"well\", \"were\", \"what\", \"whatever\", \"when\",\n",
63 | " \"whence\", \"whenever\", \"where\", \"whereafter\", \"whereas\", \"whereby\",\n",
64 | " \"wherein\", \"whereupon\", \"wherever\", \"whether\", \"which\", \"while\", \"whither\",\n",
65 | " \"who\", \"whoever\", \"whole\", \"whom\", \"whose\", \"why\", \"will\", \"with\",\n",
66 | " \"within\", \"without\", \"would\", \"yet\", \"said\",\"you\", \"your\", \"yours\", \"yourself\",\n",
67 | " \"yourselves\"])\n",
68 | "\n",
69 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n",
70 | "texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.']\n",
71 | "embed_code=wc.get_embed_code(text=texts,random_color=False,topn=50)\n",
72 | "HTML(embed_code)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": []
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.6.5"
100 | }
101 | },
102 | "nbformat": 4,
103 | "nbformat_minor": 1
104 | }
105 |
--------------------------------------------------------------------------------
/Example word clouds.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from word_cloud.word_cloud_generator import WordCloud\n",
10 | "from IPython.core.display import HTML\n",
11 | "from nltk.corpus import reuters\n",
12 | "import nltk\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "ENGLISH_STOP_WORDS = frozenset([\n",
23 | " \"a\", \"about\", \"above\", \"across\", \"after\", \"afterwards\", \"again\", \"against\",\n",
24 | " \"all\", \"almost\", \"alone\", \"along\", \"already\", \"also\", \"although\", \"always\",\n",
25 | " \"am\", \"among\", \"amongst\", \"amoungst\", \"amount\", \"an\", \"and\", \"another\",\n",
26 | " \"any\", \"anyhow\", \"anyone\", \"anything\", \"anyway\", \"anywhere\", \"are\",\n",
27 | " \"around\", \"as\", \"at\", \"back\", \"be\", \"became\", \"because\", \"become\",\n",
28 | " \"becomes\", \"becoming\", \"been\", \"before\", \"beforehand\", \"behind\", \"being\",\n",
29 | " \"below\", \"beside\", \"besides\", \"between\", \"beyond\", \"bill\", \"both\",\n",
30 | " \"bottom\", \"but\", \"by\", \"call\", \"can\", \"cannot\", \"cant\", \"co\", \"con\",\n",
31 | " \"could\", \"couldnt\", \"cry\", \"de\", \"describe\", \"detail\", \"do\", \"done\",\n",
32 | " \"down\", \"due\", \"during\", \"each\", \"eg\", \"eight\", \"either\", \"eleven\", \"else\",\n",
33 | " \"elsewhere\", \"empty\", \"enough\", \"etc\", \"even\", \"ever\", \"every\", \"everyone\",\n",
34 | " \"everything\", \"everywhere\", \"except\", \"few\", \"fifteen\", \"fifty\", \"fill\",\n",
35 | " \"find\", \"fire\", \"first\", \"five\", \"for\", \"former\", \"formerly\", \"forty\",\n",
36 | " \"found\", \"four\", \"from\", \"front\", \"full\", \"further\", \"get\", \"give\", \"go\",\n",
37 | " \"had\", \"has\", \"hasnt\", \"have\", \"he\", \"hence\", \"her\", \"here\", \"hereafter\",\n",
38 | " \"hereby\", \"herein\", \"hereupon\", \"hers\", \"herself\", \"him\", \"himself\", \"his\",\n",
39 | " \"how\", \"however\", \"hundred\", \"i\", \"ie\", \"if\", \"in\", \"inc\", \"indeed\",\n",
40 | " \"interest\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keep\", \"last\", \"latter\",\n",
41 | " \"latterly\", \"least\", \"less\", \"ltd\", \"made\", \"many\", \"may\", \"me\",\n",
42 | " \"meanwhile\", \"might\", \"mill\", \"mine\", \"more\", \"moreover\", \"most\", \"mostly\",\n",
43 | " \"move\", \"much\", \"must\", \"my\", \"myself\", \"name\", \"namely\", \"neither\",\n",
44 | " \"never\", \"nevertheless\", \"next\", \"nine\", \"no\", \"nobody\", \"none\", \"noone\",\n",
45 | " \"nor\", \"not\", \"nothing\", \"now\", \"nowhere\", \"of\", \"off\", \"often\", \"on\",\n",
46 | " \"once\", \"one\", \"only\", \"onto\", \"or\", \"other\", \"others\", \"otherwise\", \"our\",\n",
47 | " \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"part\", \"per\", \"perhaps\",\n",
48 | " \"please\", \"put\", \"rather\", \"re\", \"same\", \"see\", \"seem\", \"seemed\",\n",
49 | " \"seeming\", \"seems\", \"serious\", \"several\", \"she\", \"should\", \"show\", \"side\",\n",
50 | " \"since\", \"sincere\", \"six\", \"sixty\", \"so\", \"some\", \"somehow\", \"someone\",\n",
51 | " \"something\", \"sometime\", \"sometimes\", \"somewhere\", \"still\", \"such\",\n",
52 | " \"system\", \"take\", \"ten\", \"than\", \"that\", \"the\", \"their\", \"them\",\n",
53 | " \"themselves\", \"then\", \"thence\", \"there\", \"thereafter\", \"thereby\",\n",
54 | " \"therefore\", \"therein\", \"thereupon\", \"these\", \"they\", \"thick\", \"thin\",\n",
55 | " \"third\", \"this\", \"those\", \"though\", \"three\", \"through\", \"throughout\",\n",
56 | " \"thru\", \"thus\", \"to\", \"together\", \"too\", \"top\", \"toward\", \"towards\",\n",
57 | " \"twelve\", \"twenty\", \"two\", \"un\", \"under\", \"until\", \"up\", \"upon\", \"us\",\n",
58 | " \"very\", \"via\", \"was\", \"we\", \"well\", \"were\", \"what\", \"whatever\", \"when\",\n",
59 | " \"whence\", \"whenever\", \"where\", \"whereafter\", \"whereas\", \"whereby\",\n",
60 | " \"wherein\", \"whereupon\", \"wherever\", \"whether\", \"which\", \"while\", \"whither\",\n",
61 | " \"who\", \"whoever\", \"whole\", \"whom\", \"whose\", \"why\", \"will\", \"with\",\n",
62 | " \"within\", \"without\", \"would\", \"yet\", \"said\",\"you\", \"your\", \"yours\", \"yourself\",\n",
63 | " \"yourselves\"])\n"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "## Generate word clouds with a single text document\n",
71 | "\n",
72 | "This example show cases how you can generate word clouds with just one document. While the colors can be randomized, in this example, the colors are based on the default color settings. By default, the words are weighted by word counts unless you explicitly ask for `tfidf` weighting. Tfidf weighting makes sense only if you have a lot of documents to start with, otherwise the `idf` values would be incorrect."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 19,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/html": [
83 | " inches willa center national south cause tropical hurricane 11 path mazatlan miles winds threatening mexico san warning outward southern southwest tuesday storm mph et surge coast oct shows rainfall flooding projected expected life produce western north portions amounts near map
"
84 | ],
85 | "text/plain": [
86 | ""
87 | ]
88 | },
89 | "execution_count": 19,
90 | "metadata": {},
91 | "output_type": "execute_result"
92 | }
93 | ],
94 | "source": [
95 | "#only one news article here\n",
96 | "texts=['MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast','MEXICO CITY — Newly formed Hurricane Willa rapidly intensified off Mexico\\'s Pacific coast Sunday and early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.','early Monday and became a major Category 5 storm, the U.S. National Hurricane Center said. As of 11 a.m. ET., Willa had maximum sustained winds of 160 mph -- just 3 mph over the threshold for a Category 5. Willa was \"potentially catastrophic,\" forecasters warned. The hurricane center said it could make landfall along Mexico\\'s southwestern coast Tuesday afternoon or evening and bring with it a life-threatening storm surge -- especially near and to the south of where the center of Willa makes landfall. Near the coast, the surge will be accompanied by large and destructive waves. Willa is also forecast to bring high winds and heavy rainfall. \"Slight weakening is forecast to begin on Tuesday, but Willa is expected to be an extremely dangerous major hurricane when it reaches the coast of Mexico,\" the center said. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. A map made by the U.S. National Hurricane Center shows the projected path for Hurricane Willa as of 11 a.m. ET on Oct. 22, 2018. NATIONAL HURRICANE CENTER The center said Willa was about 175 miles south-southwest of Las Islas Marias, Mexico, and some 135 miles southwest of Cabo Corrientes, Mexico, and was moving north at about 7 mph. Hurricane-force winds extended outward up to 30 miles from the center and tropical-storm-force winds extended outward up to 105 miles. A hurricane warning was posted for a stretch of shore between San Blas and Mazatlan. A tropical storm warning was in effect for Playa Perula to San Blas and north of Mazatlan to Bahia Tempehuaya. Forecasters said Willa is expected to produce storm total rainfall accumulations of 6 to 12 inches, with local amounts up to 18 inches, across portions of western Jalisco, western Nayarit, and southern Sinaloa in Mexico. The rainfall could cause life-threatening flash flooding and landslides. Farther inland, Willa is expected to produce rainfall amounts of 2 to 4 inches across portions of Zacateca, Durango, southeast Chihuahua, and Coahuila in Mexico, with local amounts up to 6 inches possible. That could cause life-threatening flash flooding. After Willa makes its way across Mexico, it could drop between 1 and 3 inches of rain on central and southern Texas during the middle of the week, CBS News contributing meteorologist Jeff Berardelli reports. The additional rainfall could cause additional flooding in already saturated areas.']\n",
97 | "\n",
98 | "\n",
99 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n",
100 | "\n",
101 | "#don't randomize color, show only top 50\n",
102 | "embed_code=wc.get_embed_code(text=texts,random_color=True,topn=40)\n",
103 | "HTML(embed_code)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "### Randomized color\n",
111 | "This is the same example as above, with the colors randomized"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 12,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/html": [
122 | " storm projected flooding oct rainfall et shows forecast near mph western south life warning major north san threatening winds mazatlan tuesday makes category forecasters 11 miles portions willa surge extended national tropical landfall local cause coast path inches force southern outward southwest expected mexico flash amounts center produce hurricane map
"
123 | ],
124 | "text/plain": [
125 | ""
126 | ]
127 | },
128 | "execution_count": 12,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "#don't randomize color, show only top 50\n",
135 | "embed_code=wc.get_embed_code(text=texts,random_color=True,topn=50)\n",
136 | "HTML(embed_code)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "## Generate word clouds from multiple documents "
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 5,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "[nltk_data] Downloading package reuters to /Users/kavgan/nltk_data...\n",
156 | "[nltk_data] Package reuters is already up-to-date!\n"
157 | ]
158 | },
159 | {
160 | "data": {
161 | "text/html": [
162 | " board told ual group offer purchase takeover year securities business unit chairman terms national billion bank international bid company share new industries mln agreement april statement buy price acquisition american stake stock 1986 sale acquired companies report dlrs acquire merger plan shares agreed lt 000 corp analysts pct chemical sell
"
163 | ],
164 | "text/plain": [
165 | ""
166 | ]
167 | },
168 | "execution_count": 5,
169 | "metadata": {},
170 | "output_type": "execute_result"
171 | }
172 | ],
173 | "source": [
174 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n",
175 | "\n",
176 | "nltk.download('reuters')\n",
177 | "\n",
178 | "#get all articles related to acquisitions\n",
179 | "category_docs = reuters.fileids(\"acq\");\n",
180 | "\n",
181 | "\n",
182 | "list_of_documents=[]\n",
183 | "\n",
184 | "#use raw content from a 100 documents\n",
185 | "for i in range (100):\n",
186 | " document_id = category_docs[i]\n",
187 | " list_of_documents.append(reuters.raw(document_id)) \n",
188 | " \n",
189 | "\n",
190 | "#don't randomize color, show only top 50\n",
191 | "embed_code=wc.get_embed_code(text=list_of_documents,random_color=True,topn=50)\n",
192 | "HTML(embed_code)"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "## Generate word clouds from existing weights\n",
200 | "Let's say you already have a set of words with corresponding weights, and you just want to visualize it, that is also an option with this library. All you need to do is make sure that weights are normalized between [0-1]"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 6,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/html": [
211 | " ambitious cool-place nice-work smart awesome great-job nice-colors intelligent small-font fun-place medium-font word-cloud crazy cool-cloud phrase-cloud
"
212 | ],
213 | "text/plain": [
214 | ""
215 | ]
216 | },
217 | "execution_count": 6,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "wc=WordCloud(use_tfidf=False,stopwords=ENGLISH_STOP_WORDS)\n",
224 | "\n",
225 | "#words with corresponding weights\n",
226 | "list_of_scores=[['nice-work',0.2],['great-job',0.7],['cool-place',0.1],['cool-cloud',0.6],['phrase-cloud',0.34],['word-cloud',0.625],['nice-colors',0.525],['small-font',0.4],['fun-place',0.6],['awesome',0.4],['intelligent',0.4],['medium-font',0.4],['crazy',0.2],['smart',0.3],['ambitious',0.4]]\n",
227 | "\n",
228 | "#don't randomize color, show only top 50\n",
229 | "embed_code=wc.get_embed_code(text_scores=pd.DataFrame(list_of_scores),random_color=True,topn=50)\n",
230 | "HTML(embed_code)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": []
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": []
246 | }
247 | ],
248 | "metadata": {
249 | "kernelspec": {
250 | "display_name": "Python 3",
251 | "language": "python",
252 | "name": "python3"
253 | },
254 | "language_info": {
255 | "codemirror_mode": {
256 | "name": "ipython",
257 | "version": 3
258 | },
259 | "file_extension": ".py",
260 | "mimetype": "text/x-python",
261 | "name": "python",
262 | "nbconvert_exporter": "python",
263 | "pygments_lexer": "ipython3",
264 | "version": "3.6.5"
265 | }
266 | },
267 | "nbformat": 4,
268 | "nbformat_minor": 2
269 | }
270 |
--------------------------------------------------------------------------------