├── PyData-Language-of-Novice-Talks.ipynb ├── PyData-Scattertext-Part-1.ipynb ├── PyData-Scattertext-Part-2.ipynb ├── PyData-Scattertext-Part-3.ipynb ├── PyData2017Kessler.pptx ├── README.md ├── img ├── genderandparty.png └── gensim_similarity.png ├── output ├── Conventions2012Gender.html ├── Conventions2012GenderAndParty.html ├── Conventions2012GenderAndPartyCorner.html ├── Conventions2012ScattertextLog.html ├── Conventions2012ScattertextRankData.html ├── Conventions2012ScattertextRankDataJitter.html ├── Conventions2012ScattertextRankDefault.html ├── Conventions2012ScattertextScale.html ├── CornervsLog.html ├── EmpathGender.html ├── L2vsLog.html ├── LOPriorvsLog.html ├── Pydata2016vs2017.html ├── Pydata2017vs2016.html ├── PydataAdvancedVsRest.html ├── PydataNoviceVsNotNovice.html ├── SFS_UnifPrior.html ├── SFSvsLog.html ├── demo_similarity.html └── demo_similarity_gensim.html ├── pydata_talks.csv └── ~$PyData2017Kessler.pptx /PyData-Language-of-Novice-Talks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using Scattertext to Analyze PyData Talks\n", 8 | "Let's pull titles abstracts and descriptions of PyData talks to see how novice-level talks differed from intermediate and advanced talks.\n", 9 | "\n", 10 | "Please check out Scattertext on Github: https://github.com/JasonKessler/scattertext" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/html": [ 21 | "" 22 | ], 23 | "text/plain": [ 24 | "" 25 | ] 26 | }, 27 | "metadata": {}, 28 | "output_type": "display_data" 29 | } 30 | ], 31 | "source": [ 32 | "import pandas as pd\n", 33 | "import requests\n", 34 | "from bs4 import BeautifulSoup\n", 35 | "import re, time\n", 36 | "import pygal\n", 37 | "import scattertext as st\n", 38 | "from IPython.display import IFrame\n", 39 | "from IPython.core.display import display, HTML\n", 40 | "import seaborn as sns\n", 41 | "display(HTML(\"\"))\n", 42 | "import spacy\n", 43 | "import scattertext as st\n", 44 | "%matplotlib inline" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## First, let's scrape pydata.org" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "def parse_talk(url):\n", 63 | " d = {}\n", 64 | " try:\n", 65 | " soup = BeautifulSoup(requests.get(url).text, 'lxml')\n", 66 | " content = soup.find_all('div', class_='container')[1]\n", 67 | " d['author'] = content.find_all('a')[0].contents[0]\n", 68 | " d['title'] = content.find_all('h2')[0].contents[0]\n", 69 | " d['level'] = content.find_all('dd')[0].contents[0] \n", 70 | " d['description'] = soup.find_all('div', class_='description')[0].get_text()\n", 71 | " d['abstract'] = soup.find_all('div', class_='abstract')[0].get_text()\n", 72 | " except:\n", 73 | " print('bad', url)\n", 74 | " return None\n", 75 | " \n", 76 | " return d\n", 77 | "\n", 78 | "def pull_pydata_schedule(loc, year):\n", 79 | " url = 'https://pydata.org/'+loc+str(year)+'/schedule/' \n", 80 | " soup = BeautifulSoup(requests.get(url).text, 'lxml')\n", 81 | " content = soup.find_all('div', class_='container')[1]\n", 82 | " talks = []\n", 83 | " for slot in content.find_all('td', class_='slot'):\n", 84 | " for link in slot.find_all('a'): \n", 85 | " d = parse_talk('https://pydata.org'+link.attrs['href'])\n", 86 | " if d is not None:\n", 87 | " d['location'] = loc\n", 88 | " d['year'] = str(year)\n", 89 | " talks.append(d)\n", 90 | " time.sleep(5) # for politeness\n", 91 | " print(loc, year)\n", 92 | " return pd.DataFrame(talks)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "seattle 2017\n", 105 | "london 2017\n", 106 | "bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/42/\n", 107 | "bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/33/\n", 108 | "bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/34/\n", 109 | "bad https://pydata.orghttps://pydata.org/barcelona2017/schedule/presentation/52/\n", 110 | "barcelona 2017\n", 111 | "bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/\n", 112 | "bad https://pydata.org/berlin2017/keynote-speakers/\n", 113 | "bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/\n", 114 | "bad https://pydata.org/berlin2017/keynote-speakers#veronica-valeros\n", 115 | "bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/\n", 116 | "bad https://pydata.org/berlin2017/keynote-speakers#toby-walsh\n", 117 | "bad https://pydata.org/berlin2017/program/breakfast-and-lunch-program/\n", 118 | "bad https://pydata.org/berlin2017/keynote-speakers/#ethical-machine-learning-panel\n", 119 | "berlin 2017\n", 120 | "dc 2016\n", 121 | "carolinas 2016\n", 122 | "chicago 2016\n", 123 | "sfo 2016\n", 124 | "paris 2016\n", 125 | "berlin 2016\n", 126 | "london 2016\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "sched = pd.concat([pull_pydata_schedule('seattle', 2017),\n", 132 | " pull_pydata_schedule('london', 2017),\n", 133 | " pull_pydata_schedule('barcelona', 2017), \n", 134 | " pull_pydata_schedule('berlin', 2017), \n", 135 | " pull_pydata_schedule('dc', 2016),\n", 136 | " pull_pydata_schedule('carolinas', 2016),\n", 137 | " pull_pydata_schedule('chicago', 2016), \n", 138 | " pull_pydata_schedule('sfo', 2016), \n", 139 | " pull_pydata_schedule('paris', 2016), \n", 140 | " pull_pydata_schedule('berlin', 2016), \n", 141 | " pull_pydata_schedule('london', 2016)])" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 23, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "sched.to_csv('pydata_talks.csv', index=False)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 3, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "sched = pd.read_csv('pydata_talks.csv')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "nlp = spacy.load('en')" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "sched = sched[~sched['title'].isin(['BoF', 'Unconference Presentation'])]" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "sched['is_novice'] = (sched.level == 'Novice').apply(lambda x: 'Novice' if x else 'Not Novice')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 7, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "sched['parse'] = (sched['title'] + '\\n \\n' + sched['abstract'].fillna('') + '\\n \\n' + sched['description'].fillna('')).apply(nlp)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 8, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "sched = sched.loc[sched['title'].drop_duplicates().index]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "## Let's see how descriptions of novice-directed talks sound compared to directed at more seasoned audiences" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 12, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/html": [ 236 | "\n", 237 | " \n", 244 | " " 245 | ], 246 | "text/plain": [ 247 | "" 248 | ] 249 | }, 250 | "execution_count": 12, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "html = st.produce_scattertext_explorer(st.CorpusFromParsedDocuments(sched, category_col = 'is_novice', parsed_col = 'parse').build(),\n", 257 | " category='Novice',\n", 258 | " category_name='Novice',\n", 259 | " not_category_name='Intermediate or Advanced',\n", 260 | " minimum_term_frequency=8,\n", 261 | " pmi_threshold_coefficient=10,\n", 262 | " width_in_pixels=1000,\n", 263 | " term_ranker=st.OncePerDocFrequencyRanker,\n", 264 | " use_full_doc=True,\n", 265 | " metadata=sched['author'] + ' ('+sched['location'] + ', '+ sched['level'] + ')',)\n", 266 | "file_name = 'output/PydataNoviceVsNotNovice.html'\n", 267 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 268 | "IFrame(src=file_name, width = 1200, height=700)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## Let's see how the experiened talk descriptions sound" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 23, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "\n", 287 | " \n", 294 | " " 295 | ], 296 | "text/plain": [ 297 | "" 298 | ] 299 | }, 300 | "execution_count": 23, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "sched['is_advanced'] = (sched.level == 'Experienced').apply(lambda x: 'Experienced' if x else 'Not Experienced')\n", 307 | "html = st.produce_scattertext_explorer(st.CorpusFromParsedDocuments(sched, category_col = 'is_advanced', parsed_col = 'parse').build(),\n", 308 | " category='Experienced',\n", 309 | " category_name='Experienced',\n", 310 | " not_category_name='Not Experienced',\n", 311 | " minimum_term_frequency=8,\n", 312 | " pmi_filter_thresold=8, \n", 313 | " width_in_pixels=1000,\n", 314 | " term_ranker=st.OncePerDocFrequencyRanker,\n", 315 | " use_full_doc=True,\n", 316 | " metadata=sched['author'] + ' ('+sched['location'] + ', '+ sched['level'] + ')',)\n", 317 | "file_name = 'output/PydataAdvancedVsRest.html'\n", 318 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 319 | "IFrame(src=file_name, width = 1200, height=700)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": true 327 | }, 328 | "outputs": [], 329 | "source": [] 330 | } 331 | ], 332 | "metadata": { 333 | "anaconda-cloud": {}, 334 | "kernelspec": { 335 | "display_name": "Python [Root]", 336 | "language": "python", 337 | "name": "Python [Root]" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.5.2" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 2 354 | } 355 | -------------------------------------------------------------------------------- /PyData-Scattertext-Part-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Scattertext\n", 8 | "\n", 9 | "## @jasonkessler\n", 10 | "\n", 11 | "https://github.com/JasonKessler/scattertext\n", 12 | "\n", 13 | "\n", 14 | "\n", 15 | "Cite as:\n", 16 | "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", 17 | "\n", 18 | "Link to preprint: https://arxiv.org/abs/1703.00565\n", 19 | "\n", 20 | "`\n", 21 | "@article{kessler2017scattertext,\n", 22 | " author = {Kessler, Jason S.},\n", 23 | " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", 24 | " booktitle = {ACL System Demonstrations},\n", 25 | " year = {2017},\n", 26 | "}\n", 27 | "`" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "" 39 | ], 40 | "text/plain": [ 41 | "" 42 | ] 43 | }, 44 | "metadata": {}, 45 | "output_type": "display_data" 46 | } 47 | ], 48 | "source": [ 49 | "%matplotlib inline\n", 50 | "import scattertext as st\n", 51 | "import re, io\n", 52 | "from pprint import pprint\n", 53 | "import pandas as pd\n", 54 | "import numpy as np\n", 55 | "from scipy.stats import rankdata, hmean, norm\n", 56 | "import spacy\n", 57 | "import os, pkgutil, json, urllib\n", 58 | "from urllib.request import urlopen\n", 59 | "from IPython.display import IFrame\n", 60 | "from IPython.core.display import display, HTML\n", 61 | "from scattertext import CorpusFromPandas, produce_scattertext_explorer\n", 62 | "display(HTML(\"\"))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "nlp = spacy.load('en')\n", 74 | "# If this doesn't work, please uncomment the following line and use a regex-based parser instead\n", 75 | "#nlp = st.whitespace_nlp_with_sentences" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "# Grab the 2012 political convention data set and preview it" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "convention_df = st.SampleCorpora.ConventionData2012.get_data()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "party democrat\n", 105 | "speaker BARACK OBAMA\n", 106 | "text Thank you. Thank you. Thank you. Thank you so ...\n", 107 | "Name: 0, dtype: object" 108 | ] 109 | }, 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "convention_df.iloc[0]" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": { 123 | "scrolled": true 124 | }, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Document Count\n", 131 | "party\n", 132 | "democrat 123\n", 133 | "republican 66\n", 134 | "Name: text, dtype: int64\n", 135 | "Word Count\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "print(\"Document Count\")\n", 141 | "print(convention_df.groupby('party')['text'].count())\n", 142 | "print(\"Word Count\")\n", 143 | "convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())\n", 144 | "convention_df['parsed'] = convention_df.text.apply(nlp)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# Turn it into a Scattertext corpus, and have spaCy parse it." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 9, 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Scattertext has some functions to find how associated words are with categories \n", 170 | "# I've reworded this section since the talk\n", 171 | "## Lots of ways to do this. I'm partial to a novel technique called Scaled F-Score\n", 172 | "# Intutition:\n", 173 | "### Associatied terms have a *relatively* high category-specific precision and category-specific term frequency (i.e., % of terms in category are term)\n", 174 | "### Take the harmonic mean of precision and frequency (both have to be high)\n", 175 | "### Hyper-parameters are pretty much universal (beta and transformation function)\n", 176 | "\n", 177 | "Given a word $w_i \\in W$ and a category $c_j \\in C$, define the precision of the word $w_i$ wrt to a category as:\n", 178 | "$$ \\mbox{prec}(w_i, c_j) = \\frac{\\#(w_i, c_j)}{\\sum_{c \\in C} \\#(w_i, c)}. $$\n", 179 | "\n", 180 | "The function $\\#(w_i, c_j)$ represents either the number of times $w_i$ occurs in a document labeled with the category $c_j$ or the number of documents labeled $c_j$ which contain $w_i$.\n", 181 | "\n", 182 | "Similarly, define the frequency a word occurs in the category as:\n", 183 | "\n", 184 | "$$ \\mbox{freq}(w_i, c_j) = \\frac{\\#(w_i, c_j)}{\\sum_{w \\in W} \\#(w, c_j)}. $$\n", 185 | "\n", 186 | "The F-Score of these two values is defined as:\n", 187 | "\n", 188 | "$$ \\mathcal{F}_\\beta(\\mbox{prec}, \\mbox{freq}) = (1 + \\beta^2) \\frac{\\mbox{prec} \\cdot \\mbox{freq}}{\\beta^2 \\cdot \\mbox{prec} + \\mbox{freq}}. $$\n", 189 | "\n", 190 | "$\\beta \\in \\mathcal{R}^+$ is a scaling factor where frequency is favored if $\\beta < 1$, precision if $\\beta > 1$, and both are equally weighted if $\\beta = 1$. F-Score is equivalent to the harmonic mean where $\\beta = 1$." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 8, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/html": [ 201 | "

\n", 202 | "\n", 215 | "\n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | "

	democrat freq	republican freq	dem_precision	dem_freq_pct	dem_hmean
term
the	3402	2532	0.573306	0.022343	0.043009
and	2709	2233	0.548159	0.017791	0.034464
to	2340	1667	0.583978	0.015368	0.029948
a	1602	1345	0.543604	0.010521	0.020643
of	1569	1377	0.532587	0.010304	0.020218
that	1400	1051	0.571195	0.009195	0.018098
we	1318	1146	0.534903	0.008656	0.017036
in	1291	986	0.566974	0.008479	0.016708
i	1098	851	0.563366	0.007211	0.014240
's	1037	631	0.621703	0.006811	0.013473

\n", 317 | "

" 318 | ], 319 | "text/plain": [ 320 | " democrat freq republican freq dem_precision dem_freq_pct dem_hmean\n", 321 | "term \n", 322 | "the 3402 2532 0.573306 0.022343 0.043009\n", 323 | "and 2709 2233 0.548159 0.017791 0.034464\n", 324 | "to 2340 1667 0.583978 0.015368 0.029948\n", 325 | "a 1602 1345 0.543604 0.010521 0.020643\n", 326 | "of 1569 1377 0.532587 0.010304 0.020218\n", 327 | "that 1400 1051 0.571195 0.009195 0.018098\n", 328 | "we 1318 1146 0.534903 0.008656 0.017036\n", 329 | "in 1291 986 0.566974 0.008479 0.016708\n", 330 | "i 1098 851 0.563366 0.007211 0.014240\n", 331 | "'s 1037 631 0.621703 0.006811 0.013473" 332 | ] 333 | }, 334 | "execution_count": 8, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "term_freq_df = corpus.get_term_freq_df()\n", 341 | "term_freq_df['dem_precision'] = term_freq_df['democrat freq'] * 1./(term_freq_df['democrat freq'] + term_freq_df['republican freq'])\n", 342 | "term_freq_df['dem_freq_pct'] = term_freq_df['democrat freq'] * 1./term_freq_df['democrat freq'].sum()\n", 343 | "term_freq_df['dem_hmean'] = term_freq_df.apply(lambda x: (hmean([x['dem_precision'], x['dem_freq_pct']])\n", 344 | " if x['dem_precision'] > 0 and x['dem_freq_pct'] > 0 \n", 345 | " else 0), axis=1) \n", 346 | "term_freq_df.sort_values(by='dem_hmean', ascending=False).iloc[:10]" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Solution:\n", 354 | "### Take the normal CDF of precision and frequency percentage scores, which will fall between 0 and 1, which scales and standardizes both scores.\n", 355 | "\n", 356 | "Define the the Normal CDF as:\n", 357 | "\n", 358 | "$$ \\Phi(z) = \\int_{-\\infty}^z \\mathcal{N}(x; \\mu, \\sigma^2)\\ \\mathrm{d}x.$$\n", 359 | "\n", 360 | "Where $ \\mathcal{N} $ is the PDF of the Normal distribution, $\\mu$ is the mean, and $\\sigma^2$ is the variance.\n", 361 | "\n", 362 | "$\\Phi$ is used to scale and standardize the precisions and frequencies, and place them on the same scale $[0,1]$.\n", 363 | "\n", 364 | "Now we can define Scaled F-Score as the harmonic mean of the Normal CDF transformed frequency and precision:\n", 365 | "\n", 366 | "$$ \\mathcal{S}_{\\beta}(w_i, c_j) = \\mathcal{F}_{\\beta}(\\Phi(\\mbox{prec}(w_i, c_j)), \\Phi(\\mbox{freq}(w_i, c_j))).$$\n", 367 | "\n", 368 | "$\\mu$ and $\\sigma^2$ are defined separately as the mean and variance of precision and frequency.\n", 369 | "\n", 370 | "A $\\beta$ of 0.5 is recommended and is the default value in Scattertext.\n", 371 | "\n", 372 | "Note that any function with the range of $[0,1]$ (this includes the identity function) may be used in place of $\\Phi$." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 15, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/html": [ 383 | "

\n", 384 | "\n", 397 | "\n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | "

	democrat freq	republican freq	dem_precision	dem_freq_pct	dem_hmean	dem_precision_normcdf	dem_freq_pct_normcdf	dem_scaled_f_score
term
middle class	148	18	0.891566	0.000972	0.001942	0.769762	1.000000	0.869905
auto	37	0	1.000000	0.000243	0.000486	0.836010	0.889307	0.861835
fair	45	3	0.937500	0.000296	0.000591	0.799485	0.933962	0.861507
insurance	54	6	0.900000	0.000355	0.000709	0.775397	0.965959	0.860251
forward	105	16	0.867769	0.000690	0.001378	0.753443	0.999858	0.859334
president barack	47	4	0.921569	0.000309	0.000617	0.789447	0.942572	0.859241
class	161	25	0.865591	0.001057	0.002112	0.751919	1.000000	0.858395
middle	164	27	0.858639	0.001077	0.002151	0.747021	1.000000	0.855194
the middle	98	17	0.852174	0.000644	0.001286	0.742422	0.999640	0.852041
medicare	84	15	0.848485	0.000552	0.001103	0.739778	0.998050	0.849722

\n", 535 | "

" 536 | ], 537 | "text/plain": [ 538 | " democrat freq republican freq dem_precision dem_freq_pct \\\n", 539 | "term \n", 540 | "middle class 148 18 0.891566 0.000972 \n", 541 | "auto 37 0 1.000000 0.000243 \n", 542 | "fair 45 3 0.937500 0.000296 \n", 543 | "insurance 54 6 0.900000 0.000355 \n", 544 | "forward 105 16 0.867769 0.000690 \n", 545 | "president barack 47 4 0.921569 0.000309 \n", 546 | "class 161 25 0.865591 0.001057 \n", 547 | "middle 164 27 0.858639 0.001077 \n", 548 | "the middle 98 17 0.852174 0.000644 \n", 549 | "medicare 84 15 0.848485 0.000552 \n", 550 | "\n", 551 | " dem_hmean dem_precision_normcdf dem_freq_pct_normcdf \\\n", 552 | "term \n", 553 | "middle class 0.001942 0.769762 1.000000 \n", 554 | "auto 0.000486 0.836010 0.889307 \n", 555 | "fair 0.000591 0.799485 0.933962 \n", 556 | "insurance 0.000709 0.775397 0.965959 \n", 557 | "forward 0.001378 0.753443 0.999858 \n", 558 | "president barack 0.000617 0.789447 0.942572 \n", 559 | "class 0.002112 0.751919 1.000000 \n", 560 | "middle 0.002151 0.747021 1.000000 \n", 561 | "the middle 0.001286 0.742422 0.999640 \n", 562 | "medicare 0.001103 0.739778 0.998050 \n", 563 | "\n", 564 | " dem_scaled_f_score \n", 565 | "term \n", 566 | "middle class 0.869905 \n", 567 | "auto 0.861835 \n", 568 | "fair 0.861507 \n", 569 | "insurance 0.860251 \n", 570 | "forward 0.859334 \n", 571 | "president barack 0.859241 \n", 572 | "class 0.858395 \n", 573 | "middle 0.855194 \n", 574 | "the middle 0.852041 \n", 575 | "medicare 0.849722 " 576 | ] 577 | }, 578 | "execution_count": 15, 579 | "metadata": {}, 580 | "output_type": "execute_result" 581 | } 582 | ], 583 | "source": [ 584 | "def normcdf(x):\n", 585 | " return norm.cdf(x, x.mean(), x.std())\n", 586 | "term_freq_df['dem_precision_normcdf'] = normcdf(term_freq_df['dem_precision'])\n", 587 | "term_freq_df['dem_freq_pct_normcdf'] = normcdf(term_freq_df['dem_freq_pct'])\n", 588 | "term_freq_df['dem_scaled_f_score'] = hmean([term_freq_df['dem_precision_normcdf'], term_freq_df['dem_freq_pct_normcdf']])\n", 589 | "term_freq_df.sort_values(by='dem_scaled_f_score', ascending=False).iloc[:10]" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 16, 595 | "metadata": { 596 | "scrolled": false 597 | }, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/html": [ 602 | "

\n", 603 | "\n", 616 | "\n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | "

	democrat freq	republican freq	dem_precision	dem_freq_pct	dem_hmean	dem_precision_normcdf	dem_freq_pct_normcdf	dem_scaled_f_score	dem_corner_score
term
auto	37	0	1.0	0.000243	0.000486	0.83601	0.889307	0.861835	0.919467
america forward	28	0	1.0	0.000184	0.000368	0.83601	0.817094	0.826444	0.919436
auto industry	24	0	1.0	0.000158	0.000315	0.83601	0.777205	0.805536	0.919413
insurance companies	24	0	1.0	0.000158	0.000315	0.83601	0.777205	0.805536	0.919413
pell	23	0	1.0	0.000151	0.000302	0.83601	0.766509	0.799752	0.919404
last week	22	0	1.0	0.000144	0.000289	0.83601	0.755535	0.793738	0.919393
pell grants	21	0	1.0	0.000138	0.000276	0.83601	0.744288	0.787487	0.919381
platform	20	0	1.0	0.000131	0.000263	0.83601	0.732776	0.780996	0.919369
women 's	20	0	1.0	0.000131	0.000263	0.83601	0.732776	0.780996	0.919369
millionaires	18	0	1.0	0.000118	0.000236	0.83601	0.708996	0.767282	0.919333

\n", 766 | "

" 767 | ], 768 | "text/plain": [ 769 | " democrat freq republican freq dem_precision \\\n", 770 | "term \n", 771 | "auto 37 0 1.0 \n", 772 | "america forward 28 0 1.0 \n", 773 | "auto industry 24 0 1.0 \n", 774 | "insurance companies 24 0 1.0 \n", 775 | "pell 23 0 1.0 \n", 776 | "last week 22 0 1.0 \n", 777 | "pell grants 21 0 1.0 \n", 778 | "platform 20 0 1.0 \n", 779 | "women 's 20 0 1.0 \n", 780 | "millionaires 18 0 1.0 \n", 781 | "\n", 782 | " dem_freq_pct dem_hmean dem_precision_normcdf \\\n", 783 | "term \n", 784 | "auto 0.000243 0.000486 0.83601 \n", 785 | "america forward 0.000184 0.000368 0.83601 \n", 786 | "auto industry 0.000158 0.000315 0.83601 \n", 787 | "insurance companies 0.000158 0.000315 0.83601 \n", 788 | "pell 0.000151 0.000302 0.83601 \n", 789 | "last week 0.000144 0.000289 0.83601 \n", 790 | "pell grants 0.000138 0.000276 0.83601 \n", 791 | "platform 0.000131 0.000263 0.83601 \n", 792 | "women 's 0.000131 0.000263 0.83601 \n", 793 | "millionaires 0.000118 0.000236 0.83601 \n", 794 | "\n", 795 | " dem_freq_pct_normcdf dem_scaled_f_score \\\n", 796 | "term \n", 797 | "auto 0.889307 0.861835 \n", 798 | "america forward 0.817094 0.826444 \n", 799 | "auto industry 0.777205 0.805536 \n", 800 | "insurance companies 0.777205 0.805536 \n", 801 | "pell 0.766509 0.799752 \n", 802 | "last week 0.755535 0.793738 \n", 803 | "pell grants 0.744288 0.787487 \n", 804 | "platform 0.732776 0.780996 \n", 805 | "women 's 0.732776 0.780996 \n", 806 | "millionaires 0.708996 0.767282 \n", 807 | "\n", 808 | " dem_corner_score \n", 809 | "term \n", 810 | "auto 0.919467 \n", 811 | "america forward 0.919436 \n", 812 | "auto industry 0.919413 \n", 813 | "insurance companies 0.919413 \n", 814 | "pell 0.919404 \n", 815 | "last week 0.919393 \n", 816 | "pell grants 0.919381 \n", 817 | "platform 0.919369 \n", 818 | "women 's 0.919369 \n", 819 | "millionaires 0.919333 " 820 | ] 821 | }, 822 | "execution_count": 16, 823 | "metadata": {}, 824 | "output_type": "execute_result" 825 | } 826 | ], 827 | "source": [ 828 | "term_freq_df['dem_corner_score'] = corpus.get_corner_scores('democrat')\n", 829 | "term_freq_df.sort_values(by='dem_corner_score', ascending=False).iloc[:10]" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 17, 835 | "metadata": {}, 836 | "outputs": [ 837 | { 838 | "name": "stdout", 839 | "output_type": "stream", 840 | "text": [ 841 | "Top 10 Democratic terms\n", 842 | "['auto',\n", 843 | " 'america forward',\n", 844 | " 'fought for',\n", 845 | " 'insurance companies',\n", 846 | " 'auto industry',\n", 847 | " 'fair',\n", 848 | " 'pell',\n", 849 | " 'last week',\n", 850 | " 'fighting for',\n", 851 | " 'president barack']\n", 852 | "Top 10 Republican terms\n", 853 | "['unemployment',\n", 854 | " 'do better',\n", 855 | " 'liberty',\n", 856 | " 'olympics',\n", 857 | " 'built it',\n", 858 | " 'it has',\n", 859 | " 'ann',\n", 860 | " 'reagan',\n", 861 | " 'big government',\n", 862 | " 'story of']\n" 863 | ] 864 | } 865 | ], 866 | "source": [ 867 | "term_freq_df = corpus.get_term_freq_df()\n", 868 | "term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')\n", 869 | "term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')\n", 870 | "print(\"Top 10 Democratic terms\")\n", 871 | "pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))\n", 872 | "print(\"Top 10 Republican terms\")\n", 873 | "pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))" 874 | ] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": {}, 879 | "source": [ 880 | "# Make and visualize chart, scale based on raw frequency.\n", 881 | "### - A word used 10 times by Republicans will be at position 10 on the on the x-axis \n", 882 | "### - This isn't very useful. Everything but the most frequent terms are squished the lower-left corner\n", 883 | "### - The corner-distance scores are largely stopwords\n", 884 | "### - By default, color words by Scaled F-Score" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": 31, 890 | "metadata": { 891 | "scrolled": false 892 | }, 893 | "outputs": [ 894 | { 895 | "data": { 896 | "text/html": [ 897 | "\n", 898 | " \n", 905 | " " 906 | ], 907 | "text/plain": [ 908 | "" 909 | ] 910 | }, 911 | "execution_count": 31, 912 | "metadata": {}, 913 | "output_type": "execute_result" 914 | } 915 | ], 916 | "source": [ 917 | "html = produce_scattertext_explorer(corpus,\n", 918 | " category='democrat',\n", 919 | " category_name='Democratic',\n", 920 | " not_category_name='Republican',\n", 921 | " width_in_pixels=1000,\n", 922 | " minimum_term_frequency=5,\n", 923 | " transform=st.Scalers.scale,\n", 924 | " metadata=convention_df['speaker'])\n", 925 | "file_name = 'output/Conventions2012ScattertextScale.html'\n", 926 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 927 | "IFrame(src=file_name, width = 1200, height=700)\n" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "## Using log scales seems to help a bit, but blank space and stop words still dominate the graph\n", 935 | "### The chracteristic terms look much more informative" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": 32, 941 | "metadata": {}, 942 | "outputs": [ 943 | { 944 | "data": { 945 | "text/html": [ 946 | "\n", 947 | " \n", 954 | " " 955 | ], 956 | "text/plain": [ 957 | "" 958 | ] 959 | }, 960 | "execution_count": 32, 961 | "metadata": {}, 962 | "output_type": "execute_result" 963 | } 964 | ], 965 | "source": [ 966 | "html = st.produce_scattertext_explorer(corpus,\n", 967 | " category='democrat',\n", 968 | " category_name='Democratic',\n", 969 | " not_category_name='Republican',\n", 970 | " minimum_term_frequency=5,\n", 971 | " width_in_pixels=1000,\n", 972 | " transform=st.Scalers.log_scale_standardize)\n", 973 | "file_name = 'output/Conventions2012ScattertextLog.html'\n", 974 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 975 | "IFrame(src=file_name, width = 1200, height=700)\n" 976 | ] 977 | }, 978 | { 979 | "cell_type": "markdown", 980 | "metadata": {}, 981 | "source": [ 982 | "# Rank terms by frequency percentiles instead of raw frequenies. \n", 983 | "### A term at the middle of the x-axis will be mentioned by Republicans at the median frequency.\n", 984 | "### This nicely distributes terms throughout the space\n", 985 | "### But, terms occuring with the same frequencies in both classes are stacked atop each other.\n", 986 | "### Can't mouseover points not at top of stack." 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": 33, 992 | "metadata": { 993 | "scrolled": false 994 | }, 995 | "outputs": [ 996 | { 997 | "data": { 998 | "text/html": [ 999 | "\n", 1000 | " \n", 1007 | " " 1008 | ], 1009 | "text/plain": [ 1010 | "" 1011 | ] 1012 | }, 1013 | "execution_count": 33, 1014 | "metadata": {}, 1015 | "output_type": "execute_result" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "html = produce_scattertext_explorer(corpus,\n", 1020 | " category='democrat',\n", 1021 | " category_name='Democratic',\n", 1022 | " not_category_name='Republican',\n", 1023 | " width_in_pixels=1000,\n", 1024 | " minimum_term_frequency=5,\n", 1025 | " transform=st.Scalers.percentile,\n", 1026 | " metadata=convention_df['speaker'])\n", 1027 | "file_name = 'output/Conventions2012ScattertextRankData.html'\n", 1028 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1029 | "IFrame(src=file_name, width = 1200, height=700)\n" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "# One solution is to randomly jitter each point\n", 1037 | "## Points don't leave enough space for many labels\n", 1038 | "## Top terms laregely result of jitter" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "code", 1043 | "execution_count": 34, 1044 | "metadata": {}, 1045 | "outputs": [ 1046 | { 1047 | "data": { 1048 | "text/html": [ 1049 | "\n", 1050 | " \n", 1057 | " " 1058 | ], 1059 | "text/plain": [ 1060 | "" 1061 | ] 1062 | }, 1063 | "execution_count": 34, 1064 | "metadata": {}, 1065 | "output_type": "execute_result" 1066 | } 1067 | ], 1068 | "source": [ 1069 | "html = produce_scattertext_explorer(corpus,\n", 1070 | " category='democrat',\n", 1071 | " category_name='Democratic',\n", 1072 | " not_category_name='Republican',\n", 1073 | " width_in_pixels=1000,\n", 1074 | " jitter=0.1,\n", 1075 | " minimum_term_frequency=5,\n", 1076 | " transform=st.Scalers.percentile,\n", 1077 | " metadata=convention_df['speaker'])\n", 1078 | "file_name = 'output/Conventions2012ScattertextRankDataJitter.html'\n", 1079 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1080 | "IFrame(src=file_name, width = 1200, height=700)\n" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "metadata": {}, 1086 | "source": [ 1087 | "# The preferred solution is to fall back to alphabetic order among equally frequent terms\n", 1088 | "## Lets you mouseover all points\n", 1089 | "## Leaves a bit of room for labels\n", 1090 | "## Top points may be slightly distorted" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 22, 1096 | "metadata": { 1097 | "scrolled": false 1098 | }, 1099 | "outputs": [ 1100 | { 1101 | "data": { 1102 | "text/html": [ 1103 | "\n", 1104 | " \n", 1111 | " " 1112 | ], 1113 | "text/plain": [ 1114 | "" 1115 | ] 1116 | }, 1117 | "execution_count": 22, 1118 | "metadata": {}, 1119 | "output_type": "execute_result" 1120 | } 1121 | ], 1122 | "source": [ 1123 | "html = produce_scattertext_explorer(corpus,\n", 1124 | " category='democrat',\n", 1125 | " category_name='Democratic',\n", 1126 | " not_category_name='Republican',\n", 1127 | " width_in_pixels=1000,\n", 1128 | " minimum_term_frequency=5,\n", 1129 | " metadata=convention_df['speaker'],\n", 1130 | " term_significance = st.LogOddsRatioUninformativeDirichletPrior())\n", 1131 | "file_name = 'output/Conventions2012ScattertextRankDefault.html'\n", 1132 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1133 | "IFrame(src=file_name, width = 1200, height=700)" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "markdown", 1138 | "metadata": {}, 1139 | "source": [ 1140 | "# Scattertext can also be used for alternative visualizations\n", 1141 | "## Visualize L2-penalized logistic regression coefficients vs. log term frequency" 1142 | ] 1143 | }, 1144 | { 1145 | "cell_type": "code", 1146 | "execution_count": 23, 1147 | "metadata": { 1148 | "collapsed": true 1149 | }, 1150 | "outputs": [], 1151 | "source": [ 1152 | "def scale(ar): \n", 1153 | " return (ar - ar.min()) / (ar.max() - ar.min())\n", 1154 | "\n", 1155 | "def zero_centered_scale(ar):\n", 1156 | " scores = np.zeros(len(ar))\n", 1157 | " scores[ar > 0] = scale(ar[ar > 0])\n", 1158 | " scores[ar < 0] = -scale(-ar[ar < 0])\n", 1159 | " return (scores + 1) / 2.\n", 1160 | "\n", 1161 | "frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))" 1162 | ] 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "execution_count": 24, 1167 | "metadata": {}, 1168 | "outputs": [ 1169 | { 1170 | "name": "stderr", 1171 | "output_type": "stream", 1172 | "text": [ 1173 | "/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.\n", 1174 | " \" = {}.\".format(self.n_jobs))\n" 1175 | ] 1176 | }, 1177 | { 1178 | "data": { 1179 | "text/html": [ 1180 | "\n", 1181 | " \n", 1188 | " " 1189 | ], 1190 | "text/plain": [ 1191 | "" 1192 | ] 1193 | }, 1194 | "execution_count": 24, 1195 | "metadata": {}, 1196 | "output_type": "execute_result" 1197 | } 1198 | ], 1199 | "source": [ 1200 | "from sklearn.linear_model import LogisticRegression\n", 1201 | "scores = corpus.get_logreg_coefs('democrat',\n", 1202 | " LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))\n", 1203 | "scores_scaled = zero_centered_scale(scores)\n", 1204 | "\n", 1205 | "html = produce_scattertext_explorer(corpus,\n", 1206 | " category='democrat',\n", 1207 | " category_name='Democratic',\n", 1208 | " not_category_name='Republican',\n", 1209 | " minimum_term_frequency=5,\n", 1210 | " width_in_pixels=1000,\n", 1211 | " x_coords=frequencies_scaled,\n", 1212 | " y_coords=scores_scaled,\n", 1213 | " scores=scores,\n", 1214 | " sort_by_dist=False,\n", 1215 | " metadata=convention_df['speaker'],\n", 1216 | " x_label='Log frequency',\n", 1217 | " y_label='L2-Penalized Log Reg Coef')\n", 1218 | "file_name = 'output/L2vsLog.html'\n", 1219 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1220 | "IFrame(src=file_name, width = 1200, height=700)" 1221 | ] 1222 | }, 1223 | { 1224 | "cell_type": "markdown", 1225 | "metadata": {}, 1226 | "source": [ 1227 | "## We can see how this compares to Scaled F-Score" 1228 | ] 1229 | }, 1230 | { 1231 | "cell_type": "code", 1232 | "execution_count": 30, 1233 | "metadata": {}, 1234 | "outputs": [ 1235 | { 1236 | "data": { 1237 | "text/html": [ 1238 | "\n", 1239 | " \n", 1246 | " " 1247 | ], 1248 | "text/plain": [ 1249 | "" 1250 | ] 1251 | }, 1252 | "execution_count": 30, 1253 | "metadata": {}, 1254 | "output_type": "execute_result" 1255 | } 1256 | ], 1257 | "source": [ 1258 | "html = produce_scattertext_explorer(corpus,\n", 1259 | " category='democrat',\n", 1260 | " category_name='Democratic',\n", 1261 | " not_category_name='Republican',\n", 1262 | " minimum_term_frequency=5,\n", 1263 | " width_in_pixels=1000,\n", 1264 | " x_coords=frequencies_scaled,\n", 1265 | " y_coords=corpus.get_scaled_f_scores('democrat', beta=0.5),\n", 1266 | " scores=corpus.get_scaled_f_scores('democrat', beta=0.5),\n", 1267 | " sort_by_dist=False,\n", 1268 | " metadata=convention_df['speaker'],\n", 1269 | " x_label='Log Frequency',\n", 1270 | " y_label='Scaled F-Score')\n", 1271 | "file_name = 'output/SFSvsLog.html'\n", 1272 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1273 | "IFrame(src=file_name, width = 1200, height=700)" 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "markdown", 1278 | "metadata": {}, 1279 | "source": [ 1280 | "# Penalized log-odds-ratio has recently become popular recently\n", 1281 | "\n", 1282 | "Burt L. Monroe, Michael P. Colaresi, and Kevin M. Quinn. 2008. Fightin’ words: Lexical feature selection and evaluation for identifying the content of political conflict. Political Analysis." 1283 | ] 1284 | }, 1285 | { 1286 | "cell_type": "code", 1287 | "execution_count": 10, 1288 | "metadata": { 1289 | "collapsed": true 1290 | }, 1291 | "outputs": [], 1292 | "source": [ 1293 | "freq_df = corpus.get_term_freq_df().rename(columns={'democrat freq': 'y_dem', 'republican freq': 'y_rep'})\n", 1294 | "a_w = 0.01\n", 1295 | "y_i, y_j = freq_df['y_dem'].values, freq_df['y_rep'].values" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 14, 1301 | "metadata": { 1302 | "collapsed": true 1303 | }, 1304 | "outputs": [], 1305 | "source": [ 1306 | "n_i, n_j = y_i.sum(), y_j.sum()\n", 1307 | "a_0 = len(freq_df) * a_w\n", 1308 | "delta_i_j = ( np.log((y_i + a_w) / (n_i + a_0 - y_i - a_w))\n", 1309 | " - np.log((y_j + a_w) / (n_j + a_0 - y_j - a_w)))\n", 1310 | "var_delta_i_j = ( 1./(y_i + a_w) + 1./(y_i + a_0 - y_i - a_w)\n", 1311 | " + 1./(y_j + a_w) + 1./(n_j + a_0 - n_j - a_w))\n", 1312 | "zeta_i_j = delta_i_j/np.sqrt(var_delta_i_j)\n", 1313 | "max_abs_zeta = max(zeta_i_j.max(), -zeta_i_j.min())\n", 1314 | "zeta_scaled_for_charting = ((((zeta_i_j > 0).astype(float) * (zeta_i_j/max_abs_zeta))*0.5 + 0.5)\n", 1315 | " + ((zeta_i_j < 0).astype(float) * (zeta_i_j/max_abs_zeta) * 0.5))" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "code", 1320 | "execution_count": 153, 1321 | "metadata": {}, 1322 | "outputs": [ 1323 | { 1324 | "data": { 1325 | "text/html": [ 1326 | "\n", 1327 | " \n", 1334 | " " 1335 | ], 1336 | "text/plain": [ 1337 | "" 1338 | ] 1339 | }, 1340 | "execution_count": 153, 1341 | "metadata": {}, 1342 | "output_type": "execute_result" 1343 | } 1344 | ], 1345 | "source": [ 1346 | "html = produce_scattertext_explorer(corpus,\n", 1347 | " category='democrat',\n", 1348 | " category_name='Democratic',\n", 1349 | " not_category_name='Republican',\n", 1350 | " minimum_term_frequency=5,\n", 1351 | " width_in_pixels=1000,\n", 1352 | " x_coords=frequencies_scaled,\n", 1353 | " y_coords=zeta_scaled_for_charting,\n", 1354 | " scores=zeta_i_j,\n", 1355 | " sort_by_dist=False,\n", 1356 | " metadata=convention_df['speaker'],\n", 1357 | " x_label='Log Frequency',\n", 1358 | " y_label='Log Odds Ratio w/ Uninformative Prior (alpha_w=0.01)')\n", 1359 | "file_name = 'output/LOPriorvsLog.html'\n", 1360 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1361 | "IFrame(src=file_name, width = 1200, height=700)" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "markdown", 1366 | "metadata": {}, 1367 | "source": [ 1368 | "## And finally, corner score" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": 38, 1374 | "metadata": { 1375 | "scrolled": false 1376 | }, 1377 | "outputs": [ 1378 | { 1379 | "data": { 1380 | "text/html": [ 1381 | "\n", 1382 | " \n", 1389 | " " 1390 | ], 1391 | "text/plain": [ 1392 | "" 1393 | ] 1394 | }, 1395 | "execution_count": 38, 1396 | "metadata": {}, 1397 | "output_type": "execute_result" 1398 | } 1399 | ], 1400 | "source": [ 1401 | "corner_scores = corpus.get_corner_scores('democrat')\n", 1402 | "html = produce_scattertext_explorer(corpus,\n", 1403 | " category='democrat',\n", 1404 | " category_name='Democratic',\n", 1405 | " not_category_name='Republican',\n", 1406 | " minimum_term_frequency=5,\n", 1407 | " width_in_pixels=1000,\n", 1408 | " x_coords=frequencies_scaled,\n", 1409 | " y_coords=corner_scores,\n", 1410 | " scores=corner_scores,\n", 1411 | " sort_by_dist=False,\n", 1412 | " metadata=convention_df['speaker'],\n", 1413 | " x_label='Log Frequency',\n", 1414 | " y_label='Corner Scores')\n", 1415 | "file_name = 'output/CornervsLog.html'\n", 1416 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 1417 | "IFrame(src=file_name, width = 1200, height=700)" 1418 | ] 1419 | }, 1420 | { 1421 | "cell_type": "code", 1422 | "execution_count": null, 1423 | "metadata": { 1424 | "collapsed": true 1425 | }, 1426 | "outputs": [], 1427 | "source": [] 1428 | } 1429 | ], 1430 | "metadata": { 1431 | "anaconda-cloud": {}, 1432 | "kernelspec": { 1433 | "display_name": "Python [Root]", 1434 | "language": "python", 1435 | "name": "Python [Root]" 1436 | }, 1437 | "language_info": { 1438 | "codemirror_mode": { 1439 | "name": "ipython", 1440 | "version": 3 1441 | }, 1442 | "file_extension": ".py", 1443 | "mimetype": "text/x-python", 1444 | "name": "python", 1445 | "nbconvert_exporter": "python", 1446 | "pygments_lexer": "ipython3", 1447 | "version": "3.5.2" 1448 | } 1449 | }, 1450 | "nbformat": 4, 1451 | "nbformat_minor": 2 1452 | } 1453 | -------------------------------------------------------------------------------- /PyData-Scattertext-Part-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using Scattertext and AgeFromName to find gender-discriminating terms\n", 8 | "\n", 9 | "https://github.com/JasonKessler/scattertext\n", 10 | "\n", 11 | "https://github.com/JasonKessler/agefromname\n", 12 | "\n", 13 | "Cite as:\n", 14 | "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", 15 | "\n", 16 | "Link to preprint: https://arxiv.org/abs/1703.00565\n", 17 | "\n", 18 | "`\n", 19 | "@article{kessler2017scattertext,\n", 20 | " author = {Kessler, Jason S.},\n", 21 | " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", 22 | " booktitle = {ACL System Demonstrations},\n", 23 | " year = {2017},\n", 24 | "}\n", 25 | "`" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "" 37 | ], 38 | "text/plain": [ 39 | "" 40 | ] 41 | }, 42 | "metadata": {}, 43 | "output_type": "display_data" 44 | } 45 | ], 46 | "source": [ 47 | "%matplotlib inline\n", 48 | "import scattertext as st\n", 49 | "import re, io, itertools\n", 50 | "from pprint import pprint\n", 51 | "import pandas as pd\n", 52 | "import numpy as np\n", 53 | "import spacy\n", 54 | "import os, pkgutil, json, urllib\n", 55 | "from urllib.request import urlopen\n", 56 | "from IPython.display import IFrame\n", 57 | "from IPython.core.display import display, HTML\n", 58 | "display(HTML(\"\"))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "nlp = spacy.load('en')\n", 70 | "# If this doesn't work, please uncomment the following line and use a regex-based parser instead\n", 71 | "#nlp = st.whitespace_nlp_with_sentences" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "convention_df = st.SampleCorpora.ConventionData2012.get_data()\n", 83 | "convention_df['parsed'] = convention_df.text.apply(nlp)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## The `agefromname` package takes \n", 91 | "### - a first name,\n", 92 | "### - optional: a minimum age,\n", 93 | "### - optional: current year\n", 94 | "## and returns\n", 95 | "### - the probablity someone is male or female\n", 96 | "### `pip install agefromname`" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "from agefromname import AgeFromName" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "gender_imputer = AgeFromName()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 8, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "0.031370941932688919" 130 | ] 131 | }, 132 | "execution_count": 8, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "gender_imputer.prob_male('kelsey')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 9, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "0.83377422744681196" 150 | ] 151 | }, 152 | "execution_count": 9, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "gender_imputer.prob_male('kelsey', minimum_age=70)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "" 170 | ] 171 | }, 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | }, 176 | { 177 | "data": { 178 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAEPCAYAAACNyEVOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl8VNX9//HXJxBWWQJRVhEECuLCpiwqGqwsoiLuC6t+\ni1q14tcNsVX4VS3WVsWlKm64UIzLFw1YiiglxRVEFNmC0goiIoKCICAEcn5/nJswhAlZmMydSd7P\nx+M+cufeM3c+uTP55My595xjzjlERCS5pIQdgIiIlJ6St4hIElLyFhFJQkreIiJJSMlbRCQJKXmL\niCShYpO3mT1rZuvNbPEByjxsZl+a2SIz6xzbEEVEpLCS1LwnAf2L2mlmA4A2zrm2wJXA4zGKTURE\nilBs8nbOvQtsOkCRgcDzQdl5QH0zaxSb8EREJJpYtHk3A9ZEPP4GaB6D44qISBFidcHSCj1Wn3sR\nkXJUNQbHWAscHvG4ebBtH2amhC4iUgbOucIV5JjUvKcBwwDMrAew2Tm3vogAEnYZO3Zs6DEovsoX\nm+JTfMUtRSm25m1mLwGnAulmtgYYC6QGyXiic26GmQ0ws5XANuDyUib/hHf33bBgAezcuXfZtWvv\nsmcP3HILjBwZdqQiUlkUm7ydc5eWoMx1sQknMZ1yChx9NFSvvu9SrZpffvoJhg+HZcvgr3+FKlXC\njlhEKrpYtHlXCBkZGUXuO+WU4p//0Udw4YVw9tmQmQl168YuNjhwfIkgkeNL5NhA8R2syhqfHahN\nJaYvZObi9Vphyc2F66+Hd9+FN9+Eli3DjkhEkp2Z4aJcsFTyjjHn4JFH4M9/hqwsOP74sCOSZGC2\n39+mVELRcqSSd5y98Ya/gDlpEpx1VtjRSKIL/kDDDkNCVNRnoKjkrVEFy8mgQb7pZORIeOKJsKMR\nkYpGFyzLUffu8N57MGAAfP893HEH6NuxiMSCknc5a90a5s6F00+HX36Be+5RAheRg6dmkzho1Ajm\nzIGZM+Gmm/xFTZFkN3HiRP73f/83psd87rnn6NWrV4nKZmdn07t374N+zVWrVpGSkkJeXt5BH6so\n3bt3Z9myZTE9ppJ3nKSnw+zZ8P77cO21UI6fE5GYadmyJbVq1aJOnTo0btyYyy+/nG3btrFr1y7u\nuecebr31VmBvAuzSpcs+z9+4cSPVqlWjVatW5R7riBEjuOOOOwoeL126lCZNmvDAAw+U+2sX5+ab\nb+bOO++M6TGVvOMoLQ3efhsWL4Zhw3zXepFEZma8+eabbN26lYULF7JgwQLuvvtusrKyaN++PU2a\nNNmn/I4dO1i6dGnB4ylTpnDkkUfG5VZIMyt4nU8//ZTTTjuNO++8kxtvvLHcX7s4Z599NnPmzGH9\n+qjDPpWJknec1a0Ls2bBli0wcCBs2xZ2RCIl07RpU8444wyWLFnCzJkzo/YcHDp0KM8//3zB4xdf\nfJFhw4btcwvcvffeS5s2bahbty5HH300b7zxRpGvmZOTQ58+fWjYsCHt27fn1VdfPWCMzjnmz59P\n3759GT9+PL/97W8Ltue/bnp6OhdffDGbNkWfY+a5556jdevW1K1blyOPPJIpU6YU7Hv22Wfp0KED\nDRo0oH///nz99dcAXHvttdx88837HGfgwIFMmDABgBo1atC1a1feeuutA8ZfGkreIahZE6ZOhaZN\n4bTTYOPGsCMSKVp+4l2zZg0zZsygc+fOLF68mHbt2u1XdvDgwWRmZuKcY9myZfz888907959nzJt\n2rThvffeY8uWLYwdO5YhQ4ZErZFu27aNPn36MGTIEDZs2EBmZibXXHMNy5cvLzLWefPmccYZZzBh\nwgSuuOKKgu0PP/ww06ZNY+7cuaxbt460tDSuvfbaqK85atQoZs6cyZYtW/jwww/p1KkTAFlZWYwf\nP57XX3+djRs30qtXLy691A/9NGLECF566aWCc7Vx40Zmz57N4MGDC4591FFHsWjRoiJjLy0l75BU\nrQrPPAO9e0OvXvCvf+lCphTNLDZLaTnnGDRoEGlpafTq1YuMjAxuv/12Nm/eTJ06dfYr37x5c9q1\na8fbb7/NCy+8wLBhw/Yrc8EFF9C4cWMALrroItq2bcu8efP2K/fmm2/SqlUrhg8fTkpKCp06deK8\n884rsvbtnGPevHnUr1+f/v33nXZ34sSJ3H333TRt2pTU1FTGjh3La6+9FvUiZUpKCosXL2bHjh00\natSIDh06APDEE08wZswY2rVrR0pKCmPGjOGzzz5jzZo1nHDCCdSrV4/Zs2cDkJmZSe/evTn00EML\njlunTh02b95c1KkuNSXvEJnBvffC738P110HHTvCs8/6WwpFIjkXm6W0zIysrCw2bdrEqlWrePTR\nR6lRowZpaWls2bIlavlhw4YxadIkMjMzGTp06H69Bl944QU6d+5MWloaaWlpLFmyhB9++GG/Y61e\nvZp58+YVlEtLS2PKlClFthubGddeey1du3alT58++yTKVatWce655xYcp0OHDlStWnW/Y9WuXZuX\nX36ZJ554gqZNm3LWWWexYsWKgnhGjRpVcIyGDRsCsHatn3tm2LBhTJ48GYDJkyczdOjQfY69ZcsW\n0tLSDni+S0PJOwEMGQJLl/rhZF97DY44Av7977CjEinacccdxxdffBF133nnnceMGTNo3bo1zZvv\nO53t6tWrufLKK/nb3/7Gjz/+yKZNmzjmmGOidgtv0aIFp556Kps2bSpYtm7dyt/+9rci46patSpT\npkyhRYsW9OvXj61btxYca+bMmfsca/v27ftdcAXo27cvs2bN4rvvvqN9+/aMDAbqb9GiBU8++eQ+\nx9i2bRs9evQAYMiQIWRlZbFo0SJycnIYNGjQPsddvnw5HTt2PMBZLR0l7wRhBn37wowZMGWKH172\ngw/CjkokugEDBvDvImoYtWvXZs6cOTz99NP77du2bRtmRnp6Onl5eUyaNIklS5ZEPc6ZZ57JF198\nweTJk8nNzSU3N5ePP/6YnJycqOXzZ56pWrUqr776Kunp6QwYMIDt27dz9dVXc/vttxdcYNywYQPT\npk3b7xjff/89WVlZbNu2jdTUVGrXrk2VYID+q6++mj/96U8F92v/9NNP+zThNG/enOOPP55hw4Zx\nwQUXUL169YJ9v/zyCwsXLqRPnz5RYy8LJe8E9Otfw4sv+vFR5s8POxqR/Z111lnk5OSwbt26gm2R\ntwN26dJln3u78/d16NCBm266iZ49e9K4cWOWLFnCySefvE+5/LJ16tRh1qxZZGZm0qxZM5o0acKY\nMWPYVcQ9tpHPTU1NZerUqdSoUYOBAwdy5ZVXMnDgQPr27UvdunXp2bMn8yP+uPKfl5eXx4MPPkiz\nZs1o2LAh7777Lo8//jgAgwYNYvTo0VxyySXUq1ePY489dr+7R4YPH87ixYv3azKZPn06vXv3Lmjr\nj4l4zePmX0pKY/p05w47zLlPPgk7Eilvyfj38eSTT7obbrghtNefM2eOy8jICO31o5k7d65r0aLF\nftu7d+/uli5desDnFvUZCLbvl1M1tkkCO+ssPyLhgAEwebIfH0UkUYzUpK37yM3NZcKECVHPy0cf\nfRTz11OzSYI791yfuC+/3M/Ss3172BGJJIbIZpKwLV++nLS0NNavX88NN9wQl9fUZAxJYtMmfzvh\nJ5/49vATTgg7IoklTcYgmoyhgkpLg7//Hf74R9+c8sAD6tQjUpmp5p2EVq+G88+HNm3g6afhkEPC\njkgOlmreopp3JXDEEX6Gntq1oUcPKKKvhIhUYKp5JzHn4Kmn4A9/8GOFH3ts2BFJWSXKhTcJV2lq\n3rpVMImZwZVXQo0acOmlvkNPrVphRyVloYqNlJZq3hWAczB0KNSpA0FnMBGpINTmXYGZwWOP+Uke\npk4NOxoRiQfVvCuQefP87DwffwwtWoQdjYjEgmrelUD37nDjjTB4MOTmhh2NiJQn1bwrmLw8fw/4\nzp1+bHBdwBRJbqp5VxIpKfDKK3DoodCnj+9WLyIVj5J3BZSaCpMm+WaUU06Bb78NOyIRiTUl7woq\nJQXuvx8uuwxOPhm++irsiEQkltRJpwIzgzFjoG5dP0v9v/4FRx4ZdlQiEgtK3pXAtdf6RH7aaUrg\nIhVFsc0mZtbfzHLM7EszGx1lf7qZzTSzz8xsiZmNKJdI5aBccw2MHu1r4P/9b9jRiMjBOmDyNrMq\nwKNAf6ADcKmZHVWo2HXAp865TkAGcL+ZqUafgH77W7jtNsjIgFdf9bcVikhyKq7m3Q1Y6Zxb5ZzL\nBTKBcwqVWQfUDdbrAj8453bHNkyJld/+1o9EeP/9cMwxfoq13Xq3RJJOccm7GbAm4vE3wbZITwFH\nm9m3wCJgVOzCk/LQrx98+CE8/LBP5O3bw7PPqlemSDIprnmjJF0ibwc+c85lmFlr4G0z6+ic21q4\n4Lhx4wrWMzIyyMjIKEWoEktmfjb600+HuXP99Gp33eXvThkxAqpVCztCkcopOzub7OzsYssdsHu8\nmfUAxjnn+gePxwB5zrk/R5SZAdzjnHs/eDwbGO2cW1DoWOoen+Def98n8JwcP7lD69ZhRyQiZe0e\nvwBoa2YtzawacDEwrVCZHOD04EUaAe0A3c+QhE46CWbOhJtv9qMTbtkSdkQiUpRiB6YyszOACUAV\n4Bnn3HgzuwrAOTfRzNKBSUAL/D+D8c65KVGOo5p3knDOX9j89lt4/XWoUiXsiEQqr6Jq3hpVUKLa\ntcsPbHXiiTB+fNjRiFReGlVQSqVaNT+kbGYmTNnve5SIhE3JW4p06KGQlQWjRvlmlC++CDsiEcmn\n5C0HdNxxsHgxpKf7C5rnnONvLVQLmEi41OYtJbZtG7zwAkyYAFWrwsiRftb6hg3Djkyk4lKbtxy0\n2rV980lOjp+tfsECfy/4kCGa8EEk3pS8pdTM4NRT/bgoX33lh5jt0sW3j4tIfKjZRGLigw98Dbxv\nXz/oVe3aYUckUjGo2UTK1Yknwqef+nbxLl3g7bfDjkikYlPNW2IuKwtuugk6dPC18LZtw45IJHmp\n5i1xc845sHSpn/i4Z0+45RbYsSPsqEQqFiVvKRfVq8Ott/okvmYNnHACLFkSdlQiFYeSt5SrRo3g\npZd8M0rv3vDEE+rgIxILavOWuFmxAi65BFq2hL/+VeOFi5SE2rwldO3awUcfQefO0L07XHGFZrIX\nKSslb4mr6tXhzjth5Upo0QK6dYPLL4eFC8OOTCS5qNlEQrVpEzz+ODz5pB/86qqrfNNKnTphRyaS\nGDQZgyS0PXt8x56JE2HOHD8RxLnnwplnQr16YUcnEh4lb0kaGzbA9OkwdaoffvbXv/a188aNw45M\nJP6UvCUpbdni70x5/nl44w1/sVOkMlHylqT26qtwzTX+PvHzzw87GpH4UfKWpLdwIQwaBIMHw7Bh\n0L69H55WpCLTfd6S9Lp0gXnzYP16GDDA9948/3x4+mnIyws7OpH4Us1bktbXX8O778Ijj0CrVjBp\nEtSoEXZUIrGlZhOpsHbsgOHD/VRsb7zh7xcXqSjUbCIVVs2akJkJvXr5IWi//DLsiETKn5K3VAgp\nKTB+vB+Gtnt3uO46P1GySEWl5C0VysiR8PnnkJYGGRm+p+bMmWFHJRJ7avOWCmvnTn9/+B13+MGv\n7rhDtxZK8tEFS6m0vvvOj5HSubPvZp+aGnZEIiWn5C2V2s8/w4UX+vVXX4VDDgk3HpGS0t0mUqkd\ncghMmwbNmsEpp8A334QdkcjBUfKWSiM1FZ56Ci6+2N+RMm9e2BGJlJ2aTaRSmj7dT8P24IMwZEjY\n0YgUTW3eIoUsWQIDB/q28Lvv1oVMSUxq8xYp5JhjYP58f1/4ySf7eTVFkkWxydvM+ptZjpl9aWaj\niyiTYWafmtkSM8uOeZQi5SQ9Hf7xDz/MbM+e8NxzoC+IkgwO2GxiZlWAFcDpwFrgY+BS59zyiDL1\ngfeBfs65b8ws3Tm3Mcqx1GwiCW3JErjsMjjqKD/MrCZBlkRQ1maTbsBK59wq51wukAmcU6jMZcD/\nOee+AYiWuEWSQX4zSr160KOHBriSxFZc8m4GrIl4/E2wLVJboIGZzTGzBWY2NJYBisRTjRrw5JMw\nahScdJJvUhFJRMUl75K0c6QCXYABQD/gDjNre7CBiYTpyishKwuuugrGjYPdu8OOSGRfVYvZvxY4\nPOLx4fjad6Q1wEbn3A5gh5nNBToC+33pHDduXMF6RkYGGRkZpY9YJE569oSPP4ahQ/3dKM8/D+3a\nhR2VVHTZ2dlkZ2cXW664C5ZV8Rcsfw18C8xn/wuW7YFH8bXu6sA84GLn3LJCx9IFS0lKeXl+QKux\nY/3IhL/7nR8/XCQeytxJx8zOACYAVYBnnHPjzewqAOfcxKDMzcDlQB7wlHPu4SjHUfKWpPbllzBi\nBFSrBs8+6+fNFClv6mEpEgN79sADD8B998E99/jJHzRGuJQnJW+RGFq61E96nJ7u7wlv3jzsiKSi\nUvd4kRg6+mj48EM48UTo1k33hEv8FXe3iYgUITUV7rwTmjSBvn3hvff8eOEi8aDkLXKQRo6ETZt8\nAp87Fxo2DDsiqQzU5i0SI7fdBnPmwDvvaFwUiR1dsBQpZ875HplffeW71VerFnZEUhEoeYvEwZ49\nfnKHmjXhxRfVmUcOnu42EYmDKlXg73+H1at9M4pIeVHyFomxmjX9TPXTp8NDD4UdjVRUuttEpBw0\naAAzZ/phZZs0gYsuCjsiqWjU5i1Sjj77DE4/3XfoaauBkqUM1OYtEoJOnXxHnqFDNSa4xJaSt0g5\nu+46qFsXxo8POxKpSNRsIhIHa9dCly7w5ptwwglhRyPJRM0mIiFq1gweftg3n2zfHnY0UhGo5i0S\nR0OG+DtRHt5vuhKR6NTDUiQB/Pijnwfz3Xehffuwo5FkoGYTkQTQoAHceiuMGRN2JJLsVPMWibNf\nfoFf/QoyM/1kDiIHopq3SIKoUQP++EdfA1d9RspKyVskBEOHwk8/+fFPRMpCyVskBFWqwL33+pEH\n1fNSykLJWyQkAwbAYYfB88+HHYkkI12wFAnR++/D8OHwxReauEGi0wVLkQR04olQqxZkZ4cdiSQb\nJW+REJn52eefeirsSCTZqNlEJGSbNkGrVrByJaSnhx2NJBo1m4gkqLQ0GDjQT1gsUlJK3iIJIL/p\nRF9OpaSUvEUSwMkn+8T9wQdhRyLJQslbJAGYwW9+owuXUnK6YCmSIDZs8JMUr1oF9euHHY0kCl2w\nFElwhx4K/frB3/8ediSSDJS8RRLI//wPvPBC2FFIMlCziUgCyc2Fxo1h8WJo2jTsaCQRqNlEJAmk\npvoBq6ZNCzsSSXTFJm8z629mOWb2pZmNPkC5E8xst5mdF9sQRSqXc86BN94IOwpJdAdsNjGzKsAK\n4HRgLfAxcKlzbnmUcm8D24FJzrn/i3IsNZuIlMDWrdCsGaxZA/XqhR2NhK2szSbdgJXOuVXOuVwg\nEzgnSrnfAa8BGw46UpFKrk4dOOUU+Oc/w45EEllxybsZsCbi8TfBtgJm1gyf0B8PNql6LXKQzjkH\nsrLCjkISWXHJuySJeAJwW9AmYsEiIgfh7LNh5kzYuTPsSCRRVS1m/1rg8IjHh+Nr35G6AplmBpAO\nnGFmuc65/a6Xjxs3rmA9IyODjIyM0kcsUgk0bgwdOvhJGvr1Czsaiafs7GyySzA7R3EXLKviL1j+\nGvgWmE+UC5YR5ScB051zU6Ps0wVLkVK47z746it4/PHiy0rFVaYLls653cB1wFvAMuBl59xyM7vK\nzK4qn1BFBGDQIH+/d15e2JFIIlIPS5EE1qEDPPccdOsWdiQSFvWwFElCgwapw45Ep+QtksAGDYLX\nXw87CklESt4iCez442HLFlixIuxIJNEoeYsksJQU1b4lOiVvkQR37rlK3rI/3W0ikuByc6FRIz/G\nd7NmxZeXikV3m4gkqdRUOPNMjXUi+1LyFkkCajqRwtRsIpIEtm2DJk1g9WpISws7GoknNZuIJLHa\ntaF3b/jHP8KORBKFkrdIklDTiURSs4lIkvjhBzjySPjuO6hZM+xoJF7UbCKS5Bo2hK5dYdassCOR\nRKDkLZJE1HQi+dRsIpJE1q6FY4/1TSfVqoUdjcSDmk1EKoBmzeDoo9V0IkreIknnoovglVfCjkLC\npmYTkSSzbp2fYWfdOqhRI+xopLyp2USkgmjSBDp2hLfeCjsSCZOSt0gSuvhiePnlsKOQMKnZRCQJ\nrV8P7dr5phN12KnY1GwiUoE0auSnSJsxI+xIJCxK3iJJSnedVG5qNhFJUhs3QuvW8O23ftRBqZjU\nbCJSwaSnQ48eGia2slLyFkliF18ML74YdhQSBjWbiCSx7dv9MLGzZ/tu81LxqNlEpAKqVQuuvx7u\nuy/sSCTeVPMWSXKbN/sLlwsXwhFHhB2NxJpq3iIVVP368JvfwP33hx2JxJNq3iIVwLp1vs17xQo4\n9NCwo5FYUs1bpAJr0sR32nnoobAjkXhRzVukgvjPf6B7d/jvf6Fu3bCjkVhRzVukgmvdGvr0gYkT\nw45E4kE1b5EKZNEiOOMMX/vWRA0Vg2reIpVAx47QqZN6XVYGJUreZtbfzHLM7EszGx1l/2AzW2Rm\nn5vZ+2Z2XOxDFZGSuO0232lnz56wI5HyVGzyNrMqwKNAf6ADcKmZHVWo2H+BU5xzxwF3AU/GOlAR\nKZlevfztglOnhh2JlKeS1Ly7ASudc6ucc7lAJnBOZAHn3IfOuZ+Ch/OA5rENU0RKyszXvu+9F3SZ\nqeIqSfJuBqyJePxNsK0o/wNofg+REJ11FvzyC7zzTtiRSHmpWoIyJf7fbWa9gSuAk6LtHzduXMF6\nRkYGGRkZJT20iJRCSgqMHg3jx/vbByV5ZGdnk52dXWy5Ym8VNLMewDjnXP/g8Rggzzn350LljgOm\nAv2dcyujHEe3CorEUW4utGnjp0rr3j3saKSsDuZWwQVAWzNraWbVgIuBaYUO3gKfuIdES9wiEn+p\nqXDTTfDXv4YdiZSHEnXSMbMzgAlAFeAZ59x4M7sKwDk30cyeBs4Fvg6ekuuc61boGKp5i8TZli1+\nmNilS6Fp07CjkbIoquatHpYiFdzVV/vEfeedYUciZaHkLVJJLVoEZ54Jq1ZB1ZLcoiAJRd3jRSqp\njh2hRQuYPj3sSCSWlLxFKoFrroHHHw87CoklNZuIVAK//OJr3++/D23bhh2NlIaaTUQqsRo1YMQI\njfVdkajmLVJJ/Oc/0KMHfP011KwZdjRSUqp5i1RyrVtD166+x6UkP9W8RSqRWbP8xcvPP4datcKO\nRkpC93mLCACDB0PjxnD//WFHIiWh5C0iAGzcCMceC6+/7tvAJbGpzVtEAEhPh4cegiuugJ07w45G\nykrJW6QSuvBCaN8e7ror7EikrNRsIlJJrVvnu86/9RZ07hx2NFIUNZuIyD6aNIG//MU3n+Tmhh2N\nlJaSt0glNmwYNGqkCRuSkZpNRCq51at955333vPt4JJY1GwiIlEdcQSMGwe/+Q3k5YUdjZSUkreI\ncM01/udjj4Ubh5Scmk1EBIAVK+Dkk2HBAl8bl8SgZhMROaB27eCWW+C883wvTElsSt4iUuCWW6Bf\nPzjlFFi7Nuxo5EA0HamIFDCDP/0J0tKgVy8/CmGbNmFHJdEoeYvIfm65BerXh1NPhRkzfE9MSSxq\nNhGRqEaOhAkT4PTT/eTFut8gsehuExE5oBUr4LLLoHlzeOYZPyqhxI/uNhGRMmnXDj780P/s1Amy\nslQLTwSqeYtIic2eDTfeCFWqwB/+AIMGQYqqgOVKM+mISEzk5cH06X4s8J074YYb4NxzoUGDsCOr\nmJS8RSSmnIOZM+Hpp+Gdd6BnTz/Jw4ABfrhZiQ0lbxEpNz//DP/4B7zyCsyZAzVq+JEKu3aFY46B\nli390rChv5dcSk7JW0Tiwjk/zOwnn/glJwe++gpWrYLdu+Hww6FZM2ja1P887DDf5JKWtvdnvXr+\nPvNatZTslbxFJHSbN8OaNfDtt35ZuxY2bIAff4RNm/zPzZv3Lrm5cMghULu2T+TRlho1oFo1qF59\n78/CS7Vqe/dVqeIvsuYvVapA1ap+SU31j/O35a9HPo72s/ASy4u4St4iknR27fJNMtu3+2XbNtix\nY9/HO3f6ZdeuveuRS/72/J979vhvB3l5fj0vz/+T2L3b/9yzZ++ye3f0x9F+5j9/927/bSHaP4DI\nJf8fR0pK9HL5++fPV/IWEYmLvLy9CT0y+ecv+f848tcLl4vc37OnkreISNIpcw9LM+tvZjlm9qWZ\njS6izMPB/kVm1jkWAYuISNEOmLzNrArwKNAf6ABcamZHFSozAGjjnGsLXAk8Xk6xlqvs7OywQzgg\nxVd2iRwbKL6DVVnjK67m3Q1Y6Zxb5ZzLBTKBcwqVGQg8D+CcmwfUN7NGMY+0nFXWD0CsJHJ8iRwb\nKL6DVVnjKy55NwPWRDz+JthWXJnmBx+aiIgUpbjkXdIrjIUb03VlUkSkHB3wbhMz6wGMc871Dx6P\nAfKcc3+OKPMEkO2cywwe5wCnOufWFzqWErqISBlEu9ukuGnQFgBtzawl8C1wMXBpoTLTgOuAzCDZ\nby6cuIt6cRERKZsDJm/n3G4zuw54C6gCPOOcW25mVwX7JzrnZpjZADNbCWwDLi/3qEVEKrm4ddIR\nEZHYKfPwKWb2rJmtN7PFEds6mtmHZva5mU0zszrB9m5m9mmwfG5mF0c8p6uZLQ46+Tx0cL9O2eKL\n2N/CzH42s5sSKT4za2lmOyLO4WOJFF+w77hg35Jgf7VEic/MBkecu0/NbI+ZHVde8ZUythpm9lKw\nfZmZ3RbxnEQ4d9XMbFKw/TMzOzUO8R1uZnPMbGnwebo+2N7AzN42sy/MbJaZ1Y94zpggjhwz61ue\nMZY2vmD7HDPbamaPFDpW2eNzzpVpAXoBnYHFEds+BnoF65cDfwzWawIpwXpjYCNQJXg8H+gWrM8A\n+pc1prLGF7H/NeBl4KaIbaHHB7SMLFfoOIkQX1VgEXBs8Dgt4v0OPb5CzzsG33eh3M5fKc/dCOCl\niL+Tr4AnhOkOAAAJrUlEQVQWiXLugGvxzaUAhwIL4vDZawx0CtYPAVYARwH3AbcG20cD9wbrHYDP\ngNTgb2Ule1sVyuP9LW18tYCTgKuARwodq8zxHewv0bLQB2BzxPrhwNIoz2kF/CdYbwIsj9h3CfBE\nLD4ApY0PGBSc/LEEyTtR4itcLqJMosQ3AHgxUeMr9Jw/AXeVd3ylOHf98Bf9qwDpQSKonyjnDt/D\nekjEvneAE8o7vkKxvgGcDuQAjYJtjYGcYH0MMDqi/EygR7xiLC6+iHIjiEjeBxtfrKcOXWpm+T0w\nL8R/CICCppOlwFLgxmBzM3ynnnxr2b8TULnHZ2aHALcC4wqVT4j4Aq2Cr/zZZnZygsX3K8CZ2Uwz\n+8TMbkmw+CJdBLwUrMczvqixOefeArYA64BVwF+cc5vjHFuR8eG/UQ00sypm1groiu+EF5f4zN/p\n1hmYh0+M+XeyrQfye3I3LRRLfmfCwttjHmMJ48tX+ALjQZ3DWCfvK4BrzGwB/uvErvwdzrn5zrmj\ngS7AQ2ZWL8avfTDxjQMedM5tZ/8OR/FUVHzfAoc75zrj//FNsULt9SHHVxU4Gbgs+HmumZ1G/Dtr\nFfn5AzCz7sB259yyOMdVZGxmNgTfXNIE/6305iBJJkR8wLP4BLMAeBD4ANhDHN7boFL1f8Ao59zW\nyH3OV1VDvdsi7PiKu8+7VJxzK/BfAzGzXwFnRimTY2b/Adqwf1f65vj/PuUiSnwDgl3dgPPN7D78\nV9Y8M9sBTA05vjOD7bsI/piccwuD89c2iCX0+PDDI8x1zv0Y7JuB/yc9OUHiy3cJMCXicdzO3wE+\neycCrzvn9gAbzOx9fO32vXjFVkR8+Z+9Pez9pkwQ3xfAT+UZn5ml4hPji865N4LN682ssXPuOzNr\nAnwfbF/Lvt+ymuNzS7m9v6WMrygHFV9Ma95mdmjwMwX4A8EIg+bvlqgarB+BTzxfOue+A7aYWXcz\nM2Aovv2oXESJ7wkA59wpzrlWzrlWwATgHufcYwkQX/75Szc/wiNmdiT+/P3XObcuEeLD9wM41sxq\nBu/zqfg204Q4fxHbLsQPrgZAPM9fUZ89fDvpacG+2vi22pxEOXfBe1o7WO8D5Drncsrz3AXHewZY\n5pybELFrGjA8WB8e8XrTgEvM3xnTCv/3Mb+8zmEZ4it4auSDgz6HB9FI/xL+6/wufM3rCuB6/AWX\nFcCfIsoOAZYAn+KvrvaP2NcVWIy/QvxwDC8ilDi+Qs8bC9yYSPEB50Wcv0+AMxMpvqD84CDGxQRX\n2RMsvgzggyjHiXl8pXxvq+O/oSzGXw+6qTxjK0N8LfH/YJYBs/DNd+Ud38lAHv4Okk+DpT/QAH/B\n9IsglvoRz7k9iCMH6FfO729Z4lsF/ABsDc55+4ONT510RESSUKwvWIqISBwoeYuIJCElbxGRJKTk\nLSKShJS8RUSSkJK3iEgSUvKuRMx718z6R2y70Mz+GWZcycjMbjCzmnF4nQwzmx7x+G4z+6cFQ+5G\nKd/SIoZ6lYpLybsScf6m/quBB8ysejA2wz3ANWU5Xn6v2UpqFH6ozxILei+WmZn9AegJDHJ+yASp\nxJS8Kxnn3FJgOnAbcCe+d98fzGyemS00s4FQUIOba36UwE/MrGewPSOovWfhewQWMLPLzezBiMcj\nzeyBYP1G84POLzazURGvETkhwM1mNrZwzGZ2qJm9Zmbzg+XEYPs48xMLzDGz/5jZ7yKeM8zMFpmf\nQOCFiNf7V7D9HTPLH1XyOTM7P+K5P0f8rtlm9qqZLTezycH26/Ej1s0xs9nBtr5m9kFwrl6J6FK+\nyszuNbNPgAsK/V5nm9lHwXl/28wOi/aWBWVvwo89crZzbqf5Uf7+EpyPRWZ2ZZTzdnTwvn4alGkd\nbB8Ssf0JM0sxsyuKeu8kQcWqS6uW5FnwNcYc4HP8+NaDg+318d2ja+FHuqsebG8LfBysZwA/A0dE\nOW5tfDff/Ik23geOxncB/jw4Zm18N/pO7D+m9E3A2CjHnQKcFKy3wI8pAX40yPfwg/A3JJjkI3jN\nFUCD/N8r+DkdGBqsX44fEApgEnB+xOttjfhdN+MTteFH1Dsx2PdVxPHTgX8DNYPHo4E7IsrdXMT7\nENl9+jfAX6OUyQA24btcHxKx/Urg98F6dfxkCi0jzynwCHBZsF4VqIGfNGBaxHv0GH5MjajvXdif\nVS1FL5X5a2+l5ZzbbmYv45PwRcDZZnZzsLs6foS274BHzawjfgjQthGHmO+cWx3luNvM7F/B8XKA\nVOfc0qCmPdU5twPAzKbiZ3OZVsKQTweO8mP3AFAnqNk64B/OuVzgBzP7Hj8I/mnAKy4Y5dD58bHB\nD/o0KFifjJ98ozjznXPfBnF/hk+OHxQq0wM/m8sHQYzVCpV5uYhjH25mrwQxV8Mn+sIc8CX+H2tf\n/EiXBOvHmll+bb4ufqTOlRHP/QD4vZk1x5//lWb2a/w/0wVBrDWB74p674qIWxKAknfllRcsBpzn\nnPsycqeZjQPWOeeGmh/R8JeI3dsOcNyngd8Dy/FjQYNPQJEjqlmwbTf7Nt0VdQHQgO6uUDtvkHwi\nt+3Bf6YLv17hYxVWEEfQLh15MXBnlONH87Zz7rIi9hV1vh7B17bfND835Lgi4l2PH/hrtpn96JzL\nDvZd55x7e5/CfnIAAJxzL5nZR8BZwAwzuyrY9bxz7vYorxXtvZMEpTZveQs/ohwAZtY5WK2Lr30D\nDMM3RxTLOTcfPy7xZeydseZdYJDtHV50ULDte+Aw8xO0VscnmWhmFYqx44FCAP4FXGhmDYLyacG+\nD/BjeoNPhnOD9VX42ijAQHwzTHG24s8R+FlUTopoU65tZm2LfOZedfGj+4GfIqtIwT/X84DJwe//\nFn7yhPyhln9lZvtcQDWzI51zXznnHgGygGOB2cAFtncI2AZm1iJ4jWjvnSQoJe/KzQF3AanmZwdf\nAvy/YN9jwPCgqaAdvokl8nkH8grwnnPuJwDn3KfAc/jhgD8CnnLOLQqaO/4YbJ+FH3Y02rGvB44P\nLrotxU/kWmQszs+Ucw/w7yD++4NdvwMuN7NF+OQ9Ktj+FHBqULZHCX/XJ4GZZjbbObeBYCLh4Ngf\n4M9ZccYBr5qfvWZDEa9VMCOLc24Bvq1+Gn7o0WXAwuCi7+Ps/Qebf5yLzM9u/in+OsALzrnl+PG6\nZwWxzsI32+Tb572TxKUhYSXmzN+X/IBzbk7YsUjp6L1LHqp5S8yYWX0zW4GfJ1J//ElE713yUc1b\nRCQJqeYtIpKElLxFRJKQkreISBJS8hYRSUJK3iIiSUjJW0QkCf1/YaQBhn5DI9QAAAAASUVORK5C\nYII=\n", 179 | "text/plain": [ 180 | "" 181 | ] 182 | }, 183 | "metadata": {}, 184 | "output_type": "display_data" 185 | } 186 | ], 187 | "source": [ 188 | "(pd.DataFrame([{'Year you encounter a Kelsey': year, \n", 189 | " 'P(Male|Kelsey)': gender_imputer.prob_male('kelsey', current_year=year)}\n", 190 | " for year in range(1930, 2015)])\n", 191 | " .set_index('Year you encounter a Kelsey')\n", 192 | " .plot())" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## Let's assign speakers a gender if, assuming they're at least 35, \n", 200 | "## there's a 90% someone with their name is male or female.\n", 201 | "## Otherwise, drop their speech from the dataset." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "convention_df['speaker_gender'] = (convention_df.speaker\n", 213 | " .apply(lambda x: (gender_imputer.prob_male(x.split()[0], minimum_age = 35) if x else 0.5))\n", 214 | " .apply(lambda x: 'Male' if x > 0.9 else 'Female' if x < 0.1 else '?')\n", 215 | ")" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/html": [ 226 | "

\n", 227 | "\n", 240 | "\n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | "

	party	speaker	text	parsed	speaker_gender
0	democrat	BARACK OBAMA	Thank you. Thank you. Thank you. Thank you so ...	(Thank, you, ., Thank, you, ., Thank, you, ., ...	?
1	democrat	MICHELLE OBAMA	Thank you so much. Tonight, I am so thrilled a...	(Thank, you, so, much, ., Tonight, ,, I, am, s...	Female
2	democrat	RICHARD DURBIN	Thank you. It is a singular honor to be here t...	(Thank, you, ., It, is, a, singular, honor, to...	Male

\n", 278 | "

" 279 | ], 280 | "text/plain": [ 281 | " party speaker \\\n", 282 | "0 democrat BARACK OBAMA \n", 283 | "1 democrat MICHELLE OBAMA \n", 284 | "2 democrat RICHARD DURBIN \n", 285 | "\n", 286 | " text \\\n", 287 | "0 Thank you. Thank you. Thank you. Thank you so ... \n", 288 | "1 Thank you so much. Tonight, I am so thrilled a... \n", 289 | "2 Thank you. It is a singular honor to be here t... \n", 290 | "\n", 291 | " parsed speaker_gender \n", 292 | "0 (Thank, you, ., Thank, you, ., Thank, you, ., ... ? \n", 293 | "1 (Thank, you, so, much, ., Tonight, ,, I, am, s... Female \n", 294 | "2 (Thank, you, ., It, is, a, singular, honor, to... Male " 295 | ] 296 | }, 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "convention_df.iloc[:3]" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 11, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "Male 105\n", 315 | "Female 65\n", 316 | "? 19\n", 317 | "Name: speaker_gender, dtype: int64" 318 | ] 319 | }, 320 | "execution_count": 11, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "convention_df.speaker_gender.value_counts()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 12, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "convention_df_gender = convention_df[convention_df.speaker_gender.isin(['Male', 'Female'])]\n", 338 | "convention_df_gender['speaker_party'] = (convention_df_gender['speaker'] \n", 339 | " + ' (' + convention_df_gender['party'].apply(lambda x: x.upper()[0]) + ')')" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "## Let's plot the differences in language use by gender" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 13, 352 | "metadata": { 353 | "scrolled": false 354 | }, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/html": [ 359 | "\n", 360 | " \n", 367 | " " 368 | ], 369 | "text/plain": [ 370 | "" 371 | ] 372 | }, 373 | "execution_count": 13, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "corpus_gender = st.CorpusFromParsedDocuments(convention_df_gender, \n", 380 | " category_col='speaker_gender', \n", 381 | " parsed_col='parsed').build()\n", 382 | "html = st.produce_scattertext_explorer(corpus_gender,\n", 383 | " category='Female',\n", 384 | " category_name='Female',\n", 385 | " not_category_name='Male',\n", 386 | " minimum_term_frequency=5,\n", 387 | " width_in_pixels=1000,\n", 388 | " metadata=convention_df_gender['speaker_party'])\n", 389 | "file_name = 'output/Conventions2012Gender.html'\n", 390 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 391 | "IFrame(src=file_name, width = 1200, height=700)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "# Let's see how gender and party-associated terms differ" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "## Compute gender and party scaled f-scores" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 14, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "female_scores = corpus_gender.get_scaled_f_scores('Female')\n", 417 | "democratic_scores = (st.CorpusFromParsedDocuments(convention_df_gender, \n", 418 | " category_col='party', \n", 419 | " parsed_col='parsed')\n", 420 | " .build()\n", 421 | " .get_scaled_f_scores('democrat'))" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "# Use custom coordinates to plot the gender scaled f-score vs. the party scaled f-score" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 15, 434 | "metadata": { 435 | "scrolled": false 436 | }, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/html": [ 441 | "\n", 442 | " \n", 449 | " " 450 | ], 451 | "text/plain": [ 452 | "" 453 | ] 454 | }, 455 | "execution_count": 15, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "html = st.produce_scattertext_explorer(corpus_gender,\n", 462 | " category='Female',\n", 463 | " category_name='Female',\n", 464 | " not_category_name='Male',\n", 465 | " minimum_term_frequency=5,\n", 466 | " pmi_filter_thresold=4,\n", 467 | " width_in_pixels=1000,\n", 468 | " scores=female_scores,\n", 469 | " sort_by_dist=False,\n", 470 | " x_coords=democratic_scores,\n", 471 | " y_coords=female_scores,\n", 472 | " show_characteristic=False,\n", 473 | " metadata=(convention_df_gender['speaker'] \n", 474 | " + ' (' \n", 475 | " + convention_df_gender['party'].apply(lambda x: x.upper()[0]) \n", 476 | " + ')'),\n", 477 | " x_label='More Democratic',\n", 478 | " y_label='More Female')\n", 479 | "file_name = 'output/Conventions2012GenderAndParty.html'\n", 480 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 481 | "IFrame(src=file_name, width = 1200, height=700)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": { 487 | "collapsed": true 488 | }, 489 | "source": [ 490 | "# Women and men appear to discuss different topics \n", 491 | "## The package Empath uses a crowd-sourced topic model to categorize a given document\n", 492 | "\n", 493 | "Fast, Ethan, Binbin Chen, and Michael S. Bernstein. \"Empath: Understanding topic signals in large-scale text.\" Proceedings of the 2016 CHI Conference on Human Factors in Computing Systems. ACM, 2016." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 16, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/html": [ 504 | "\n", 505 | " \n", 512 | " " 513 | ], 514 | "text/plain": [ 515 | "" 516 | ] 517 | }, 518 | "execution_count": 16, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "empath_corpus = st.CorpusFromParsedDocuments(convention_df_gender,\n", 525 | " category_col='speaker_gender',\n", 526 | " feats_from_spacy_doc=st.FeatsFromOnlyEmpath(),\n", 527 | " parsed_col='text').build()\n", 528 | "html = st.produce_scattertext_explorer(empath_corpus,\n", 529 | " category='Female',\n", 530 | " category_name='Female',\n", 531 | " not_category_name='Male',\n", 532 | " width_in_pixels=1000,\n", 533 | " metadata=convention_df_gender['speaker_party'],\n", 534 | " use_non_text_features=True,\n", 535 | " use_full_doc=True)\n", 536 | "file_name = 'output/EmpathGender.html'\n", 537 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 538 | "IFrame(src=file_name, width = 1200, height=700)\n" 539 | ] 540 | } 541 | ], 542 | "metadata": { 543 | "anaconda-cloud": {}, 544 | "kernelspec": { 545 | "display_name": "Python [Root]", 546 | "language": "python", 547 | "name": "Python [Root]" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": { 551 | "name": "ipython", 552 | "version": 3 553 | }, 554 | "file_extension": ".py", 555 | "mimetype": "text/x-python", 556 | "name": "python", 557 | "nbconvert_exporter": "python", 558 | "pygments_lexer": "ipython3", 559 | "version": "3.5.2" 560 | } 561 | }, 562 | "nbformat": 4, 563 | "nbformat_minor": 1 564 | } 565 | -------------------------------------------------------------------------------- /PyData-Scattertext-Part-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Topic-specific term associations through word representations\n", 8 | "## How do Democrats and Republicans talk different about jobs\n", 9 | "\n", 10 | "https://github.com/JasonKessler/scattertext\n", 11 | "\n", 12 | "Cite as:\n", 13 | "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", 14 | "\n", 15 | "Link to preprint: https://arxiv.org/abs/1703.00565\n", 16 | "\n", 17 | "`\n", 18 | "@article{kessler2017scattertext,\n", 19 | " author = {Kessler, Jason S.},\n", 20 | " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", 21 | " booktitle = {ACL System Demonstrations},\n", 22 | " year = {2017},\n", 23 | "}\n", 24 | "`" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "" 36 | ], 37 | "text/plain": [ 38 | "" 39 | ] 40 | }, 41 | "metadata": {}, 42 | "output_type": "display_data" 43 | } 44 | ], 45 | "source": [ 46 | "%matplotlib inline\n", 47 | "import scattertext as st\n", 48 | "from gensim.models import word2vec\n", 49 | "import re, io, itertools\n", 50 | "from pprint import pprint\n", 51 | "import pandas as pd\n", 52 | "import numpy as np\n", 53 | "import spacy\n", 54 | "import os, pkgutil, json, urllib\n", 55 | "from urllib.request import urlopen\n", 56 | "from IPython.display import IFrame\n", 57 | "from IPython.core.display import display, HTML\n", 58 | "display(HTML(\"\"))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "nlp = spacy.load('en')\n", 70 | "# If this doesn't work, please uncomment the following line and use a regex-based parser instead\n", 71 | "#nlp = st.whitespace_nlp_with_sentences" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Load the 2012 Conventions Dataset\n", 79 | "### We'll limit the study to unigrams" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "convention_df = st.SampleCorpora.ConventionData2012.get_data()\n", 91 | "convention_df['parsed'] = convention_df.text.apply(nlp)\n", 92 | "corpus = (st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed')\n", 93 | " .build()\n", 94 | " .get_unigram_corpus())" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Use Gensim to run Word2Vec on the corpus.\n", 102 | "### Word2Vec encodes each word in a dense K-dimensional vector space\n", 103 | "### Cosine distances between terms vectors correspond to semantic similarity " 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "[('create', 0.9190447926521301),\n", 115 | " ('businesses', 0.8814688920974731),\n", 116 | " ('million', 0.8395127058029175),\n", 117 | " ('taxes', 0.8300786018371582),\n", 118 | " ('millions', 0.829835057258606),\n", 119 | " ('created', 0.8269357085227966),\n", 120 | " ('pay', 0.8228686451911926),\n", 121 | " ('families', 0.8117849826812744),\n", 122 | " ('lives', 0.8079125881195068),\n", 123 | " ('debt', 0.8053802847862244)]" 124 | ] 125 | }, 126 | "execution_count": 4, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "model = word2vec.Word2Vec(size=100, window=5, min_count=10, workers=4)\n", 133 | "model = st.Word2VecFromParsedCorpus(corpus, model).train(epochs=10000)\n", 134 | "model.wv.most_similar('jobs')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": { 141 | "scrolled": false 142 | }, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "9677" 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "corpus._df[corpus._parsed_col].apply(lambda x: len(list(x.sents))).sum()\n", 157 | "#model.corpus_count" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Draw the Scattertext by only coloring points that have are associated with a category (p < 0.05 via log-odds w/ prior)\n", 165 | "### The top Democratic and Republican terms are raked by their similarity to \"jobs\"\n", 166 | "#### Only the terms associated to a category are considered. \n", 167 | "### On the far right, the most similar terms, regardless of category association, are listed." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/html": [ 178 | "\n", 179 | " \n", 186 | " " 187 | ], 188 | "text/plain": [ 189 | "" 190 | ] 191 | }, 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "target_term = 'jobs'\n", 199 | "\n", 200 | "html = st.word_similarity_explorer_gensim(corpus,\n", 201 | " category='democrat',\n", 202 | " category_name='Democratic',\n", 203 | " not_category_name='Republican',\n", 204 | " target_term=target_term,\n", 205 | " minimum_term_frequency=5,\n", 206 | " width_in_pixels=1000,\n", 207 | " word2vec=model,\n", 208 | " metadata=convention_df['speaker'])\n", 209 | "file_name = 'output/demo_similarity_gensim.html'\n", 210 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 211 | "IFrame(src=file_name, width = 1200, height=700)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Instead of using vectors trained on the Corpus, we can use the spaCy-provided word vectors trained on the Common Crawl Corpus.\n", 219 | "### These are trained on a lot more data, but aren't specific to the corpus" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 7, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/html": [ 230 | "\n", 231 | " \n", 238 | " " 239 | ], 240 | "text/plain": [ 241 | "" 242 | ] 243 | }, 244 | "execution_count": 7, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "# Note: this will fail if you did not use spaCy as your parser.\n", 251 | "html = st.word_similarity_explorer(corpus,\n", 252 | " category='democrat',\n", 253 | " category_name='Democratic',\n", 254 | " not_category_name='Republican',\n", 255 | " target_term='jobs',\n", 256 | " minimum_term_frequency=5,\n", 257 | " width_in_pixels=1000,\n", 258 | " metadata=convention_df['speaker'])\n", 259 | "file_name = 'output/demo_similarity.html'\n", 260 | "open(file_name, 'wb').write(html.encode('utf-8'))\n", 261 | "IFrame(src=file_name, width = 1200, height=700)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [] 272 | } 273 | ], 274 | "metadata": { 275 | "anaconda-cloud": {}, 276 | "kernelspec": { 277 | "display_name": "Python [Root]", 278 | "language": "python", 279 | "name": "Python [Root]" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.5.2" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 1 296 | } 297 | -------------------------------------------------------------------------------- /PyData2017Kessler.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonKessler/Scattertext-PyData/5e892805319a2522210038f3c6eeb2dc13e7b2e8/PyData2017Kessler.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scattertext-PyData 2 | Notebooks for the Seattle PyData 2017 talk on Scattertext 3 | 4 | A guide to using the python package [Scattertext](http://github.com/JasonKessler/scattertext). If you feel so moved, please star it, fork it, or even contribute! 5 | 6 | Check out the introductory presentation [here](https://github.com/JasonKessler/Scattertext-PyData/raw/master/PyData2017Kessler.pptx). 7 | 8 | # Video 9 | [![Watch the PyData talk here](https://raw.githubusercontent.com/JasonKessler/jasonkessler.github.io/master/scattertext_youtube.png)](https://www.youtube.com/watch?v=H7X9CA2pWKo) 10 | 11 | # Using the notebooks 12 | The notebooks look best in Chrome. 13 | 14 | ## Slow but interactive way 15 | In order to use these notebooks, please execute the following commands, please clone this repo and run (in Python 3): 16 | ``` 17 | $ git clone https://github.com/JasonKessler/Scattertext-PyData 18 | $ pip3 install scattertext agefromname 19 | $ cd Scattertext-PyData 20 | $ jupyter notebook 21 | ``` 22 | ## Fast and non-interative way 23 | * [First Notebook](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-1.ipynb) how to use Scattertext to visualize differences in document types. 24 | [![Conventions-Visualization.html](https://jasonkessler.github.io/2012conventions0.0.2.2.png)](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-1.ipynb) 25 | * [Second Notebook](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-2.ipynb) how to use Scattertext and AgeFromName to understand how lanugage, gender and political party intersect. 26 | [![Gender and Party](https://github.com/JasonKessler/Scattertext-PyData/raw/master/img/genderandparty.png)](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-2.ipynb) 27 | * [Third Notebook](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-3.ipynb) how to use Scattertext to visualize how the same word or semantic type is discussed different between document categories. In this case, we explore how "jobs" is discussed differently by Republicans and Democrats. 28 | [![Word representations](https://github.com/JasonKessler/Scattertext-PyData/raw/master/img/gensim_similarity.png)](https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-3.ipynb) 29 | 30 | 31 | -------------------------------------------------------------------------------- /img/genderandparty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonKessler/Scattertext-PyData/5e892805319a2522210038f3c6eeb2dc13e7b2e8/img/genderandparty.png -------------------------------------------------------------------------------- /img/gensim_similarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonKessler/Scattertext-PyData/5e892805319a2522210038f3c6eeb2dc13e7b2e8/img/gensim_similarity.png -------------------------------------------------------------------------------- /~$PyData2017Kessler.pptx: -------------------------------------------------------------------------------- 1 | Microsoft Office User Microsoft Office User --------------------------------------------------------------------------------