├── Humor-Sans.ttf ├── LICENSE ├── xkcd.py ├── getngrams.py └── README.md /Humor-Sans.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/econpy/google-ngrams/HEAD/Humor-Sans.ttf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Matt Nicklay 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /xkcd.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.font_manager as fm 3 | from matplotlib import cm 4 | from matplotlib.ticker import FuncFormatter 5 | from matplotlib.backends.backend_agg import FigureCanvasAgg 6 | 7 | 8 | def plotXKCD(ngramCSVfile): 9 | fin = open(ngramCSVfile, 'r') 10 | ngrams = fin.readline().strip().split(',')[1:] 11 | data_vals = [[] for ngram in ngrams] 12 | years = [] 13 | for line in fin: 14 | sp = line.strip().split(',') 15 | years.append(int(sp[0])) 16 | for i, s in enumerate(sp[1:]): 17 | data_vals[i].append(float(s)*100) # Make percentage 18 | fin.close() 19 | 20 | # Set up a figure 21 | plt.xkcd(scale=2, randomness=2.75) 22 | fig = plt.Figure() 23 | canvas = FigureCanvasAgg(fig) 24 | num_ngrams = len(ngrams) 25 | 26 | # Plot the data 27 | ax = fig.add_subplot(1, 1, 1) 28 | if len(data_vals) >= 1: 29 | for k, data, label in zip(list(range(num_ngrams)), data_vals, ngrams): 30 | if label.startswith('_'): # The legend doesn't like labels that 31 | label = label[1:] # start with an underscore. 32 | if k == 0: 33 | ax.plot(years, data, label=label, 34 | color=cm.jet(1.*k/num_ngrams), lw=2) 35 | else: 36 | ax.plot(years, data, 'white', lw=6) 37 | ax.plot(years, data, label=label, 38 | color=cm.jet(1.*k/num_ngrams), lw=2) 39 | 40 | # Create the Humor-Sans font properties object 41 | prop = fm.FontProperties(fname='Humor-Sans.ttf', size=17) 42 | 43 | # Define axes min/max/center 44 | xlim, ylim = ax.get_xlim(), ax.get_ylim() 45 | xmid = (xlim[1] - xlim[0])/2 + xlim[0] 46 | 47 | # Create the legend and change the font 48 | legend = ax.legend(loc='best', fontsize=9) 49 | for label in legend.get_texts(): 50 | label.set_fontproperties(prop) 51 | 52 | # Don't show frame around legend 53 | legend.draw_frame(False) 54 | 55 | # Do not display top and right axes 56 | ax.spines["right"].set_visible(False) 57 | ax.spines["top"].set_visible(False) 58 | 59 | # Remove unneeded ticks 60 | ax.tick_params(axis='both', direction='out') 61 | ax.get_xaxis().tick_bottom() 62 | ax.get_yaxis().tick_left() 63 | 64 | # Set tick labels text 65 | prop.set_size(11) 66 | for label in ax.get_xticklabels(): 67 | label.set_fontproperties(prop) 68 | for label in ax.get_yticklabels(): 69 | label.set_fontproperties(prop) 70 | 71 | # Add percentage sign to y-axis ticks 72 | ax.yaxis.set_major_formatter(FuncFormatter(lambda y, pos=0: '%s%%' % y)) 73 | 74 | # Set the ticks on each axes 75 | ax.set_xticks([xlim[0], xmid, xlim[1]]) 76 | ax.set_yticks([ylim[1]]) 77 | 78 | # Change tick thickness 79 | ax.xaxis.set_tick_params(width=1, length=4) 80 | ax.yaxis.set_tick_params(width=1, length=4) 81 | 82 | fig.savefig(ngramCSVfile.replace('.csv', '.png'), dpi=190) 83 | 84 | if __name__ == '__main__': 85 | import sys 86 | plotXKCD(sys.argv[1]) 87 | -------------------------------------------------------------------------------- /getngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -* 3 | from ast import literal_eval 4 | from pandas import DataFrame # http://github.com/pydata/pandas 5 | import re 6 | import requests # http://github.com/kennethreitz/requests 7 | import subprocess 8 | import sys 9 | 10 | corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_gb_2012=18, eng_gb_2009=6, 11 | chi_sim_2012=23, chi_sim_2009=11, eng_2012=15, eng_2009=0, 12 | eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1, 13 | fre_2012=19, fre_2009=7, ger_2012=20, ger_2009=8, heb_2012=24, 14 | heb_2009=9, spa_2012=21, spa_2009=10, rus_2012=25, rus_2009=12, 15 | ita_2012=22) 16 | 17 | 18 | def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive): 19 | params = dict(content=query, year_start=startYear, year_end=endYear, 20 | corpus=corpora[corpus], smoothing=smoothing, 21 | case_insensitive=caseInsensitive) 22 | if params['case_insensitive'] is False: 23 | params.pop('case_insensitive') 24 | if '?' in params['content']: 25 | params['content'] = params['content'].replace('?', '*') 26 | if '@' in params['content']: 27 | params['content'] = params['content'].replace('@', '=>') 28 | req = requests.get('http://books.google.com/ngrams/graph', params=params) 29 | res = re.findall('var data = (.*?);\\n', req.text) 30 | if res: 31 | data = {qry['ngram']: qry['timeseries'] 32 | for qry in literal_eval(res[0])} 33 | df = DataFrame(data) 34 | df.insert(0, 'year', list(range(startYear, endYear + 1))) 35 | else: 36 | df = DataFrame() 37 | return req.url, params['content'], df 38 | 39 | 40 | def runQuery(argumentString): 41 | arguments = argumentString.split() 42 | query = ' '.join([arg for arg in arguments if not arg.startswith('-')]) 43 | if '?' in query: 44 | query = query.replace('?', '*') 45 | if '@' in query: 46 | query = query.replace('@', '=>') 47 | params = [arg for arg in arguments if arg.startswith('-')] 48 | corpus, startYear, endYear, smoothing = 'eng_2012', 1800, 2000, 3 49 | printHelp, caseInsensitive, allData = False, False, False 50 | toSave, toPrint, toPlot = True, True, False 51 | 52 | # parsing the query parameters 53 | for param in params: 54 | if '-nosave' in param: 55 | toSave = False 56 | elif '-noprint' in param: 57 | toPrint = False 58 | elif '-plot' in param: 59 | toPlot = True 60 | elif '-corpus' in param: 61 | corpus = param.split('=')[1].strip() 62 | elif '-startYear' in param: 63 | startYear = int(param.split('=')[1]) 64 | elif '-endYear' in param: 65 | endYear = int(param.split('=')[1]) 66 | elif '-smoothing' in param: 67 | smoothing = int(param.split('=')[1]) 68 | elif '-caseInsensitive' in param: 69 | caseInsensitive = True 70 | elif '-alldata' in param: 71 | allData = True 72 | elif '-help' in param: 73 | printHelp = True 74 | else: 75 | print(('Did not recognize the following argument: %s' % param)) 76 | if printHelp: 77 | print('See README file.') 78 | else: 79 | if '*' in query and caseInsensitive is True: 80 | caseInsensitive = False 81 | notifyUser = True 82 | warningMessage = "*NOTE: Wildcard and case-insensitive " + \ 83 | "searches can't be combined, so the " + \ 84 | "case-insensitive option was ignored." 85 | elif '_INF' in query and caseInsensitive is True: 86 | caseInsensitive = False 87 | notifyUser = True 88 | warningMessage = "*NOTE: Inflected form and case-insensitive " + \ 89 | "searches can't be combined, so the " + \ 90 | "case-insensitive option was ignored." 91 | else: 92 | notifyUser = False 93 | url, urlquery, df = getNgrams(query, corpus, startYear, endYear, 94 | smoothing, caseInsensitive) 95 | if not allData: 96 | if caseInsensitive is True: 97 | for col in df.columns: 98 | if col.count('(All)') == 1: 99 | df[col.replace(' (All)', '')] = df.pop(col) 100 | elif col.count(':chi_') == 1 or corpus.startswith('chi_'): 101 | pass 102 | elif col.count(':ger_') == 1 or corpus.startswith('ger_'): 103 | pass 104 | elif col.count(':heb_') == 1 or corpus.startswith('heb_'): 105 | pass 106 | elif col.count('(All)') == 0 and col != 'year': 107 | if col not in urlquery.split(','): 108 | df.pop(col) 109 | if '_INF' in query: 110 | for col in df.columns: 111 | if '_INF' in col: 112 | df.pop(col) 113 | if '*' in query: 114 | for col in df.columns: 115 | if '*' in col: 116 | df.pop(col) 117 | if toPrint: 118 | print((','.join(df.columns.tolist()))) 119 | for row in df.iterrows(): 120 | try: 121 | print(('%d,' % int(row[1].values[0]) + 122 | ','.join(['%.12f' % s for s in row[1].values[1:]]))) 123 | except: 124 | print((','.join([str(s) for s in row[1].values]))) 125 | queries = ''.join(urlquery.replace(',', '_').split()) 126 | if '*' in queries: 127 | queries = queries.replace('*', 'WILDCARD') 128 | if caseInsensitive is True: 129 | word_case = 'caseInsensitive' 130 | else: 131 | word_case = 'caseSensitive' 132 | filename = '%s-%s-%d-%d-%d-%s.csv' % (queries, corpus, startYear, 133 | endYear, smoothing, word_case) 134 | if toSave: 135 | for col in df.columns: 136 | if '>' in col: 137 | df[col.replace('>', '>')] = df.pop(col) 138 | df.to_csv(filename, index=False) 139 | print(('Data saved to %s' % filename)) 140 | if toPlot: 141 | try: 142 | subprocess.call(['python', 'xkcd.py', filename]) 143 | except: 144 | if not toSave: 145 | print(('Currently, if you want to create a plot you ' + 146 | 'must also save the data. Rerun your query, ' + 147 | 'removing the -nosave option.')) 148 | else: 149 | print(('Plotting Failed: %s' % filename)) 150 | if notifyUser: 151 | print(warningMessage) 152 | 153 | if __name__ == '__main__': 154 | argumentString = ' '.join(sys.argv[1:]) 155 | if argumentString == '': 156 | argumentString = eval(input('Enter query (or -help):')) 157 | else: 158 | try: 159 | runQuery(argumentString) 160 | except: 161 | print('An error occurred.') 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About # 2 | 3 | Here you'll find a basic python script to retrieve data behind the trajectories plotted on the [Google Ngram Viewer](http://books.google.com/ngrams). A Python script that creates [XKCD](http://www.xkcd.com) style plots from the ngram CSV data is also included, making it simple to create some awesome looking plots! 4 | 5 | # Dependencies 6 | 7 | * [matplotlib](http://github.com/matplotlib/matplotlib) >= 1.3.1 8 | * [pandas](http://github.com/pydata/pandas) 9 | * [requests](http://github.com/kennethreitz/requests) 10 | 11 | 12 | # Usage # 13 | 14 | Simply type the same query you would type at the [Google Ngram Viewer](http://books.google.com/ngrams) and retrieve the data in csv format. 15 | 16 | 17 | 18 | #### Quick Gotchas #### 19 | 20 | * By default, the data is printed on screen and saved to a file in the working directory. 21 | * Add the `-plot` option to your query and an XKCD style plot like the one to the left will be saved in the working directory as well. 22 | * Searches are case-sensitive by default. To perform case-insenitive searches, pass the `-caseInsensitive` option to your query. The result will be the sum of all common formats of the query (lowercase, uppercase, titlecase, etc). 23 | * The syntax for [modifier](#modifier-searches) and [wildcard](#wildcard-searches) searches has been slightly modified in order to make the script work as a command line tool. See [below](#more-complicated-examples) for more information on these minor changes. 24 | 25 | 26 | ### Options ### 27 | * **corpus** [default: eng_2012] *This will run the query in CORPUS. Possible values are recapitulated below and [here](http://books.google.com/ngrams/info).* 28 | * **startYear** [default: 1800] 29 | * **endYear** [default: 2000] 30 | * **smoothing** [default: 3] *Smoothing parameter (integer)* 31 | * **caseInsensitive** *Return case-insensitive results* 32 | * **plot** *Return an XKCD style plot as a .png file* 33 | * **alldata** *Return every column of available data*** 34 | * **nosave** *Results will not be saved to file* 35 | * **noprint** *Results will not be printed on screen* 36 | * **help** *Prints this screen* 37 | 38 | \*\* This can be used with inflection, wildcard, and case-insensitive searches (otherwise it does nothing) where one column is the sum of some of the other columns (labeled with a column name ending in "(All)" or an asterisk for wildcard searches). In the [Google Ngram Viewer](http://books.google.com/ngrams), the columns whose sum makes up this column is viewable by right clicking on the ngram plot. In the `getngrams.py` script, these columns are dropped by default, but you can keep them by adding `-alldata` to your query. 39 | 40 | # Examples # 41 | 42 | There are tons of examples below that demonstrate of all kinds of available queries. 43 | 44 | ### Basic Examples ### 45 | 46 | Here are some basic example uses of `getngrams.py`: 47 | 48 | ``` 49 | python getngrams.py Albert Einstein, Charles Darwin 50 | python getngrams.py aluminum, copper, steel -noprint 51 | python getngrams.py Pearl Harbor, Watergate -corpus=eng_2009 52 | python getngrams.py bells and whistles -startYear=1900 -endYear=2001 -smoothing=2 53 | python getngrams.py internet --startYear=1980 --endYear=2000 --corpus=eng_2012 -caseInsensitive 54 | ``` 55 | 56 | ### More Complicated Examples ### 57 | 58 | ##### Wildcard Searches ##### 59 | 60 | As in the full [Google Ngram Viewer](http://books.google.com/ngrams), you can also perform wildcard searches using `getngrams.py`. 61 | 62 | When doing a wildcard search, use the `?` character instead of the `*` character. Using an asterisk will cause the `getngrams.py` script to fail because your shell will expand the asterisk before Python has a chance to see it. 63 | 64 | ``` 65 | python getngrams.py United ? --startYear=1850 --endYear=2000 -alldata 66 | python getngrams.py University of ? 67 | python getngrams.py University of ?, ? State University -alldata 68 | ``` 69 | 70 | ##### Modifier Searches ##### 71 | 72 | Modifier searches let you see how often one more modifies another word. The usual syntax for doing a modifier search is by using the `=>` operator. For example, running the query `dessert=>tasty` would match all instances of when the word *tasty* was used to modify the word *dessert*. 73 | 74 | Modifier searches can be done using `getngrams.py`, but you must replace the `=>` operator with the `@` character. 75 | 76 | ``` 77 | python getngrams.py car@fast -startYear=1900 -endYear=2000 78 | python getngrams.py car@fast -startYear=1900 -endYear=2000 -alldata 79 | python getngrams.py drink@?_NOUN -startYear=1900 -endYear=2000 -alldata 80 | ``` 81 | 82 | For more information on wildcard and modifier searches, take a look at the [About Ngram Viewer](https://books.google.com/ngrams/info) page for more in depth documentation. 83 | 84 | ##### Other Examples ##### 85 | 86 | ``` 87 | python getngrams.py book ? hotel, book_INF a hotel --startYear=1920 --endYear=2000 -alldata 88 | python getngrams.py read ?_DET book 89 | python getngrams.py _DET_ bright_ADJ rainbow 90 | python getngrams.py _START_ President ?_NOUN 91 | python getngrams.py _ROOT_@will 92 | ``` 93 | 94 | ##### Possible Corpora ##### 95 | 96 | ``` 97 | eng_2012, eng_2009, eng_us_2012, eng_us_2009, eng_gb_2012, eng_gb_2009, chi_sim_2012, chi_sim_2009, fre_2012, 98 | fre_2009, ger_2012, ger_2009, spa_2012, spa_2009, rus_2012, rus_2009, heb_2012, heb_2009, ita_2012, 99 | eng_fiction_2012, eng_fiction_2009, eng_1m_2009 100 | ``` 101 | 102 | ## Plotting ## 103 | 104 | There are 2 easy ways to create your own plots using a CSV file produced by running a query with `getngrams.py`. To demonstrate the 2 methods, we'll run the following query: 105 | 106 | ```bash 107 | python getngrams.py railroad,radio,television,internet -startYear=1900 -endYear=2000 -caseInsensitive 108 | ``` 109 | 110 | ### Plotting w/ xkcd.py ### 111 | 112 | The first way to create a plot is to use the supplied `xkcd.py` script to generate awesome [XKCD](http://www.xkcd.com) style charts. However, there are two ways to use the script: 113 | 114 | 1. Add the `-plot` option to your command when running `getngrams.py`: 115 | 116 | ```bash 117 | python getngrams.py railroad,radio,television,internet -startYear=1900 -endYear=2000 -plot -caseInsensitive 118 | ``` 119 | 120 | 2. You can also use `xkcd.py` directly by passing the CSV file as an argument: 121 | 122 | ```bash 123 | python xkcd.py railroad_radio_television_internet-eng_2012-1900-2000-3-caseInsensitive.csv 124 | ``` 125 | 126 | Both methods produce the same chart: 127 | 128 | ![](https://s3.amazonaws.com/ngramplots/xkcd_example.png) 129 | 130 | 131 | ### Plotting w/ Pandas ### 132 | 133 | Another way to plot data from an ngram CSV file is to read the file into a pandas DataFrame object and call the .plot() option on it. Here we do that, but also convert the data to percentages first and add a title to the plot: 134 | 135 | ```python 136 | from pandas import read_csv 137 | df = read_csv('railroad_radio_television_internet-eng_2012-1900-2000-3-caseInsensitive.csv', 138 | index_col=0, 139 | parse_dates=True) 140 | for col in df.columns: 141 | df[col] = [i*100 for i in df[col]] 142 | df.plot(title='Railroad, Radio, Television, and Internet') 143 | ``` 144 | 145 | ![](https://s3.amazonaws.com/ngramplots/pandas_simple.png) 146 | 147 | ### License ### 148 | MIT License 149 | 150 | Moreover, PLEASE do respect the terms of service of the Google Ngram Viewer while using this code. This code is meant to help viewers retrieve data behind a few queries, not bang at Google's servers with thousands of queries. The complete dataset can be freely downloaded [here](http://storage.googleapis.com/books/ngrams/books/datasetsv2.html). This code is not a Google product and is not endorsed by Google in any way. 151 | 152 | With this in mind... happy plotting! 153 | --------------------------------------------------------------------------------