├── .gitignore ├── requirements.txt ├── credentials-example.py ├── run.py ├── config.py ├── functions.py ├── README.md └── examples └── app-growth.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea 3 | credentials.py 4 | local-queries.py 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | oauth2client 2 | httplib2 3 | google-api-python-client 4 | pandas 5 | xlsxwriter 6 | python-dateutil 7 | matplotlib 8 | -------------------------------------------------------------------------------- /credentials-example.py: -------------------------------------------------------------------------------- 1 | # set credentials variables from Google developer console 2 | client_id = '' 3 | client_secret= '' 4 | redirect_uri = '' 5 | # set credentials for oauth connection by running config.py 2 times 6 | access_code = '' 7 | access_token = '' 8 | refresh_token = '' 9 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from functions import return_ga_data, save_df_to_excel 2 | 3 | df = return_ga_data( 4 | start_date='2017-09-13', 5 | end_date='2017-09-21', 6 | view_id='100555616', 7 | metrics=[ 8 | {'expression': 'ga:sessions'}, 9 | ], 10 | dimensions=[ 11 | {'name': 'ga:source'}, 12 | ], 13 | split_dates=True, 14 | group_by=['ga:source'], 15 | dimensionFilterClauses=[ 16 | { 17 | 'operator': 'OR', 18 | 'filters': [ 19 | { 20 | 'dimensionName': 'ga:userType', 21 | 'not': False, 22 | 'expressions':[ 23 | 'new visitor' 24 | ], 25 | 'caseSensitive': False 26 | } 27 | ], 28 | 29 | } 30 | ], 31 | segments=[] 32 | ) 33 | 34 | print(df) 35 | 36 | # save_df_to_excel( 37 | # df=df, 38 | # path='C:\\Users\\Erik\\Documents\\', 39 | # file_name='test_export', 40 | # sheet_name='data' 41 | # ) 42 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # import libraries 2 | from credentials import client_id, client_secret, redirect_uri, access_code, access_token, refresh_token 3 | from oauth2client.client import OAuth2WebServerFlow, GoogleCredentials 4 | import httplib2 5 | from googleapiclient.discovery import build 6 | 7 | # create connection based on project credentials 8 | flow = OAuth2WebServerFlow(client_id=client_id, 9 | client_secret=client_secret, 10 | scope='https://www.googleapis.com/auth/analytics', 11 | redirect_uri=redirect_uri) 12 | 13 | # capture different states of connection 14 | if access_code == '': 15 | # first run prints oauth URL 16 | auth_uri = flow.step1_get_authorize_url() 17 | print (auth_uri) 18 | elif access_token == '' and refresh_token == '': 19 | # second run returns access and refresh token 20 | credentials = flow.step2_exchange(access_code) 21 | print(credentials.access_token) 22 | print(credentials.refresh_token) 23 | else: 24 | # third and future run connect through access token an refresh token 25 | credentials = GoogleCredentials(access_token, client_id, client_secret, refresh_token, 3920, 'https://accounts.google.com/o/oauth2/token', 'test') 26 | http = httplib2.Http() 27 | http = credentials.authorize(http) 28 | service = build('analytics', 'v4', http=http) 29 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from config import service 3 | from datetime import datetime 4 | from dateutil.rrule import rrule, DAILY 5 | from time import sleep 6 | 7 | def convert_reponse_to_df(response): 8 | list = [] 9 | # parse report data 10 | for report in response.get('reports', []): 11 | 12 | columnHeader = report.get('columnHeader', {}) 13 | dimensionHeaders = columnHeader.get('dimensions', []) 14 | metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', []) 15 | rows = report.get('data', {}).get('rows', []) 16 | 17 | for row in rows: 18 | dict = {} 19 | dimensions = row.get('dimensions', []) 20 | dateRangeValues = row.get('metrics', []) 21 | 22 | for header, dimension in zip(dimensionHeaders, dimensions): 23 | dict[header] = dimension 24 | 25 | for i, values in enumerate(dateRangeValues): 26 | for metric, value in zip(metricHeaders, values.get('values')): 27 | if ',' in value or ',' in value: 28 | dict[metric.get('name')] = float(value) 29 | else: 30 | dict[metric.get('name')] = int(value) 31 | list.append(dict) 32 | 33 | df = pd.DataFrame(list) 34 | return df 35 | 36 | 37 | def get_report(analytics, start_date, end_date, view_id, metrics, dimensions, dimensionFilterClauses=[], segments=[]): 38 | return analytics.reports().batchGet( 39 | body={ 40 | 'reportRequests': [ 41 | { 42 | 'viewId': view_id, 43 | 'dateRanges': [{'startDate':start_date, 'endDate': end_date}], 44 | 'metrics': metrics, 45 | 'dimensions': dimensions, 46 | 'pageSize': 10000, 47 | 'dimensionFilterClauses': dimensionFilterClauses, 48 | 'segments': segments, 49 | }] 50 | } 51 | ).execute() 52 | 53 | 54 | def return_ga_data(start_date, end_date, view_id, metrics, dimensions, split_dates, group_by=[], dimensionFilterClauses=[], segments=[]): 55 | if split_dates == False: 56 | return convert_reponse_to_df(get_report(service, start_date, end_date, view_id, metrics, dimensions, dimensionFilterClauses, segments)) 57 | else: 58 | start_date = datetime.strptime(start_date, '%Y-%m-%d').date() 59 | end_date = datetime.strptime(end_date, '%Y-%m-%d').date() 60 | 61 | df_total = pd.DataFrame() 62 | for date in rrule(freq=DAILY, dtstart=start_date, until=end_date): 63 | date = str(date.date()) 64 | df_total = df_total.append(convert_reponse_to_df(get_report(service, date, date, view_id, metrics, dimensions, dimensionFilterClauses, segments))) 65 | sleep(1) 66 | 67 | if len(group_by) != 0: 68 | df_total = df_total.groupby(group_by).sum() 69 | 70 | return df_total 71 | 72 | 73 | def save_df_to_excel(df, path, file_name, sheet_name): 74 | writer = pd.ExcelWriter(path+file_name+'.xlsx', engine='xlsxwriter') 75 | df.to_excel(writer, sheet_name=sheet_name) 76 | writer.save() 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Analytics Reporting API v4 in Python with pandas 2 | This repo contains a setup to get started with the Google Analytics Reporting API v4 in Python. It takes three steps: 3 | 4 | *For a step by step guide to setting up a project like this, ready my tutorial on [The Marketing Technologist](https://www.themarketingtechnologist.co/getting-started-with-the-google-analytics-reporting-api-in-python/)*. 5 | 6 | # 1. Create a project 7 | First, create a project in your Google Developer console. I highly recommend using [the 17 steps of this post](https://www.themarketingtechnologist.co/google-oauth-2-enable-your-application-to-access-data-from-a-google-user/). Add a `credentials.py` file to your Python project and create variables for the `client_id`, `client_secret` and `redirect_uri` and fill out the corresponding values. 8 | 9 | *For your Python project, I recommend using Python 3.x over 2.7 because it's better at handling special characters in strings.* 10 | 11 | # 2. Connect to the API 12 | To connect to the Google Analytics API, run `config.py` two times: 13 | 14 | 1. Your first run prints a URL in the console. Open the URL, grant access to your Google account of choice, copy the `&code=` parameter value and add an `access_code` variable with the parameter value in `credentials.py`. 15 | 2. Your second run prints the access token and refresh token. For both values, create a variable (`access_token` and `refresh_token`) and set the corresponding values. 16 | - the refresh token is only returned with your first API connection. If the second line says `None`, revoke your app's access at https://myaccount.google.com/permissions, clear the `access_code` and reconnect. 17 | 18 | All future runs will use the access token and refresh token to connect to the API. 19 | 20 | # 3. Run your report 21 | Lastly, you can run `run.py` to return a report in a DataFrame. The `return_ga_data` function returns a [pandas](http://pandas.pydata.org/) DataFarme. The example code is set to return sessions by source: 22 | 23 | ```python 24 | df = return_ga_data( 25 | start_date='2017-09-13', 26 | end_date='2017-09-21', 27 | view_id='100555616', 28 | metrics=[{'expression': 'ga:sessions'},], 29 | dimensions=[{'name': 'ga:source'}], 30 | split_dates=False, 31 | group_by=[], 32 | dimensionFilterClauses=[ 33 | { 34 | 'operator': 'OR', 35 | 'filters': [ 36 | { 37 | 'dimensionName': 'ga:userType', 38 | 'not': False, 39 | 'expressions':[ 40 | 'new visitor' 41 | ], 42 | 'caseSensitive': False 43 | } 44 | ], 45 | 46 | } 47 | ], 48 | segments=[] 49 | ) 50 | ``` 51 | A brief description of each parameter: 52 | 53 | - `start_date` & `end_date`: 54 | - date format in `'YYYY-MM-DD'` 55 | - relative date: `'today'`, `'yesterday'`, `'NdaysAgo'` (where N is the amount of days) 56 | - `view_id`: the ID of the Google Analytics view you want to import data from. 57 | - `metrics`: the list of sessions you want to import (max. 10) - full list [here](https://developers.google.com/analytics/devguides/reporting/core/dimsmets). 58 | - `dimensions`: the list of dimensions you want to import (max. 9) - full list [here](https://developers.google.com/analytics/devguides/reporting/core/dimsmets). 59 | - `split_dates`: boolean. If true each day in your date range is queries seperately and merged into a data frame later on. 60 | - `group_by` (optional): if you enable `split_dates` you can group the data on a dimension of choice. Especially handy when you're not include the date in your export. 61 | - `dimensionFilterClauses` (optional): filter data based on dimensions if required ([see documention here](https://developers.google.com/analytics/devguides/reporting/core/v4/rest/v4/reports/batchGet#DimensionFilterClause). 62 | - `segments` (optional): use to apply segments ([see example here](https://developers.google.com/analytics/devguides/reporting/core/v4/samples#segments)). Requires the dimension `ga:segment` in your query. 63 | 64 | ## Export to Excel 65 | 66 | As not all of your data analyst friends are as cool as you are, I've added basic DataFrame to Excel export function. Here's how you can use it: 67 | 68 | ```python 69 | save_df_to_excel( 70 | df=df, 71 | path='C:\\Users\\Erik\\Documents\\', 72 | file_name='test_export', 73 | sheet_name='data' 74 | ) 75 | ``` 76 | 77 | ## To do 78 | 79 | - Create function to make use of segments easier. 80 | 81 | -------------------------------------------------------------------------------- /examples/app-growth.py: -------------------------------------------------------------------------------- 1 | from functions import return_ga_data 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import matplotlib.ticker as mticker 5 | import numpy as np 6 | import math 7 | import datetime 8 | from dateutil import relativedelta 9 | 10 | ga_view_id = '100555616' 11 | 12 | def return_last_sunday_date_string(): 13 | yesterday = datetime.date.today() - datetime.timedelta(1) 14 | print(yesterday) 15 | last_sunday = yesterday - datetime.timedelta((yesterday.weekday() + 1) % 7) 16 | print(last_sunday) 17 | return str(last_sunday) 18 | 19 | 20 | query_start_date = '2015-01-01' 21 | query_end_date = return_last_sunday_date_string() 22 | 23 | df_new_users = return_ga_data( 24 | start_date=query_start_date, 25 | end_date=query_end_date, 26 | view_id=ga_view_id, 27 | metrics=[ 28 | {'expression': 'ga:goal1Completions'}, 29 | ], 30 | dimensions=[ 31 | {'name': 'ga:isoYear'}, 32 | {'name': 'ga:isoWeek'}, 33 | ], 34 | split_dates=False, 35 | dimensionFilterClauses=[ 36 | { 37 | 'operator': 'OR', 38 | 'filters': [ 39 | { 40 | 'dimensionName': 'ga:userType', 41 | 'not': False, 42 | 'expressions':[ 43 | 'new visitor' 44 | ], 45 | 'caseSensitive': False 46 | } 47 | ], 48 | 49 | } 50 | ], 51 | ) 52 | 53 | df_returning_users = return_ga_data( 54 | start_date=query_start_date, 55 | end_date=query_end_date, 56 | view_id=ga_view_id, 57 | metrics=[ 58 | {'expression': 'ga:users'}, 59 | ], 60 | dimensions=[ 61 | {'name': 'ga:isoYear'}, 62 | {'name': 'ga:isoWeek'}, 63 | {'name': 'ga:segment'} 64 | ], 65 | split_dates=False, 66 | segments=[{ 67 | "dynamicSegment": 68 | { 69 | "name": "Sessions with app use", 70 | "sessionSegment": 71 | { 72 | "segmentFilters":[ 73 | { 74 | "simpleSegment": 75 | { 76 | "orFiltersForSegment": 77 | { 78 | "segmentFilterClauses": [ 79 | { 80 | "metricFilter": 81 | { 82 | "metricName":"ga:goal1Completions", 83 | "operator":"GREATER_THAN", 84 | "comparisonValue":"0" 85 | } 86 | }] 87 | } 88 | } 89 | }] 90 | } 91 | } 92 | }] 93 | ) 94 | 95 | df_app_growth = pd.merge(df_returning_users, df_new_users, on=['ga:isoYear','ga:isoWeek'], how='outer') 96 | df_app_growth['Week of Year'] = df_app_growth["ga:isoYear"].map(str) + df_app_growth["ga:isoWeek"].map(str) 97 | df_app_growth.rename(columns={'ga:users': 'Weekly Active Users', 'ga:goal1Completions': 'New App Users'}, inplace=True) 98 | df_app_growth = df_app_growth.fillna(0) 99 | df_app_growth['New App Users (cumulative)'] = df_app_growth['New App Users'].cumsum() 100 | # df_app_growth = df_app_growth.set_index('Week of Year') 101 | # print(df_app_growth) 102 | 103 | 104 | def plot_dual_axis_line_chart(title, df, main_color, sub_color, grid_color, yaxis_color, xaxis_column_name, left_yaxis_column_name, 105 | right_yaxis_column_name, number_of_yaxis_ticks, xaxis_tick_interval, xaxis_label_rotation_degrees, 106 | yaxis_tick_width, round_yvalues_to): 107 | df_plot = df 108 | fig, ax1 = plt.subplots() 109 | ax2 = ax1.twinx() 110 | 111 | ax1.grid(color=grid_color, linestyle='solid', linewidth=1, axis='y') 112 | 113 | ax1.spines['left'].set_color(yaxis_color) 114 | ax2.spines['left'].set_color(yaxis_color) 115 | ax1.spines['right'].set_color(yaxis_color) 116 | ax2.spines['right'].set_color(yaxis_color) 117 | ax1.spines['top'].set_color(yaxis_color) 118 | ax2.spines['top'].set_color(yaxis_color) 119 | 120 | ax1.plot(df_plot.index.values, df_plot[left_yaxis_column_name], main_color) 121 | ax1.set_xlabel(xaxis_column_name) 122 | ax1.set_ylabel(left_yaxis_column_name, color=main_color) 123 | ax1.tick_params('y', colors=main_color, width =yaxis_tick_width, length=0) 124 | 125 | ax2.plot(df_plot.index.values, df_plot[right_yaxis_column_name], sub_color) 126 | ax2.set_xlabel(xaxis_column_name) 127 | ax2.set_ylabel(right_yaxis_column_name, color=sub_color) 128 | ax2.tick_params('y', colors=sub_color, width=yaxis_tick_width, length=0) 129 | 130 | yaxis_left_rounded_max = int(round_yvalues_to * math.ceil(float(df_plot[left_yaxis_column_name].max()) / round_yvalues_to)) 131 | yaxis_right_rounded_max = int(round_yvalues_to * math.ceil(float(df_plot[right_yaxis_column_name].max()) / round_yvalues_to)) 132 | 133 | ax1.set_yticks(np.arange(0, yaxis_left_rounded_max*1.01, yaxis_left_rounded_max/number_of_yaxis_ticks)) 134 | ax2.set_yticks(np.arange(0, yaxis_right_rounded_max*1.01, yaxis_right_rounded_max/number_of_yaxis_ticks)) 135 | 136 | ax1.set_ylim(ymin=0, ymax=yaxis_left_rounded_max*1.02) 137 | ax2.set_ylim(ymin=0, ymax=yaxis_right_rounded_max*1.02) 138 | 139 | xticklocs = np.arange(0, len(df_plot.index.values), xaxis_tick_interval) 140 | ticks = df_plot[xaxis_column_name][0::xaxis_tick_interval] 141 | ax1.set_xticklabels(ticks) 142 | ax1.xaxis.set_major_locator(mticker.FixedLocator(xticklocs)) 143 | plt.xlim([0,len(df_plot.index.values)]) 144 | for tick in ax1.get_xticklabels(): 145 | tick.set_rotation(xaxis_label_rotation_degrees) 146 | 147 | plt.title(title, y=1.08) 148 | fig.tight_layout() 149 | plt.show() 150 | 151 | # double axis 152 | # https://matplotlib.org/examples/api/two_scales.html 153 | 154 | plot_dual_axis_line_chart( 155 | title = 'Fuuut App Growth', 156 | df = df_app_growth, 157 | main_color = '#2d6891', 158 | sub_color = '#d9734e', 159 | grid_color = '#dddddd', 160 | yaxis_color = '#ffffff', 161 | xaxis_column_name = 'Week of Year', 162 | left_yaxis_column_name = 'Weekly Active Users', 163 | right_yaxis_column_name = 'New App Users (cumulative)', 164 | round_yvalues_to=50, 165 | number_of_yaxis_ticks=5, 166 | xaxis_tick_interval=5, 167 | xaxis_label_rotation_degrees = 90, 168 | yaxis_tick_width=0, 169 | ) 170 | --------------------------------------------------------------------------------