├── .gitignore └── search_console_query.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .DS_STORE 3 | *.dat 4 | __pycache__/ 5 | -------------------------------------------------------------------------------- /search_console_query.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import webbrowser 3 | import csv 4 | import json 5 | import time 6 | import sys 7 | import os 8 | from datetime import datetime, timedelta 9 | import itertools 10 | import argparse 11 | from collections import OrderedDict 12 | import httplib2 13 | from oauth2client.file import Storage 14 | from oauth2client.client import flow_from_clientsecrets 15 | from googleapiclient.discovery import build 16 | from googleapiclient.errors import HttpError 17 | 18 | WEBMASTER_CREDENTIALS_FILE_PATH = "webmaster_credentials.dat" 19 | 20 | def rate_limit(max_per_minute): 21 | """ 22 | Decorator function to prevent more than x calls per minute of any function 23 | Args: 24 | max_per_minute. Numeric type. 25 | The maximum number of times the function should run per minute. 26 | """ 27 | min_interval = 60.0 / float(max_per_minute) 28 | def decorate(func): 29 | last_time_called = [0.0] 30 | def rate_limited_function(*args, **kwargs): 31 | elapsed = time.clock() - last_time_called[0] 32 | wait_for = min_interval - elapsed 33 | if wait_for > 0: 34 | time.sleep(wait_for) 35 | ret = func(*args, **kwargs) 36 | last_time_called[0] = time.clock() 37 | return ret 38 | return rate_limited_function 39 | return decorate 40 | 41 | 42 | def acquire_new_oauth2_credentials(secrets_file): 43 | """ 44 | Args: 45 | secrets_file. The file path to a JSON file of client secrets, containing: 46 | client_id; client_secret; redirect_uris; auth_uri; token_uri. 47 | Returns: 48 | credentials for use with Google APIs 49 | """ 50 | flow = flow_from_clientsecrets( 51 | secrets_file, 52 | scope="https://www.googleapis.com/auth/webmasters.readonly", 53 | redirect_uri="http://localhost") 54 | auth_uri = flow.step1_get_authorize_url() 55 | webbrowser.open(auth_uri) 56 | print("Please enter the following URL in a browser " + auth_uri) 57 | auth_code = input("Enter the authentication code: ") 58 | credentials = flow.step2_exchange(auth_code) 59 | return credentials 60 | 61 | 62 | def load_oauth2_credentials(secrets_file): 63 | """ 64 | Args: 65 | secrets_file. The file path to a JSON file of client secrets. 66 | Returns: 67 | credentials for use with Google APIs. 68 | Side effect: 69 | If the secrets file did not exist, fetch the appropriate credentials and create a new one. 70 | """ 71 | storage = Storage(WEBMASTER_CREDENTIALS_FILE_PATH) 72 | credentials = storage.get() 73 | if credentials is None or credentials.invalid: 74 | credentials = acquire_new_oauth2_credentials(secrets_file) 75 | storage.put(credentials) 76 | return credentials 77 | 78 | 79 | def create_search_console_client(credentials): 80 | """ 81 | The search console client allows us to perform queries against the API. 82 | To create it, pass in your already authenticated credentials 83 | 84 | Args: 85 | credentials. An object representing Google API credentials. 86 | Returns: 87 | service. An object used to perform queries against the API. 88 | """ 89 | http_auth = httplib2.Http() 90 | http_auth = credentials.authorize(http_auth) 91 | service = build('webmasters', 'v3', http=http_auth) 92 | return service 93 | 94 | 95 | def date_range(start_date, end_date, delta=timedelta(days=1)): 96 | """ 97 | Yields a stream of datetime objects, for all days within a range. 98 | The range is inclusive, so both start_date and end_date will be returned, 99 | as well as all dates in between. 100 | 101 | Args: 102 | start_date: The datetime object representing the first day in the range. 103 | end_date: The datetime object representing the second day in the range. 104 | delta: A datetime.timedelta instance, specifying the step interval. Defaults to one day. 105 | Yields: 106 | Each datetime object in the range. 107 | """ 108 | current_date = start_date 109 | while current_date <= end_date: 110 | yield current_date 111 | current_date += delta 112 | 113 | 114 | def generate_filters(**kwargs): 115 | """ 116 | Yields a filter list for each combination of the args provided. 117 | """ 118 | kwargs = OrderedDict((k, v) for k, v in kwargs.items() if v) 119 | dimensions = kwargs.keys() 120 | values = list(kwargs.values()) 121 | for vals in itertools.product(*values): 122 | yield [{ 123 | 'dimension': dim, 124 | 'operator': 'equals', 125 | 'expression': val} for dim, val in zip(dimensions, vals) 126 | ] 127 | 128 | 129 | @rate_limit(200) 130 | def execute_request(service, property_uri, request, max_retries=5, wait_interval=4, 131 | retry_errors=(503, 500)): 132 | """ 133 | Executes a searchanalytics request. 134 | Args: 135 | service: The webmasters service object/client to use for execution. 136 | property_uri: Matches the URI in Google Search Console. 137 | request: The request to be executed. 138 | max_retries. Optional. Sets the maximum number of retry attempts. 139 | wait_interval. Optional. Sets the number of seconds to wait between each retry attempt. 140 | retry_errors. Optional. Retry the request whenever these error codes are encountered. 141 | Returns: 142 | An array of response rows. 143 | """ 144 | 145 | response = None 146 | retries = 0 147 | while retries <= max_retries: 148 | try: 149 | response = service.searchanalytics().query(siteUrl=property_uri, body=request).execute() 150 | except HttpError as err: 151 | decoded_error_body = err.content.decode('utf-8') 152 | json_error = json.loads(decoded_error_body) 153 | if json_error['error']['code'] in retry_errors: 154 | time.sleep(wait_interval) 155 | retries += 1 156 | continue 157 | break 158 | return response 159 | 160 | 161 | def parse_command_line_options(): 162 | """ 163 | Parses arguments from the command line and returns them in the form of an ArgParser object. 164 | """ 165 | parser = argparse.ArgumentParser(description="Query the Google Search Console API for every day in a date range.") 166 | parser.add_argument('property_uri', type=str, help='The property URI to query. Must exactly match a property URI in Google Search Console') 167 | parser.add_argument('start_date', type=str, help='The start date for the query. Should not be more than 90 days ago') 168 | parser.add_argument('end_date', type=str, help='The last date to query. Should not be sooner than two days ago.') 169 | parser.add_argument('--secrets_file', type=str, default='credentials.json', help='File path of your Google Client ID and Client Secret') 170 | parser.add_argument('--config_file', type=str, help='File path of a config file containing settings for this Search Console property.') 171 | parser.add_argument('--output_location', type=str, help='The folder output location of the script.', default="") 172 | parser.add_argument('--url_type', type=str, help='A string to add to the beginning of the file', default="") 173 | parser.add_argument('--max-rows-per-day', '-n', type=int, default=100, help='The maximum number of rows to return for each day in the range') 174 | 175 | filters = parser.add_argument_group('filters') 176 | filters.add_argument('--page_filters_file', type=str, help='File path of a CSV list of pages to filter by', default="") 177 | filters.add_argument('--devices', nargs='*', type=str, help='List of devices to filter by. By default we do segment by device.', 178 | default=['mobile', 'desktop', 'tablet']) 179 | filters.add_argument('--countries', nargs='*', type=str, help='List of countries to filter by', default=[]) 180 | return parser.parse_args() 181 | 182 | 183 | def read_page_paths_from_file(page_filters_file, property_uri): 184 | """ 185 | Args: 186 | page_filters_file. The filepath of a plain text file containing a list of URLs 187 | to filter by in the Google Search Console. 188 | Returns: 189 | A list of those URLs, if they all specify the full GSC property correctly. 190 | Otherwise, will raise an exception. 191 | """ 192 | pages = [] 193 | with open(page_filters_file, "r") as file_handle: 194 | for line in file_handle.readlines(): 195 | if property_uri in line: 196 | pages.append(line.strip("\n")) 197 | else: 198 | raise ValueError("Page filter does not include the property uri: {}".format(line)) 199 | return pages 200 | 201 | def main(): 202 | """ 203 | Fetch and parse all command line options. 204 | Dispatch queries to the GSC API. 205 | """ 206 | args = parse_command_line_options() 207 | 208 | if args.page_filters_file: 209 | try: 210 | pages = read_page_paths_from_file(args.page_filters_file, args.property_uri) 211 | except IOError as err: 212 | logging.error("%s is not a valid file path", args.page_filters_file) 213 | sys.exit(err) 214 | except ValueError as err: 215 | logging.error("Error: all page filters must include the full URL of the Google Search Console property.") 216 | sys.exit(err) 217 | else: 218 | pages = [] 219 | 220 | # Prepare the API service 221 | credentials = load_oauth2_credentials(args.secrets_file) 222 | service = create_search_console_client(credentials) 223 | 224 | start_date = datetime.strptime(args.start_date, "%Y-%m-%d") 225 | end_date = datetime.strptime(args.end_date, "%Y-%m-%d") 226 | 227 | for day in date_range(start_date, end_date): 228 | output_file = os.path.join( 229 | args.output_location, 230 | "{}_{}.csv".format(args.url_type, day.strftime("%Y%m%d")) 231 | ) 232 | day = day.strftime("%Y-%m-%d") 233 | output_rows = [] 234 | 235 | for filter_set in generate_filters(page=pages, device=args.devices, country=args.countries): 236 | 237 | request = { 238 | 'startDate' : day, 239 | 'endDate' : day, 240 | 'dimensions' : ['query'], 241 | 'rowLimit' : args.max_rows_per_day, 242 | 'dimensionFilterGroups' : [ 243 | { 244 | "groupType" : "and", 245 | "filters" : filter_set 246 | } 247 | ] 248 | } 249 | 250 | response = execute_request(service, args.property_uri, request) 251 | 252 | if response is None: 253 | logging.error("Request failed %s", json.dumps(request, indent=2)) 254 | continue 255 | 256 | if 'rows' in response: 257 | 258 | if pages: 259 | filters = [pages[0], 'worldwide', 'all_devices', args.url_type] 260 | else: 261 | filters = ['gsc_property', 'worldwide', 'all_devices', args.url_type] 262 | 263 | filter_mapping = {'page': 0, 'country': 1, 'device': 2} 264 | for _filter in filter_set: 265 | filters[filter_mapping[_filter['dimension']]] = _filter['expression'] 266 | 267 | for row in response['rows']: 268 | keys = ','.join(row['keys']) 269 | output_row = [keys, row['clicks'], row['impressions'], row['ctr'], row['position']] 270 | output_row.extend(filters) 271 | output_rows.append(output_row) 272 | 273 | with open(output_file, 'w', newline="", encoding="utf-8-sig") as file_handle: 274 | csvwriter = csv.writer(file_handle) 275 | csvwriter.writerows(output_rows) 276 | 277 | logging.info("Query for %s complete", day) 278 | 279 | 280 | if __name__ == '__main__': 281 | main() 282 | --------------------------------------------------------------------------------