├── .gitignore
└── search_console_query.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | .DS_STORE
3 | *.dat
4 | __pycache__/
5 | 


--------------------------------------------------------------------------------
/search_console_query.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import webbrowser
  3 | import csv
  4 | import json
  5 | import time
  6 | import sys
  7 | import os
  8 | from datetime import datetime, timedelta
  9 | import itertools
 10 | import argparse
 11 | from collections import OrderedDict
 12 | import httplib2
 13 | from oauth2client.file import Storage
 14 | from oauth2client.client import flow_from_clientsecrets
 15 | from googleapiclient.discovery import build
 16 | from googleapiclient.errors import HttpError
 17 | 
 18 | WEBMASTER_CREDENTIALS_FILE_PATH = "webmaster_credentials.dat"
 19 | 
 20 | def rate_limit(max_per_minute):
 21 |     """
 22 |     Decorator function to prevent more than x calls per minute of any function
 23 |     Args:
 24 |         max_per_minute. Numeric type.
 25 |         The maximum number of times the function should run per minute.
 26 |     """
 27 |     min_interval = 60.0 / float(max_per_minute)
 28 |     def decorate(func):
 29 |         last_time_called = [0.0]
 30 |         def rate_limited_function(*args, **kwargs):
 31 |             elapsed = time.clock() - last_time_called[0]
 32 |             wait_for = min_interval - elapsed
 33 |             if wait_for > 0:
 34 |                 time.sleep(wait_for)
 35 |             ret = func(*args, **kwargs)
 36 |             last_time_called[0] = time.clock()
 37 |             return ret
 38 |         return rate_limited_function
 39 |     return decorate
 40 | 
 41 | 
 42 | def acquire_new_oauth2_credentials(secrets_file):
 43 |     """
 44 |     Args:
 45 |         secrets_file. The file path to a JSON file of client secrets, containing:
 46 |             client_id; client_secret; redirect_uris; auth_uri; token_uri.
 47 |     Returns:
 48 |         credentials for use with Google APIs
 49 |     """
 50 |     flow = flow_from_clientsecrets(
 51 |         secrets_file,
 52 |         scope="https://www.googleapis.com/auth/webmasters.readonly",
 53 |         redirect_uri="http://localhost")
 54 |     auth_uri = flow.step1_get_authorize_url()
 55 |     webbrowser.open(auth_uri)
 56 |     print("Please enter the following URL in a browser " + auth_uri)
 57 |     auth_code = input("Enter the authentication code: ")
 58 |     credentials = flow.step2_exchange(auth_code)
 59 |     return credentials
 60 | 
 61 | 
 62 | def load_oauth2_credentials(secrets_file):
 63 |     """
 64 |     Args:
 65 |         secrets_file. The file path to a JSON file of client secrets.
 66 |     Returns:
 67 |         credentials for use with Google APIs.
 68 |     Side effect:
 69 |         If the secrets file did not exist, fetch the appropriate credentials and create a new one.
 70 |     """
 71 |     storage = Storage(WEBMASTER_CREDENTIALS_FILE_PATH)
 72 |     credentials = storage.get()
 73 |     if credentials is None or credentials.invalid:
 74 |         credentials = acquire_new_oauth2_credentials(secrets_file)
 75 |     storage.put(credentials)
 76 |     return credentials
 77 | 
 78 | 
 79 | def create_search_console_client(credentials):
 80 |     """
 81 |     The search console client allows us to perform queries against the API.
 82 |     To create it, pass in your already authenticated credentials
 83 | 
 84 |     Args:
 85 |         credentials. An object representing Google API credentials.
 86 |     Returns:
 87 |         service. An object used to perform queries against the API.
 88 |     """
 89 |     http_auth = httplib2.Http()
 90 |     http_auth = credentials.authorize(http_auth)
 91 |     service = build('webmasters', 'v3', http=http_auth)
 92 |     return service
 93 | 
 94 | 
 95 | def date_range(start_date, end_date, delta=timedelta(days=1)):
 96 |     """
 97 |     Yields a stream of datetime objects, for all days within a range.
 98 |     The range is inclusive, so both start_date and end_date will be returned,
 99 |     as well as all dates in between.
100 | 
101 |     Args:
102 |         start_date: The datetime object representing the first day in the range.
103 |         end_date: The datetime object representing the second day in the range.
104 |         delta: A datetime.timedelta instance, specifying the step interval. Defaults to one day.
105 |     Yields:
106 |         Each datetime object in the range.
107 |     """
108 |     current_date = start_date
109 |     while current_date <= end_date:
110 |         yield current_date
111 |         current_date += delta
112 | 
113 | 
114 | def generate_filters(**kwargs):
115 |     """
116 |     Yields a filter list for each combination of the args provided.
117 |     """
118 |     kwargs = OrderedDict((k, v) for k, v in kwargs.items() if v)
119 |     dimensions = kwargs.keys()
120 |     values = list(kwargs.values())
121 |     for vals in itertools.product(*values):
122 |         yield [{
123 |             'dimension': dim,
124 |             'operator': 'equals',
125 |             'expression': val} for dim, val in zip(dimensions, vals)
126 |               ]
127 | 
128 | 
129 | @rate_limit(200)
130 | def execute_request(service, property_uri, request, max_retries=5, wait_interval=4,
131 |                     retry_errors=(503, 500)):
132 |     """
133 |     Executes a searchanalytics request.
134 |     Args:
135 |         service: The webmasters service object/client to use for execution.
136 |         property_uri: Matches the URI in Google Search Console.
137 |         request: The request to be executed.
138 |         max_retries. Optional. Sets the maximum number of retry attempts.
139 |         wait_interval. Optional. Sets the number of seconds to wait between each retry attempt.
140 |         retry_errors. Optional. Retry the request whenever these error codes are encountered.
141 |     Returns:
142 |         An array of response rows.
143 |     """
144 | 
145 |     response = None
146 |     retries = 0
147 |     while retries <= max_retries:
148 |         try:
149 |             response = service.searchanalytics().query(siteUrl=property_uri, body=request).execute()
150 |         except HttpError as err:
151 |             decoded_error_body = err.content.decode('utf-8')
152 |             json_error = json.loads(decoded_error_body)
153 |             if json_error['error']['code'] in retry_errors:
154 |                 time.sleep(wait_interval)
155 |                 retries += 1
156 |                 continue
157 |         break
158 |     return response
159 | 
160 | 
161 | def parse_command_line_options():
162 |     """
163 |     Parses arguments from the command line and returns them in the form of an ArgParser object.
164 |     """
165 |     parser = argparse.ArgumentParser(description="Query the Google Search Console API for every day in a date range.")
166 |     parser.add_argument('property_uri', type=str, help='The property URI to query. Must exactly match a property URI in Google Search Console')
167 |     parser.add_argument('start_date', type=str, help='The start date for the query. Should not be more than 90 days ago')
168 |     parser.add_argument('end_date', type=str, help='The last date to query. Should not be sooner than two days ago.')
169 |     parser.add_argument('--secrets_file', type=str, default='credentials.json', help='File path of your Google Client ID and Client Secret')
170 |     parser.add_argument('--config_file', type=str, help='File path of a config file containing settings for this Search Console property.')
171 |     parser.add_argument('--output_location', type=str, help='The folder output location of the script.', default="")
172 |     parser.add_argument('--url_type', type=str, help='A string to add to the beginning of the file', default="")
173 |     parser.add_argument('--max-rows-per-day', '-n', type=int, default=100, help='The maximum number of rows to return for each day in the range')
174 | 
175 |     filters = parser.add_argument_group('filters')
176 |     filters.add_argument('--page_filters_file', type=str, help='File path of a CSV list of pages to filter by', default="")
177 |     filters.add_argument('--devices', nargs='*', type=str, help='List of devices to filter by. By default we do segment by device.',
178 |                          default=['mobile', 'desktop', 'tablet'])
179 |     filters.add_argument('--countries', nargs='*', type=str, help='List of countries to filter by', default=[])
180 |     return parser.parse_args()
181 | 
182 | 
183 | def read_page_paths_from_file(page_filters_file, property_uri):
184 |     """
185 |     Args:
186 |         page_filters_file. The filepath of a plain text file containing a list of URLs
187 |         to filter by in the Google Search Console.
188 |     Returns:
189 |         A list of those URLs, if they all specify the full GSC property correctly.
190 |         Otherwise, will raise an exception.
191 |     """
192 |     pages = []
193 |     with open(page_filters_file, "r") as file_handle:
194 |         for line in file_handle.readlines():
195 |             if property_uri in line:
196 |                 pages.append(line.strip("\n"))
197 |             else:
198 |                 raise ValueError("Page filter does not include the property uri: {}".format(line))
199 |     return pages
200 | 
201 | def main():
202 |     """
203 |     Fetch and parse all command line options.
204 |     Dispatch queries to the GSC API.
205 |     """
206 |     args = parse_command_line_options()
207 | 
208 |     if args.page_filters_file:
209 |         try:
210 |             pages = read_page_paths_from_file(args.page_filters_file, args.property_uri)
211 |         except IOError as err:
212 |             logging.error("%s is not a valid file path", args.page_filters_file)
213 |             sys.exit(err)
214 |         except ValueError as err:
215 |             logging.error("Error: all page filters must include the full URL of the Google Search Console property.")
216 |             sys.exit(err)
217 |     else:
218 |         pages = []
219 | 
220 |     # Prepare the API service
221 |     credentials = load_oauth2_credentials(args.secrets_file)
222 |     service = create_search_console_client(credentials)
223 | 
224 |     start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
225 |     end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
226 | 
227 |     for day in date_range(start_date, end_date):
228 |         output_file = os.path.join(
229 |             args.output_location,
230 |             "{}_{}.csv".format(args.url_type, day.strftime("%Y%m%d"))
231 |         )
232 |         day = day.strftime("%Y-%m-%d")
233 |         output_rows = []
234 | 
235 |         for filter_set in generate_filters(page=pages, device=args.devices, country=args.countries):
236 | 
237 |             request = {
238 |                 'startDate' : day,
239 |                 'endDate' : day,
240 |                 'dimensions' : ['query'],
241 |                 'rowLimit' : args.max_rows_per_day,
242 |                 'dimensionFilterGroups' : [
243 |                     {
244 |                         "groupType" : "and",
245 |                         "filters" : filter_set
246 |                     }
247 |                 ]
248 |             }
249 | 
250 |             response = execute_request(service, args.property_uri, request)
251 | 
252 |             if response is None:
253 |                 logging.error("Request failed %s", json.dumps(request, indent=2))
254 |                 continue
255 | 
256 |             if 'rows' in response:
257 | 
258 |                 if pages:
259 |                     filters = [pages[0], 'worldwide', 'all_devices', args.url_type]
260 |                 else:
261 |                     filters = ['gsc_property', 'worldwide', 'all_devices', args.url_type]
262 | 
263 |                 filter_mapping = {'page': 0, 'country': 1, 'device': 2}
264 |                 for _filter in filter_set:
265 |                     filters[filter_mapping[_filter['dimension']]] = _filter['expression']
266 | 
267 |                 for row in response['rows']:
268 |                     keys = ','.join(row['keys'])
269 |                     output_row = [keys, row['clicks'], row['impressions'], row['ctr'], row['position']]
270 |                     output_row.extend(filters)
271 |                     output_rows.append(output_row)
272 | 
273 |         with open(output_file, 'w', newline="", encoding="utf-8-sig") as file_handle:
274 |             csvwriter = csv.writer(file_handle)
275 |             csvwriter.writerows(output_rows)
276 | 
277 |         logging.info("Query for %s complete", day)
278 | 
279 | 
280 | if __name__ == '__main__':
281 |     main()
282 | 


--------------------------------------------------------------------------------