├── application ├── README.md └── main.py ├── infrastructure ├── README.md ├── project_services.tf ├── main.tf ├── service_account.tf ├── variables.tf ├── bigquery.tf └── bigquery_schema │ └── google_search_console_data_table.json ├── .gitignore ├── README.md ├── Makefile └── requirements.txt /application/README.md: -------------------------------------------------------------------------------- 1 | The App uses Python. -------------------------------------------------------------------------------- /infrastructure/README.md: -------------------------------------------------------------------------------- 1 | Infrarstructure is managed by terraform. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | .vscode/ -------------------------------------------------------------------------------- /infrastructure/project_services.tf: -------------------------------------------------------------------------------- 1 | resource "google_project_service" "apis" { 2 | for_each = toset(var.apis) 3 | service = each.value 4 | disable_dependent_services = true 5 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Search Console Exporter 2 | Export Google Search Data via the **Google Search Console API** to **BigQuery**. 3 | ## Flowchart 4 | ```mermaid 5 | graph LR; 6 | a((timer)) --> b[extract] --> c[load]; 7 | ``` -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | tf-apply: 2 | terraform -chdir=infrastructure init 3 | terraform -chdir=infrastructure fmt 4 | terraform -chdir=infrastructure validate 5 | terraform -chdir=infrastructure plan 6 | terraform -chdir=infrastructure apply 7 | 8 | run: 9 | python application/main.py -------------------------------------------------------------------------------- /infrastructure/main.tf: -------------------------------------------------------------------------------- 1 | terraform {} 2 | 3 | provider "google" { 4 | project = var.project_id # test project 5 | region = var.region 6 | } 7 | 8 | locals { 9 | default_labels = { 10 | "provisioner" = "terraform", 11 | "git_project" = "pygsc" 12 | } 13 | } -------------------------------------------------------------------------------- /infrastructure/service_account.tf: -------------------------------------------------------------------------------- 1 | resource "google_service_account" "bigquery_exporter" { 2 | account_id = "bigquery-exporter" 3 | display_name = "Exports Data from GSC to BigQuery" 4 | } 5 | 6 | resource "google_service_account_iam_binding" "account-iam" { 7 | service_account_id = google_service_account.bigquery_exporter.name 8 | role = "roles/iam.serviceAccountTokenCreator" 9 | members = [ 10 | "user:ms@thinq.digital", 11 | ] 12 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==5.2.0 2 | certifi==2022.6.15 3 | charset-normalizer==2.0.12 4 | google-api-core==2.8.2 5 | google-api-python-client==2.51.0 6 | google-auth==2.8.0 7 | google-auth-httplib2==0.1.0 8 | googleapis-common-protos==1.56.2 9 | httplib2==0.20.4 10 | idna==3.3 11 | protobuf==3.20.1 12 | pyasn1==0.4.8 13 | pyasn1-modules==0.2.8 14 | pyparsing==3.0.9 15 | requests==2.28.0 16 | rsa==4.8 17 | six==1.16.0 18 | uritemplate==4.1.1 19 | urllib3==1.26.9 20 | -------------------------------------------------------------------------------- /infrastructure/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project_id" { 2 | type = string 3 | default = "onyx-dragon-349408" 4 | } 5 | 6 | variable "region" { 7 | type = string 8 | default = "europe-west3" 9 | } 10 | 11 | variable "apis" { 12 | description = "List of apis to enable" 13 | type = list(string) 14 | default = [ 15 | "searchconsole.googleapis.com", 16 | "iamcredentials.googleapis.com", 17 | "cloudresourcemanager.googleapis.com" //needed by terraform 18 | ] 19 | } -------------------------------------------------------------------------------- /infrastructure/bigquery.tf: -------------------------------------------------------------------------------- 1 | resource "google_bigquery_dataset" "dataset" { 2 | dataset_id = "google_search_console" 3 | friendly_name = "google_search_console" 4 | description = "Hold the data from the Google Search Console" 5 | location = var.region 6 | labels = merge(local.default_labels, {}) 7 | } 8 | 9 | resource "google_bigquery_table" "default" { 10 | deletion_protection = false 11 | dataset_id = google_bigquery_dataset.dataset.dataset_id 12 | table_id = "google_search_console_data" 13 | 14 | time_partitioning { 15 | type = "DAY" 16 | field = "date" 17 | } 18 | 19 | labels = merge(local.default_labels, {}) 20 | 21 | schema = file("bigquery_schema/google_search_console_data_table.json") 22 | 23 | } -------------------------------------------------------------------------------- /infrastructure/bigquery_schema/google_search_console_data_table.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "date", 4 | "type": "DATE" 5 | }, 6 | { 7 | "name": "page", 8 | "type": "STRING" 9 | }, 10 | { 11 | "name": "country", 12 | "type": "STRING" 13 | }, 14 | { 15 | "name": "device", 16 | "type": "STRING" 17 | }, 18 | { 19 | "name": "query", 20 | "type": "STRING" 21 | }, 22 | { 23 | "name": "clicks", 24 | "type": "INTEGER" 25 | }, 26 | { 27 | "name": "impressions", 28 | "type": "INTEGER" 29 | }, 30 | { 31 | "name": "ctr", 32 | "type": "FLOAT" 33 | }, 34 | { 35 | "name": "position", 36 | "type": "FLOAT" 37 | } 38 | ] -------------------------------------------------------------------------------- /application/main.py: -------------------------------------------------------------------------------- 1 | import google.auth 2 | import argparse 3 | from google.auth import impersonated_credentials 4 | import googleapiclient.discovery 5 | from google.cloud import bigquery 6 | 7 | 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument("domain") 11 | args = parser.parse_args() 12 | 13 | target_scopes = ["https://www.googleapis.com/auth/webmasters.readonly"] 14 | source_credentials, project_id = google.auth.default() 15 | target_credentials = impersonated_credentials.Credentials( 16 | source_credentials=source_credentials, 17 | target_principal='bigquery-exporter@onyx-dragon-349408.iam.gserviceaccount.com', 18 | target_scopes=target_scopes, 19 | lifetime=60) 20 | 21 | search_console_service = googleapiclient.discovery.build( 22 | 'searchconsole', 23 | 'v1', 24 | credentials=target_credentials, 25 | cache_discovery=False) 26 | 27 | dimensions = ['date', 'country', 'device', 'page', 'query'] 28 | 29 | 30 | def call_google_search_console_api(date: str): 31 | return_data = list() 32 | index = 0 33 | row_limit = 25000 34 | 35 | while True: 36 | payload = { 37 | 'startDate': date, 38 | 'endDate': date, 39 | 'searchType': 'web', 40 | 'dimensions': dimensions, 41 | 'rowLimit': row_limit, 42 | "startRow": row_limit * index 43 | } 44 | 45 | chunk = search_console_service.searchanalytics().query( 46 | siteUrl=f'sc-domain:{args.domain}', 47 | body=payload).execute() 48 | 49 | index += 1 50 | 51 | if 'rows' in chunk.keys(): 52 | return_data.extend(chunk['rows']) 53 | else: 54 | break 55 | 56 | return return_data 57 | 58 | 59 | def prepare_data_bigquery(data): 60 | """ 61 | Map the Values to the Dimensions. And add alle KPIs like clicks etc. 62 | """ 63 | out = list() 64 | for row in data: 65 | mapped = dict(zip(dimensions, row['keys'])) 66 | del row['keys'] 67 | out.append(mapped | row) 68 | return out 69 | 70 | 71 | date = "2022-01-01" 72 | gsc_data = call_google_search_console_api(date=date) 73 | 74 | gsc_data_transformed = prepare_data_bigquery(gsc_data) 75 | 76 | job_config = bigquery.LoadJobConfig() 77 | job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE 78 | job_config.time_partitioning = bigquery.TimePartitioning(field="date") 79 | 80 | bigquery_client = bigquery.Client() 81 | table_ref = bigquery_client.dataset('google_search_console').table(f'google_search_console_data${date.replace("-", "")}') 82 | 83 | bigquery_client.load_table_from_json( 84 | json_rows=gsc_data_transformed, 85 | destination=table_ref, 86 | job_config=job_config 87 | ) 88 | --------------------------------------------------------------------------------