├── .gitignore
├── GA4_tables_backfill.ipynb
├── README.md
├── backfill-GA4-schema.md
├── backfill-ga4.py
├── config.json
└── tansfer_divar_data_from_huggingface_to_bigquery.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/GA4_tables_backfill.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "include_colab_link": true
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | },
13 | "language_info": {
14 | "name": "python"
15 | }
16 | },
17 | "cells": [
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "id": "view-in-github",
22 | "colab_type": "text"
23 | },
24 | "source": [
25 | "
"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "id": "0WJNl6xTYm4_",
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "!pip install google-analytics-data==0.18.4\n",
38 | "!pip install google-cloud-bigquery\n",
39 | "!pip install google-auth==2.27.0\n",
40 | "!pip install google-auth-oauthlib\n",
41 | "!pip install google-auth-httplib2"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "source": [
47 | "import pandas as pd\n",
48 | "from google.cloud import bigquery\n",
49 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
50 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
51 | "from google.oauth2 import service_account\n",
52 | "from google_auth_oauthlib.flow import Flow\n",
53 | "import json\n",
54 | "import os\n",
55 | "import pickle\n",
56 | "\n",
57 | "# Load configuration from a JSON file\n",
58 | "with open(\"config.json\", \"r\") as f:\n",
59 | " config = json.load(f)\n",
60 | "\n",
61 | "# Authenticate with service account for BigQuery\n",
62 | "creds1 = service_account.Credentials.from_service_account_file(\n",
63 | " config['SERVICE_ACCOUNT_FILE'],\n",
64 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
65 | ")\n",
66 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
67 | "\n",
68 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
69 | "def authenticate_ga4():\n",
70 | " creds = None\n",
71 | " if os.path.exists('token.pickle'):\n",
72 | " with open('token.pickle', 'rb') as token:\n",
73 | " creds = pickle.load(token)\n",
74 | " else:\n",
75 | " flow = Flow.from_client_secrets_file(\n",
76 | " config['CLIENT_SECRET_FILE'],\n",
77 | " scopes=config['SCOPES'],\n",
78 | " redirect_uri='http://localhost:8080/'\n",
79 | " )\n",
80 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
81 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
82 | " code = input('Enter the authorization code: ')\n",
83 | " flow.fetch_token(code=code)\n",
84 | " creds = flow.credentials\n",
85 | " with open('token.pickle', 'wb') as token:\n",
86 | " pickle.dump(creds, token)\n",
87 | " return creds\n",
88 | "\n",
89 | "# Function to paginate and fetch GA4 report data with logging\n",
90 | "def run_report_with_pagination(client, request, limit=10000):\n",
91 | " all_rows = []\n",
92 | " offset = 0\n",
93 | " page_number = 1\n",
94 | "\n",
95 | " while True:\n",
96 | " # Apply offset and limit to request\n",
97 | " request.offset = offset\n",
98 | " request.limit = limit\n",
99 | "\n",
100 | " # Fetch report data\n",
101 | " response = client.run_report(request)\n",
102 | " all_rows.extend(response.rows)\n",
103 | "\n",
104 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
105 | "\n",
106 | " # If fewer rows are fetched than the limit, we're done\n",
107 | " if len(response.rows) < limit:\n",
108 | " break\n",
109 | "\n",
110 | " # Update offset and page number to get the next set of rows\n",
111 | " offset += limit\n",
112 | " page_number += 1\n",
113 | "\n",
114 | " return all_rows\n",
115 | "\n",
116 | "# Function to fetch GA4 data using pagination\n",
117 | "def get_ga4_report(client):\n",
118 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
119 | " request = RunReportRequest(\n",
120 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
121 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
122 | " dimensions=[\n",
123 | " Dimension(name='transactionId'),\n",
124 | " Dimension(name='itemName'),\n",
125 | " Dimension(name='date') # Added 'date' dimension\n",
126 | " ],\n",
127 | " metrics=[\n",
128 | " Metric(name='itemPurchaseQuantity'),\n",
129 | " Metric(name='itemRevenue')\n",
130 | " ]\n",
131 | " )\n",
132 | " return run_report_with_pagination(client, request)\n",
133 | "\n",
134 | "# Function to convert GA4 response to a DataFrame\n",
135 | "def response_to_dataframe(response):\n",
136 | " list_rows = []\n",
137 | " for row in response:\n",
138 | " transaction_id = row.dimension_values[0].value\n",
139 | " item_name = row.dimension_values[1].value\n",
140 | " date_value = row.dimension_values[2].value # Added date handling\n",
141 | " list_rows.append({\n",
142 | " 'transactionId': transaction_id,\n",
143 | " 'itemName': item_name,\n",
144 | " 'date': date_value, # Added date column\n",
145 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
146 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n",
147 | " })\n",
148 | " return pd.DataFrame(list_rows)\n",
149 | "\n",
150 | "# Function to upload data to BigQuery\n",
151 | "def upload_to_bigquery(df, table_id):\n",
152 | " # Define BigQuery schema\n",
153 | " schema = [\n",
154 | " bigquery.SchemaField(\"transactionId\", \"STRING\"),\n",
155 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n",
156 | " bigquery.SchemaField(\"date\", \"STRING\"), # Added date field in schema\n",
157 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n",
158 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n",
159 | " ]\n",
160 | "\n",
161 | " # Configure BigQuery job to partition the table by the 'transactionId' field\n",
162 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
163 | " job_config = bigquery.LoadJobConfig(\n",
164 | " schema=schema,\n",
165 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE\n",
166 | " )\n",
167 | "\n",
168 | " # Upload the DataFrame to BigQuery\n",
169 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
170 | " print(f\"Data uploaded to {table_ref}\")\n",
171 | "\n",
172 | "# Main function\n",
173 | "def main():\n",
174 | " try:\n",
175 | " # Authenticate GA4 using OAuth2\n",
176 | " creds = authenticate_ga4()\n",
177 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
178 | "\n",
179 | " # Fetch GA4 data\n",
180 | " ga4_response = get_ga4_report(client_ga4)\n",
181 | "\n",
182 | " # Convert the response to a DataFrame\n",
183 | " ga4_df = response_to_dataframe(ga4_response)\n",
184 | "\n",
185 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
186 | " table_id = 'ga4_transaction_items'\n",
187 | " csv_filename = f\"{table_id}.csv\"\n",
188 | "\n",
189 | " # Save the DataFrame to a CSV file\n",
190 | " ga4_df.to_csv(csv_filename, index=False)\n",
191 | " print(f\"Data saved to {csv_filename}\")\n",
192 | "\n",
193 | " # Upload the DataFrame to BigQuery\n",
194 | " upload_to_bigquery(ga4_df, table_id)\n",
195 | " except Exception as e:\n",
196 | " print(f\"Error occurred: {e}\")\n",
197 | "\n",
198 | "if __name__ == '__main__':\n",
199 | " main()\n"
200 | ],
201 | "metadata": {
202 | "id": "003OzBhNUl7b"
203 | },
204 | "execution_count": null,
205 | "outputs": []
206 | },
207 | {
208 | "cell_type": "code",
209 | "source": [
210 | "import pandas as pd\n",
211 | "from google.cloud import bigquery\n",
212 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
213 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
214 | "from google.oauth2 import service_account\n",
215 | "from google_auth_oauthlib.flow import Flow\n",
216 | "import json\n",
217 | "import os\n",
218 | "import pickle\n",
219 | "\n",
220 | "# Load configuration from a JSON file\n",
221 | "with open(\"config.json\", \"r\") as f:\n",
222 | " config = json.load(f)\n",
223 | "\n",
224 | "# Authenticate with service account for BigQuery\n",
225 | "creds1 = service_account.Credentials.from_service_account_file(\n",
226 | " config['SERVICE_ACCOUNT_FILE'],\n",
227 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
228 | ")\n",
229 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
230 | "\n",
231 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
232 | "def authenticate_ga4():\n",
233 | " creds = None\n",
234 | " if os.path.exists('token.pickle'):\n",
235 | " with open('token.pickle', 'rb') as token:\n",
236 | " creds = pickle.load(token)\n",
237 | " else:\n",
238 | " flow = Flow.from_client_secrets_file(\n",
239 | " config['CLIENT_SECRET_FILE'],\n",
240 | " scopes=config['SCOPES'],\n",
241 | " redirect_uri='http://localhost:8080/'\n",
242 | " )\n",
243 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
244 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
245 | " code = input('Enter the authorization code: ')\n",
246 | " flow.fetch_token(code=code)\n",
247 | " creds = flow.credentials\n",
248 | " with open('token.pickle', 'wb') as token:\n",
249 | " pickle.dump(creds, token)\n",
250 | " return creds\n",
251 | "\n",
252 | "# Function to paginate and fetch GA4 report data with logging\n",
253 | "def run_report_with_pagination(client, request, limit=10000):\n",
254 | " all_rows = []\n",
255 | " offset = 0\n",
256 | " page_number = 1\n",
257 | "\n",
258 | " while True:\n",
259 | " # Apply offset and limit to request\n",
260 | " request.offset = offset\n",
261 | " request.limit = limit\n",
262 | "\n",
263 | " # Fetch report data\n",
264 | " response = client.run_report(request)\n",
265 | " all_rows.extend(response.rows)\n",
266 | "\n",
267 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
268 | "\n",
269 | " # If fewer rows are fetched than the limit, we're done\n",
270 | " if len(response.rows) < limit:\n",
271 | " break\n",
272 | "\n",
273 | " # Update offset and page number to get the next set of rows\n",
274 | " offset += limit\n",
275 | " page_number += 1\n",
276 | "\n",
277 | " return all_rows\n",
278 | "\n",
279 | "# Function to fetch GA4 data using pagination\n",
280 | "def get_ga4_report(client):\n",
281 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
282 | " request = RunReportRequest(\n",
283 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
284 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
285 | " dimensions=[Dimension(name='date'), Dimension(name='sessionDefaultChannelGroup')],\n",
286 | " metrics=[\n",
287 | " Metric(name='sessions'),\n",
288 | " Metric(name='totalUsers'),\n",
289 | " Metric(name='newUsers'),\n",
290 | " Metric(name='ecommercePurchases'),\n",
291 | " Metric(name='purchaseRevenue'),\n",
292 | " ]\n",
293 | " )\n",
294 | " return run_report_with_pagination(client, request)\n",
295 | "\n",
296 | "# Function to convert GA4 response to a DataFrame\n",
297 | "def response_to_dataframe(response):\n",
298 | " list_rows = []\n",
299 | " for row in response:\n",
300 | " try:\n",
301 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
302 | " except ValueError:\n",
303 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
304 | " session_channel_group = row.dimension_values[1].value\n",
305 | " list_rows.append({\n",
306 | " 'date': date_value,\n",
307 | " 'sessionPrimaryChannelGroup': session_channel_group,\n",
308 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
309 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
310 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
311 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
312 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0\n",
313 | " })\n",
314 | " return pd.DataFrame(list_rows)\n",
315 | "\n",
316 | "# Function to upload data to BigQuery\n",
317 | "def upload_to_bigquery(df, table_id):\n",
318 | " # Define BigQuery schema\n",
319 | " schema = [\n",
320 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
321 | " bigquery.SchemaField(\"sessionPrimaryChannelGroup\", \"STRING\"),\n",
322 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
323 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
324 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
325 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
326 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
327 | " ]\n",
328 | "\n",
329 | " # Configure BigQuery job to partition the table by the 'date' column\n",
330 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
331 | " job_config = bigquery.LoadJobConfig(\n",
332 | " schema=schema,\n",
333 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
334 | " time_partitioning=bigquery.TimePartitioning(\n",
335 | " type_=bigquery.TimePartitioningType.DAY,\n",
336 | " field='date'\n",
337 | " )\n",
338 | " )\n",
339 | "\n",
340 | " # Upload the DataFrame to BigQuery\n",
341 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
342 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
343 | "\n",
344 | "# Main function\n",
345 | "def main():\n",
346 | " try:\n",
347 | " # Authenticate GA4 using OAuth2\n",
348 | " creds = authenticate_ga4()\n",
349 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
350 | "\n",
351 | " # Fetch GA4 data\n",
352 | " ga4_response = get_ga4_report(client_ga4)\n",
353 | "\n",
354 | " # Convert the response to a DataFrame\n",
355 | " ga4_df = response_to_dataframe(ga4_response)\n",
356 | "\n",
357 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
358 | " table_id = 'ga4_data_session_channel_group'\n",
359 | " csv_filename = f\"{table_id}.csv\"\n",
360 | "\n",
361 | " # Save the DataFrame to a CSV file\n",
362 | " ga4_df.to_csv(csv_filename, index=False)\n",
363 | " print(f\"Data saved to {csv_filename}\")\n",
364 | "\n",
365 | " # Upload the DataFrame to BigQuery\n",
366 | " upload_to_bigquery(ga4_df, table_id)\n",
367 | " except Exception as e:\n",
368 | " print(f\"Error occurred: {e}\")\n",
369 | "\n",
370 | "if __name__ == '__main__':\n",
371 | " main()"
372 | ],
373 | "metadata": {
374 | "id": "TaCbme6LYqD4",
375 | "collapsed": true
376 | },
377 | "execution_count": null,
378 | "outputs": []
379 | },
380 | {
381 | "cell_type": "code",
382 | "source": [
383 | "import pandas as pd\n",
384 | "from google.cloud import bigquery\n",
385 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
386 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
387 | "from google.oauth2 import service_account\n",
388 | "from google_auth_oauthlib.flow import Flow\n",
389 | "import json\n",
390 | "import os\n",
391 | "import pickle\n",
392 | "\n",
393 | "# Load configuration from a JSON file\n",
394 | "with open(\"config.json\", \"r\") as f:\n",
395 | " config = json.load(f)\n",
396 | "\n",
397 | "# Authenticate with service account for BigQuery\n",
398 | "creds1 = service_account.Credentials.from_service_account_file(\n",
399 | " config['SERVICE_ACCOUNT_FILE'],\n",
400 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
401 | ")\n",
402 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
403 | "\n",
404 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
405 | "def authenticate_ga4():\n",
406 | " creds = None\n",
407 | " if os.path.exists('token.pickle'):\n",
408 | " with open('token.pickle', 'rb') as token:\n",
409 | " creds = pickle.load(token)\n",
410 | " else:\n",
411 | " flow = Flow.from_client_secrets_file(\n",
412 | " config['CLIENT_SECRET_FILE'],\n",
413 | " scopes=config['SCOPES'],\n",
414 | " redirect_uri='http://localhost:8080/'\n",
415 | " )\n",
416 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
417 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
418 | " code = input('Enter the authorization code: ')\n",
419 | " flow.fetch_token(code=code)\n",
420 | " creds = flow.credentials\n",
421 | " with open('token.pickle', 'wb') as token:\n",
422 | " pickle.dump(creds, token)\n",
423 | " return creds\n",
424 | "\n",
425 | "# Function to paginate and fetch GA4 report data with logging\n",
426 | "def run_report_with_pagination(client, request, limit=10000):\n",
427 | " all_rows = []\n",
428 | " offset = 0\n",
429 | " page_number = 1\n",
430 | "\n",
431 | " while True:\n",
432 | " # Apply offset and limit to request\n",
433 | " request.offset = offset\n",
434 | " request.limit = limit\n",
435 | "\n",
436 | " # Fetch report data\n",
437 | " response = client.run_report(request)\n",
438 | " all_rows.extend(response.rows)\n",
439 | "\n",
440 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
441 | "\n",
442 | " # If fewer rows are fetched than the limit, we're done\n",
443 | " if len(response.rows) < limit:\n",
444 | " break\n",
445 | "\n",
446 | " # Update offset and page number to get the next set of rows\n",
447 | " offset += limit\n",
448 | " page_number += 1\n",
449 | "\n",
450 | " return all_rows\n",
451 | "\n",
452 | "# Function to fetch GA4 data using pagination\n",
453 | "def get_ga4_report(client):\n",
454 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
455 | " request = RunReportRequest(\n",
456 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
457 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
458 | " dimensions=[\n",
459 | " Dimension(name='date'),\n",
460 | " Dimension(name='sessionSource'),\n",
461 | " Dimension(name='sessionCampaignName'),\n",
462 | " Dimension(name='sessionMedium')\n",
463 | " ],\n",
464 | " metrics=[\n",
465 | " Metric(name='sessions'),\n",
466 | " Metric(name='totalUsers'),\n",
467 | " Metric(name='newUsers'),\n",
468 | " Metric(name='ecommercePurchases'),\n",
469 | " Metric(name='purchaseRevenue'),\n",
470 | " ]\n",
471 | " )\n",
472 | " return run_report_with_pagination(client, request)\n",
473 | "\n",
474 | "# Function to convert GA4 response to a DataFrame\n",
475 | "def response_to_dataframe(response):\n",
476 | " list_rows = []\n",
477 | " for row in response:\n",
478 | " try:\n",
479 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
480 | " except ValueError:\n",
481 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
482 | " session_source = row.dimension_values[1].value\n",
483 | " session_campaign_name = row.dimension_values[2].value\n",
484 | " session_medium = row.dimension_values[3].value\n",
485 | " list_rows.append({\n",
486 | " 'date': date_value,\n",
487 | " 'sessionSource': session_source,\n",
488 | " 'sessionCampaignName': session_campaign_name,\n",
489 | " 'sessionMedium': session_medium,\n",
490 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
491 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
492 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
493 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
494 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0\n",
495 | " })\n",
496 | " return pd.DataFrame(list_rows)\n",
497 | "\n",
498 | "# Function to upload data to BigQuery\n",
499 | "def upload_to_bigquery(df, table_id):\n",
500 | " # Define BigQuery schema\n",
501 | " schema = [\n",
502 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
503 | " bigquery.SchemaField(\"sessionSource\", \"STRING\"),\n",
504 | " bigquery.SchemaField(\"sessionCampaignName\", \"STRING\"),\n",
505 | " bigquery.SchemaField(\"sessionMedium\", \"STRING\"),\n",
506 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
507 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
508 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
509 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
510 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
511 | " ]\n",
512 | "\n",
513 | " # Configure BigQuery job to partition the table by the 'date' column\n",
514 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
515 | " job_config = bigquery.LoadJobConfig(\n",
516 | " schema=schema,\n",
517 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
518 | " time_partitioning=bigquery.TimePartitioning(\n",
519 | " type_=bigquery.TimePartitioningType.DAY,\n",
520 | " field='date'\n",
521 | " )\n",
522 | " )\n",
523 | "\n",
524 | " # Upload the DataFrame to BigQuery\n",
525 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
526 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
527 | "\n",
528 | "# Main function\n",
529 | "def main():\n",
530 | " try:\n",
531 | " # Authenticate GA4 using OAuth2\n",
532 | " creds = authenticate_ga4()\n",
533 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
534 | "\n",
535 | " # Fetch GA4 data\n",
536 | " ga4_response = get_ga4_report(client_ga4)\n",
537 | "\n",
538 | " # Convert the response to a DataFrame\n",
539 | " ga4_df = response_to_dataframe(ga4_response)\n",
540 | "\n",
541 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
542 | " table_id = 'ga4_data_session_source_campaign_medium'\n",
543 | " csv_filename = f\"{table_id}.csv\"\n",
544 | "\n",
545 | " # Save the DataFrame to a CSV file\n",
546 | " ga4_df.to_csv(csv_filename, index=False)\n",
547 | " print(f\"Data saved to {csv_filename}\")\n",
548 | "\n",
549 | " # Upload the DataFrame to BigQuery\n",
550 | " upload_to_bigquery(ga4_df, table_id)\n",
551 | " except Exception as e:\n",
552 | " print(f\"Error occurred: {e}\")\n",
553 | "\n",
554 | "if __name__ == '__main__':\n",
555 | " main()\n"
556 | ],
557 | "metadata": {
558 | "id": "Wz5wF6MHbIAC"
559 | },
560 | "execution_count": null,
561 | "outputs": []
562 | },
563 | {
564 | "cell_type": "code",
565 | "source": [
566 | "import pandas as pd\n",
567 | "from google.cloud import bigquery\n",
568 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
569 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
570 | "from google.oauth2 import service_account\n",
571 | "from google_auth_oauthlib.flow import Flow\n",
572 | "import json\n",
573 | "import os\n",
574 | "import pickle\n",
575 | "\n",
576 | "# Load configuration from a JSON file\n",
577 | "with open(\"config.json\", \"r\") as f:\n",
578 | " config = json.load(f)\n",
579 | "\n",
580 | "# Authenticate with service account for BigQuery\n",
581 | "creds1 = service_account.Credentials.from_service_account_file(\n",
582 | " config['SERVICE_ACCOUNT_FILE'],\n",
583 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
584 | ")\n",
585 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
586 | "\n",
587 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
588 | "def authenticate_ga4():\n",
589 | " creds = None\n",
590 | " if os.path.exists('token.pickle'):\n",
591 | " with open('token.pickle', 'rb') as token:\n",
592 | " creds = pickle.load(token)\n",
593 | " else:\n",
594 | " flow = Flow.from_client_secrets_file(\n",
595 | " config['CLIENT_SECRET_FILE'],\n",
596 | " scopes=config['SCOPES'],\n",
597 | " redirect_uri='http://localhost:8080/'\n",
598 | " )\n",
599 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
600 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
601 | " code = input('Enter the authorization code: ')\n",
602 | " flow.fetch_token(code=code)\n",
603 | " creds = flow.credentials\n",
604 | " with open('token.pickle', 'wb') as token:\n",
605 | " pickle.dump(creds, token)\n",
606 | " return creds\n",
607 | "\n",
608 | "# Function to paginate and fetch GA4 report data with logging\n",
609 | "def run_report_with_pagination(client, request, limit=10000):\n",
610 | " all_rows = []\n",
611 | " offset = 0\n",
612 | " page_number = 1\n",
613 | "\n",
614 | " while True:\n",
615 | " # Apply offset and limit to request\n",
616 | " request.offset = offset\n",
617 | " request.limit = limit\n",
618 | "\n",
619 | " # Fetch report data\n",
620 | " response = client.run_report(request)\n",
621 | " all_rows.extend(response.rows)\n",
622 | "\n",
623 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
624 | "\n",
625 | " # If fewer rows are fetched than the limit, we're done\n",
626 | " if len(response.rows) < limit:\n",
627 | " break\n",
628 | "\n",
629 | " # Update offset and page number to get the next set of rows\n",
630 | " offset += limit\n",
631 | " page_number += 1\n",
632 | "\n",
633 | " return all_rows\n",
634 | "\n",
635 | "# Function to fetch GA4 data using pagination\n",
636 | "def get_ga4_report(client):\n",
637 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
638 | " request = RunReportRequest(\n",
639 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
640 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
641 | " dimensions=[\n",
642 | " Dimension(name='date'),\n",
643 | " Dimension(name='country'),\n",
644 | " Dimension(name='language'),\n",
645 | " Dimension(name='city')\n",
646 | " ],\n",
647 | " metrics=[\n",
648 | " Metric(name='sessions'),\n",
649 | " Metric(name='screenPageViews'),\n",
650 | " Metric(name='totalUsers'),\n",
651 | " Metric(name='newUsers'),\n",
652 | " Metric(name='ecommercePurchases'),\n",
653 | " Metric(name='purchaseRevenue')\n",
654 | " ]\n",
655 | " )\n",
656 | " return run_report_with_pagination(client, request)\n",
657 | "\n",
658 | "# Function to convert GA4 response to a DataFrame\n",
659 | "def response_to_dataframe(response):\n",
660 | " list_rows = []\n",
661 | " for row in response:\n",
662 | " try:\n",
663 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
664 | " except ValueError:\n",
665 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
666 | " country = row.dimension_values[1].value\n",
667 | " language = row.dimension_values[2].value\n",
668 | " city = row.dimension_values[3].value\n",
669 | " list_rows.append({\n",
670 | " 'date': date_value,\n",
671 | " 'country': country,\n",
672 | " 'language': language,\n",
673 | " 'city': city,\n",
674 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
675 | " 'screenPageViews': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
676 | " 'totalUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
677 | " 'newUsers': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
678 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
679 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n",
680 | " })\n",
681 | " return pd.DataFrame(list_rows)\n",
682 | "\n",
683 | "# Function to upload data to BigQuery\n",
684 | "def upload_to_bigquery(df, table_id):\n",
685 | " # Define BigQuery schema\n",
686 | " schema = [\n",
687 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
688 | " bigquery.SchemaField(\"country\", \"STRING\"),\n",
689 | " bigquery.SchemaField(\"language\", \"STRING\"),\n",
690 | " bigquery.SchemaField(\"city\", \"STRING\"),\n",
691 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
692 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n",
693 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
694 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
695 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
696 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
697 | " ]\n",
698 | "\n",
699 | " # Configure BigQuery job to partition the table by the 'date' column\n",
700 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
701 | " job_config = bigquery.LoadJobConfig(\n",
702 | " schema=schema,\n",
703 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
704 | " time_partitioning=bigquery.TimePartitioning(\n",
705 | " type_=bigquery.TimePartitioningType.DAY,\n",
706 | " field='date'\n",
707 | " )\n",
708 | " )\n",
709 | "\n",
710 | " # Upload the DataFrame to BigQuery\n",
711 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
712 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
713 | "\n",
714 | "# Main function\n",
715 | "def main():\n",
716 | " try:\n",
717 | " # Authenticate GA4 using OAuth2\n",
718 | " creds = authenticate_ga4()\n",
719 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
720 | "\n",
721 | " # Fetch GA4 data\n",
722 | " ga4_response = get_ga4_report(client_ga4)\n",
723 | "\n",
724 | " # Convert the response to a DataFrame\n",
725 | " ga4_df = response_to_dataframe(ga4_response)\n",
726 | "\n",
727 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
728 | " table_id = 'ga4_data_country_language_city'\n",
729 | " csv_filename = f\"{table_id}.csv\"\n",
730 | "\n",
731 | " # Save the DataFrame to a CSV file\n",
732 | " ga4_df.to_csv(csv_filename, index=False)\n",
733 | " print(f\"Data saved to {csv_filename}\")\n",
734 | "\n",
735 | " # Upload the DataFrame to BigQuery\n",
736 | " upload_to_bigquery(ga4_df, table_id)\n",
737 | " except Exception as e:\n",
738 | " print(f\"Error occurred: {e}\")\n",
739 | "\n",
740 | "if __name__ == '__main__':\n",
741 | " main()\n"
742 | ],
743 | "metadata": {
744 | "id": "e-Oqh-oNfbC1"
745 | },
746 | "execution_count": null,
747 | "outputs": []
748 | },
749 | {
750 | "cell_type": "code",
751 | "source": [
752 | "import pandas as pd\n",
753 | "from google.cloud import bigquery\n",
754 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
755 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
756 | "from google.oauth2 import service_account\n",
757 | "from google_auth_oauthlib.flow import Flow\n",
758 | "import json\n",
759 | "import os\n",
760 | "import pickle\n",
761 | "\n",
762 | "# Load configuration from a JSON file\n",
763 | "with open(\"config.json\", \"r\") as f:\n",
764 | " config = json.load(f)\n",
765 | "\n",
766 | "# Authenticate with service account for BigQuery\n",
767 | "creds1 = service_account.Credentials.from_service_account_file(\n",
768 | " config['SERVICE_ACCOUNT_FILE'],\n",
769 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
770 | ")\n",
771 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
772 | "\n",
773 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
774 | "def authenticate_ga4():\n",
775 | " creds = None\n",
776 | " if os.path.exists('token.pickle'):\n",
777 | " with open('token.pickle', 'rb') as token:\n",
778 | " creds = pickle.load(token)\n",
779 | " else:\n",
780 | " flow = Flow.from_client_secrets_file(\n",
781 | " config['CLIENT_SECRET_FILE'],\n",
782 | " scopes=config['SCOPES'],\n",
783 | " redirect_uri='http://localhost:8080/'\n",
784 | " )\n",
785 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
786 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
787 | " code = input('Enter the authorization code: ')\n",
788 | " flow.fetch_token(code=code)\n",
789 | " creds = flow.credentials\n",
790 | " with open('token.pickle', 'wb') as token:\n",
791 | " pickle.dump(creds, token)\n",
792 | " return creds\n",
793 | "\n",
794 | "# Function to paginate and fetch GA4 report data with logging\n",
795 | "def run_report_with_pagination(client, request, limit=1000):\n",
796 | " all_rows = []\n",
797 | " offset = 0\n",
798 | " page_number = 1\n",
799 | "\n",
800 | " while True:\n",
801 | " # Apply offset and limit to request\n",
802 | " request.offset = offset\n",
803 | " request.limit = limit\n",
804 | "\n",
805 | " # Fetch report data\n",
806 | " response = client.run_report(request)\n",
807 | " all_rows.extend(response.rows)\n",
808 | "\n",
809 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
810 | "\n",
811 | " # If fewer rows are fetched than the limit, we're done\n",
812 | " if len(response.rows) < limit:\n",
813 | " break\n",
814 | "\n",
815 | " # Update offset and page number to get the next set of rows\n",
816 | " offset += limit\n",
817 | " page_number += 1\n",
818 | "\n",
819 | " return all_rows\n",
820 | "\n",
821 | "# Function to fetch GA4 data using pagination\n",
822 | "def get_ga4_report(client):\n",
823 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
824 | " request = RunReportRequest(\n",
825 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
826 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
827 | " dimensions=[\n",
828 | " Dimension(name='date'),\n",
829 | " Dimension(name='itemName')\n",
830 | " ],\n",
831 | " metrics=[\n",
832 | " Metric(name='itemPurchaseQuantity'),\n",
833 | " Metric(name='itemRevenue')\n",
834 | " ]\n",
835 | " )\n",
836 | " return run_report_with_pagination(client, request)\n",
837 | "\n",
838 | "# Function to convert GA4 response to a DataFrame\n",
839 | "def response_to_dataframe(response):\n",
840 | " list_rows = []\n",
841 | " for row in response:\n",
842 | " try:\n",
843 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
844 | " except ValueError:\n",
845 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
846 | " item_name = row.dimension_values[1].value\n",
847 | " list_rows.append({\n",
848 | " 'date': date_value,\n",
849 | " 'itemName': item_name,\n",
850 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
851 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n",
852 | " })\n",
853 | " return pd.DataFrame(list_rows)\n",
854 | "\n",
855 | "# Function to upload data to BigQuery\n",
856 | "def upload_to_bigquery(df, table_id):\n",
857 | " # Define BigQuery schema\n",
858 | " schema = [\n",
859 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
860 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n",
861 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n",
862 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n",
863 | " ]\n",
864 | "\n",
865 | " # Configure BigQuery job to partition the table by the 'date' column\n",
866 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
867 | " job_config = bigquery.LoadJobConfig(\n",
868 | " schema=schema,\n",
869 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
870 | " time_partitioning=bigquery.TimePartitioning(\n",
871 | " type_=bigquery.TimePartitioningType.DAY,\n",
872 | " field='date'\n",
873 | " )\n",
874 | " )\n",
875 | "\n",
876 | " # Upload the DataFrame to BigQuery\n",
877 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
878 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
879 | "\n",
880 | "# Main function\n",
881 | "def main():\n",
882 | " try:\n",
883 | " # Authenticate GA4 using OAuth2\n",
884 | " creds = authenticate_ga4()\n",
885 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
886 | "\n",
887 | " # Fetch GA4 data\n",
888 | " ga4_response = get_ga4_report(client_ga4)\n",
889 | "\n",
890 | " # Convert the response to a DataFrame\n",
891 | " ga4_df = response_to_dataframe(ga4_response)\n",
892 | "\n",
893 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
894 | " table_id = 'ga4_data_item_name'\n",
895 | " csv_filename = f\"{table_id}.csv\"\n",
896 | "\n",
897 | " # Save the DataFrame to a CSV file\n",
898 | " ga4_df.to_csv(csv_filename, index=False)\n",
899 | " print(f\"Data saved to {csv_filename}\")\n",
900 | "\n",
901 | " # Upload the DataFrame to BigQuery\n",
902 | " upload_to_bigquery(ga4_df, table_id)\n",
903 | " except Exception as e:\n",
904 | " print(f\"Error occurred: {e}\")\n",
905 | "\n",
906 | "if __name__ == '__main__':\n",
907 | " main()\n"
908 | ],
909 | "metadata": {
910 | "id": "RKU2hiP7gynQ"
911 | },
912 | "execution_count": null,
913 | "outputs": []
914 | },
915 | {
916 | "cell_type": "code",
917 | "source": [
918 | "import pandas as pd\n",
919 | "from google.cloud import bigquery\n",
920 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
921 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
922 | "from google.oauth2 import service_account\n",
923 | "from google_auth_oauthlib.flow import Flow\n",
924 | "import json\n",
925 | "import os\n",
926 | "import pickle\n",
927 | "\n",
928 | "# Load configuration from a JSON file\n",
929 | "with open(\"config.json\", \"r\") as f:\n",
930 | " config = json.load(f)\n",
931 | "\n",
932 | "# Authenticate with service account for BigQuery\n",
933 | "creds1 = service_account.Credentials.from_service_account_file(\n",
934 | " config['SERVICE_ACCOUNT_FILE'],\n",
935 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
936 | ")\n",
937 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
938 | "\n",
939 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
940 | "def authenticate_ga4():\n",
941 | " creds = None\n",
942 | " if os.path.exists('token.pickle'):\n",
943 | " with open('token.pickle', 'rb') as token:\n",
944 | " creds = pickle.load(token)\n",
945 | " else:\n",
946 | " flow = Flow.from_client_secrets_file(\n",
947 | " config['CLIENT_SECRET_FILE'],\n",
948 | " scopes=config['SCOPES'],\n",
949 | " redirect_uri='http://localhost:8080/'\n",
950 | " )\n",
951 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
952 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
953 | " code = input('Enter the authorization code: ')\n",
954 | " flow.fetch_token(code=code)\n",
955 | " creds = flow.credentials\n",
956 | " with open('token.pickle', 'wb') as token:\n",
957 | " pickle.dump(creds, token)\n",
958 | " return creds\n",
959 | "\n",
960 | "# Function to paginate and fetch GA4 report data with logging\n",
961 | "def run_report_with_pagination(client, request, limit=1000):\n",
962 | " all_rows = []\n",
963 | " offset = 0\n",
964 | " page_number = 1\n",
965 | "\n",
966 | " while True:\n",
967 | " # Apply offset and limit to request\n",
968 | " request.offset = offset\n",
969 | " request.limit = limit\n",
970 | "\n",
971 | " # Fetch report data\n",
972 | " response = client.run_report(request)\n",
973 | " all_rows.extend(response.rows)\n",
974 | "\n",
975 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
976 | "\n",
977 | " # If fewer rows are fetched than the limit, we're done\n",
978 | " if len(response.rows) < limit:\n",
979 | " break\n",
980 | "\n",
981 | " # Update offset and page number to get the next set of rows\n",
982 | " offset += limit\n",
983 | " page_number += 1\n",
984 | "\n",
985 | " return all_rows\n",
986 | "\n",
987 | "# Function to fetch GA4 data using pagination\n",
988 | "def get_ga4_report(client):\n",
989 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
990 | " request = RunReportRequest(\n",
991 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
992 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
993 | " dimensions=[\n",
994 | " Dimension(name='date'),\n",
995 | " Dimension(name='browser'),\n",
996 | " Dimension(name='operatingSystem'),\n",
997 | " Dimension(name='deviceCategory')\n",
998 | " ],\n",
999 | " metrics=[\n",
1000 | " Metric(name='sessions'),\n",
1001 | " Metric(name='screenPageViews'),\n",
1002 | " Metric(name='totalUsers'),\n",
1003 | " Metric(name='newUsers'),\n",
1004 | " Metric(name='ecommercePurchases'),\n",
1005 | " Metric(name='purchaseRevenue')\n",
1006 | " ]\n",
1007 | " )\n",
1008 | " return run_report_with_pagination(client, request)\n",
1009 | "\n",
1010 | "# Function to convert GA4 response to a DataFrame\n",
1011 | "def response_to_dataframe(response):\n",
1012 | " list_rows = []\n",
1013 | " for row in response:\n",
1014 | " try:\n",
1015 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
1016 | " except ValueError:\n",
1017 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
1018 | " browser = row.dimension_values[1].value\n",
1019 | " operating_system = row.dimension_values[2].value\n",
1020 | " device_category = row.dimension_values[3].value\n",
1021 | " list_rows.append({\n",
1022 | " 'date': date_value,\n",
1023 | " 'browser': browser,\n",
1024 | " 'operatingSystem': operating_system,\n",
1025 | " 'deviceCategory': device_category,\n",
1026 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1027 | " 'screenPageViews': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
1028 | " 'totalUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
1029 | " 'newUsers': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
1030 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
1031 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n",
1032 | " })\n",
1033 | " return pd.DataFrame(list_rows)\n",
1034 | "\n",
1035 | "# Function to upload data to BigQuery\n",
1036 | "def upload_to_bigquery(df, table_id):\n",
1037 | " # Define BigQuery schema\n",
1038 | " schema = [\n",
1039 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
1040 | " bigquery.SchemaField(\"browser\", \"STRING\"),\n",
1041 | " bigquery.SchemaField(\"operatingSystem\", \"STRING\"),\n",
1042 | " bigquery.SchemaField(\"deviceCategory\", \"STRING\"),\n",
1043 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
1044 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n",
1045 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
1046 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
1047 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
1048 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
1049 | " ]\n",
1050 | "\n",
1051 | " # Configure BigQuery job to partition the table by the 'date' column\n",
1052 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1053 | " job_config = bigquery.LoadJobConfig(\n",
1054 | " schema=schema,\n",
1055 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
1056 | " time_partitioning=bigquery.TimePartitioning(\n",
1057 | " type_=bigquery.TimePartitioningType.DAY,\n",
1058 | " field='date'\n",
1059 | " )\n",
1060 | " )\n",
1061 | "\n",
1062 | " # Upload the DataFrame to BigQuery\n",
1063 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1064 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
1065 | "\n",
1066 | "# Main function\n",
1067 | "def main():\n",
1068 | " try:\n",
1069 | " # Authenticate GA4 using OAuth2\n",
1070 | " creds = authenticate_ga4()\n",
1071 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1072 | "\n",
1073 | " # Fetch GA4 data\n",
1074 | " ga4_response = get_ga4_report(client_ga4)\n",
1075 | "\n",
1076 | " # Convert the response to a DataFrame\n",
1077 | " ga4_df = response_to_dataframe(ga4_response)\n",
1078 | "\n",
1079 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1080 | " table_id = 'ga4_data_browser_os_device'\n",
1081 | " csv_filename = f\"{table_id}.csv\"\n",
1082 | "\n",
1083 | " # Save the DataFrame to a CSV file\n",
1084 | " ga4_df.to_csv(csv_filename, index=False)\n",
1085 | " print(f\"Data saved to {csv_filename}\")\n",
1086 | "\n",
1087 | " # Upload the DataFrame to BigQuery\n",
1088 | " upload_to_bigquery(ga4_df, table_id)\n",
1089 | " except Exception as e:\n",
1090 | " print(f\"Error occurred: {e}\")\n",
1091 | "\n",
1092 | "if __name__ == '__main__':\n",
1093 | " main()\n"
1094 | ],
1095 | "metadata": {
1096 | "id": "YpYm_kTLiqsy"
1097 | },
1098 | "execution_count": null,
1099 | "outputs": []
1100 | },
1101 | {
1102 | "cell_type": "code",
1103 | "source": [
1104 | "import pandas as pd\n",
1105 | "from google.cloud import bigquery\n",
1106 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1107 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1108 | "from google.oauth2 import service_account\n",
1109 | "from google_auth_oauthlib.flow import Flow\n",
1110 | "import json\n",
1111 | "import os\n",
1112 | "import pickle\n",
1113 | "\n",
1114 | "# Load configuration from a JSON file\n",
1115 | "with open(\"config.json\", \"r\") as f:\n",
1116 | " config = json.load(f)\n",
1117 | "\n",
1118 | "# Authenticate with service account for BigQuery\n",
1119 | "creds1 = service_account.Credentials.from_service_account_file(\n",
1120 | " config['SERVICE_ACCOUNT_FILE'],\n",
1121 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
1122 | ")\n",
1123 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
1124 | "\n",
1125 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
1126 | "def authenticate_ga4():\n",
1127 | " creds = None\n",
1128 | " if os.path.exists('token.pickle'):\n",
1129 | " with open('token.pickle', 'rb') as token:\n",
1130 | " creds = pickle.load(token)\n",
1131 | " else:\n",
1132 | " flow = Flow.from_client_secrets_file(\n",
1133 | " config['CLIENT_SECRET_FILE'],\n",
1134 | " scopes=config['SCOPES'],\n",
1135 | " redirect_uri='http://localhost:8080/'\n",
1136 | " )\n",
1137 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
1138 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
1139 | " code = input('Enter the authorization code: ')\n",
1140 | " flow.fetch_token(code=code)\n",
1141 | " creds = flow.credentials\n",
1142 | " with open('token.pickle', 'wb') as token:\n",
1143 | " pickle.dump(creds, token)\n",
1144 | " return creds\n",
1145 | "\n",
1146 | "# Function to paginate and fetch GA4 report data with logging\n",
1147 | "def run_report_with_pagination(client, request, limit=10000):\n",
1148 | " all_rows = []\n",
1149 | " offset = 0\n",
1150 | " page_number = 1\n",
1151 | "\n",
1152 | " while True:\n",
1153 | " # Apply offset and limit to request\n",
1154 | " request.offset = offset\n",
1155 | " request.limit = limit\n",
1156 | "\n",
1157 | " # Fetch report data\n",
1158 | " response = client.run_report(request)\n",
1159 | " all_rows.extend(response.rows)\n",
1160 | "\n",
1161 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
1162 | "\n",
1163 | " # If fewer rows are fetched than the limit, we're done\n",
1164 | " if len(response.rows) < limit:\n",
1165 | " break\n",
1166 | "\n",
1167 | " # Update offset and page number to get the next set of rows\n",
1168 | " offset += limit\n",
1169 | " page_number += 1\n",
1170 | "\n",
1171 | " return all_rows\n",
1172 | "\n",
1173 | "# Function to fetch GA4 data using pagination\n",
1174 | "def get_ga4_report(client):\n",
1175 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
1176 | " request = RunReportRequest(\n",
1177 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
1178 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
1179 | " dimensions=[\n",
1180 | " Dimension(name='date'),\n",
1181 | " Dimension(name='firstUserMedium'),\n",
1182 | " Dimension(name='firstUserSource'),\n",
1183 | " Dimension(name='firstUserCampaignName')\n",
1184 | " ],\n",
1185 | " metrics=[\n",
1186 | " Metric(name='totalUsers'),\n",
1187 | " Metric(name='newUsers'),\n",
1188 | " Metric(name='ecommercePurchases'),\n",
1189 | " Metric(name='purchaseRevenue')\n",
1190 | " ]\n",
1191 | " )\n",
1192 | " return run_report_with_pagination(client, request)\n",
1193 | "\n",
1194 | "# Function to convert GA4 response to a DataFrame\n",
1195 | "def response_to_dataframe(response):\n",
1196 | " list_rows = []\n",
1197 | " for row in response:\n",
1198 | " try:\n",
1199 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
1200 | " except ValueError:\n",
1201 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
1202 | " first_user_medium = row.dimension_values[1].value\n",
1203 | " first_user_source = row.dimension_values[2].value\n",
1204 | " first_user_campaign_name = row.dimension_values[3].value\n",
1205 | " list_rows.append({\n",
1206 | " 'date': date_value,\n",
1207 | " 'firstUserMedium': first_user_medium,\n",
1208 | " 'firstUserSource': first_user_source,\n",
1209 | " 'firstUserCampaignName': first_user_campaign_name,\n",
1210 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1211 | " 'newUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
1212 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
1213 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0\n",
1214 | " })\n",
1215 | " return pd.DataFrame(list_rows)\n",
1216 | "\n",
1217 | "# Function to upload data to BigQuery\n",
1218 | "def upload_to_bigquery(df, table_id):\n",
1219 | " # Define BigQuery schema\n",
1220 | " schema = [\n",
1221 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
1222 | " bigquery.SchemaField(\"firstUserMedium\", \"STRING\"),\n",
1223 | " bigquery.SchemaField(\"firstUserSource\", \"STRING\"),\n",
1224 | " bigquery.SchemaField(\"firstUserCampaignName\", \"STRING\"),\n",
1225 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
1226 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
1227 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
1228 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
1229 | " ]\n",
1230 | "\n",
1231 | " # Configure BigQuery job to partition the table by the 'date' column\n",
1232 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1233 | " job_config = bigquery.LoadJobConfig(\n",
1234 | " schema=schema,\n",
1235 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
1236 | " time_partitioning=bigquery.TimePartitioning(\n",
1237 | " type_=bigquery.TimePartitioningType.DAY,\n",
1238 | " field='date'\n",
1239 | " )\n",
1240 | " )\n",
1241 | "\n",
1242 | " # Upload the DataFrame to BigQuery\n",
1243 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1244 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
1245 | "\n",
1246 | "# Main function\n",
1247 | "def main():\n",
1248 | " try:\n",
1249 | " # Authenticate GA4 using OAuth2\n",
1250 | " creds = authenticate_ga4()\n",
1251 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1252 | "\n",
1253 | " # Fetch GA4 data\n",
1254 | " ga4_response = get_ga4_report(client_ga4)\n",
1255 | "\n",
1256 | " # Convert the response to a DataFrame\n",
1257 | " ga4_df = response_to_dataframe(ga4_response)\n",
1258 | "\n",
1259 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1260 | " table_id = 'ga4_data_first_user_source_medium'\n",
1261 | " csv_filename = f\"{table_id}.csv\"\n",
1262 | "\n",
1263 | " # Save the DataFrame to a CSV file\n",
1264 | " ga4_df.to_csv(csv_filename, index=False)\n",
1265 | " print(f\"Data saved to {csv_filename}\")\n",
1266 | "\n",
1267 | " # Upload the DataFrame to BigQuery\n",
1268 | " upload_to_bigquery(ga4_df, table_id)\n",
1269 | " except Exception as e:\n",
1270 | " print(f\"Error occurred: {e}\")\n",
1271 | "\n",
1272 | "if __name__ == '__main__':\n",
1273 | " main()\n"
1274 | ],
1275 | "metadata": {
1276 | "id": "s5H79ndims88"
1277 | },
1278 | "execution_count": null,
1279 | "outputs": []
1280 | },
1281 | {
1282 | "cell_type": "code",
1283 | "source": [
1284 | "import pandas as pd\n",
1285 | "from google.cloud import bigquery\n",
1286 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1287 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1288 | "from google.oauth2 import service_account\n",
1289 | "from google_auth_oauthlib.flow import Flow\n",
1290 | "import json\n",
1291 | "import os\n",
1292 | "import pickle\n",
1293 | "\n",
1294 | "# Load configuration from a JSON file\n",
1295 | "with open(\"config.json\", \"r\") as f:\n",
1296 | " config = json.load(f)\n",
1297 | "\n",
1298 | "# Authenticate with service account for BigQuery\n",
1299 | "creds1 = service_account.Credentials.from_service_account_file(\n",
1300 | " config['SERVICE_ACCOUNT_FILE'],\n",
1301 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
1302 | ")\n",
1303 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
1304 | "\n",
1305 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
1306 | "def authenticate_ga4():\n",
1307 | " creds = None\n",
1308 | " if os.path.exists('token.pickle'):\n",
1309 | " with open('token.pickle', 'rb') as token:\n",
1310 | " creds = pickle.load(token)\n",
1311 | " else:\n",
1312 | " flow = Flow.from_client_secrets_file(\n",
1313 | " config['CLIENT_SECRET_FILE'],\n",
1314 | " scopes=config['SCOPES'],\n",
1315 | " redirect_uri='http://localhost:8080/'\n",
1316 | " )\n",
1317 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
1318 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
1319 | " code = input('Enter the authorization code: ')\n",
1320 | " flow.fetch_token(code=code)\n",
1321 | " creds = flow.credentials\n",
1322 | " with open('token.pickle', 'wb') as token:\n",
1323 | " pickle.dump(creds, token)\n",
1324 | " return creds\n",
1325 | "\n",
1326 | "# Function to paginate and fetch GA4 report data with logging\n",
1327 | "def run_report_with_pagination(client, request, limit=10000):\n",
1328 | " all_rows = []\n",
1329 | " offset = 0\n",
1330 | " page_number = 1\n",
1331 | "\n",
1332 | " while True:\n",
1333 | " # Apply offset and limit to request\n",
1334 | " request.offset = offset\n",
1335 | " request.limit = limit\n",
1336 | "\n",
1337 | " # Fetch report data\n",
1338 | " response = client.run_report(request)\n",
1339 | " all_rows.extend(response.rows)\n",
1340 | "\n",
1341 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
1342 | "\n",
1343 | " # If fewer rows are fetched than the limit, we're done\n",
1344 | " if len(response.rows) < limit:\n",
1345 | " break\n",
1346 | "\n",
1347 | " # Update offset and page number to get the next set of rows\n",
1348 | " offset += limit\n",
1349 | " page_number += 1\n",
1350 | "\n",
1351 | " return all_rows\n",
1352 | "\n",
1353 | "# Function to fetch GA4 data using pagination\n",
1354 | "def get_ga4_report(client):\n",
1355 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
1356 | " request = RunReportRequest(\n",
1357 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
1358 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
1359 | " dimensions=[\n",
1360 | " Dimension(name='date'),\n",
1361 | " Dimension(name='firstUserDefaultChannelGroup')\n",
1362 | " ],\n",
1363 | " metrics=[\n",
1364 | " Metric(name='totalUsers'),\n",
1365 | " Metric(name='newUsers'),\n",
1366 | " Metric(name='ecommercePurchases'),\n",
1367 | " Metric(name='purchaseRevenue')\n",
1368 | " ]\n",
1369 | " )\n",
1370 | " return run_report_with_pagination(client, request)\n",
1371 | "\n",
1372 | "# Function to convert GA4 response to a DataFrame\n",
1373 | "def response_to_dataframe(response):\n",
1374 | " list_rows = []\n",
1375 | " for row in response:\n",
1376 | " try:\n",
1377 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
1378 | " except ValueError:\n",
1379 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
1380 | " first_user_channel_group = row.dimension_values[1].value\n",
1381 | " list_rows.append({\n",
1382 | " 'date': date_value,\n",
1383 | " 'firstUserDefaultChannelGroup': first_user_channel_group,\n",
1384 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1385 | " 'newUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
1386 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
1387 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0\n",
1388 | " })\n",
1389 | " return pd.DataFrame(list_rows)\n",
1390 | "\n",
1391 | "# Function to upload data to BigQuery\n",
1392 | "def upload_to_bigquery(df, table_id):\n",
1393 | " # Define BigQuery schema\n",
1394 | " schema = [\n",
1395 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
1396 | " bigquery.SchemaField(\"firstUserDefaultChannelGroup\", \"STRING\"),\n",
1397 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
1398 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
1399 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
1400 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n",
1401 | " ]\n",
1402 | "\n",
1403 | " # Configure BigQuery job to partition the table by the 'date' column\n",
1404 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1405 | " job_config = bigquery.LoadJobConfig(\n",
1406 | " schema=schema,\n",
1407 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
1408 | " time_partitioning=bigquery.TimePartitioning(\n",
1409 | " type_=bigquery.TimePartitioningType.DAY,\n",
1410 | " field='date'\n",
1411 | " )\n",
1412 | " )\n",
1413 | "\n",
1414 | " # Upload the DataFrame to BigQuery\n",
1415 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1416 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
1417 | "\n",
1418 | "# Main function\n",
1419 | "def main():\n",
1420 | " try:\n",
1421 | " # Authenticate GA4 using OAuth2\n",
1422 | " creds = authenticate_ga4()\n",
1423 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1424 | "\n",
1425 | " # Fetch GA4 data\n",
1426 | " ga4_response = get_ga4_report(client_ga4)\n",
1427 | "\n",
1428 | " # Convert the response to a DataFrame\n",
1429 | " ga4_df = response_to_dataframe(ga4_response)\n",
1430 | "\n",
1431 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1432 | " table_id = 'ga4_data_first_user_channel_group'\n",
1433 | " csv_filename = f\"{table_id}.csv\"\n",
1434 | "\n",
1435 | " # Save the DataFrame to a CSV file\n",
1436 | " ga4_df.to_csv(csv_filename, index=False)\n",
1437 | " print(f\"Data saved to {csv_filename}\")\n",
1438 | "\n",
1439 | " # Upload the DataFrame to BigQuery\n",
1440 | " upload_to_bigquery(ga4_df, table_id)\n",
1441 | " except Exception as e:\n",
1442 | " print(f\"Error occurred: {e}\")\n",
1443 | "\n",
1444 | "if __name__ == '__main__':\n",
1445 | " main()\n"
1446 | ],
1447 | "metadata": {
1448 | "id": "Fv1ISKAUn-bf"
1449 | },
1450 | "execution_count": null,
1451 | "outputs": []
1452 | },
1453 | {
1454 | "cell_type": "code",
1455 | "source": [
1456 | "import pandas as pd\n",
1457 | "from google.cloud import bigquery\n",
1458 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1459 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1460 | "from google.oauth2 import service_account\n",
1461 | "from google_auth_oauthlib.flow import Flow\n",
1462 | "import json\n",
1463 | "import os\n",
1464 | "import pickle\n",
1465 | "\n",
1466 | "# Load configuration from a JSON file\n",
1467 | "with open(\"config.json\", \"r\") as f:\n",
1468 | " config = json.load(f)\n",
1469 | "\n",
1470 | "# Authenticate with service account for BigQuery\n",
1471 | "creds1 = service_account.Credentials.from_service_account_file(\n",
1472 | " config['SERVICE_ACCOUNT_FILE'],\n",
1473 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
1474 | ")\n",
1475 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
1476 | "\n",
1477 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
1478 | "def authenticate_ga4():\n",
1479 | " creds = None\n",
1480 | " if os.path.exists('token.pickle'):\n",
1481 | " with open('token.pickle', 'rb') as token:\n",
1482 | " creds = pickle.load(token)\n",
1483 | " else:\n",
1484 | " flow = Flow.from_client_secrets_file(\n",
1485 | " config['CLIENT_SECRET_FILE'],\n",
1486 | " scopes=config['SCOPES'],\n",
1487 | " redirect_uri='http://localhost:8080/'\n",
1488 | " )\n",
1489 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
1490 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
1491 | " code = input('Enter the authorization code: ')\n",
1492 | " flow.fetch_token(code=code)\n",
1493 | " creds = flow.credentials\n",
1494 | " with open('token.pickle', 'wb') as token:\n",
1495 | " pickle.dump(creds, token)\n",
1496 | " return creds\n",
1497 | "\n",
1498 | "# Function to paginate and fetch GA4 report data with logging\n",
1499 | "def run_report_with_pagination(client, request, limit=10000):\n",
1500 | " all_rows = []\n",
1501 | " offset = 0\n",
1502 | " page_number = 1\n",
1503 | "\n",
1504 | " while True:\n",
1505 | " # Apply offset and limit to request\n",
1506 | " request.offset = offset\n",
1507 | " request.limit = limit\n",
1508 | "\n",
1509 | " # Fetch report data\n",
1510 | " response = client.run_report(request)\n",
1511 | " all_rows.extend(response.rows)\n",
1512 | "\n",
1513 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
1514 | "\n",
1515 | " # If fewer rows are fetched than the limit, we're done\n",
1516 | " if len(response.rows) < limit:\n",
1517 | " break\n",
1518 | "\n",
1519 | " # Update offset and page number to get the next set of rows\n",
1520 | " offset += limit\n",
1521 | " page_number += 1\n",
1522 | "\n",
1523 | " return all_rows\n",
1524 | "\n",
1525 | "# Function to fetch GA4 data using pagination\n",
1526 | "def get_ga4_report(client):\n",
1527 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
1528 | " request = RunReportRequest(\n",
1529 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
1530 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
1531 | " dimensions=[\n",
1532 | " Dimension(name='date'),\n",
1533 | " Dimension(name='sessionSource'),\n",
1534 | " Dimension(name='sessionMedium'),\n",
1535 | " Dimension(name='sessionCampaignName')\n",
1536 | " ],\n",
1537 | " metrics=[\n",
1538 | " Metric(name='ecommercePurchases'),\n",
1539 | " Metric(name='averagePurchaseRevenue'),\n",
1540 | " Metric(name='purchaseRevenue'),\n",
1541 | " Metric(name='advertiserAdClicks'),\n",
1542 | " Metric(name='advertiserAdCost'),\n",
1543 | " Metric(name='advertiserAdCostPerClick'),\n",
1544 | " Metric(name='returnOnAdSpend')\n",
1545 | " ]\n",
1546 | " )\n",
1547 | " return run_report_with_pagination(client, request)\n",
1548 | "\n",
1549 | "# Function to convert GA4 response to a DataFrame\n",
1550 | "def response_to_dataframe(response):\n",
1551 | " list_rows = []\n",
1552 | " for row in response:\n",
1553 | " try:\n",
1554 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
1555 | " except ValueError:\n",
1556 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
1557 | " session_source = row.dimension_values[1].value\n",
1558 | " session_medium = row.dimension_values[2].value\n",
1559 | " session_campaign_name = row.dimension_values[3].value\n",
1560 | " list_rows.append({\n",
1561 | " 'date': date_value,\n",
1562 | " 'sessionSource': session_source,\n",
1563 | " 'sessionMedium': session_medium,\n",
1564 | " 'sessionCampaignName': session_campaign_name,\n",
1565 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1566 | " 'averagePurchaseRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
1567 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
1568 | " 'advertiserAdClicks': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
1569 | " 'advertiserAdCost': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
1570 | " 'advertiserAdCostPerClick': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0,\n",
1571 | " 'returnOnAdSpend': pd.to_numeric(row.metric_values[6].value, errors='coerce') or 0\n",
1572 | " })\n",
1573 | " return pd.DataFrame(list_rows)\n",
1574 | "\n",
1575 | "# Function to upload data to BigQuery\n",
1576 | "def upload_to_bigquery(df, table_id):\n",
1577 | " # Define BigQuery schema\n",
1578 | " schema = [\n",
1579 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
1580 | " bigquery.SchemaField(\"sessionSource\", \"STRING\"),\n",
1581 | " bigquery.SchemaField(\"sessionMedium\", \"STRING\"),\n",
1582 | " bigquery.SchemaField(\"sessionCampaignName\", \"STRING\"),\n",
1583 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
1584 | " bigquery.SchemaField(\"averagePurchaseRevenue\", \"FLOAT\"),\n",
1585 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n",
1586 | " bigquery.SchemaField(\"advertiserAdClicks\", \"INTEGER\"),\n",
1587 | " bigquery.SchemaField(\"advertiserAdCost\", \"FLOAT\"),\n",
1588 | " bigquery.SchemaField(\"advertiserAdCostPerClick\", \"FLOAT\"),\n",
1589 | " bigquery.SchemaField(\"returnOnAdSpend\", \"FLOAT\")\n",
1590 | " ]\n",
1591 | "\n",
1592 | " # Configure BigQuery job to partition the table by the 'date' column\n",
1593 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1594 | " job_config = bigquery.LoadJobConfig(\n",
1595 | " schema=schema,\n",
1596 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
1597 | " time_partitioning=bigquery.TimePartitioning(\n",
1598 | " type_=bigquery.TimePartitioningType.DAY,\n",
1599 | " field='date'\n",
1600 | " )\n",
1601 | " )\n",
1602 | "\n",
1603 | " # Upload the DataFrame to BigQuery\n",
1604 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1605 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
1606 | "\n",
1607 | "# Main function\n",
1608 | "def main():\n",
1609 | " try:\n",
1610 | " # Authenticate GA4 using OAuth2\n",
1611 | " creds = authenticate_ga4()\n",
1612 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1613 | "\n",
1614 | " # Fetch GA4 data\n",
1615 | " ga4_response = get_ga4_report(client_ga4)\n",
1616 | "\n",
1617 | " # Convert the response to a DataFrame\n",
1618 | " ga4_df = response_to_dataframe(ga4_response)\n",
1619 | "\n",
1620 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1621 | " table_id = 'ga4_ads_data'\n",
1622 | " csv_filename = f\"{table_id}.csv\"\n",
1623 | "\n",
1624 | " # Save the DataFrame to a CSV file\n",
1625 | " ga4_df.to_csv(csv_filename, index=False)\n",
1626 | " print(f\"Data saved to {csv_filename}\")\n",
1627 | "\n",
1628 | " # Upload the DataFrame to BigQuery\n",
1629 | " upload_to_bigquery(ga4_df, table_id)\n",
1630 | " except Exception as e:\n",
1631 | " print(f\"Error occurred: {e}\")\n",
1632 | "\n",
1633 | "if __name__ == '__main__':\n",
1634 | " main()\n"
1635 | ],
1636 | "metadata": {
1637 | "id": "TB_tq1b0rvkt"
1638 | },
1639 | "execution_count": null,
1640 | "outputs": []
1641 | },
1642 | {
1643 | "cell_type": "code",
1644 | "source": [
1645 | "import pandas as pd\n",
1646 | "from google.cloud import bigquery\n",
1647 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1648 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1649 | "from google.oauth2 import service_account\n",
1650 | "from google_auth_oauthlib.flow import Flow\n",
1651 | "import json\n",
1652 | "import os\n",
1653 | "import pickle\n",
1654 | "\n",
1655 | "# Load configuration from a JSON file\n",
1656 | "with open(\"config.json\", \"r\") as f:\n",
1657 | " config = json.load(f)\n",
1658 | "\n",
1659 | "# Authenticate with service account for BigQuery\n",
1660 | "creds1 = service_account.Credentials.from_service_account_file(\n",
1661 | " config['SERVICE_ACCOUNT_FILE'],\n",
1662 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
1663 | ")\n",
1664 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
1665 | "\n",
1666 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
1667 | "def authenticate_ga4():\n",
1668 | " creds = None\n",
1669 | " if os.path.exists('token.pickle'):\n",
1670 | " with open('token.pickle', 'rb') as token:\n",
1671 | " creds = pickle.load(token)\n",
1672 | " else:\n",
1673 | " flow = Flow.from_client_secrets_file(\n",
1674 | " config['CLIENT_SECRET_FILE'],\n",
1675 | " scopes=config['SCOPES'],\n",
1676 | " redirect_uri='http://localhost:8080/'\n",
1677 | " )\n",
1678 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
1679 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
1680 | " code = input('Enter the authorization code: ')\n",
1681 | " flow.fetch_token(code=code)\n",
1682 | " creds = flow.credentials\n",
1683 | " with open('token.pickle', 'wb') as token:\n",
1684 | " pickle.dump(creds, token)\n",
1685 | " return creds\n",
1686 | "\n",
1687 | "# Function to paginate and fetch GA4 report data with logging\n",
1688 | "def run_report_with_pagination(client, request, limit=10000):\n",
1689 | " all_rows = []\n",
1690 | " offset = 0\n",
1691 | " page_number = 1\n",
1692 | "\n",
1693 | " while True:\n",
1694 | " # Apply offset and limit to request\n",
1695 | " request.offset = offset\n",
1696 | " request.limit = limit\n",
1697 | "\n",
1698 | " # Fetch report data\n",
1699 | " response = client.run_report(request)\n",
1700 | " all_rows.extend(response.rows)\n",
1701 | "\n",
1702 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
1703 | "\n",
1704 | " # If fewer rows are fetched than the limit, we're done\n",
1705 | " if len(response.rows) < limit:\n",
1706 | " break\n",
1707 | "\n",
1708 | " # Update offset and page number to get the next set of rows\n",
1709 | " offset += limit\n",
1710 | " page_number += 1\n",
1711 | "\n",
1712 | " return all_rows\n",
1713 | "\n",
1714 | "# Function to fetch GA4 data using pagination\n",
1715 | "def get_ga4_report(client):\n",
1716 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
1717 | " request = RunReportRequest(\n",
1718 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
1719 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
1720 | " dimensions=[\n",
1721 | " Dimension(name='transactionId'),\n",
1722 | " Dimension(name='itemName')\n",
1723 | " ],\n",
1724 | " metrics=[\n",
1725 | " Metric(name='itemPurchaseQuantity'),\n",
1726 | " Metric(name='itemRevenue')\n",
1727 | " ]\n",
1728 | " )\n",
1729 | " return run_report_with_pagination(client, request)\n",
1730 | "\n",
1731 | "# Function to convert GA4 response to a DataFrame\n",
1732 | "def response_to_dataframe(response):\n",
1733 | " list_rows = []\n",
1734 | " for row in response:\n",
1735 | " transaction_id = row.dimension_values[0].value\n",
1736 | " item_name = row.dimension_values[1].value\n",
1737 | " list_rows.append({\n",
1738 | " 'transactionId': transaction_id,\n",
1739 | " 'itemName': item_name,\n",
1740 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1741 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n",
1742 | " })\n",
1743 | " return pd.DataFrame(list_rows)\n",
1744 | "\n",
1745 | "# Function to upload data to BigQuery\n",
1746 | "def upload_to_bigquery(df, table_id):\n",
1747 | " # Define BigQuery schema\n",
1748 | " schema = [\n",
1749 | " bigquery.SchemaField(\"transactionId\", \"STRING\"),\n",
1750 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n",
1751 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n",
1752 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n",
1753 | " ]\n",
1754 | "\n",
1755 | " # Configure BigQuery job to partition the table by the 'transactionId' field\n",
1756 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1757 | " job_config = bigquery.LoadJobConfig(\n",
1758 | " schema=schema,\n",
1759 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE\n",
1760 | " )\n",
1761 | "\n",
1762 | " # Upload the DataFrame to BigQuery\n",
1763 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1764 | " print(f\"Data uploaded to {table_ref}\")\n",
1765 | "\n",
1766 | "# Main function\n",
1767 | "def main():\n",
1768 | " try:\n",
1769 | " # Authenticate GA4 using OAuth2\n",
1770 | " creds = authenticate_ga4()\n",
1771 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1772 | "\n",
1773 | " # Fetch GA4 data\n",
1774 | " ga4_response = get_ga4_report(client_ga4)\n",
1775 | "\n",
1776 | " # Convert the response to a DataFrame\n",
1777 | " ga4_df = response_to_dataframe(ga4_response)\n",
1778 | "\n",
1779 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1780 | " table_id = 'ga4_transaction_items'\n",
1781 | " csv_filename = f\"{table_id}.csv\"\n",
1782 | "\n",
1783 | " # Save the DataFrame to a CSV file\n",
1784 | " ga4_df.to_csv(csv_filename, index=False)\n",
1785 | " print(f\"Data saved to {csv_filename}\")\n",
1786 | "\n",
1787 | " # Upload the DataFrame to BigQuery\n",
1788 | " upload_to_bigquery(ga4_df, table_id)\n",
1789 | " except Exception as e:\n",
1790 | " print(f\"Error occurred: {e}\")\n",
1791 | "\n",
1792 | "if __name__ == '__main__':\n",
1793 | " main()\n"
1794 | ],
1795 | "metadata": {
1796 | "id": "Dt6n-vqwt1NB"
1797 | },
1798 | "execution_count": null,
1799 | "outputs": []
1800 | },
1801 | {
1802 | "cell_type": "code",
1803 | "source": [
1804 | "import pandas as pd\n",
1805 | "from google.cloud import bigquery\n",
1806 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1807 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1808 | "from google.oauth2 import service_account\n",
1809 | "from google_auth_oauthlib.flow import Flow\n",
1810 | "import json\n",
1811 | "import os\n",
1812 | "import pickle\n",
1813 | "\n",
1814 | "# Load configuration from a JSON file\n",
1815 | "with open(\"config.json\", \"r\") as f:\n",
1816 | " config = json.load(f)\n",
1817 | "\n",
1818 | "# Authenticate with service account for BigQuery\n",
1819 | "creds1 = service_account.Credentials.from_service_account_file(\n",
1820 | " config['SERVICE_ACCOUNT_FILE'],\n",
1821 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
1822 | ")\n",
1823 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
1824 | "\n",
1825 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
1826 | "def authenticate_ga4():\n",
1827 | " creds = None\n",
1828 | " if os.path.exists('token.pickle'):\n",
1829 | " with open('token.pickle', 'rb') as token:\n",
1830 | " creds = pickle.load(token)\n",
1831 | " else:\n",
1832 | " flow = Flow.from_client_secrets_file(\n",
1833 | " config['CLIENT_SECRET_FILE'],\n",
1834 | " scopes=config['SCOPES'],\n",
1835 | " redirect_uri='http://localhost:8080/'\n",
1836 | " )\n",
1837 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
1838 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
1839 | " code = input('Enter the authorization code: ')\n",
1840 | " flow.fetch_token(code=code)\n",
1841 | " creds = flow.credentials\n",
1842 | " with open('token.pickle', 'wb') as token:\n",
1843 | " pickle.dump(creds, token)\n",
1844 | " return creds\n",
1845 | "\n",
1846 | "# Function to paginate and fetch GA4 report data with logging\n",
1847 | "def run_report_with_pagination(client, request, limit=10000):\n",
1848 | " all_rows = []\n",
1849 | " offset = 0\n",
1850 | " page_number = 1\n",
1851 | "\n",
1852 | " while True:\n",
1853 | " # Apply offset and limit to request\n",
1854 | " request.offset = offset\n",
1855 | " request.limit = limit\n",
1856 | "\n",
1857 | " # Fetch report data\n",
1858 | " response = client.run_report(request)\n",
1859 | " all_rows.extend(response.rows)\n",
1860 | "\n",
1861 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
1862 | "\n",
1863 | " # If fewer rows are fetched than the limit, we're done\n",
1864 | " if len(response.rows) < limit:\n",
1865 | " break\n",
1866 | "\n",
1867 | " # Update offset and page number to get the next set of rows\n",
1868 | " offset += limit\n",
1869 | " page_number += 1\n",
1870 | "\n",
1871 | " return all_rows\n",
1872 | "\n",
1873 | "# Function to fetch GA4 data using pagination\n",
1874 | "def get_ga4_report(client):\n",
1875 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
1876 | " request = RunReportRequest(\n",
1877 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
1878 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
1879 | " dimensions=[Dimension(name='date')],\n",
1880 | " metrics=[\n",
1881 | " Metric(name='sessions'),\n",
1882 | " Metric(name='totalUsers'),\n",
1883 | " Metric(name='newUsers'),\n",
1884 | " Metric(name='ecommercePurchases'),\n",
1885 | " Metric(name='purchaseRevenue'),\n",
1886 | " Metric(name='screenPageViews'),\n",
1887 | " Metric(name='eventCount'),\n",
1888 | " Metric(name='averageSessionDuration'),\n",
1889 | " Metric(name='engagedSessions'),\n",
1890 | " Metric(name='engagementRate')\n",
1891 | " ]\n",
1892 | " )\n",
1893 | " return run_report_with_pagination(client, request)\n",
1894 | "\n",
1895 | "# Function to convert GA4 response to a DataFrame\n",
1896 | "def response_to_dataframe(response):\n",
1897 | " list_rows = []\n",
1898 | " for row in response:\n",
1899 | " try:\n",
1900 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
1901 | " except ValueError:\n",
1902 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
1903 | " list_rows.append({\n",
1904 | " 'date': date_value,\n",
1905 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
1906 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
1907 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
1908 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
1909 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
1910 | " 'screenPageViews': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0,\n",
1911 | " 'eventCount': pd.to_numeric(row.metric_values[6].value, errors='coerce') or 0,\n",
1912 | " 'averageSessionDuration': pd.to_numeric(row.metric_values[7].value, errors='coerce') or 0,\n",
1913 | " 'engagedSessions': pd.to_numeric(row.metric_values[8].value, errors='coerce') or 0,\n",
1914 | " 'engagementRate': pd.to_numeric(row.metric_values[9].value, errors='coerce') or 0\n",
1915 | " })\n",
1916 | " return pd.DataFrame(list_rows)\n",
1917 | "\n",
1918 | "# Function to upload data to BigQuery\n",
1919 | "def upload_to_bigquery(df, table_id):\n",
1920 | " # Define BigQuery schema\n",
1921 | " schema = [\n",
1922 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
1923 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
1924 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
1925 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n",
1926 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
1927 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n",
1928 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n",
1929 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n",
1930 | " bigquery.SchemaField(\"averageSessionDuration\", \"FLOAT\"),\n",
1931 | " bigquery.SchemaField(\"engagedSessions\", \"INTEGER\"),\n",
1932 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\")\n",
1933 | " ]\n",
1934 | "\n",
1935 | " # Configure BigQuery job to partition the table by the 'date' column\n",
1936 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
1937 | " job_config = bigquery.LoadJobConfig(\n",
1938 | " schema=schema,\n",
1939 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
1940 | " time_partitioning=bigquery.TimePartitioning(\n",
1941 | " type_=bigquery.TimePartitioningType.DAY,\n",
1942 | " field='date'\n",
1943 | " )\n",
1944 | " )\n",
1945 | "\n",
1946 | " # Upload the DataFrame to BigQuery\n",
1947 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
1948 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
1949 | "\n",
1950 | "# Main function\n",
1951 | "def main():\n",
1952 | " try:\n",
1953 | " # Authenticate GA4 using OAuth2\n",
1954 | " creds = authenticate_ga4()\n",
1955 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
1956 | "\n",
1957 | " # Fetch GA4 data\n",
1958 | " ga4_response = get_ga4_report(client_ga4)\n",
1959 | "\n",
1960 | " # Convert the response to a DataFrame\n",
1961 | " ga4_df = response_to_dataframe(ga4_response)\n",
1962 | "\n",
1963 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
1964 | " table_id = 'ga4_all_metrics_data'\n",
1965 | " csv_filename = f\"{table_id}.csv\"\n",
1966 | "\n",
1967 | " # Save the DataFrame to a CSV file\n",
1968 | " ga4_df.to_csv(csv_filename, index=False)\n",
1969 | " print(f\"Data saved to {csv_filename}\")\n",
1970 | "\n",
1971 | " # Upload the DataFrame to BigQuery\n",
1972 | " upload_to_bigquery(ga4_df, table_id)\n",
1973 | " except Exception as e:\n",
1974 | " print(f\"Error occurred: {e}\")\n",
1975 | "\n",
1976 | "if __name__ == '__main__':\n",
1977 | " main()\n"
1978 | ],
1979 | "metadata": {
1980 | "id": "Wb8umpjezZsU"
1981 | },
1982 | "execution_count": null,
1983 | "outputs": []
1984 | },
1985 | {
1986 | "cell_type": "code",
1987 | "source": [
1988 | "import pandas as pd\n",
1989 | "from google.cloud import bigquery\n",
1990 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
1991 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
1992 | "from google.oauth2 import service_account\n",
1993 | "from google_auth_oauthlib.flow import Flow\n",
1994 | "import json\n",
1995 | "import os\n",
1996 | "import pickle\n",
1997 | "\n",
1998 | "# Load configuration from a JSON file\n",
1999 | "with open(\"config.json\", \"r\") as f:\n",
2000 | " config = json.load(f)\n",
2001 | "\n",
2002 | "# Authenticate with service account for BigQuery\n",
2003 | "creds1 = service_account.Credentials.from_service_account_file(\n",
2004 | " config['SERVICE_ACCOUNT_FILE'],\n",
2005 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
2006 | ")\n",
2007 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
2008 | "\n",
2009 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
2010 | "def authenticate_ga4():\n",
2011 | " creds = None\n",
2012 | " if os.path.exists('token.pickle'):\n",
2013 | " with open('token.pickle', 'rb') as token:\n",
2014 | " creds = pickle.load(token)\n",
2015 | " else:\n",
2016 | " flow = Flow.from_client_secrets_file(\n",
2017 | " config['CLIENT_SECRET_FILE'],\n",
2018 | " scopes=config['SCOPES'],\n",
2019 | " redirect_uri='http://localhost:8080/'\n",
2020 | " )\n",
2021 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
2022 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
2023 | " code = input('Enter the authorization code: ')\n",
2024 | " flow.fetch_token(code=code)\n",
2025 | " creds = flow.credentials\n",
2026 | " with open('token.pickle', 'wb') as token:\n",
2027 | " pickle.dump(creds, token)\n",
2028 | " return creds\n",
2029 | "\n",
2030 | "# Function to paginate and fetch GA4 report data with logging\n",
2031 | "def run_report_with_pagination(client, request, limit=10000):\n",
2032 | " all_rows = []\n",
2033 | " offset = 0\n",
2034 | " page_number = 1\n",
2035 | "\n",
2036 | " while True:\n",
2037 | " # Apply offset and limit to request\n",
2038 | " request.offset = offset\n",
2039 | " request.limit = limit\n",
2040 | "\n",
2041 | " # Fetch report data\n",
2042 | " response = client.run_report(request)\n",
2043 | " all_rows.extend(response.rows)\n",
2044 | "\n",
2045 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
2046 | "\n",
2047 | " # If fewer rows are fetched than the limit, we're done\n",
2048 | " if len(response.rows) < limit:\n",
2049 | " break\n",
2050 | "\n",
2051 | " # Update offset and page number to get the next set of rows\n",
2052 | " offset += limit\n",
2053 | " page_number += 1\n",
2054 | "\n",
2055 | " return all_rows\n",
2056 | "\n",
2057 | "# Function to fetch GA4 data using pagination\n",
2058 | "def get_ga4_report(client):\n",
2059 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
2060 | " request = RunReportRequest(\n",
2061 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
2062 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
2063 | " dimensions=[\n",
2064 | " Dimension(name='date'),\n",
2065 | " Dimension(name='eventName')\n",
2066 | " ],\n",
2067 | " metrics=[\n",
2068 | " Metric(name='eventCount'),\n",
2069 | " Metric(name='eventCountPerUser'),\n",
2070 | " Metric(name='eventValue')\n",
2071 | " ]\n",
2072 | " )\n",
2073 | " return run_report_with_pagination(client, request)\n",
2074 | "\n",
2075 | "# Function to convert GA4 response to a DataFrame\n",
2076 | "def response_to_dataframe(response):\n",
2077 | " list_rows = []\n",
2078 | " for row in response:\n",
2079 | " try:\n",
2080 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
2081 | " except ValueError:\n",
2082 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
2083 | " event_name = row.dimension_values[1].value\n",
2084 | " list_rows.append({\n",
2085 | " 'date': date_value,\n",
2086 | " 'eventName': event_name,\n",
2087 | " 'eventCount': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
2088 | " 'eventCountPerUser': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
2089 | " 'eventValue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0\n",
2090 | " })\n",
2091 | " return pd.DataFrame(list_rows)\n",
2092 | "\n",
2093 | "# Function to upload data to BigQuery\n",
2094 | "def upload_to_bigquery(df, table_id):\n",
2095 | " # Define BigQuery schema\n",
2096 | " schema = [\n",
2097 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
2098 | " bigquery.SchemaField(\"eventName\", \"STRING\"),\n",
2099 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n",
2100 | " bigquery.SchemaField(\"eventCountPerUser\", \"FLOAT\"),\n",
2101 | " bigquery.SchemaField(\"eventValue\", \"FLOAT\")\n",
2102 | " ]\n",
2103 | "\n",
2104 | " # Configure BigQuery job to partition the table by the 'date' column\n",
2105 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
2106 | " job_config = bigquery.LoadJobConfig(\n",
2107 | " schema=schema,\n",
2108 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
2109 | " time_partitioning=bigquery.TimePartitioning(\n",
2110 | " type_=bigquery.TimePartitioningType.DAY,\n",
2111 | " field='date'\n",
2112 | " )\n",
2113 | " )\n",
2114 | "\n",
2115 | " # Upload the DataFrame to BigQuery\n",
2116 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
2117 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
2118 | "\n",
2119 | "# Main function\n",
2120 | "def main():\n",
2121 | " try:\n",
2122 | " # Authenticate GA4 using OAuth2\n",
2123 | " creds = authenticate_ga4()\n",
2124 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
2125 | "\n",
2126 | " # Fetch GA4 data\n",
2127 | " ga4_response = get_ga4_report(client_ga4)\n",
2128 | "\n",
2129 | " # Convert the response to a DataFrame\n",
2130 | " ga4_df = response_to_dataframe(ga4_response)\n",
2131 | "\n",
2132 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
2133 | " table_id = 'ga4_event_metrics_data'\n",
2134 | " csv_filename = f\"{table_id}.csv\"\n",
2135 | "\n",
2136 | " # Save the DataFrame to a CSV file\n",
2137 | " ga4_df.to_csv(csv_filename, index=False)\n",
2138 | " print(f\"Data saved to {csv_filename}\")\n",
2139 | "\n",
2140 | " # Upload the DataFrame to BigQuery\n",
2141 | " upload_to_bigquery(ga4_df, table_id)\n",
2142 | " except Exception as e:\n",
2143 | " print(f\"Error occurred: {e}\")\n",
2144 | "\n",
2145 | "if __name__ == '__main__':\n",
2146 | " main()\n"
2147 | ],
2148 | "metadata": {
2149 | "id": "CgQ4MPAf1A_b"
2150 | },
2151 | "execution_count": null,
2152 | "outputs": []
2153 | },
2154 | {
2155 | "cell_type": "code",
2156 | "source": [
2157 | "import pandas as pd\n",
2158 | "from google.cloud import bigquery\n",
2159 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
2160 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
2161 | "from google.oauth2 import service_account\n",
2162 | "from google_auth_oauthlib.flow import Flow\n",
2163 | "import json\n",
2164 | "import os\n",
2165 | "import pickle\n",
2166 | "\n",
2167 | "# Load configuration from a JSON file\n",
2168 | "with open(\"config.json\", \"r\") as f:\n",
2169 | " config = json.load(f)\n",
2170 | "\n",
2171 | "# Authenticate with service account for BigQuery\n",
2172 | "creds1 = service_account.Credentials.from_service_account_file(\n",
2173 | " config['SERVICE_ACCOUNT_FILE'],\n",
2174 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
2175 | ")\n",
2176 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
2177 | "\n",
2178 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
2179 | "def authenticate_ga4():\n",
2180 | " creds = None\n",
2181 | " if os.path.exists('token.pickle'):\n",
2182 | " with open('token.pickle', 'rb') as token:\n",
2183 | " creds = pickle.load(token)\n",
2184 | " else:\n",
2185 | " flow = Flow.from_client_secrets_file(\n",
2186 | " config['CLIENT_SECRET_FILE'],\n",
2187 | " scopes=config['SCOPES'],\n",
2188 | " redirect_uri='http://localhost:8080/'\n",
2189 | " )\n",
2190 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
2191 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
2192 | " code = input('Enter the authorization code: ')\n",
2193 | " flow.fetch_token(code=code)\n",
2194 | " creds = flow.credentials\n",
2195 | " with open('token.pickle', 'wb') as token:\n",
2196 | " pickle.dump(creds, token)\n",
2197 | " return creds\n",
2198 | "\n",
2199 | "# Function to paginate and fetch GA4 report data with logging\n",
2200 | "def run_report_with_pagination(client, request, limit=250000):\n",
2201 | " all_rows = []\n",
2202 | " offset = 0\n",
2203 | " page_number = 1\n",
2204 | "\n",
2205 | " while True:\n",
2206 | " # Apply offset and limit to request\n",
2207 | " request.offset = offset\n",
2208 | " request.limit = limit\n",
2209 | "\n",
2210 | " # Fetch report data\n",
2211 | " response = client.run_report(request)\n",
2212 | " all_rows.extend(response.rows)\n",
2213 | "\n",
2214 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
2215 | "\n",
2216 | " # If fewer rows are fetched than the limit, we're done\n",
2217 | " if len(response.rows) < limit:\n",
2218 | " break\n",
2219 | "\n",
2220 | " # Update offset and page number to get the next set of rows\n",
2221 | " offset += limit\n",
2222 | " page_number += 1\n",
2223 | "\n",
2224 | " return all_rows\n",
2225 | "\n",
2226 | "# Function to fetch GA4 data using pagination\n",
2227 | "def get_ga4_report(client):\n",
2228 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
2229 | " request = RunReportRequest(\n",
2230 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
2231 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
2232 | " dimensions=[\n",
2233 | " Dimension(name='date'),\n",
2234 | " Dimension(name='pageLocation') # New dimension\n",
2235 | " ],\n",
2236 | " metrics=[\n",
2237 | " Metric(name='totalUsers'),\n",
2238 | " Metric(name='ecommercePurchases'),\n",
2239 | " Metric(name='purchaseRevenue'),\n",
2240 | " Metric(name='screenPageViews'),\n",
2241 | " Metric(name='eventCount'),\n",
2242 | " Metric(name='engagementRate') # New metrics\n",
2243 | " ]\n",
2244 | " )\n",
2245 | " return run_report_with_pagination(client, request)\n",
2246 | "\n",
2247 | "# Function to convert GA4 response to a DataFrame\n",
2248 | "def response_to_dataframe(response):\n",
2249 | " list_rows = []\n",
2250 | " for row in response:\n",
2251 | " try:\n",
2252 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
2253 | " except ValueError:\n",
2254 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
2255 | " page_location = row.dimension_values[1].value # New dimension\n",
2256 | " list_rows.append({\n",
2257 | " 'date': date_value,\n",
2258 | " 'pageLocation': page_location, # New dimension\n",
2259 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
2260 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
2261 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
2262 | " 'screenPageViews': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
2263 | " 'eventCount': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
2264 | " 'engagementRate': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0 # New metric\n",
2265 | " })\n",
2266 | " return pd.DataFrame(list_rows)\n",
2267 | "\n",
2268 | "# Function to upload data to BigQuery\n",
2269 | "def upload_to_bigquery(df, table_id):\n",
2270 | " # Define BigQuery schema\n",
2271 | " schema = [\n",
2272 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
2273 | " bigquery.SchemaField(\"pageLocation\", \"STRING\"), # New dimension\n",
2274 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
2275 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
2276 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n",
2277 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"), # New metric\n",
2278 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"), # New metric\n",
2279 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\") # New metric\n",
2280 | " ]\n",
2281 | "\n",
2282 | " # Configure BigQuery job to partition the table by the 'date' column\n",
2283 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
2284 | " job_config = bigquery.LoadJobConfig(\n",
2285 | " schema=schema,\n",
2286 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
2287 | " time_partitioning=bigquery.TimePartitioning(\n",
2288 | " type_=bigquery.TimePartitioningType.DAY,\n",
2289 | " field='date'\n",
2290 | " )\n",
2291 | " )\n",
2292 | "\n",
2293 | " # Upload the DataFrame to BigQuery\n",
2294 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
2295 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
2296 | "\n",
2297 | "# Main function\n",
2298 | "def main():\n",
2299 | " try:\n",
2300 | " # Authenticate GA4 using OAuth2\n",
2301 | " creds = authenticate_ga4()\n",
2302 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
2303 | "\n",
2304 | " # Fetch GA4 data\n",
2305 | " ga4_response = get_ga4_report(client_ga4)\n",
2306 | "\n",
2307 | " # Convert the response to a DataFrame\n",
2308 | " ga4_df = response_to_dataframe(ga4_response)\n",
2309 | "\n",
2310 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
2311 | " table_id = 'ga4_page_location_data' # New table name\n",
2312 | " csv_filename = f\"{table_id}.csv\"\n",
2313 | "\n",
2314 | " # Save the DataFrame to a CSV file\n",
2315 | " ga4_df.to_csv(csv_filename, index=False)\n",
2316 | " print(f\"Data saved to {csv_filename}\")\n",
2317 | "\n",
2318 | " # Upload the DataFrame to BigQuery\n",
2319 | " upload_to_bigquery(ga4_df, table_id)\n",
2320 | " except Exception as e:\n",
2321 | " print(f\"Error occurred: {e}\")\n",
2322 | "\n",
2323 | "if __name__ == '__main__':\n",
2324 | " main()"
2325 | ],
2326 | "metadata": {
2327 | "id": "dTobTzk1nDNi"
2328 | },
2329 | "execution_count": null,
2330 | "outputs": []
2331 | },
2332 | {
2333 | "cell_type": "code",
2334 | "source": [
2335 | "import pandas as pd\n",
2336 | "from google.cloud import bigquery\n",
2337 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n",
2338 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n",
2339 | "from google.oauth2 import service_account\n",
2340 | "from google_auth_oauthlib.flow import Flow\n",
2341 | "import json\n",
2342 | "import os\n",
2343 | "import pickle\n",
2344 | "\n",
2345 | "# Load configuration from a JSON file\n",
2346 | "with open(\"config.json\", \"r\") as f:\n",
2347 | " config = json.load(f)\n",
2348 | "\n",
2349 | "# Authenticate with service account for BigQuery\n",
2350 | "creds1 = service_account.Credentials.from_service_account_file(\n",
2351 | " config['SERVICE_ACCOUNT_FILE'],\n",
2352 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n",
2353 | ")\n",
2354 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n",
2355 | "\n",
2356 | "# Authenticate for GA4 Analytics Data API using OAuth2\n",
2357 | "def authenticate_ga4():\n",
2358 | " creds = None\n",
2359 | " if os.path.exists('token.pickle'):\n",
2360 | " with open('token.pickle', 'rb') as token:\n",
2361 | " creds = pickle.load(token)\n",
2362 | " else:\n",
2363 | " flow = Flow.from_client_secrets_file(\n",
2364 | " config['CLIENT_SECRET_FILE'],\n",
2365 | " scopes=config['SCOPES'],\n",
2366 | " redirect_uri='http://localhost:8080/'\n",
2367 | " )\n",
2368 | " auth_url, _ = flow.authorization_url(prompt='consent')\n",
2369 | " print('Please go to this URL and finish the authentication: ', auth_url)\n",
2370 | " code = input('Enter the authorization code: ')\n",
2371 | " flow.fetch_token(code=code)\n",
2372 | " creds = flow.credentials\n",
2373 | " with open('token.pickle', 'wb') as token:\n",
2374 | " pickle.dump(creds, token)\n",
2375 | " return creds\n",
2376 | "\n",
2377 | "# Function to paginate and fetch GA4 report data with logging\n",
2378 | "def run_report_with_pagination(client, request, limit=250000):\n",
2379 | " all_rows = []\n",
2380 | " offset = 0\n",
2381 | " page_number = 1\n",
2382 | "\n",
2383 | " while True:\n",
2384 | " # Apply offset and limit to request\n",
2385 | " request.offset = offset\n",
2386 | " request.limit = limit\n",
2387 | "\n",
2388 | " # Fetch report data\n",
2389 | " response = client.run_report(request)\n",
2390 | " all_rows.extend(response.rows)\n",
2391 | "\n",
2392 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n",
2393 | "\n",
2394 | " # If fewer rows are fetched than the limit, we're done\n",
2395 | " if len(response.rows) < limit:\n",
2396 | " break\n",
2397 | "\n",
2398 | " # Update offset and page number to get the next set of rows\n",
2399 | " offset += limit\n",
2400 | " page_number += 1\n",
2401 | "\n",
2402 | " return all_rows\n",
2403 | "\n",
2404 | "# Function to fetch GA4 data using pagination\n",
2405 | "def get_ga4_report(client):\n",
2406 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n",
2407 | " request = RunReportRequest(\n",
2408 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n",
2409 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n",
2410 | " dimensions=[\n",
2411 | " Dimension(name='date'),\n",
2412 | " Dimension(name='landingPage')\n",
2413 | " ],\n",
2414 | " metrics=[\n",
2415 | " Metric(name='totalUsers'),\n",
2416 | " Metric(name='ecommercePurchases'),\n",
2417 | " Metric(name='purchaseRevenue'),\n",
2418 | " Metric(name='sessions'),\n",
2419 | " Metric(name='eventCount'),\n",
2420 | " Metric(name='engagementRate')\n",
2421 | " ]\n",
2422 | " )\n",
2423 | " return run_report_with_pagination(client, request)\n",
2424 | "\n",
2425 | "# Function to convert GA4 response to a DataFrame\n",
2426 | "def response_to_dataframe(response):\n",
2427 | " list_rows = []\n",
2428 | " for row in response:\n",
2429 | " try:\n",
2430 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n",
2431 | " except ValueError:\n",
2432 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n",
2433 | " landing_page = row.dimension_values[1].value\n",
2434 | " list_rows.append({\n",
2435 | " 'date': date_value,\n",
2436 | " 'landingPage': landing_page,\n",
2437 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n",
2438 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n",
2439 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n",
2440 | " 'sessions': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n",
2441 | " 'eventCount': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n",
2442 | " 'engagementRate': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n",
2443 | " })\n",
2444 | " return pd.DataFrame(list_rows)\n",
2445 | "\n",
2446 | "# Function to upload data to BigQuery\n",
2447 | "def upload_to_bigquery(df, table_id):\n",
2448 | " # Define BigQuery schema\n",
2449 | " schema = [\n",
2450 | " bigquery.SchemaField(\"date\", \"DATE\"),\n",
2451 | " bigquery.SchemaField(\"landingPage\", \"STRING\"),\n",
2452 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n",
2453 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n",
2454 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n",
2455 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n",
2456 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n",
2457 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\")\n",
2458 | " ]\n",
2459 | "\n",
2460 | " # Configure BigQuery job to partition the table by the 'date' column\n",
2461 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n",
2462 | " job_config = bigquery.LoadJobConfig(\n",
2463 | " schema=schema,\n",
2464 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
2465 | " time_partitioning=bigquery.TimePartitioning(\n",
2466 | " type_=bigquery.TimePartitioningType.DAY,\n",
2467 | " field='date'\n",
2468 | " )\n",
2469 | " )\n",
2470 | "\n",
2471 | " # Upload the DataFrame to BigQuery\n",
2472 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n",
2473 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n",
2474 | "\n",
2475 | "# Main function\n",
2476 | "def main():\n",
2477 | " try:\n",
2478 | " # Authenticate GA4 using OAuth2\n",
2479 | " creds = authenticate_ga4()\n",
2480 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n",
2481 | "\n",
2482 | " # Fetch GA4 data\n",
2483 | " ga4_response = get_ga4_report(client_ga4)\n",
2484 | "\n",
2485 | " # Convert the response to a DataFrame\n",
2486 | " ga4_df = response_to_dataframe(ga4_response)\n",
2487 | "\n",
2488 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n",
2489 | " table_id = 'ga4_landing_page_data'\n",
2490 | " csv_filename = f\"{table_id}.csv\"\n",
2491 | "\n",
2492 | " # Save the DataFrame to a CSV file\n",
2493 | " ga4_df.to_csv(csv_filename, index=False)\n",
2494 | " print(f\"Data saved to {csv_filename}\")\n",
2495 | "\n",
2496 | " # Upload the DataFrame to BigQuery\n",
2497 | " upload_to_bigquery(ga4_df, table_id)\n",
2498 | " except Exception as e:\n",
2499 | " print(f\"Error occurred: {e}\")\n",
2500 | "\n",
2501 | "if __name__ == '__main__':\n",
2502 | " main()"
2503 | ],
2504 | "metadata": {
2505 | "id": "T5zaPVqat6kl"
2506 | },
2507 | "execution_count": null,
2508 | "outputs": []
2509 | }
2510 | ]
2511 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Backfill-GA4-to-BigQuery
2 | Backfill-GA4-to-BigQuery" repository offers a solution for users to backfill their GA4 data into BigQuery. This is useful for those who need historical data from the start of their GA4 property, as GA4 data is typically only available in BigQuery after linking the two services. my solution provides a Game-Changer backfill of data to BigQuery. It uses OAuth 2.0 credentials for desktop applications, making authentication easier and well-suited for IDEs like Google Colab.
3 |
4 | ## What's New
5 |
6 | I've added a **notebook version** of the code for working with GA4 data using Python and BigQuery!
7 |
8 | - If you prefer **straightforward, ready-to-use scripts** for creating GA4-like tables with minimal effort, the notebook provides a streamlined approach for quick setup.
9 | - For those looking to **customize the dimensions, metrics, or data handling processes**, the original main code remains your go-to option for flexibility and control.
10 |
11 |
12 |
13 | ## Table of Contents
14 | 1. [Features](#features)
15 | 2. [Prerequisites](#prerequisites)
16 | 3. [Setup and Installation](#setup-and-installation)
17 | - [Step 1: Create New Dataset and Activate Analytics API](#step-1-create-new-dataset-and-activate-analytics-api)
18 | - [Step 2: Creating a Service Account](#step-2-creating-a-service-account)
19 | - [Step 3: Setting Up OAuth for Desktop App](#step-3-setting-up-oauth-for-desktop-app)
20 | - [Step 4: Configuration File](#step-4-configuration-file)
21 | - [Step 5: Installation of Dependencies](#step-5-installation-of-dependencies)
22 | - [Step 6: Running the Script](#step-6-running-the-script)
23 | - [Step 7: Authentication](#step-7-authentication)
24 | - [Step 8: QA](#step-8-qa)
25 | 4. [Using the Pre-Built Notebook for GA4 Reports](#using-the-pre-built-notebook-for-ga4-reports)
26 | 5. [Troubleshooting](#troubleshooting)
27 | 6. [Customization](#customization)
28 | 7. [Contributing](#contributing)
29 | 8. [Contact](#contact)
30 |
31 |
32 |
33 | ## Features
34 |
35 | - **OAuth 2.0 Authentication**: Simplifies authentication with OAuth 2.0 credentials, ideal for desktop apps and environments like Google Colab.
36 |
37 | - **Service Account Integration**: Securely connects to Google Cloud services using service accounts for enhanced security.
38 |
39 | - **Data Extraction from GA4**: Fetches comprehensive GA4 data from a specified start date, ideal for historical data backfilling.
40 |
41 | - **Customizable Configuration**: Offers a `config.json` file for user-specific settings like table prefixes and property IDs.
42 |
43 | - **BigQuery Integration**: Efficiently processes and stores data in BigQuery with proper schema management.
44 |
45 | - **Export Functionality**: Enables exporting GA4 data to CSV format for external use.
46 |
47 | - **Duplicate Check**: Incorporates mechanisms to avoid duplicate data entries in BigQuery.
48 |
49 | - **Flexible Data Retrieval**: Allows data fetching from a specific date or the previous day.
50 |
51 | - **Robust Error Handling**: Includes effective error handling and logging for smooth operation.
52 |
53 | - **Partitioning and clustering**: Dynamic partitioning and clustering for optimized query performance and cost management.
54 |
55 | - **Configurable End Date Range**: precise control over the data retrieval period, making it easier to manage data quotas and perform historical data analysis within a specific timeframe.
56 |
57 |
58 |
59 | ## Prerequisites
60 | - Google Cloud account with billing enabled.
61 | - Access to Google Analytics 4 and Google BigQuery.
62 | - Python environment (Python 3.x recommended).
63 |
64 | ## Setup and Installation
65 |
66 | ### Step 1: Create a New Project and Activate Analytics API
67 | - Go to [Google Cloud Console](https://console.cloud.google.com/apis/api/analyticsdata.googleapis.com/metrics) to activate the Analytics API in your selected project.
68 |
69 | ### Step 2: Creating a Service Account
70 |
71 | 1. **Access Google Cloud Console**: Visit the [Google Cloud Console](https://console.cloud.google.com/).
72 |
73 | 2. **Create a Service Account**:
74 | - Navigate to "IAM & Admin > Service Accounts".
75 | - Click "Create Service Account", enter a name, description, and click "Create".
76 | - Grant necessary roles to the service account (e.g., Owner or BigQuery Admin + BigQuery Job User).
77 |
78 | 3. **Generate a Service Account Key**:
79 | - Click on the created service account to manage it.
80 | - Go to the "Keys" tab and click "Add Key", then "Create new key".
81 | - Choose "JSON" as the key type and click "Create".
82 | - A JSON key file will be downloaded. Store it securely.
83 |
84 |
85 | ### Step 3: Setting Up OAuth for Desktop App
86 |
87 | To set up OAuth for a desktop application, you need to create an OAuth client ID in Google Cloud Console. Before creating an OAuth client ID, make sure to configure your consent screen if you don't have one already.
88 |
89 | #### Configure the Consent Screen:
90 |
91 | 1. **Access Consent Screen Configuration**:
92 | - In the Google Cloud Console, navigate to "APIs & Services > OAuth consent screen".
93 | - Select the external user type.
94 |
95 | 2. **Fill in Consent Screen Details**:
96 | - Provide the necessary information, such as the app name, user support email, and developer contact information.
97 | - add your email (and others, if needed) in the "Test users" section.
98 |
99 | 4. **Publish the App**:
100 | - Once all necessary information is provided, save and publish your consent screen.
101 |
102 | #### Create OAuth 2.0 Client ID:
103 |
104 | 1. **Navigate to Credentials**:
105 | - Go to "APIs & Services > Credentials".
106 |
107 | 2. **Create OAuth Client ID**:
108 | - Click "Create Credentials" and select "OAuth client ID".
109 | - Choose "Desktop app" as the Application type.
110 | - Provide a name for the client ID and click "Create".
111 |
112 | 3. **Download Client Configuration**:
113 | - After the OAuth client ID is created, download the client configuration JSON file.
114 | - This file contains your client ID and secret, which are essential for the OAuth flow.
115 |
116 | #### Note:
117 |
118 | - The script uses a `token.pickle` file to store access tokens and refresh tokens. Once authenticated, you won't need to repeat the authentication process unless the token is revoked or expired.
119 | - Ensure that the JSON file is stored securely and referenced correctly in your project.
120 |
121 |
122 | ### Step 4: Configuration File
123 | Fill out and save a `config.json` file with your specific parameters.
124 | Example:
125 | ```json
126 | {
127 | "CLIENT_SECRET_FILE": "",
128 | "SERVICE_ACCOUNT_FILE": "",
129 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly"],
130 | "PROPERTY_ID": "",
131 | "INITIAL_FETCH_FROM_DATE": "2022-01-01",
132 | "FETCH_TO_DATE": "today",
133 | "DATASET_ID": "",
134 | "TABLE_PREFIX": "_backfill_GA4",
135 | "PARTITION_BY": "Event_Date",
136 | "CLUSTER_BY": "Event_Name"
137 | }
138 | ```
139 | - **Client Secret and Service Account File**: Replace the placeholders with the actual paths to your OAuth client secret and service account JSON files.
140 |
141 | - **Property ID and Dataset ID**: Insert your Google Analytics Property ID and the BigQuery Dataset ID where data will be stored.
142 |
143 | - **Initial Fetch Date**: Set the initial date from which to fetch historical data in `YYYY-MM-DD` format.
144 |
145 | - **FETCH_TO_DATE**: Specify the end date for data fetching. Defaults to today's date. Format: `YYYY-MM-DD`.
146 |
147 | - **Table Prefix**: Specify the prefix for your BigQuery tables. If the specified prefix does not exist, the script will create tables with this prefix in BigQuery.
148 |
149 | - **PARTITION_BY**: Specifies the column for table partitioning. Default is Event_Date, which is highly recommended for optimal data management.
150 |
151 | - **CLUSTER_BY**: Specifies the column(s) for table clustering. Default is Event_Name, aligning with common querying patterns. While this choice is optimal for many use cases, you may customize this field.
152 |
153 | Install necessary Python packages:
154 | ```bash
155 | !pip install google-analytics-data==0.18.4
156 | !pip install google-cloud-bigquery
157 | !pip install google-auth==2.27.0
158 | !pip install google-auth-oauthlib
159 | !pip install google-auth-httplib2
160 | ```
161 |
162 | ### Step 6: Running the Script
163 |
164 | After configuring the `config.json` file and saving the source code with the same name `backfill-ga4.py` , it's time to run the script with the desired flags.
165 |
166 | - **Execute the Script with Flags**:
167 | - Use the `%run` command followed by the script name and the desired flag.
168 | - For fetching data from yesterday, use:
169 | ```bash
170 | %run backfill-ga4.py --yesterday
171 | ```
172 | - For fetching data from the initial fetch date specified in your `config.json`, use:
173 | ```bash
174 | %run backfill-ga4.py --initial_fetch
175 | ```
176 | - This will start the authentication flow
177 |
178 | ### Step 7: Authentication
179 |
180 | - **Run the Script**:
181 | - Execute your Python script.
182 | - It will prompt you to open a URL for authentication.
183 | - Ensure that you choose a Google account that has the necessary access to selected property.
184 | - If you don't verify your app in the first step, select "Go to 'YourPublishedAPP'(unsafe)" to access the authentication code on localhost.
185 | - Your code can be found as a part of the URL between "code=" and the next ampersand. (Screenshot attached)
186 | [](https://postimg.cc/6TFYHQKN)
187 | - Copy and paste this code back into the script.
188 | - Now data retrieval process based on the specified date range should be completed. It is important that a table, that has been exported, is
189 | visible in both Bigquery table and CSV downloadable file.
190 |
191 | ### Step 8: QA
192 |
193 | - **Check for Successful Setup**:
194 | - Upon successful completion of the script, it should indicate that the authentication process is complete and data fetching has started.
195 | - Now, you should be able to see the new tables in your Google Analytics BigQuery dataset (`DATASET_ID` specified in your `config.json`).
196 | - Additionally, the `output.csv` file in your project directory should contain the fetched data.
197 | - If the tables are visible and the CSV file has data, everything is set up correctly.
198 |
199 |
200 | ## Using the Pre-Built Notebook for GA4 Reports
201 |
202 | This repository now includes a **custom notebook** for exporting **13 of the most useful GA4 reports** into BigQuery and CSV format. This notebook simplifies the process, eliminating the need to dive into the source code. Follow the steps below to configure and run the notebook. Here is a clear breakdown of the tables that will be exported after running the notebook :
203 |
204 |
205 | | **Table Name** | **Dimensions** | **Metrics** |
206 | |-----------------------------------|---------------------------------------------|-------------------------------------------------------------------------------------------------|
207 | | `ga4_transaction_items` | `transactionId`, `itemName`, `date` | `itemPurchaseQuantity`, `itemRevenue` |
208 | | `ga4_data_session_channel_group` | `date`, `sessionDefaultChannelGroup` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
209 | | `ga4_data_session_source_campaign_medium` | `date`, `sessionSource`, `sessionCampaignName`, `sessionMedium` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
210 | | `ga4_data_country_language_city` | `date`, `country`, `language`, `city` | `sessions`, `screenPageViews`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
211 | | `ga4_data_item_name` | `date`, `itemName` | `itemPurchaseQuantity`, `itemRevenue` |
212 | | `ga4_data_browser_os_device` | `date`, `browser`, `operatingSystem`, `deviceCategory` | `sessions`, `screenPageViews`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
213 | | `ga4_data_first_user_source_medium` | `date`, `firstUserMedium`, `firstUserSource`, `firstUserCampaignName` | `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
214 | | `ga4_data_first_user_channel_group` | `date`, `firstUserDefaultChannelGroup` | `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` |
215 | | `ga4_ads_data` | `date`, `sessionSource`, `sessionMedium`, `sessionCampaignName` | `ecommercePurchases`, `averagePurchaseRevenue`, `purchaseRevenue`, `advertiserAdClicks`, `advertiserAdCost`, `advertiserAdCostPerClick`, `returnOnAdSpend` |
216 | | `ga4_all_metrics_data` | `date` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue`, `screenPageViews`, `eventCount`, `averageSessionDuration`, `engagedSessions`, `engagementRate` |
217 | | `ga4_event_metrics_data` | `date`, `eventName` | `eventCount`, `eventCountPerUser`, `eventValue` |
218 | | `ga4_page_location_data` | `date`, `pageLocation` | `totalUsers`, `ecommercePurchases`, `purchaseRevenue`, `screenPageViews`, `eventCount`, `engagementRate` |
219 | | `ga4_landing_page_data` | `date`, `landingPage` | `totalUsers`, `ecommercePurchases`, `purchaseRevenue`, `sessions`, `eventCount`, `engagementRate` |
220 |
221 |
222 |
223 | ### Steps to Use the Notebook
224 |
225 | 1. **Initial Steps**:
226 | The first three steps (creating a dataset, activating the Analytics API, and setting up OAuth) remain the same as detailed in the [Setup and Installation](#setup-and-installation) section.
227 |
228 | 2. **Prepare the Configuration File (`config.json`)**:
229 | Use the following template for the `config.json` file:
230 | ```json
231 | {
232 | "CLIENT_SECRET_FILE": "/path/to/your/client_secret.json",
233 | "SERVICE_ACCOUNT_FILE": "/path/to/your/service_account.json",
234 | "PROPERTY_ID": "",
235 | "INITIAL_FETCH_FROM_DATE": "YYYY-MM-DD",
236 | "FETCH_TO_DATE": "today",
237 | "DATASET_ID": "",
238 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly", "https://www.googleapis.com/auth/bigquery"]
239 | }
240 | ```
241 | Replace placeholders with your project-specific details.
242 |
243 | 3. **Run the Notebook**:
244 | - Upload the `config.json` file to the notebook directory.
245 | - Open and execute the cells in the notebook sequentially.
246 | - During execution, you will be prompted to authorize access. Follow the instructions to complete the OAuth flow.
247 | - Once authorized, the script will fetch the data and save it to BigQuery and a downloadable CSV.
248 |
249 |
250 | ## Troubleshooting
251 |
252 | ### AttributeError on Script Execution
253 |
254 | **Issue:** Encountering an `AttributeError` related to `credentials.universe_domain` when running the script.
255 |
256 | **Solution:** This is likely due to version mismatches in `google-auth` and `google-analytics-data` libraries. Resolve it by upgrading both libraries:
257 |
258 | ```shell
259 | pip install --upgrade google-analytics-data google-auth
260 | ```
261 |
262 | Run this command in your terminal or command prompt to ensure you're using compatible versions, which should fix the issue.
263 |
264 |
265 | ## Customization
266 |
267 | Your project can be customized to fetch different metrics and dimensions based on your specific needs. Use the [Google Analytics Data API schema](https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema) to understand the available metrics and dimensions. You can then modify the script to query different sets of data from your Google Analytics account.
268 |
269 | - **Tailor Metrics and Dimensions**: In the script, identify the sections where API requests are constructed and modify the `metrics` and `dimensions` according to your requirements.
270 | - **Consult API Schema**: The API schema documentation provides a comprehensive list of all available metrics and dimensions, along with their descriptions and usage.
271 |
272 |
273 | ## Contributing
274 |
275 | Contributions to this project are welcome! Here's how you can help:
276 |
277 | - **Reporting Issues**: Report issues or bugs by opening a new issue in the GitHub repository.
278 | - **Feature Requests**: If you have ideas for new features or improvements, feel free to create an issue describing your suggestion.
279 | - **Submitting Pull Requests**: You can contribute directly to the codebase. Please ensure your code adheres to the project's coding standards and include tests for new features.
280 |
281 |
282 |
--------------------------------------------------------------------------------
/backfill-GA4-schema.md:
--------------------------------------------------------------------------------
1 | [](https://postimg.cc/hhQk6djC)
2 |
--------------------------------------------------------------------------------
/backfill-ga4.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import argparse
3 | import datetime
4 | import sys
5 | import json
6 | import os
7 | import pickle
8 | from google.analytics.data_v1beta import BetaAnalyticsDataClient, OrderBy
9 | from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest
10 | from google.cloud import bigquery
11 | from google.oauth2 import service_account
12 | from google.cloud.exceptions import NotFound
13 | from google_auth_oauthlib.flow import Flow
14 |
15 | # Load configuration
16 | with open("config.json", "r") as f:
17 | config = json.load(f)
18 |
19 | # function to validate and ensure FETCH_TO_DATE is correct
20 | def get_valid_end_date(end_date_str):
21 | try:
22 | valid_end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date()
23 | except (TypeError, ValueError):
24 | valid_end_date = datetime.date.today()
25 | return valid_end_date
26 |
27 | # Validate or default FETCH_TO_DATE from the config
28 | valid_end_date = get_valid_end_date(config.get('FETCH_TO_DATE'))
29 | FETCH_TO_DATE = valid_end_date.strftime('%Y-%m-%d')
30 |
31 | # Function to check if an event exists in BigQuery
32 | def exists_in_bigquery(event_name, event_date, event_count, channel_group, dataset_id, bq_client):
33 | year = event_date[:4]
34 | month = event_date[4:6]
35 | table_id = f'{TABLE_PREFIX}{year}{month}01'
36 | table_ref = bq_client.dataset(dataset_id).table(table_id)
37 |
38 | try:
39 | bq_client.get_table(table_ref)
40 | except NotFound:
41 | return False
42 |
43 |
44 | params = [
45 | bigquery.ScalarQueryParameter('event_name', 'STRING', event_name),
46 | bigquery.ScalarQueryParameter('event_date', 'INTEGER', event_date),
47 | bigquery.ScalarQueryParameter('event_count', 'INTEGER', event_count),
48 | bigquery.ScalarQueryParameter('channel_group', 'STRING', channel_group)
49 | ]
50 |
51 | job_config = bigquery.QueryJobConfig()
52 | job_config.query_parameters = params
53 |
54 | result = bq_client.query(query, job_config=job_config).result()
55 | count = list(result)[0][0]
56 |
57 | if count > 0:
58 | print(f"..record already exists in BigQuery ({count})", flush=True)
59 |
60 | return count > 0
61 |
62 | def get_table_ref(year, month):
63 | table_id = f'{TABLE_PREFIX}{year}{month}01'
64 | return bq_client.dataset(DATASET_ID).table(table_id)
65 |
66 | # Configuration parameters
67 | CLIENT_SECRET_FILE = config['CLIENT_SECRET_FILE']
68 | SCOPES = config['SCOPES']
69 | TABLE_PREFIX = config['TABLE_PREFIX']
70 | PROPERTY_ID = config['PROPERTY_ID']
71 | DATASET_ID = config['DATASET_ID']
72 | INITIAL_FETCH_FROM_DATE = config['INITIAL_FETCH_FROM_DATE']
73 | SERVICE_ACCOUNT_FILE = config['SERVICE_ACCOUNT_FILE']
74 | PARTITION_BY = config.get('PARTITION_BY', 'Event_Date') # Default to Event_Date
75 | CLUSTER_BY = config.get('CLUSTER_BY', 'Event_Name')
76 | FETCH_TO_DATE = config.get('FETCH_TO_DATE', datetime.date.today().strftime('%Y-%m-%d'))
77 |
78 |
79 | # Command line arguments for date range
80 | parser = argparse.ArgumentParser(description='Fetch data based on date range.')
81 | parser.add_argument('--yesterday', action='store_true', help='Fetch data from yesterday only.')
82 | parser.add_argument('--initial_fetch', action='store_true', help='Fetch data from a wide date range.')
83 | args = parser.parse_args()
84 |
85 | # Determine date range
86 | start_date = None
87 | end_date = None
88 | if args.yesterday:
89 | date = datetime.date.today() - datetime.timedelta(days=1)
90 | start_date = end_date = date.strftime('%Y-%m-%d')
91 | elif args.initial_fetch:
92 | confirmation = input("Using the initial_fetch might result in duplicated records. Do you want to proceed? (yes/no): ").strip().lower()
93 | if confirmation == 'yes':
94 | start_date = INITIAL_FETCH_FROM_DATE
95 | end_date = FETCH_TO_DATE
96 | else:
97 | print("Exiting script due to user cancellation.", flush=True)
98 | sys.exit()
99 | else:
100 | print("No valid date range argument provided. Exiting script.", flush=True)
101 | sys.exit()
102 | print(f"Starting fetching data from {start_date} to {valid_end_date.strftime('%Y-%m-%d')}.", flush=True)
103 |
104 | # Authenticate with service account for BigQuery
105 | creds1 = service_account.Credentials.from_service_account_file(
106 | SERVICE_ACCOUNT_FILE,
107 | scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']
108 | )
109 | bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)
110 |
111 | # Authenticate for Analytics Data API
112 | if os.path.exists('token.pickle'):
113 | with open('token.pickle', 'rb') as token:
114 | creds = pickle.load(token)
115 | else:
116 | # Create the flow using the client secrets file
117 | flow = Flow.from_client_secrets_file(
118 | CLIENT_SECRET_FILE,
119 | scopes=SCOPES,
120 | redirect_uri='http://localhost:8080/'
121 | )
122 |
123 | # Generate the authorization URL
124 | auth_url, _ = flow.authorization_url(prompt='consent')
125 |
126 | print('Please go to this URL and finish the authentication: ', auth_url)
127 | code = input('Enter the authorization code: ')
128 | flow.fetch_token(code=code)
129 |
130 | creds = flow.credentials
131 |
132 | # Save the credentials for future use
133 | with open('token.pickle', 'wb') as token:
134 | pickle.dump(creds, token)
135 |
136 | print("Authentication successful!")
137 |
138 | client = BetaAnalyticsDataClient(credentials=creds)
139 |
140 | # Function to run report with pagination
141 | def run_report_with_pagination(client, request):
142 | all_rows = []
143 | offset = 0 # Initialize offset
144 | limit = 10000 # Set limit (maximum rows per request)
145 |
146 | while True:
147 | # Apply offset and limit to request
148 | request.offset = offset
149 | request.limit = limit
150 |
151 | response = client.run_report(request)
152 | all_rows.extend(response.rows)
153 |
154 | # Check if there are more rows to fetch
155 | if len(response.rows) == limit:
156 | offset += limit # Increase offset for the next iteration
157 | else:
158 | break # No more rows left, exit loop
159 |
160 | return all_rows
161 |
162 | # Requests for active users and events
163 | request_active_users = RunReportRequest(
164 | property=f'properties/{PROPERTY_ID}',
165 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
166 | dimensions=[
167 | Dimension(name='date'),
168 | Dimension(name='sessionDefaultChannelGroup')
169 | ],
170 | metrics=[Metric(name='sessions')],
171 | order_bys=[OrderBy({"dimension": {"dimension_name": "date"}})]
172 | )
173 |
174 | active_users = run_report_with_pagination(client, request_active_users)
175 |
176 | request_events = RunReportRequest(
177 | property=f'properties/{PROPERTY_ID}',
178 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
179 | dimensions=[Dimension(name='eventName'), Dimension(name='date'), Dimension(name='isConversionEvent'), Dimension(name='sessionDefaultChannelGroup')],
180 | metrics=[Metric(name='eventCount')]
181 | )
182 |
183 | all_events = run_report_with_pagination(client, request_events)
184 |
185 | # Process and write data to CSV
186 | rows_by_month = {}
187 |
188 | with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
189 | csv_writer = csv.writer(csvfile)
190 | csv_writer.writerow(['Event Name', 'Event Date', 'Event Count', 'Is Conversion', 'Channel', 'Event_Type'])
191 |
192 | # Processing active users data
193 | for row in active_users:
194 | event_name = "ct_active_users"
195 | is_conversion = None
196 | event_date = row.dimension_values[0].value
197 | channel_group = row.dimension_values[1].value
198 | event_count = row.metric_values[0].value
199 | event_type = "Traffic"
200 |
201 | csv_writer.writerow([event_name, event_date, event_count, is_conversion, channel_group, event_type])
202 |
203 | # Check for existing records in BigQuery
204 | if not (args.yesterday and exists_in_bigquery(event_name, event_date, event_count, channel_group, DATASET_ID, bq_client)):
205 | year, month = event_date[:4], event_date[4:6]
206 | key = (year, month)
207 | rows_by_month.setdefault(key, []).append({
208 | "Event_Name": event_name,
209 | "Event_Date": event_date,
210 | "Event_Count": event_count,
211 | "Is_Conversion": is_conversion,
212 | "Channel": channel_group,
213 | "Event_Type": event_type
214 | })
215 |
216 | # Sort and process events data
217 | sorted_events = sorted(all_events, key=lambda x: x.dimension_values[1].value)
218 | for row in sorted_events:
219 | event_name = row.dimension_values[0].value
220 | event_date = row.dimension_values[1].value
221 | is_conversion = row.dimension_values[2].value
222 |
223 | if is_conversion == "(not set)":
224 | is_conversion = ""
225 |
226 | channel_group = row.dimension_values[3].value
227 | event_count = row.metric_values[0].value
228 |
229 | is_conversion = bool(is_conversion)
230 | event_type = "Conversion" if is_conversion else "Event"
231 |
232 | csv_writer.writerow([event_name, event_date, event_count, is_conversion, channel_group, event_type])
233 |
234 | # Check for existing records in BigQuery
235 | if not (args.yesterday and exists_in_bigquery(event_name, event_date, event_count, channel_group, DATASET_ID, bq_client)):
236 | year, month = event_date[:4], event_date[4:6]
237 | key = (year, month)
238 | rows_by_month.setdefault(key, []).append({
239 | "Event_Name": event_name,
240 | "Event_Date": event_date,
241 | "Event_Count": event_count,
242 | "Is_Conversion": is_conversion,
243 | "Channel": channel_group,
244 | "Event_Type": event_type
245 | })
246 |
247 | print("Data saved to output.csv!", flush=True)
248 |
249 | def create_or_update_table_with_partition_and_cluster(dataset_id, simple_table_id, schema, partition_by=None, cluster_by=None):
250 | full_table_id = f"{bq_client.project}.{dataset_id}.{simple_table_id}" # Correctly construct the full table ID
251 | table = bigquery.Table(full_table_id, schema=schema)
252 |
253 | if partition_by:
254 | table.time_partitioning = bigquery.TimePartitioning(field=partition_by)
255 |
256 | if cluster_by:
257 | table.clustering_fields = [cluster_by]
258 |
259 | try:
260 | # Attempt to create the table, or if it exists, confirm it's updated
261 | created_table = bq_client.create_table(table, exists_ok=True)
262 | print(f"Table {created_table.full_table_id} created or confirmed existing with specified settings.")
263 | except Exception as e:
264 | print(f"Error creating or confirming table: {e}")
265 |
266 | TABLE_PREFIX = config.get('TABLE_PREFIX') # Handle potential absence of key
267 | DATASET_ID = config['DATASET_ID']
268 |
269 | schema = [
270 | bigquery.SchemaField("Event_Name", "STRING", mode="NULLABLE"),
271 | bigquery.SchemaField("Event_Date", "DATE", mode="NULLABLE"),
272 | bigquery.SchemaField("Event_Count", "INTEGER", mode="NULLABLE"),
273 | bigquery.SchemaField("Is_Conversion", "BOOLEAN", mode="NULLABLE"),
274 | bigquery.SchemaField("Channel", "STRING", mode="NULLABLE"),
275 | bigquery.SchemaField("Event_Type", "STRING", mode="NULLABLE"),
276 | ]
277 |
278 | def format_event_date(event_date):
279 | return f"{event_date[:4]}-{event_date[4:6]}-{event_date[6:]}"
280 |
281 | table_id = f"{bq_client.project}.{DATASET_ID}.{TABLE_PREFIX}"
282 |
283 | try:
284 | bq_client.get_table(table_id)
285 | print(f"Table {table_id} already exists.")
286 | except NotFound:
287 | # If table does not exist, create it
288 | print(f"Table {table_id} not found. Creating table...")
289 | table = bigquery.Table(table_id, schema=schema)
290 | table.time_partitioning = bigquery.TimePartitioning(
291 | field=config["PARTITION_BY"],
292 | type_=bigquery.TimePartitioningType.DAY
293 | )
294 | if "CLUSTER_BY" in config and config["CLUSTER_BY"]:
295 | table.clustering_fields = [config["CLUSTER_BY"]]
296 | bq_client.create_table(table)
297 | print(f"Created table {table_id}")
298 |
299 | all_rows_to_insert = []
300 | for _, month_data in rows_by_month.items():
301 | for row in month_data:
302 | # Format the 'Event_Date' to match BigQuery DATE format 'YYYY-MM-DD'
303 | if 'Event_Date' in row:
304 | row['Event_Date'] = format_event_date(row['Event_Date'])
305 | all_rows_to_insert.append(row)
306 |
307 | # Now, insert all rows into the single table
308 | if all_rows_to_insert:
309 | errors = bq_client.insert_rows_json(table_id, all_rows_to_insert) # Use insert_rows_json for better performance with dicts
310 | if errors:
311 | print("Errors:", errors, flush=True)
312 | else:
313 | print(f"Data saved to BigQuery!", flush=True)
314 | else:
315 | print("No data to insert.")
316 |
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "CLIENT_SECRET_FILE": "",
3 | "SERVICE_ACCOUNT_FILE": "",
4 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly"],
5 | "PROPERTY_ID": "",
6 | "INITIAL_FETCH_FROM_DATE": "2022-01-01",
7 | "FETCH_TO_DATE": "today",
8 | "DATASET_ID": "",
9 | "TABLE_PREFIX": "_backfill_GA4",
10 | "PARTITION_BY": "Event_Date",
11 | "CLUSTER_BY": "Event_Name"
12 | }
13 |
--------------------------------------------------------------------------------
/tansfer_divar_data_from_huggingface_to_bigquery.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOUqeVLqcnQdmK/K27isM7E",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | },
17 | "widgets": {
18 | "application/vnd.jupyter.widget-state+json": {
19 | "cf441a3e0a58460a902f8ece8095abcd": {
20 | "model_module": "@jupyter-widgets/controls",
21 | "model_name": "HBoxModel",
22 | "model_module_version": "1.5.0",
23 | "state": {
24 | "_dom_classes": [],
25 | "_model_module": "@jupyter-widgets/controls",
26 | "_model_module_version": "1.5.0",
27 | "_model_name": "HBoxModel",
28 | "_view_count": null,
29 | "_view_module": "@jupyter-widgets/controls",
30 | "_view_module_version": "1.5.0",
31 | "_view_name": "HBoxView",
32 | "box_style": "",
33 | "children": [
34 | "IPY_MODEL_f38efd6522cf47c2a80d4230a3673bb2",
35 | "IPY_MODEL_b90a72eeccaa4f1c889f652ec0f56a84",
36 | "IPY_MODEL_2d040286969446e1818a6b53616ca163"
37 | ],
38 | "layout": "IPY_MODEL_48ee6b5e634948a7bec5d3ae58e32dba"
39 | }
40 | },
41 | "f38efd6522cf47c2a80d4230a3673bb2": {
42 | "model_module": "@jupyter-widgets/controls",
43 | "model_name": "HTMLModel",
44 | "model_module_version": "1.5.0",
45 | "state": {
46 | "_dom_classes": [],
47 | "_model_module": "@jupyter-widgets/controls",
48 | "_model_module_version": "1.5.0",
49 | "_model_name": "HTMLModel",
50 | "_view_count": null,
51 | "_view_module": "@jupyter-widgets/controls",
52 | "_view_module_version": "1.5.0",
53 | "_view_name": "HTMLView",
54 | "description": "",
55 | "description_tooltip": null,
56 | "layout": "IPY_MODEL_561eeac4fa5e4134a143d5ee08e5bf21",
57 | "placeholder": "",
58 | "style": "IPY_MODEL_8b4bbb4cb2e144fd86cfc4c72fc21df6",
59 | "value": "real_estate_ads.csv: 100%"
60 | }
61 | },
62 | "b90a72eeccaa4f1c889f652ec0f56a84": {
63 | "model_module": "@jupyter-widgets/controls",
64 | "model_name": "FloatProgressModel",
65 | "model_module_version": "1.5.0",
66 | "state": {
67 | "_dom_classes": [],
68 | "_model_module": "@jupyter-widgets/controls",
69 | "_model_module_version": "1.5.0",
70 | "_model_name": "FloatProgressModel",
71 | "_view_count": null,
72 | "_view_module": "@jupyter-widgets/controls",
73 | "_view_module_version": "1.5.0",
74 | "_view_name": "ProgressView",
75 | "bar_style": "success",
76 | "description": "",
77 | "description_tooltip": null,
78 | "layout": "IPY_MODEL_3f6e48f483774947bf59bf143a89c76d",
79 | "max": 780721338,
80 | "min": 0,
81 | "orientation": "horizontal",
82 | "style": "IPY_MODEL_ee74e78455634b59a4d9cea414f1780f",
83 | "value": 780721338
84 | }
85 | },
86 | "2d040286969446e1818a6b53616ca163": {
87 | "model_module": "@jupyter-widgets/controls",
88 | "model_name": "HTMLModel",
89 | "model_module_version": "1.5.0",
90 | "state": {
91 | "_dom_classes": [],
92 | "_model_module": "@jupyter-widgets/controls",
93 | "_model_module_version": "1.5.0",
94 | "_model_name": "HTMLModel",
95 | "_view_count": null,
96 | "_view_module": "@jupyter-widgets/controls",
97 | "_view_module_version": "1.5.0",
98 | "_view_name": "HTMLView",
99 | "description": "",
100 | "description_tooltip": null,
101 | "layout": "IPY_MODEL_b4e7f4b921574c4cace9cafe6c13b34c",
102 | "placeholder": "",
103 | "style": "IPY_MODEL_66bca6fc67d34d1bb5c68bd645df0c59",
104 | "value": " 781M/781M [00:11<00:00, 117MB/s]"
105 | }
106 | },
107 | "48ee6b5e634948a7bec5d3ae58e32dba": {
108 | "model_module": "@jupyter-widgets/base",
109 | "model_name": "LayoutModel",
110 | "model_module_version": "1.2.0",
111 | "state": {
112 | "_model_module": "@jupyter-widgets/base",
113 | "_model_module_version": "1.2.0",
114 | "_model_name": "LayoutModel",
115 | "_view_count": null,
116 | "_view_module": "@jupyter-widgets/base",
117 | "_view_module_version": "1.2.0",
118 | "_view_name": "LayoutView",
119 | "align_content": null,
120 | "align_items": null,
121 | "align_self": null,
122 | "border": null,
123 | "bottom": null,
124 | "display": null,
125 | "flex": null,
126 | "flex_flow": null,
127 | "grid_area": null,
128 | "grid_auto_columns": null,
129 | "grid_auto_flow": null,
130 | "grid_auto_rows": null,
131 | "grid_column": null,
132 | "grid_gap": null,
133 | "grid_row": null,
134 | "grid_template_areas": null,
135 | "grid_template_columns": null,
136 | "grid_template_rows": null,
137 | "height": null,
138 | "justify_content": null,
139 | "justify_items": null,
140 | "left": null,
141 | "margin": null,
142 | "max_height": null,
143 | "max_width": null,
144 | "min_height": null,
145 | "min_width": null,
146 | "object_fit": null,
147 | "object_position": null,
148 | "order": null,
149 | "overflow": null,
150 | "overflow_x": null,
151 | "overflow_y": null,
152 | "padding": null,
153 | "right": null,
154 | "top": null,
155 | "visibility": null,
156 | "width": null
157 | }
158 | },
159 | "561eeac4fa5e4134a143d5ee08e5bf21": {
160 | "model_module": "@jupyter-widgets/base",
161 | "model_name": "LayoutModel",
162 | "model_module_version": "1.2.0",
163 | "state": {
164 | "_model_module": "@jupyter-widgets/base",
165 | "_model_module_version": "1.2.0",
166 | "_model_name": "LayoutModel",
167 | "_view_count": null,
168 | "_view_module": "@jupyter-widgets/base",
169 | "_view_module_version": "1.2.0",
170 | "_view_name": "LayoutView",
171 | "align_content": null,
172 | "align_items": null,
173 | "align_self": null,
174 | "border": null,
175 | "bottom": null,
176 | "display": null,
177 | "flex": null,
178 | "flex_flow": null,
179 | "grid_area": null,
180 | "grid_auto_columns": null,
181 | "grid_auto_flow": null,
182 | "grid_auto_rows": null,
183 | "grid_column": null,
184 | "grid_gap": null,
185 | "grid_row": null,
186 | "grid_template_areas": null,
187 | "grid_template_columns": null,
188 | "grid_template_rows": null,
189 | "height": null,
190 | "justify_content": null,
191 | "justify_items": null,
192 | "left": null,
193 | "margin": null,
194 | "max_height": null,
195 | "max_width": null,
196 | "min_height": null,
197 | "min_width": null,
198 | "object_fit": null,
199 | "object_position": null,
200 | "order": null,
201 | "overflow": null,
202 | "overflow_x": null,
203 | "overflow_y": null,
204 | "padding": null,
205 | "right": null,
206 | "top": null,
207 | "visibility": null,
208 | "width": null
209 | }
210 | },
211 | "8b4bbb4cb2e144fd86cfc4c72fc21df6": {
212 | "model_module": "@jupyter-widgets/controls",
213 | "model_name": "DescriptionStyleModel",
214 | "model_module_version": "1.5.0",
215 | "state": {
216 | "_model_module": "@jupyter-widgets/controls",
217 | "_model_module_version": "1.5.0",
218 | "_model_name": "DescriptionStyleModel",
219 | "_view_count": null,
220 | "_view_module": "@jupyter-widgets/base",
221 | "_view_module_version": "1.2.0",
222 | "_view_name": "StyleView",
223 | "description_width": ""
224 | }
225 | },
226 | "3f6e48f483774947bf59bf143a89c76d": {
227 | "model_module": "@jupyter-widgets/base",
228 | "model_name": "LayoutModel",
229 | "model_module_version": "1.2.0",
230 | "state": {
231 | "_model_module": "@jupyter-widgets/base",
232 | "_model_module_version": "1.2.0",
233 | "_model_name": "LayoutModel",
234 | "_view_count": null,
235 | "_view_module": "@jupyter-widgets/base",
236 | "_view_module_version": "1.2.0",
237 | "_view_name": "LayoutView",
238 | "align_content": null,
239 | "align_items": null,
240 | "align_self": null,
241 | "border": null,
242 | "bottom": null,
243 | "display": null,
244 | "flex": null,
245 | "flex_flow": null,
246 | "grid_area": null,
247 | "grid_auto_columns": null,
248 | "grid_auto_flow": null,
249 | "grid_auto_rows": null,
250 | "grid_column": null,
251 | "grid_gap": null,
252 | "grid_row": null,
253 | "grid_template_areas": null,
254 | "grid_template_columns": null,
255 | "grid_template_rows": null,
256 | "height": null,
257 | "justify_content": null,
258 | "justify_items": null,
259 | "left": null,
260 | "margin": null,
261 | "max_height": null,
262 | "max_width": null,
263 | "min_height": null,
264 | "min_width": null,
265 | "object_fit": null,
266 | "object_position": null,
267 | "order": null,
268 | "overflow": null,
269 | "overflow_x": null,
270 | "overflow_y": null,
271 | "padding": null,
272 | "right": null,
273 | "top": null,
274 | "visibility": null,
275 | "width": null
276 | }
277 | },
278 | "ee74e78455634b59a4d9cea414f1780f": {
279 | "model_module": "@jupyter-widgets/controls",
280 | "model_name": "ProgressStyleModel",
281 | "model_module_version": "1.5.0",
282 | "state": {
283 | "_model_module": "@jupyter-widgets/controls",
284 | "_model_module_version": "1.5.0",
285 | "_model_name": "ProgressStyleModel",
286 | "_view_count": null,
287 | "_view_module": "@jupyter-widgets/base",
288 | "_view_module_version": "1.2.0",
289 | "_view_name": "StyleView",
290 | "bar_color": null,
291 | "description_width": ""
292 | }
293 | },
294 | "b4e7f4b921574c4cace9cafe6c13b34c": {
295 | "model_module": "@jupyter-widgets/base",
296 | "model_name": "LayoutModel",
297 | "model_module_version": "1.2.0",
298 | "state": {
299 | "_model_module": "@jupyter-widgets/base",
300 | "_model_module_version": "1.2.0",
301 | "_model_name": "LayoutModel",
302 | "_view_count": null,
303 | "_view_module": "@jupyter-widgets/base",
304 | "_view_module_version": "1.2.0",
305 | "_view_name": "LayoutView",
306 | "align_content": null,
307 | "align_items": null,
308 | "align_self": null,
309 | "border": null,
310 | "bottom": null,
311 | "display": null,
312 | "flex": null,
313 | "flex_flow": null,
314 | "grid_area": null,
315 | "grid_auto_columns": null,
316 | "grid_auto_flow": null,
317 | "grid_auto_rows": null,
318 | "grid_column": null,
319 | "grid_gap": null,
320 | "grid_row": null,
321 | "grid_template_areas": null,
322 | "grid_template_columns": null,
323 | "grid_template_rows": null,
324 | "height": null,
325 | "justify_content": null,
326 | "justify_items": null,
327 | "left": null,
328 | "margin": null,
329 | "max_height": null,
330 | "max_width": null,
331 | "min_height": null,
332 | "min_width": null,
333 | "object_fit": null,
334 | "object_position": null,
335 | "order": null,
336 | "overflow": null,
337 | "overflow_x": null,
338 | "overflow_y": null,
339 | "padding": null,
340 | "right": null,
341 | "top": null,
342 | "visibility": null,
343 | "width": null
344 | }
345 | },
346 | "66bca6fc67d34d1bb5c68bd645df0c59": {
347 | "model_module": "@jupyter-widgets/controls",
348 | "model_name": "DescriptionStyleModel",
349 | "model_module_version": "1.5.0",
350 | "state": {
351 | "_model_module": "@jupyter-widgets/controls",
352 | "_model_module_version": "1.5.0",
353 | "_model_name": "DescriptionStyleModel",
354 | "_view_count": null,
355 | "_view_module": "@jupyter-widgets/base",
356 | "_view_module_version": "1.2.0",
357 | "_view_name": "StyleView",
358 | "description_width": ""
359 | }
360 | },
361 | "4beb968ea9bf4666bcc52ea171f4226b": {
362 | "model_module": "@jupyter-widgets/controls",
363 | "model_name": "HBoxModel",
364 | "model_module_version": "1.5.0",
365 | "state": {
366 | "_dom_classes": [],
367 | "_model_module": "@jupyter-widgets/controls",
368 | "_model_module_version": "1.5.0",
369 | "_model_name": "HBoxModel",
370 | "_view_count": null,
371 | "_view_module": "@jupyter-widgets/controls",
372 | "_view_module_version": "1.5.0",
373 | "_view_name": "HBoxView",
374 | "box_style": "",
375 | "children": [
376 | "IPY_MODEL_c88c0ef2a51144188cd5cd25747d4eea",
377 | "IPY_MODEL_5d9f8e6989324983895d0e6caf4c2c0a",
378 | "IPY_MODEL_b74d1074999c4c079db3fda33543b1ab"
379 | ],
380 | "layout": "IPY_MODEL_46a914846d9049ec93db190ff2b5ef40"
381 | }
382 | },
383 | "c88c0ef2a51144188cd5cd25747d4eea": {
384 | "model_module": "@jupyter-widgets/controls",
385 | "model_name": "HTMLModel",
386 | "model_module_version": "1.5.0",
387 | "state": {
388 | "_dom_classes": [],
389 | "_model_module": "@jupyter-widgets/controls",
390 | "_model_module_version": "1.5.0",
391 | "_model_name": "HTMLModel",
392 | "_view_count": null,
393 | "_view_module": "@jupyter-widgets/controls",
394 | "_view_module_version": "1.5.0",
395 | "_view_name": "HTMLView",
396 | "description": "",
397 | "description_tooltip": null,
398 | "layout": "IPY_MODEL_7c6edd53b4af41d7ad0eb79ae02bddcf",
399 | "placeholder": "",
400 | "style": "IPY_MODEL_62afe04f90034d8095b6fd2630d7845d",
401 | "value": "Generating train split: 100%"
402 | }
403 | },
404 | "5d9f8e6989324983895d0e6caf4c2c0a": {
405 | "model_module": "@jupyter-widgets/controls",
406 | "model_name": "FloatProgressModel",
407 | "model_module_version": "1.5.0",
408 | "state": {
409 | "_dom_classes": [],
410 | "_model_module": "@jupyter-widgets/controls",
411 | "_model_module_version": "1.5.0",
412 | "_model_name": "FloatProgressModel",
413 | "_view_count": null,
414 | "_view_module": "@jupyter-widgets/controls",
415 | "_view_module_version": "1.5.0",
416 | "_view_name": "ProgressView",
417 | "bar_style": "success",
418 | "description": "",
419 | "description_tooltip": null,
420 | "layout": "IPY_MODEL_981b459b97ad4d099873b23408970ac4",
421 | "max": 1000000,
422 | "min": 0,
423 | "orientation": "horizontal",
424 | "style": "IPY_MODEL_64fc088c40ee426c87a17421a14ea9b5",
425 | "value": 1000000
426 | }
427 | },
428 | "b74d1074999c4c079db3fda33543b1ab": {
429 | "model_module": "@jupyter-widgets/controls",
430 | "model_name": "HTMLModel",
431 | "model_module_version": "1.5.0",
432 | "state": {
433 | "_dom_classes": [],
434 | "_model_module": "@jupyter-widgets/controls",
435 | "_model_module_version": "1.5.0",
436 | "_model_name": "HTMLModel",
437 | "_view_count": null,
438 | "_view_module": "@jupyter-widgets/controls",
439 | "_view_module_version": "1.5.0",
440 | "_view_name": "HTMLView",
441 | "description": "",
442 | "description_tooltip": null,
443 | "layout": "IPY_MODEL_316f27747662462aaf689620788f71ab",
444 | "placeholder": "",
445 | "style": "IPY_MODEL_c0d04991fb754e23af1a3942dcf06d45",
446 | "value": " 1000000/1000000 [00:36<00:00, 30208.23 examples/s]"
447 | }
448 | },
449 | "46a914846d9049ec93db190ff2b5ef40": {
450 | "model_module": "@jupyter-widgets/base",
451 | "model_name": "LayoutModel",
452 | "model_module_version": "1.2.0",
453 | "state": {
454 | "_model_module": "@jupyter-widgets/base",
455 | "_model_module_version": "1.2.0",
456 | "_model_name": "LayoutModel",
457 | "_view_count": null,
458 | "_view_module": "@jupyter-widgets/base",
459 | "_view_module_version": "1.2.0",
460 | "_view_name": "LayoutView",
461 | "align_content": null,
462 | "align_items": null,
463 | "align_self": null,
464 | "border": null,
465 | "bottom": null,
466 | "display": null,
467 | "flex": null,
468 | "flex_flow": null,
469 | "grid_area": null,
470 | "grid_auto_columns": null,
471 | "grid_auto_flow": null,
472 | "grid_auto_rows": null,
473 | "grid_column": null,
474 | "grid_gap": null,
475 | "grid_row": null,
476 | "grid_template_areas": null,
477 | "grid_template_columns": null,
478 | "grid_template_rows": null,
479 | "height": null,
480 | "justify_content": null,
481 | "justify_items": null,
482 | "left": null,
483 | "margin": null,
484 | "max_height": null,
485 | "max_width": null,
486 | "min_height": null,
487 | "min_width": null,
488 | "object_fit": null,
489 | "object_position": null,
490 | "order": null,
491 | "overflow": null,
492 | "overflow_x": null,
493 | "overflow_y": null,
494 | "padding": null,
495 | "right": null,
496 | "top": null,
497 | "visibility": null,
498 | "width": null
499 | }
500 | },
501 | "7c6edd53b4af41d7ad0eb79ae02bddcf": {
502 | "model_module": "@jupyter-widgets/base",
503 | "model_name": "LayoutModel",
504 | "model_module_version": "1.2.0",
505 | "state": {
506 | "_model_module": "@jupyter-widgets/base",
507 | "_model_module_version": "1.2.0",
508 | "_model_name": "LayoutModel",
509 | "_view_count": null,
510 | "_view_module": "@jupyter-widgets/base",
511 | "_view_module_version": "1.2.0",
512 | "_view_name": "LayoutView",
513 | "align_content": null,
514 | "align_items": null,
515 | "align_self": null,
516 | "border": null,
517 | "bottom": null,
518 | "display": null,
519 | "flex": null,
520 | "flex_flow": null,
521 | "grid_area": null,
522 | "grid_auto_columns": null,
523 | "grid_auto_flow": null,
524 | "grid_auto_rows": null,
525 | "grid_column": null,
526 | "grid_gap": null,
527 | "grid_row": null,
528 | "grid_template_areas": null,
529 | "grid_template_columns": null,
530 | "grid_template_rows": null,
531 | "height": null,
532 | "justify_content": null,
533 | "justify_items": null,
534 | "left": null,
535 | "margin": null,
536 | "max_height": null,
537 | "max_width": null,
538 | "min_height": null,
539 | "min_width": null,
540 | "object_fit": null,
541 | "object_position": null,
542 | "order": null,
543 | "overflow": null,
544 | "overflow_x": null,
545 | "overflow_y": null,
546 | "padding": null,
547 | "right": null,
548 | "top": null,
549 | "visibility": null,
550 | "width": null
551 | }
552 | },
553 | "62afe04f90034d8095b6fd2630d7845d": {
554 | "model_module": "@jupyter-widgets/controls",
555 | "model_name": "DescriptionStyleModel",
556 | "model_module_version": "1.5.0",
557 | "state": {
558 | "_model_module": "@jupyter-widgets/controls",
559 | "_model_module_version": "1.5.0",
560 | "_model_name": "DescriptionStyleModel",
561 | "_view_count": null,
562 | "_view_module": "@jupyter-widgets/base",
563 | "_view_module_version": "1.2.0",
564 | "_view_name": "StyleView",
565 | "description_width": ""
566 | }
567 | },
568 | "981b459b97ad4d099873b23408970ac4": {
569 | "model_module": "@jupyter-widgets/base",
570 | "model_name": "LayoutModel",
571 | "model_module_version": "1.2.0",
572 | "state": {
573 | "_model_module": "@jupyter-widgets/base",
574 | "_model_module_version": "1.2.0",
575 | "_model_name": "LayoutModel",
576 | "_view_count": null,
577 | "_view_module": "@jupyter-widgets/base",
578 | "_view_module_version": "1.2.0",
579 | "_view_name": "LayoutView",
580 | "align_content": null,
581 | "align_items": null,
582 | "align_self": null,
583 | "border": null,
584 | "bottom": null,
585 | "display": null,
586 | "flex": null,
587 | "flex_flow": null,
588 | "grid_area": null,
589 | "grid_auto_columns": null,
590 | "grid_auto_flow": null,
591 | "grid_auto_rows": null,
592 | "grid_column": null,
593 | "grid_gap": null,
594 | "grid_row": null,
595 | "grid_template_areas": null,
596 | "grid_template_columns": null,
597 | "grid_template_rows": null,
598 | "height": null,
599 | "justify_content": null,
600 | "justify_items": null,
601 | "left": null,
602 | "margin": null,
603 | "max_height": null,
604 | "max_width": null,
605 | "min_height": null,
606 | "min_width": null,
607 | "object_fit": null,
608 | "object_position": null,
609 | "order": null,
610 | "overflow": null,
611 | "overflow_x": null,
612 | "overflow_y": null,
613 | "padding": null,
614 | "right": null,
615 | "top": null,
616 | "visibility": null,
617 | "width": null
618 | }
619 | },
620 | "64fc088c40ee426c87a17421a14ea9b5": {
621 | "model_module": "@jupyter-widgets/controls",
622 | "model_name": "ProgressStyleModel",
623 | "model_module_version": "1.5.0",
624 | "state": {
625 | "_model_module": "@jupyter-widgets/controls",
626 | "_model_module_version": "1.5.0",
627 | "_model_name": "ProgressStyleModel",
628 | "_view_count": null,
629 | "_view_module": "@jupyter-widgets/base",
630 | "_view_module_version": "1.2.0",
631 | "_view_name": "StyleView",
632 | "bar_color": null,
633 | "description_width": ""
634 | }
635 | },
636 | "316f27747662462aaf689620788f71ab": {
637 | "model_module": "@jupyter-widgets/base",
638 | "model_name": "LayoutModel",
639 | "model_module_version": "1.2.0",
640 | "state": {
641 | "_model_module": "@jupyter-widgets/base",
642 | "_model_module_version": "1.2.0",
643 | "_model_name": "LayoutModel",
644 | "_view_count": null,
645 | "_view_module": "@jupyter-widgets/base",
646 | "_view_module_version": "1.2.0",
647 | "_view_name": "LayoutView",
648 | "align_content": null,
649 | "align_items": null,
650 | "align_self": null,
651 | "border": null,
652 | "bottom": null,
653 | "display": null,
654 | "flex": null,
655 | "flex_flow": null,
656 | "grid_area": null,
657 | "grid_auto_columns": null,
658 | "grid_auto_flow": null,
659 | "grid_auto_rows": null,
660 | "grid_column": null,
661 | "grid_gap": null,
662 | "grid_row": null,
663 | "grid_template_areas": null,
664 | "grid_template_columns": null,
665 | "grid_template_rows": null,
666 | "height": null,
667 | "justify_content": null,
668 | "justify_items": null,
669 | "left": null,
670 | "margin": null,
671 | "max_height": null,
672 | "max_width": null,
673 | "min_height": null,
674 | "min_width": null,
675 | "object_fit": null,
676 | "object_position": null,
677 | "order": null,
678 | "overflow": null,
679 | "overflow_x": null,
680 | "overflow_y": null,
681 | "padding": null,
682 | "right": null,
683 | "top": null,
684 | "visibility": null,
685 | "width": null
686 | }
687 | },
688 | "c0d04991fb754e23af1a3942dcf06d45": {
689 | "model_module": "@jupyter-widgets/controls",
690 | "model_name": "DescriptionStyleModel",
691 | "model_module_version": "1.5.0",
692 | "state": {
693 | "_model_module": "@jupyter-widgets/controls",
694 | "_model_module_version": "1.5.0",
695 | "_model_name": "DescriptionStyleModel",
696 | "_view_count": null,
697 | "_view_module": "@jupyter-widgets/base",
698 | "_view_module_version": "1.2.0",
699 | "_view_name": "StyleView",
700 | "description_width": ""
701 | }
702 | }
703 | }
704 | }
705 | },
706 | "cells": [
707 | {
708 | "cell_type": "markdown",
709 | "metadata": {
710 | "id": "view-in-github",
711 | "colab_type": "text"
712 | },
713 | "source": [
714 | "
"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 6,
720 | "metadata": {
721 | "id": "edMbM4ZtMeFs"
722 | },
723 | "outputs": [],
724 | "source": [
725 | "# CELL 1: Install Libraries\n",
726 | "\n",
727 | "!pip install datasets pandas google-cloud-bigquery pyarrow db-dtypes -q"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "source": [
733 | "# CELL 2: Authenticate to Google Cloud\n",
734 | "from google.colab import auth\n",
735 | "auth.authenticate_user()\n",
736 | "print('✅ Authenticated')\n"
737 | ],
738 | "metadata": {
739 | "id": "L46Um3ftOETP",
740 | "colab": {
741 | "base_uri": "https://localhost:8080/"
742 | },
743 | "outputId": "ac104c9f-ff70-41ce-908e-6c66c846dae0"
744 | },
745 | "execution_count": 7,
746 | "outputs": [
747 | {
748 | "output_type": "stream",
749 | "name": "stdout",
750 | "text": [
751 | "✅ Authenticated\n"
752 | ]
753 | }
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "source": [
759 | "# CELL 3: Download from HF & Load into BigQuery (with retries)\n",
760 | "\n",
761 | "# === CONFIGURATION: REPLACE THESE ===\n",
762 | "gcp_project_id = \"azw-ua\" # ← your GCP project ID\n",
763 | "bq_dataset_id = \"real_estate_data\" # ← your existing BigQuery dataset\n",
764 | "bq_table_id = \"divar_real_estate_ads\" # ← name for the new table\n",
765 | "hf_dataset = \"divaroffical/real_estate_ads\"\n",
766 | "hf_split = \"train\"\n",
767 | "bq_location = \"US\" # ← match your dataset location\n",
768 | "# ===================================\n",
769 | "\n",
770 | "import time\n",
771 | "import pandas as pd\n",
772 | "from datasets import load_dataset\n",
773 | "from google.cloud import bigquery\n",
774 | "\n",
775 | "# Full table reference\n",
776 | "table_ref = f\"{gcp_project_id}.{bq_dataset_id}.{bq_table_id}\"\n",
777 | "\n",
778 | "print(f\"→ HF dataset: {hf_dataset} [{hf_split}]\")\n",
779 | "print(f\"→ BQ table: {table_ref} (location={bq_location})\\n\")\n",
780 | "\n",
781 | "# 1) Download HF dataset\n",
782 | "print(\"1) Downloading Hugging Face dataset…\")\n",
783 | "hf_ds = load_dataset(hf_dataset, split=hf_split)\n",
784 | "df = hf_ds.to_pandas()\n",
785 | "print(f\" → Downloaded & converted to DataFrame: {df.shape[0]} rows, {df.shape[1]} cols\\n\")\n",
786 | "\n",
787 | "# 2) Initialize BQ client\n",
788 | "client = bigquery.Client(project=gcp_project_id, location=bq_location)\n",
789 | "job_config = bigquery.LoadJobConfig(\n",
790 | " write_disposition=\"WRITE_TRUNCATE\",\n",
791 | " autodetect=True,\n",
792 | ")\n",
793 | "\n",
794 | "# 3) Upload with retries\n",
795 | "max_retries = 5\n",
796 | "for attempt in range(1, max_retries+1):\n",
797 | " try:\n",
798 | " print(f\"{attempt=}: Starting load_job…\")\n",
799 | " job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)\n",
800 | " job.result() # wait for completion\n",
801 | " print(f\"✅ Loaded {job.output_rows} rows into {table_ref}\")\n",
802 | " break\n",
803 | " except Exception as err:\n",
804 | " print(f\"❌ Attempt {attempt} failed: {err}\")\n",
805 | " if attempt == max_retries:\n",
806 | " raise RuntimeError(\"All retries failed—aborting.\") from err\n",
807 | " backoff = 2 ** attempt\n",
808 | " print(f\" ↳ retrying in {backoff}s…\")\n",
809 | " time.sleep(backoff)\n",
810 | "\n",
811 | "print(\"\\n🎉 All done!\")\n"
812 | ],
813 | "metadata": {
814 | "id": "3UXzjvGlOJ74",
815 | "colab": {
816 | "base_uri": "https://localhost:8080/",
817 | "height": 255,
818 | "referenced_widgets": [
819 | "cf441a3e0a58460a902f8ece8095abcd",
820 | "f38efd6522cf47c2a80d4230a3673bb2",
821 | "b90a72eeccaa4f1c889f652ec0f56a84",
822 | "2d040286969446e1818a6b53616ca163",
823 | "48ee6b5e634948a7bec5d3ae58e32dba",
824 | "561eeac4fa5e4134a143d5ee08e5bf21",
825 | "8b4bbb4cb2e144fd86cfc4c72fc21df6",
826 | "3f6e48f483774947bf59bf143a89c76d",
827 | "ee74e78455634b59a4d9cea414f1780f",
828 | "b4e7f4b921574c4cace9cafe6c13b34c",
829 | "66bca6fc67d34d1bb5c68bd645df0c59",
830 | "4beb968ea9bf4666bcc52ea171f4226b",
831 | "c88c0ef2a51144188cd5cd25747d4eea",
832 | "5d9f8e6989324983895d0e6caf4c2c0a",
833 | "b74d1074999c4c079db3fda33543b1ab",
834 | "46a914846d9049ec93db190ff2b5ef40",
835 | "7c6edd53b4af41d7ad0eb79ae02bddcf",
836 | "62afe04f90034d8095b6fd2630d7845d",
837 | "981b459b97ad4d099873b23408970ac4",
838 | "64fc088c40ee426c87a17421a14ea9b5",
839 | "316f27747662462aaf689620788f71ab",
840 | "c0d04991fb754e23af1a3942dcf06d45"
841 | ]
842 | },
843 | "outputId": "7964477d-a590-43ea-9b31-6b39817e21c2"
844 | },
845 | "execution_count": 9,
846 | "outputs": [
847 | {
848 | "metadata": {
849 | "tags": null
850 | },
851 | "name": "stdout",
852 | "output_type": "stream",
853 | "text": [
854 | "→ HF dataset: divaroffical/real_estate_ads [train]\n",
855 | "→ BQ table: azw-ua.real_estate_data.divar_real_estate_ads (location=US)\n",
856 | "\n",
857 | "1) Downloading Hugging Face dataset…\n"
858 | ]
859 | },
860 | {
861 | "data": {
862 | "application/vnd.jupyter.widget-view+json": {
863 | "model_id": "cf441a3e0a58460a902f8ece8095abcd",
864 | "version_major": 2,
865 | "version_minor": 0
866 | },
867 | "text/plain": [
868 | "real_estate_ads.csv: 0%| | 0.00/781M [00:00, ?B/s]"
869 | ]
870 | },
871 | "metadata": {},
872 | "output_type": "display_data"
873 | },
874 | {
875 | "data": {
876 | "application/vnd.jupyter.widget-view+json": {
877 | "model_id": "4beb968ea9bf4666bcc52ea171f4226b",
878 | "version_major": 2,
879 | "version_minor": 0
880 | },
881 | "text/plain": [
882 | "Generating train split: 0%| | 0/1000000 [00:00, ? examples/s]"
883 | ]
884 | },
885 | "metadata": {},
886 | "output_type": "display_data"
887 | },
888 | {
889 | "output_type": "stream",
890 | "name": "stdout",
891 | "text": [
892 | " → Downloaded & converted to DataFrame: 1000000 rows, 60 cols\n",
893 | "\n",
894 | "attempt=1: Starting load_job…\n",
895 | "✅ Loaded 1000000 rows into azw-ua.real_estate_data.divar_real_estate_ads\n",
896 | "\n",
897 | "🎉 All done!\n"
898 | ]
899 | }
900 | ]
901 | }
902 | ]
903 | }
--------------------------------------------------------------------------------