├── .gitignore ├── GA4_tables_backfill.ipynb ├── README.md ├── backfill-GA4-schema.md ├── backfill-ga4.py ├── config.json └── tansfer_divar_data_from_huggingface_to_bigquery.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /GA4_tables_backfill.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "include_colab_link": true 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "id": "0WJNl6xTYm4_", 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "!pip install google-analytics-data==0.18.4\n", 38 | "!pip install google-cloud-bigquery\n", 39 | "!pip install google-auth==2.27.0\n", 40 | "!pip install google-auth-oauthlib\n", 41 | "!pip install google-auth-httplib2" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "source": [ 47 | "import pandas as pd\n", 48 | "from google.cloud import bigquery\n", 49 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 50 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 51 | "from google.oauth2 import service_account\n", 52 | "from google_auth_oauthlib.flow import Flow\n", 53 | "import json\n", 54 | "import os\n", 55 | "import pickle\n", 56 | "\n", 57 | "# Load configuration from a JSON file\n", 58 | "with open(\"config.json\", \"r\") as f:\n", 59 | " config = json.load(f)\n", 60 | "\n", 61 | "# Authenticate with service account for BigQuery\n", 62 | "creds1 = service_account.Credentials.from_service_account_file(\n", 63 | " config['SERVICE_ACCOUNT_FILE'],\n", 64 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 65 | ")\n", 66 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 67 | "\n", 68 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 69 | "def authenticate_ga4():\n", 70 | " creds = None\n", 71 | " if os.path.exists('token.pickle'):\n", 72 | " with open('token.pickle', 'rb') as token:\n", 73 | " creds = pickle.load(token)\n", 74 | " else:\n", 75 | " flow = Flow.from_client_secrets_file(\n", 76 | " config['CLIENT_SECRET_FILE'],\n", 77 | " scopes=config['SCOPES'],\n", 78 | " redirect_uri='http://localhost:8080/'\n", 79 | " )\n", 80 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 81 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 82 | " code = input('Enter the authorization code: ')\n", 83 | " flow.fetch_token(code=code)\n", 84 | " creds = flow.credentials\n", 85 | " with open('token.pickle', 'wb') as token:\n", 86 | " pickle.dump(creds, token)\n", 87 | " return creds\n", 88 | "\n", 89 | "# Function to paginate and fetch GA4 report data with logging\n", 90 | "def run_report_with_pagination(client, request, limit=10000):\n", 91 | " all_rows = []\n", 92 | " offset = 0\n", 93 | " page_number = 1\n", 94 | "\n", 95 | " while True:\n", 96 | " # Apply offset and limit to request\n", 97 | " request.offset = offset\n", 98 | " request.limit = limit\n", 99 | "\n", 100 | " # Fetch report data\n", 101 | " response = client.run_report(request)\n", 102 | " all_rows.extend(response.rows)\n", 103 | "\n", 104 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 105 | "\n", 106 | " # If fewer rows are fetched than the limit, we're done\n", 107 | " if len(response.rows) < limit:\n", 108 | " break\n", 109 | "\n", 110 | " # Update offset and page number to get the next set of rows\n", 111 | " offset += limit\n", 112 | " page_number += 1\n", 113 | "\n", 114 | " return all_rows\n", 115 | "\n", 116 | "# Function to fetch GA4 data using pagination\n", 117 | "def get_ga4_report(client):\n", 118 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 119 | " request = RunReportRequest(\n", 120 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 121 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 122 | " dimensions=[\n", 123 | " Dimension(name='transactionId'),\n", 124 | " Dimension(name='itemName'),\n", 125 | " Dimension(name='date') # Added 'date' dimension\n", 126 | " ],\n", 127 | " metrics=[\n", 128 | " Metric(name='itemPurchaseQuantity'),\n", 129 | " Metric(name='itemRevenue')\n", 130 | " ]\n", 131 | " )\n", 132 | " return run_report_with_pagination(client, request)\n", 133 | "\n", 134 | "# Function to convert GA4 response to a DataFrame\n", 135 | "def response_to_dataframe(response):\n", 136 | " list_rows = []\n", 137 | " for row in response:\n", 138 | " transaction_id = row.dimension_values[0].value\n", 139 | " item_name = row.dimension_values[1].value\n", 140 | " date_value = row.dimension_values[2].value # Added date handling\n", 141 | " list_rows.append({\n", 142 | " 'transactionId': transaction_id,\n", 143 | " 'itemName': item_name,\n", 144 | " 'date': date_value, # Added date column\n", 145 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 146 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n", 147 | " })\n", 148 | " return pd.DataFrame(list_rows)\n", 149 | "\n", 150 | "# Function to upload data to BigQuery\n", 151 | "def upload_to_bigquery(df, table_id):\n", 152 | " # Define BigQuery schema\n", 153 | " schema = [\n", 154 | " bigquery.SchemaField(\"transactionId\", \"STRING\"),\n", 155 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n", 156 | " bigquery.SchemaField(\"date\", \"STRING\"), # Added date field in schema\n", 157 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n", 158 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n", 159 | " ]\n", 160 | "\n", 161 | " # Configure BigQuery job to partition the table by the 'transactionId' field\n", 162 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 163 | " job_config = bigquery.LoadJobConfig(\n", 164 | " schema=schema,\n", 165 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE\n", 166 | " )\n", 167 | "\n", 168 | " # Upload the DataFrame to BigQuery\n", 169 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 170 | " print(f\"Data uploaded to {table_ref}\")\n", 171 | "\n", 172 | "# Main function\n", 173 | "def main():\n", 174 | " try:\n", 175 | " # Authenticate GA4 using OAuth2\n", 176 | " creds = authenticate_ga4()\n", 177 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 178 | "\n", 179 | " # Fetch GA4 data\n", 180 | " ga4_response = get_ga4_report(client_ga4)\n", 181 | "\n", 182 | " # Convert the response to a DataFrame\n", 183 | " ga4_df = response_to_dataframe(ga4_response)\n", 184 | "\n", 185 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 186 | " table_id = 'ga4_transaction_items'\n", 187 | " csv_filename = f\"{table_id}.csv\"\n", 188 | "\n", 189 | " # Save the DataFrame to a CSV file\n", 190 | " ga4_df.to_csv(csv_filename, index=False)\n", 191 | " print(f\"Data saved to {csv_filename}\")\n", 192 | "\n", 193 | " # Upload the DataFrame to BigQuery\n", 194 | " upload_to_bigquery(ga4_df, table_id)\n", 195 | " except Exception as e:\n", 196 | " print(f\"Error occurred: {e}\")\n", 197 | "\n", 198 | "if __name__ == '__main__':\n", 199 | " main()\n" 200 | ], 201 | "metadata": { 202 | "id": "003OzBhNUl7b" 203 | }, 204 | "execution_count": null, 205 | "outputs": [] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "source": [ 210 | "import pandas as pd\n", 211 | "from google.cloud import bigquery\n", 212 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 213 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 214 | "from google.oauth2 import service_account\n", 215 | "from google_auth_oauthlib.flow import Flow\n", 216 | "import json\n", 217 | "import os\n", 218 | "import pickle\n", 219 | "\n", 220 | "# Load configuration from a JSON file\n", 221 | "with open(\"config.json\", \"r\") as f:\n", 222 | " config = json.load(f)\n", 223 | "\n", 224 | "# Authenticate with service account for BigQuery\n", 225 | "creds1 = service_account.Credentials.from_service_account_file(\n", 226 | " config['SERVICE_ACCOUNT_FILE'],\n", 227 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 228 | ")\n", 229 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 230 | "\n", 231 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 232 | "def authenticate_ga4():\n", 233 | " creds = None\n", 234 | " if os.path.exists('token.pickle'):\n", 235 | " with open('token.pickle', 'rb') as token:\n", 236 | " creds = pickle.load(token)\n", 237 | " else:\n", 238 | " flow = Flow.from_client_secrets_file(\n", 239 | " config['CLIENT_SECRET_FILE'],\n", 240 | " scopes=config['SCOPES'],\n", 241 | " redirect_uri='http://localhost:8080/'\n", 242 | " )\n", 243 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 244 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 245 | " code = input('Enter the authorization code: ')\n", 246 | " flow.fetch_token(code=code)\n", 247 | " creds = flow.credentials\n", 248 | " with open('token.pickle', 'wb') as token:\n", 249 | " pickle.dump(creds, token)\n", 250 | " return creds\n", 251 | "\n", 252 | "# Function to paginate and fetch GA4 report data with logging\n", 253 | "def run_report_with_pagination(client, request, limit=10000):\n", 254 | " all_rows = []\n", 255 | " offset = 0\n", 256 | " page_number = 1\n", 257 | "\n", 258 | " while True:\n", 259 | " # Apply offset and limit to request\n", 260 | " request.offset = offset\n", 261 | " request.limit = limit\n", 262 | "\n", 263 | " # Fetch report data\n", 264 | " response = client.run_report(request)\n", 265 | " all_rows.extend(response.rows)\n", 266 | "\n", 267 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 268 | "\n", 269 | " # If fewer rows are fetched than the limit, we're done\n", 270 | " if len(response.rows) < limit:\n", 271 | " break\n", 272 | "\n", 273 | " # Update offset and page number to get the next set of rows\n", 274 | " offset += limit\n", 275 | " page_number += 1\n", 276 | "\n", 277 | " return all_rows\n", 278 | "\n", 279 | "# Function to fetch GA4 data using pagination\n", 280 | "def get_ga4_report(client):\n", 281 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 282 | " request = RunReportRequest(\n", 283 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 284 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 285 | " dimensions=[Dimension(name='date'), Dimension(name='sessionDefaultChannelGroup')],\n", 286 | " metrics=[\n", 287 | " Metric(name='sessions'),\n", 288 | " Metric(name='totalUsers'),\n", 289 | " Metric(name='newUsers'),\n", 290 | " Metric(name='ecommercePurchases'),\n", 291 | " Metric(name='purchaseRevenue'),\n", 292 | " ]\n", 293 | " )\n", 294 | " return run_report_with_pagination(client, request)\n", 295 | "\n", 296 | "# Function to convert GA4 response to a DataFrame\n", 297 | "def response_to_dataframe(response):\n", 298 | " list_rows = []\n", 299 | " for row in response:\n", 300 | " try:\n", 301 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 302 | " except ValueError:\n", 303 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 304 | " session_channel_group = row.dimension_values[1].value\n", 305 | " list_rows.append({\n", 306 | " 'date': date_value,\n", 307 | " 'sessionPrimaryChannelGroup': session_channel_group,\n", 308 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 309 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 310 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 311 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 312 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0\n", 313 | " })\n", 314 | " return pd.DataFrame(list_rows)\n", 315 | "\n", 316 | "# Function to upload data to BigQuery\n", 317 | "def upload_to_bigquery(df, table_id):\n", 318 | " # Define BigQuery schema\n", 319 | " schema = [\n", 320 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 321 | " bigquery.SchemaField(\"sessionPrimaryChannelGroup\", \"STRING\"),\n", 322 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 323 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 324 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 325 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 326 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 327 | " ]\n", 328 | "\n", 329 | " # Configure BigQuery job to partition the table by the 'date' column\n", 330 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 331 | " job_config = bigquery.LoadJobConfig(\n", 332 | " schema=schema,\n", 333 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 334 | " time_partitioning=bigquery.TimePartitioning(\n", 335 | " type_=bigquery.TimePartitioningType.DAY,\n", 336 | " field='date'\n", 337 | " )\n", 338 | " )\n", 339 | "\n", 340 | " # Upload the DataFrame to BigQuery\n", 341 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 342 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 343 | "\n", 344 | "# Main function\n", 345 | "def main():\n", 346 | " try:\n", 347 | " # Authenticate GA4 using OAuth2\n", 348 | " creds = authenticate_ga4()\n", 349 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 350 | "\n", 351 | " # Fetch GA4 data\n", 352 | " ga4_response = get_ga4_report(client_ga4)\n", 353 | "\n", 354 | " # Convert the response to a DataFrame\n", 355 | " ga4_df = response_to_dataframe(ga4_response)\n", 356 | "\n", 357 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 358 | " table_id = 'ga4_data_session_channel_group'\n", 359 | " csv_filename = f\"{table_id}.csv\"\n", 360 | "\n", 361 | " # Save the DataFrame to a CSV file\n", 362 | " ga4_df.to_csv(csv_filename, index=False)\n", 363 | " print(f\"Data saved to {csv_filename}\")\n", 364 | "\n", 365 | " # Upload the DataFrame to BigQuery\n", 366 | " upload_to_bigquery(ga4_df, table_id)\n", 367 | " except Exception as e:\n", 368 | " print(f\"Error occurred: {e}\")\n", 369 | "\n", 370 | "if __name__ == '__main__':\n", 371 | " main()" 372 | ], 373 | "metadata": { 374 | "id": "TaCbme6LYqD4", 375 | "collapsed": true 376 | }, 377 | "execution_count": null, 378 | "outputs": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "source": [ 383 | "import pandas as pd\n", 384 | "from google.cloud import bigquery\n", 385 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 386 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 387 | "from google.oauth2 import service_account\n", 388 | "from google_auth_oauthlib.flow import Flow\n", 389 | "import json\n", 390 | "import os\n", 391 | "import pickle\n", 392 | "\n", 393 | "# Load configuration from a JSON file\n", 394 | "with open(\"config.json\", \"r\") as f:\n", 395 | " config = json.load(f)\n", 396 | "\n", 397 | "# Authenticate with service account for BigQuery\n", 398 | "creds1 = service_account.Credentials.from_service_account_file(\n", 399 | " config['SERVICE_ACCOUNT_FILE'],\n", 400 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 401 | ")\n", 402 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 403 | "\n", 404 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 405 | "def authenticate_ga4():\n", 406 | " creds = None\n", 407 | " if os.path.exists('token.pickle'):\n", 408 | " with open('token.pickle', 'rb') as token:\n", 409 | " creds = pickle.load(token)\n", 410 | " else:\n", 411 | " flow = Flow.from_client_secrets_file(\n", 412 | " config['CLIENT_SECRET_FILE'],\n", 413 | " scopes=config['SCOPES'],\n", 414 | " redirect_uri='http://localhost:8080/'\n", 415 | " )\n", 416 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 417 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 418 | " code = input('Enter the authorization code: ')\n", 419 | " flow.fetch_token(code=code)\n", 420 | " creds = flow.credentials\n", 421 | " with open('token.pickle', 'wb') as token:\n", 422 | " pickle.dump(creds, token)\n", 423 | " return creds\n", 424 | "\n", 425 | "# Function to paginate and fetch GA4 report data with logging\n", 426 | "def run_report_with_pagination(client, request, limit=10000):\n", 427 | " all_rows = []\n", 428 | " offset = 0\n", 429 | " page_number = 1\n", 430 | "\n", 431 | " while True:\n", 432 | " # Apply offset and limit to request\n", 433 | " request.offset = offset\n", 434 | " request.limit = limit\n", 435 | "\n", 436 | " # Fetch report data\n", 437 | " response = client.run_report(request)\n", 438 | " all_rows.extend(response.rows)\n", 439 | "\n", 440 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 441 | "\n", 442 | " # If fewer rows are fetched than the limit, we're done\n", 443 | " if len(response.rows) < limit:\n", 444 | " break\n", 445 | "\n", 446 | " # Update offset and page number to get the next set of rows\n", 447 | " offset += limit\n", 448 | " page_number += 1\n", 449 | "\n", 450 | " return all_rows\n", 451 | "\n", 452 | "# Function to fetch GA4 data using pagination\n", 453 | "def get_ga4_report(client):\n", 454 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 455 | " request = RunReportRequest(\n", 456 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 457 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 458 | " dimensions=[\n", 459 | " Dimension(name='date'),\n", 460 | " Dimension(name='sessionSource'),\n", 461 | " Dimension(name='sessionCampaignName'),\n", 462 | " Dimension(name='sessionMedium')\n", 463 | " ],\n", 464 | " metrics=[\n", 465 | " Metric(name='sessions'),\n", 466 | " Metric(name='totalUsers'),\n", 467 | " Metric(name='newUsers'),\n", 468 | " Metric(name='ecommercePurchases'),\n", 469 | " Metric(name='purchaseRevenue'),\n", 470 | " ]\n", 471 | " )\n", 472 | " return run_report_with_pagination(client, request)\n", 473 | "\n", 474 | "# Function to convert GA4 response to a DataFrame\n", 475 | "def response_to_dataframe(response):\n", 476 | " list_rows = []\n", 477 | " for row in response:\n", 478 | " try:\n", 479 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 480 | " except ValueError:\n", 481 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 482 | " session_source = row.dimension_values[1].value\n", 483 | " session_campaign_name = row.dimension_values[2].value\n", 484 | " session_medium = row.dimension_values[3].value\n", 485 | " list_rows.append({\n", 486 | " 'date': date_value,\n", 487 | " 'sessionSource': session_source,\n", 488 | " 'sessionCampaignName': session_campaign_name,\n", 489 | " 'sessionMedium': session_medium,\n", 490 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 491 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 492 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 493 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 494 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0\n", 495 | " })\n", 496 | " return pd.DataFrame(list_rows)\n", 497 | "\n", 498 | "# Function to upload data to BigQuery\n", 499 | "def upload_to_bigquery(df, table_id):\n", 500 | " # Define BigQuery schema\n", 501 | " schema = [\n", 502 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 503 | " bigquery.SchemaField(\"sessionSource\", \"STRING\"),\n", 504 | " bigquery.SchemaField(\"sessionCampaignName\", \"STRING\"),\n", 505 | " bigquery.SchemaField(\"sessionMedium\", \"STRING\"),\n", 506 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 507 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 508 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 509 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 510 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 511 | " ]\n", 512 | "\n", 513 | " # Configure BigQuery job to partition the table by the 'date' column\n", 514 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 515 | " job_config = bigquery.LoadJobConfig(\n", 516 | " schema=schema,\n", 517 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 518 | " time_partitioning=bigquery.TimePartitioning(\n", 519 | " type_=bigquery.TimePartitioningType.DAY,\n", 520 | " field='date'\n", 521 | " )\n", 522 | " )\n", 523 | "\n", 524 | " # Upload the DataFrame to BigQuery\n", 525 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 526 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 527 | "\n", 528 | "# Main function\n", 529 | "def main():\n", 530 | " try:\n", 531 | " # Authenticate GA4 using OAuth2\n", 532 | " creds = authenticate_ga4()\n", 533 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 534 | "\n", 535 | " # Fetch GA4 data\n", 536 | " ga4_response = get_ga4_report(client_ga4)\n", 537 | "\n", 538 | " # Convert the response to a DataFrame\n", 539 | " ga4_df = response_to_dataframe(ga4_response)\n", 540 | "\n", 541 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 542 | " table_id = 'ga4_data_session_source_campaign_medium'\n", 543 | " csv_filename = f\"{table_id}.csv\"\n", 544 | "\n", 545 | " # Save the DataFrame to a CSV file\n", 546 | " ga4_df.to_csv(csv_filename, index=False)\n", 547 | " print(f\"Data saved to {csv_filename}\")\n", 548 | "\n", 549 | " # Upload the DataFrame to BigQuery\n", 550 | " upload_to_bigquery(ga4_df, table_id)\n", 551 | " except Exception as e:\n", 552 | " print(f\"Error occurred: {e}\")\n", 553 | "\n", 554 | "if __name__ == '__main__':\n", 555 | " main()\n" 556 | ], 557 | "metadata": { 558 | "id": "Wz5wF6MHbIAC" 559 | }, 560 | "execution_count": null, 561 | "outputs": [] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "source": [ 566 | "import pandas as pd\n", 567 | "from google.cloud import bigquery\n", 568 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 569 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 570 | "from google.oauth2 import service_account\n", 571 | "from google_auth_oauthlib.flow import Flow\n", 572 | "import json\n", 573 | "import os\n", 574 | "import pickle\n", 575 | "\n", 576 | "# Load configuration from a JSON file\n", 577 | "with open(\"config.json\", \"r\") as f:\n", 578 | " config = json.load(f)\n", 579 | "\n", 580 | "# Authenticate with service account for BigQuery\n", 581 | "creds1 = service_account.Credentials.from_service_account_file(\n", 582 | " config['SERVICE_ACCOUNT_FILE'],\n", 583 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 584 | ")\n", 585 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 586 | "\n", 587 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 588 | "def authenticate_ga4():\n", 589 | " creds = None\n", 590 | " if os.path.exists('token.pickle'):\n", 591 | " with open('token.pickle', 'rb') as token:\n", 592 | " creds = pickle.load(token)\n", 593 | " else:\n", 594 | " flow = Flow.from_client_secrets_file(\n", 595 | " config['CLIENT_SECRET_FILE'],\n", 596 | " scopes=config['SCOPES'],\n", 597 | " redirect_uri='http://localhost:8080/'\n", 598 | " )\n", 599 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 600 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 601 | " code = input('Enter the authorization code: ')\n", 602 | " flow.fetch_token(code=code)\n", 603 | " creds = flow.credentials\n", 604 | " with open('token.pickle', 'wb') as token:\n", 605 | " pickle.dump(creds, token)\n", 606 | " return creds\n", 607 | "\n", 608 | "# Function to paginate and fetch GA4 report data with logging\n", 609 | "def run_report_with_pagination(client, request, limit=10000):\n", 610 | " all_rows = []\n", 611 | " offset = 0\n", 612 | " page_number = 1\n", 613 | "\n", 614 | " while True:\n", 615 | " # Apply offset and limit to request\n", 616 | " request.offset = offset\n", 617 | " request.limit = limit\n", 618 | "\n", 619 | " # Fetch report data\n", 620 | " response = client.run_report(request)\n", 621 | " all_rows.extend(response.rows)\n", 622 | "\n", 623 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 624 | "\n", 625 | " # If fewer rows are fetched than the limit, we're done\n", 626 | " if len(response.rows) < limit:\n", 627 | " break\n", 628 | "\n", 629 | " # Update offset and page number to get the next set of rows\n", 630 | " offset += limit\n", 631 | " page_number += 1\n", 632 | "\n", 633 | " return all_rows\n", 634 | "\n", 635 | "# Function to fetch GA4 data using pagination\n", 636 | "def get_ga4_report(client):\n", 637 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 638 | " request = RunReportRequest(\n", 639 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 640 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 641 | " dimensions=[\n", 642 | " Dimension(name='date'),\n", 643 | " Dimension(name='country'),\n", 644 | " Dimension(name='language'),\n", 645 | " Dimension(name='city')\n", 646 | " ],\n", 647 | " metrics=[\n", 648 | " Metric(name='sessions'),\n", 649 | " Metric(name='screenPageViews'),\n", 650 | " Metric(name='totalUsers'),\n", 651 | " Metric(name='newUsers'),\n", 652 | " Metric(name='ecommercePurchases'),\n", 653 | " Metric(name='purchaseRevenue')\n", 654 | " ]\n", 655 | " )\n", 656 | " return run_report_with_pagination(client, request)\n", 657 | "\n", 658 | "# Function to convert GA4 response to a DataFrame\n", 659 | "def response_to_dataframe(response):\n", 660 | " list_rows = []\n", 661 | " for row in response:\n", 662 | " try:\n", 663 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 664 | " except ValueError:\n", 665 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 666 | " country = row.dimension_values[1].value\n", 667 | " language = row.dimension_values[2].value\n", 668 | " city = row.dimension_values[3].value\n", 669 | " list_rows.append({\n", 670 | " 'date': date_value,\n", 671 | " 'country': country,\n", 672 | " 'language': language,\n", 673 | " 'city': city,\n", 674 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 675 | " 'screenPageViews': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 676 | " 'totalUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 677 | " 'newUsers': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 678 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 679 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n", 680 | " })\n", 681 | " return pd.DataFrame(list_rows)\n", 682 | "\n", 683 | "# Function to upload data to BigQuery\n", 684 | "def upload_to_bigquery(df, table_id):\n", 685 | " # Define BigQuery schema\n", 686 | " schema = [\n", 687 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 688 | " bigquery.SchemaField(\"country\", \"STRING\"),\n", 689 | " bigquery.SchemaField(\"language\", \"STRING\"),\n", 690 | " bigquery.SchemaField(\"city\", \"STRING\"),\n", 691 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 692 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n", 693 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 694 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 695 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 696 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 697 | " ]\n", 698 | "\n", 699 | " # Configure BigQuery job to partition the table by the 'date' column\n", 700 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 701 | " job_config = bigquery.LoadJobConfig(\n", 702 | " schema=schema,\n", 703 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 704 | " time_partitioning=bigquery.TimePartitioning(\n", 705 | " type_=bigquery.TimePartitioningType.DAY,\n", 706 | " field='date'\n", 707 | " )\n", 708 | " )\n", 709 | "\n", 710 | " # Upload the DataFrame to BigQuery\n", 711 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 712 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 713 | "\n", 714 | "# Main function\n", 715 | "def main():\n", 716 | " try:\n", 717 | " # Authenticate GA4 using OAuth2\n", 718 | " creds = authenticate_ga4()\n", 719 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 720 | "\n", 721 | " # Fetch GA4 data\n", 722 | " ga4_response = get_ga4_report(client_ga4)\n", 723 | "\n", 724 | " # Convert the response to a DataFrame\n", 725 | " ga4_df = response_to_dataframe(ga4_response)\n", 726 | "\n", 727 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 728 | " table_id = 'ga4_data_country_language_city'\n", 729 | " csv_filename = f\"{table_id}.csv\"\n", 730 | "\n", 731 | " # Save the DataFrame to a CSV file\n", 732 | " ga4_df.to_csv(csv_filename, index=False)\n", 733 | " print(f\"Data saved to {csv_filename}\")\n", 734 | "\n", 735 | " # Upload the DataFrame to BigQuery\n", 736 | " upload_to_bigquery(ga4_df, table_id)\n", 737 | " except Exception as e:\n", 738 | " print(f\"Error occurred: {e}\")\n", 739 | "\n", 740 | "if __name__ == '__main__':\n", 741 | " main()\n" 742 | ], 743 | "metadata": { 744 | "id": "e-Oqh-oNfbC1" 745 | }, 746 | "execution_count": null, 747 | "outputs": [] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "source": [ 752 | "import pandas as pd\n", 753 | "from google.cloud import bigquery\n", 754 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 755 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 756 | "from google.oauth2 import service_account\n", 757 | "from google_auth_oauthlib.flow import Flow\n", 758 | "import json\n", 759 | "import os\n", 760 | "import pickle\n", 761 | "\n", 762 | "# Load configuration from a JSON file\n", 763 | "with open(\"config.json\", \"r\") as f:\n", 764 | " config = json.load(f)\n", 765 | "\n", 766 | "# Authenticate with service account for BigQuery\n", 767 | "creds1 = service_account.Credentials.from_service_account_file(\n", 768 | " config['SERVICE_ACCOUNT_FILE'],\n", 769 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 770 | ")\n", 771 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 772 | "\n", 773 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 774 | "def authenticate_ga4():\n", 775 | " creds = None\n", 776 | " if os.path.exists('token.pickle'):\n", 777 | " with open('token.pickle', 'rb') as token:\n", 778 | " creds = pickle.load(token)\n", 779 | " else:\n", 780 | " flow = Flow.from_client_secrets_file(\n", 781 | " config['CLIENT_SECRET_FILE'],\n", 782 | " scopes=config['SCOPES'],\n", 783 | " redirect_uri='http://localhost:8080/'\n", 784 | " )\n", 785 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 786 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 787 | " code = input('Enter the authorization code: ')\n", 788 | " flow.fetch_token(code=code)\n", 789 | " creds = flow.credentials\n", 790 | " with open('token.pickle', 'wb') as token:\n", 791 | " pickle.dump(creds, token)\n", 792 | " return creds\n", 793 | "\n", 794 | "# Function to paginate and fetch GA4 report data with logging\n", 795 | "def run_report_with_pagination(client, request, limit=1000):\n", 796 | " all_rows = []\n", 797 | " offset = 0\n", 798 | " page_number = 1\n", 799 | "\n", 800 | " while True:\n", 801 | " # Apply offset and limit to request\n", 802 | " request.offset = offset\n", 803 | " request.limit = limit\n", 804 | "\n", 805 | " # Fetch report data\n", 806 | " response = client.run_report(request)\n", 807 | " all_rows.extend(response.rows)\n", 808 | "\n", 809 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 810 | "\n", 811 | " # If fewer rows are fetched than the limit, we're done\n", 812 | " if len(response.rows) < limit:\n", 813 | " break\n", 814 | "\n", 815 | " # Update offset and page number to get the next set of rows\n", 816 | " offset += limit\n", 817 | " page_number += 1\n", 818 | "\n", 819 | " return all_rows\n", 820 | "\n", 821 | "# Function to fetch GA4 data using pagination\n", 822 | "def get_ga4_report(client):\n", 823 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 824 | " request = RunReportRequest(\n", 825 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 826 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 827 | " dimensions=[\n", 828 | " Dimension(name='date'),\n", 829 | " Dimension(name='itemName')\n", 830 | " ],\n", 831 | " metrics=[\n", 832 | " Metric(name='itemPurchaseQuantity'),\n", 833 | " Metric(name='itemRevenue')\n", 834 | " ]\n", 835 | " )\n", 836 | " return run_report_with_pagination(client, request)\n", 837 | "\n", 838 | "# Function to convert GA4 response to a DataFrame\n", 839 | "def response_to_dataframe(response):\n", 840 | " list_rows = []\n", 841 | " for row in response:\n", 842 | " try:\n", 843 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 844 | " except ValueError:\n", 845 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 846 | " item_name = row.dimension_values[1].value\n", 847 | " list_rows.append({\n", 848 | " 'date': date_value,\n", 849 | " 'itemName': item_name,\n", 850 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 851 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n", 852 | " })\n", 853 | " return pd.DataFrame(list_rows)\n", 854 | "\n", 855 | "# Function to upload data to BigQuery\n", 856 | "def upload_to_bigquery(df, table_id):\n", 857 | " # Define BigQuery schema\n", 858 | " schema = [\n", 859 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 860 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n", 861 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n", 862 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n", 863 | " ]\n", 864 | "\n", 865 | " # Configure BigQuery job to partition the table by the 'date' column\n", 866 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 867 | " job_config = bigquery.LoadJobConfig(\n", 868 | " schema=schema,\n", 869 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 870 | " time_partitioning=bigquery.TimePartitioning(\n", 871 | " type_=bigquery.TimePartitioningType.DAY,\n", 872 | " field='date'\n", 873 | " )\n", 874 | " )\n", 875 | "\n", 876 | " # Upload the DataFrame to BigQuery\n", 877 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 878 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 879 | "\n", 880 | "# Main function\n", 881 | "def main():\n", 882 | " try:\n", 883 | " # Authenticate GA4 using OAuth2\n", 884 | " creds = authenticate_ga4()\n", 885 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 886 | "\n", 887 | " # Fetch GA4 data\n", 888 | " ga4_response = get_ga4_report(client_ga4)\n", 889 | "\n", 890 | " # Convert the response to a DataFrame\n", 891 | " ga4_df = response_to_dataframe(ga4_response)\n", 892 | "\n", 893 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 894 | " table_id = 'ga4_data_item_name'\n", 895 | " csv_filename = f\"{table_id}.csv\"\n", 896 | "\n", 897 | " # Save the DataFrame to a CSV file\n", 898 | " ga4_df.to_csv(csv_filename, index=False)\n", 899 | " print(f\"Data saved to {csv_filename}\")\n", 900 | "\n", 901 | " # Upload the DataFrame to BigQuery\n", 902 | " upload_to_bigquery(ga4_df, table_id)\n", 903 | " except Exception as e:\n", 904 | " print(f\"Error occurred: {e}\")\n", 905 | "\n", 906 | "if __name__ == '__main__':\n", 907 | " main()\n" 908 | ], 909 | "metadata": { 910 | "id": "RKU2hiP7gynQ" 911 | }, 912 | "execution_count": null, 913 | "outputs": [] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "source": [ 918 | "import pandas as pd\n", 919 | "from google.cloud import bigquery\n", 920 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 921 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 922 | "from google.oauth2 import service_account\n", 923 | "from google_auth_oauthlib.flow import Flow\n", 924 | "import json\n", 925 | "import os\n", 926 | "import pickle\n", 927 | "\n", 928 | "# Load configuration from a JSON file\n", 929 | "with open(\"config.json\", \"r\") as f:\n", 930 | " config = json.load(f)\n", 931 | "\n", 932 | "# Authenticate with service account for BigQuery\n", 933 | "creds1 = service_account.Credentials.from_service_account_file(\n", 934 | " config['SERVICE_ACCOUNT_FILE'],\n", 935 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 936 | ")\n", 937 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 938 | "\n", 939 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 940 | "def authenticate_ga4():\n", 941 | " creds = None\n", 942 | " if os.path.exists('token.pickle'):\n", 943 | " with open('token.pickle', 'rb') as token:\n", 944 | " creds = pickle.load(token)\n", 945 | " else:\n", 946 | " flow = Flow.from_client_secrets_file(\n", 947 | " config['CLIENT_SECRET_FILE'],\n", 948 | " scopes=config['SCOPES'],\n", 949 | " redirect_uri='http://localhost:8080/'\n", 950 | " )\n", 951 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 952 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 953 | " code = input('Enter the authorization code: ')\n", 954 | " flow.fetch_token(code=code)\n", 955 | " creds = flow.credentials\n", 956 | " with open('token.pickle', 'wb') as token:\n", 957 | " pickle.dump(creds, token)\n", 958 | " return creds\n", 959 | "\n", 960 | "# Function to paginate and fetch GA4 report data with logging\n", 961 | "def run_report_with_pagination(client, request, limit=1000):\n", 962 | " all_rows = []\n", 963 | " offset = 0\n", 964 | " page_number = 1\n", 965 | "\n", 966 | " while True:\n", 967 | " # Apply offset and limit to request\n", 968 | " request.offset = offset\n", 969 | " request.limit = limit\n", 970 | "\n", 971 | " # Fetch report data\n", 972 | " response = client.run_report(request)\n", 973 | " all_rows.extend(response.rows)\n", 974 | "\n", 975 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 976 | "\n", 977 | " # If fewer rows are fetched than the limit, we're done\n", 978 | " if len(response.rows) < limit:\n", 979 | " break\n", 980 | "\n", 981 | " # Update offset and page number to get the next set of rows\n", 982 | " offset += limit\n", 983 | " page_number += 1\n", 984 | "\n", 985 | " return all_rows\n", 986 | "\n", 987 | "# Function to fetch GA4 data using pagination\n", 988 | "def get_ga4_report(client):\n", 989 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 990 | " request = RunReportRequest(\n", 991 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 992 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 993 | " dimensions=[\n", 994 | " Dimension(name='date'),\n", 995 | " Dimension(name='browser'),\n", 996 | " Dimension(name='operatingSystem'),\n", 997 | " Dimension(name='deviceCategory')\n", 998 | " ],\n", 999 | " metrics=[\n", 1000 | " Metric(name='sessions'),\n", 1001 | " Metric(name='screenPageViews'),\n", 1002 | " Metric(name='totalUsers'),\n", 1003 | " Metric(name='newUsers'),\n", 1004 | " Metric(name='ecommercePurchases'),\n", 1005 | " Metric(name='purchaseRevenue')\n", 1006 | " ]\n", 1007 | " )\n", 1008 | " return run_report_with_pagination(client, request)\n", 1009 | "\n", 1010 | "# Function to convert GA4 response to a DataFrame\n", 1011 | "def response_to_dataframe(response):\n", 1012 | " list_rows = []\n", 1013 | " for row in response:\n", 1014 | " try:\n", 1015 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 1016 | " except ValueError:\n", 1017 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 1018 | " browser = row.dimension_values[1].value\n", 1019 | " operating_system = row.dimension_values[2].value\n", 1020 | " device_category = row.dimension_values[3].value\n", 1021 | " list_rows.append({\n", 1022 | " 'date': date_value,\n", 1023 | " 'browser': browser,\n", 1024 | " 'operatingSystem': operating_system,\n", 1025 | " 'deviceCategory': device_category,\n", 1026 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1027 | " 'screenPageViews': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 1028 | " 'totalUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 1029 | " 'newUsers': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 1030 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 1031 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n", 1032 | " })\n", 1033 | " return pd.DataFrame(list_rows)\n", 1034 | "\n", 1035 | "# Function to upload data to BigQuery\n", 1036 | "def upload_to_bigquery(df, table_id):\n", 1037 | " # Define BigQuery schema\n", 1038 | " schema = [\n", 1039 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 1040 | " bigquery.SchemaField(\"browser\", \"STRING\"),\n", 1041 | " bigquery.SchemaField(\"operatingSystem\", \"STRING\"),\n", 1042 | " bigquery.SchemaField(\"deviceCategory\", \"STRING\"),\n", 1043 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 1044 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n", 1045 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 1046 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 1047 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 1048 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 1049 | " ]\n", 1050 | "\n", 1051 | " # Configure BigQuery job to partition the table by the 'date' column\n", 1052 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1053 | " job_config = bigquery.LoadJobConfig(\n", 1054 | " schema=schema,\n", 1055 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 1056 | " time_partitioning=bigquery.TimePartitioning(\n", 1057 | " type_=bigquery.TimePartitioningType.DAY,\n", 1058 | " field='date'\n", 1059 | " )\n", 1060 | " )\n", 1061 | "\n", 1062 | " # Upload the DataFrame to BigQuery\n", 1063 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1064 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 1065 | "\n", 1066 | "# Main function\n", 1067 | "def main():\n", 1068 | " try:\n", 1069 | " # Authenticate GA4 using OAuth2\n", 1070 | " creds = authenticate_ga4()\n", 1071 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1072 | "\n", 1073 | " # Fetch GA4 data\n", 1074 | " ga4_response = get_ga4_report(client_ga4)\n", 1075 | "\n", 1076 | " # Convert the response to a DataFrame\n", 1077 | " ga4_df = response_to_dataframe(ga4_response)\n", 1078 | "\n", 1079 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1080 | " table_id = 'ga4_data_browser_os_device'\n", 1081 | " csv_filename = f\"{table_id}.csv\"\n", 1082 | "\n", 1083 | " # Save the DataFrame to a CSV file\n", 1084 | " ga4_df.to_csv(csv_filename, index=False)\n", 1085 | " print(f\"Data saved to {csv_filename}\")\n", 1086 | "\n", 1087 | " # Upload the DataFrame to BigQuery\n", 1088 | " upload_to_bigquery(ga4_df, table_id)\n", 1089 | " except Exception as e:\n", 1090 | " print(f\"Error occurred: {e}\")\n", 1091 | "\n", 1092 | "if __name__ == '__main__':\n", 1093 | " main()\n" 1094 | ], 1095 | "metadata": { 1096 | "id": "YpYm_kTLiqsy" 1097 | }, 1098 | "execution_count": null, 1099 | "outputs": [] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "source": [ 1104 | "import pandas as pd\n", 1105 | "from google.cloud import bigquery\n", 1106 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1107 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1108 | "from google.oauth2 import service_account\n", 1109 | "from google_auth_oauthlib.flow import Flow\n", 1110 | "import json\n", 1111 | "import os\n", 1112 | "import pickle\n", 1113 | "\n", 1114 | "# Load configuration from a JSON file\n", 1115 | "with open(\"config.json\", \"r\") as f:\n", 1116 | " config = json.load(f)\n", 1117 | "\n", 1118 | "# Authenticate with service account for BigQuery\n", 1119 | "creds1 = service_account.Credentials.from_service_account_file(\n", 1120 | " config['SERVICE_ACCOUNT_FILE'],\n", 1121 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 1122 | ")\n", 1123 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 1124 | "\n", 1125 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 1126 | "def authenticate_ga4():\n", 1127 | " creds = None\n", 1128 | " if os.path.exists('token.pickle'):\n", 1129 | " with open('token.pickle', 'rb') as token:\n", 1130 | " creds = pickle.load(token)\n", 1131 | " else:\n", 1132 | " flow = Flow.from_client_secrets_file(\n", 1133 | " config['CLIENT_SECRET_FILE'],\n", 1134 | " scopes=config['SCOPES'],\n", 1135 | " redirect_uri='http://localhost:8080/'\n", 1136 | " )\n", 1137 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 1138 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 1139 | " code = input('Enter the authorization code: ')\n", 1140 | " flow.fetch_token(code=code)\n", 1141 | " creds = flow.credentials\n", 1142 | " with open('token.pickle', 'wb') as token:\n", 1143 | " pickle.dump(creds, token)\n", 1144 | " return creds\n", 1145 | "\n", 1146 | "# Function to paginate and fetch GA4 report data with logging\n", 1147 | "def run_report_with_pagination(client, request, limit=10000):\n", 1148 | " all_rows = []\n", 1149 | " offset = 0\n", 1150 | " page_number = 1\n", 1151 | "\n", 1152 | " while True:\n", 1153 | " # Apply offset and limit to request\n", 1154 | " request.offset = offset\n", 1155 | " request.limit = limit\n", 1156 | "\n", 1157 | " # Fetch report data\n", 1158 | " response = client.run_report(request)\n", 1159 | " all_rows.extend(response.rows)\n", 1160 | "\n", 1161 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 1162 | "\n", 1163 | " # If fewer rows are fetched than the limit, we're done\n", 1164 | " if len(response.rows) < limit:\n", 1165 | " break\n", 1166 | "\n", 1167 | " # Update offset and page number to get the next set of rows\n", 1168 | " offset += limit\n", 1169 | " page_number += 1\n", 1170 | "\n", 1171 | " return all_rows\n", 1172 | "\n", 1173 | "# Function to fetch GA4 data using pagination\n", 1174 | "def get_ga4_report(client):\n", 1175 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 1176 | " request = RunReportRequest(\n", 1177 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 1178 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 1179 | " dimensions=[\n", 1180 | " Dimension(name='date'),\n", 1181 | " Dimension(name='firstUserMedium'),\n", 1182 | " Dimension(name='firstUserSource'),\n", 1183 | " Dimension(name='firstUserCampaignName')\n", 1184 | " ],\n", 1185 | " metrics=[\n", 1186 | " Metric(name='totalUsers'),\n", 1187 | " Metric(name='newUsers'),\n", 1188 | " Metric(name='ecommercePurchases'),\n", 1189 | " Metric(name='purchaseRevenue')\n", 1190 | " ]\n", 1191 | " )\n", 1192 | " return run_report_with_pagination(client, request)\n", 1193 | "\n", 1194 | "# Function to convert GA4 response to a DataFrame\n", 1195 | "def response_to_dataframe(response):\n", 1196 | " list_rows = []\n", 1197 | " for row in response:\n", 1198 | " try:\n", 1199 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 1200 | " except ValueError:\n", 1201 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 1202 | " first_user_medium = row.dimension_values[1].value\n", 1203 | " first_user_source = row.dimension_values[2].value\n", 1204 | " first_user_campaign_name = row.dimension_values[3].value\n", 1205 | " list_rows.append({\n", 1206 | " 'date': date_value,\n", 1207 | " 'firstUserMedium': first_user_medium,\n", 1208 | " 'firstUserSource': first_user_source,\n", 1209 | " 'firstUserCampaignName': first_user_campaign_name,\n", 1210 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1211 | " 'newUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 1212 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 1213 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0\n", 1214 | " })\n", 1215 | " return pd.DataFrame(list_rows)\n", 1216 | "\n", 1217 | "# Function to upload data to BigQuery\n", 1218 | "def upload_to_bigquery(df, table_id):\n", 1219 | " # Define BigQuery schema\n", 1220 | " schema = [\n", 1221 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 1222 | " bigquery.SchemaField(\"firstUserMedium\", \"STRING\"),\n", 1223 | " bigquery.SchemaField(\"firstUserSource\", \"STRING\"),\n", 1224 | " bigquery.SchemaField(\"firstUserCampaignName\", \"STRING\"),\n", 1225 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 1226 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 1227 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 1228 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 1229 | " ]\n", 1230 | "\n", 1231 | " # Configure BigQuery job to partition the table by the 'date' column\n", 1232 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1233 | " job_config = bigquery.LoadJobConfig(\n", 1234 | " schema=schema,\n", 1235 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 1236 | " time_partitioning=bigquery.TimePartitioning(\n", 1237 | " type_=bigquery.TimePartitioningType.DAY,\n", 1238 | " field='date'\n", 1239 | " )\n", 1240 | " )\n", 1241 | "\n", 1242 | " # Upload the DataFrame to BigQuery\n", 1243 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1244 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 1245 | "\n", 1246 | "# Main function\n", 1247 | "def main():\n", 1248 | " try:\n", 1249 | " # Authenticate GA4 using OAuth2\n", 1250 | " creds = authenticate_ga4()\n", 1251 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1252 | "\n", 1253 | " # Fetch GA4 data\n", 1254 | " ga4_response = get_ga4_report(client_ga4)\n", 1255 | "\n", 1256 | " # Convert the response to a DataFrame\n", 1257 | " ga4_df = response_to_dataframe(ga4_response)\n", 1258 | "\n", 1259 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1260 | " table_id = 'ga4_data_first_user_source_medium'\n", 1261 | " csv_filename = f\"{table_id}.csv\"\n", 1262 | "\n", 1263 | " # Save the DataFrame to a CSV file\n", 1264 | " ga4_df.to_csv(csv_filename, index=False)\n", 1265 | " print(f\"Data saved to {csv_filename}\")\n", 1266 | "\n", 1267 | " # Upload the DataFrame to BigQuery\n", 1268 | " upload_to_bigquery(ga4_df, table_id)\n", 1269 | " except Exception as e:\n", 1270 | " print(f\"Error occurred: {e}\")\n", 1271 | "\n", 1272 | "if __name__ == '__main__':\n", 1273 | " main()\n" 1274 | ], 1275 | "metadata": { 1276 | "id": "s5H79ndims88" 1277 | }, 1278 | "execution_count": null, 1279 | "outputs": [] 1280 | }, 1281 | { 1282 | "cell_type": "code", 1283 | "source": [ 1284 | "import pandas as pd\n", 1285 | "from google.cloud import bigquery\n", 1286 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1287 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1288 | "from google.oauth2 import service_account\n", 1289 | "from google_auth_oauthlib.flow import Flow\n", 1290 | "import json\n", 1291 | "import os\n", 1292 | "import pickle\n", 1293 | "\n", 1294 | "# Load configuration from a JSON file\n", 1295 | "with open(\"config.json\", \"r\") as f:\n", 1296 | " config = json.load(f)\n", 1297 | "\n", 1298 | "# Authenticate with service account for BigQuery\n", 1299 | "creds1 = service_account.Credentials.from_service_account_file(\n", 1300 | " config['SERVICE_ACCOUNT_FILE'],\n", 1301 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 1302 | ")\n", 1303 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 1304 | "\n", 1305 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 1306 | "def authenticate_ga4():\n", 1307 | " creds = None\n", 1308 | " if os.path.exists('token.pickle'):\n", 1309 | " with open('token.pickle', 'rb') as token:\n", 1310 | " creds = pickle.load(token)\n", 1311 | " else:\n", 1312 | " flow = Flow.from_client_secrets_file(\n", 1313 | " config['CLIENT_SECRET_FILE'],\n", 1314 | " scopes=config['SCOPES'],\n", 1315 | " redirect_uri='http://localhost:8080/'\n", 1316 | " )\n", 1317 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 1318 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 1319 | " code = input('Enter the authorization code: ')\n", 1320 | " flow.fetch_token(code=code)\n", 1321 | " creds = flow.credentials\n", 1322 | " with open('token.pickle', 'wb') as token:\n", 1323 | " pickle.dump(creds, token)\n", 1324 | " return creds\n", 1325 | "\n", 1326 | "# Function to paginate and fetch GA4 report data with logging\n", 1327 | "def run_report_with_pagination(client, request, limit=10000):\n", 1328 | " all_rows = []\n", 1329 | " offset = 0\n", 1330 | " page_number = 1\n", 1331 | "\n", 1332 | " while True:\n", 1333 | " # Apply offset and limit to request\n", 1334 | " request.offset = offset\n", 1335 | " request.limit = limit\n", 1336 | "\n", 1337 | " # Fetch report data\n", 1338 | " response = client.run_report(request)\n", 1339 | " all_rows.extend(response.rows)\n", 1340 | "\n", 1341 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 1342 | "\n", 1343 | " # If fewer rows are fetched than the limit, we're done\n", 1344 | " if len(response.rows) < limit:\n", 1345 | " break\n", 1346 | "\n", 1347 | " # Update offset and page number to get the next set of rows\n", 1348 | " offset += limit\n", 1349 | " page_number += 1\n", 1350 | "\n", 1351 | " return all_rows\n", 1352 | "\n", 1353 | "# Function to fetch GA4 data using pagination\n", 1354 | "def get_ga4_report(client):\n", 1355 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 1356 | " request = RunReportRequest(\n", 1357 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 1358 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 1359 | " dimensions=[\n", 1360 | " Dimension(name='date'),\n", 1361 | " Dimension(name='firstUserDefaultChannelGroup')\n", 1362 | " ],\n", 1363 | " metrics=[\n", 1364 | " Metric(name='totalUsers'),\n", 1365 | " Metric(name='newUsers'),\n", 1366 | " Metric(name='ecommercePurchases'),\n", 1367 | " Metric(name='purchaseRevenue')\n", 1368 | " ]\n", 1369 | " )\n", 1370 | " return run_report_with_pagination(client, request)\n", 1371 | "\n", 1372 | "# Function to convert GA4 response to a DataFrame\n", 1373 | "def response_to_dataframe(response):\n", 1374 | " list_rows = []\n", 1375 | " for row in response:\n", 1376 | " try:\n", 1377 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 1378 | " except ValueError:\n", 1379 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 1380 | " first_user_channel_group = row.dimension_values[1].value\n", 1381 | " list_rows.append({\n", 1382 | " 'date': date_value,\n", 1383 | " 'firstUserDefaultChannelGroup': first_user_channel_group,\n", 1384 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1385 | " 'newUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 1386 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 1387 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0\n", 1388 | " })\n", 1389 | " return pd.DataFrame(list_rows)\n", 1390 | "\n", 1391 | "# Function to upload data to BigQuery\n", 1392 | "def upload_to_bigquery(df, table_id):\n", 1393 | " # Define BigQuery schema\n", 1394 | " schema = [\n", 1395 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 1396 | " bigquery.SchemaField(\"firstUserDefaultChannelGroup\", \"STRING\"),\n", 1397 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 1398 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 1399 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 1400 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\")\n", 1401 | " ]\n", 1402 | "\n", 1403 | " # Configure BigQuery job to partition the table by the 'date' column\n", 1404 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1405 | " job_config = bigquery.LoadJobConfig(\n", 1406 | " schema=schema,\n", 1407 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 1408 | " time_partitioning=bigquery.TimePartitioning(\n", 1409 | " type_=bigquery.TimePartitioningType.DAY,\n", 1410 | " field='date'\n", 1411 | " )\n", 1412 | " )\n", 1413 | "\n", 1414 | " # Upload the DataFrame to BigQuery\n", 1415 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1416 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 1417 | "\n", 1418 | "# Main function\n", 1419 | "def main():\n", 1420 | " try:\n", 1421 | " # Authenticate GA4 using OAuth2\n", 1422 | " creds = authenticate_ga4()\n", 1423 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1424 | "\n", 1425 | " # Fetch GA4 data\n", 1426 | " ga4_response = get_ga4_report(client_ga4)\n", 1427 | "\n", 1428 | " # Convert the response to a DataFrame\n", 1429 | " ga4_df = response_to_dataframe(ga4_response)\n", 1430 | "\n", 1431 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1432 | " table_id = 'ga4_data_first_user_channel_group'\n", 1433 | " csv_filename = f\"{table_id}.csv\"\n", 1434 | "\n", 1435 | " # Save the DataFrame to a CSV file\n", 1436 | " ga4_df.to_csv(csv_filename, index=False)\n", 1437 | " print(f\"Data saved to {csv_filename}\")\n", 1438 | "\n", 1439 | " # Upload the DataFrame to BigQuery\n", 1440 | " upload_to_bigquery(ga4_df, table_id)\n", 1441 | " except Exception as e:\n", 1442 | " print(f\"Error occurred: {e}\")\n", 1443 | "\n", 1444 | "if __name__ == '__main__':\n", 1445 | " main()\n" 1446 | ], 1447 | "metadata": { 1448 | "id": "Fv1ISKAUn-bf" 1449 | }, 1450 | "execution_count": null, 1451 | "outputs": [] 1452 | }, 1453 | { 1454 | "cell_type": "code", 1455 | "source": [ 1456 | "import pandas as pd\n", 1457 | "from google.cloud import bigquery\n", 1458 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1459 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1460 | "from google.oauth2 import service_account\n", 1461 | "from google_auth_oauthlib.flow import Flow\n", 1462 | "import json\n", 1463 | "import os\n", 1464 | "import pickle\n", 1465 | "\n", 1466 | "# Load configuration from a JSON file\n", 1467 | "with open(\"config.json\", \"r\") as f:\n", 1468 | " config = json.load(f)\n", 1469 | "\n", 1470 | "# Authenticate with service account for BigQuery\n", 1471 | "creds1 = service_account.Credentials.from_service_account_file(\n", 1472 | " config['SERVICE_ACCOUNT_FILE'],\n", 1473 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 1474 | ")\n", 1475 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 1476 | "\n", 1477 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 1478 | "def authenticate_ga4():\n", 1479 | " creds = None\n", 1480 | " if os.path.exists('token.pickle'):\n", 1481 | " with open('token.pickle', 'rb') as token:\n", 1482 | " creds = pickle.load(token)\n", 1483 | " else:\n", 1484 | " flow = Flow.from_client_secrets_file(\n", 1485 | " config['CLIENT_SECRET_FILE'],\n", 1486 | " scopes=config['SCOPES'],\n", 1487 | " redirect_uri='http://localhost:8080/'\n", 1488 | " )\n", 1489 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 1490 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 1491 | " code = input('Enter the authorization code: ')\n", 1492 | " flow.fetch_token(code=code)\n", 1493 | " creds = flow.credentials\n", 1494 | " with open('token.pickle', 'wb') as token:\n", 1495 | " pickle.dump(creds, token)\n", 1496 | " return creds\n", 1497 | "\n", 1498 | "# Function to paginate and fetch GA4 report data with logging\n", 1499 | "def run_report_with_pagination(client, request, limit=10000):\n", 1500 | " all_rows = []\n", 1501 | " offset = 0\n", 1502 | " page_number = 1\n", 1503 | "\n", 1504 | " while True:\n", 1505 | " # Apply offset and limit to request\n", 1506 | " request.offset = offset\n", 1507 | " request.limit = limit\n", 1508 | "\n", 1509 | " # Fetch report data\n", 1510 | " response = client.run_report(request)\n", 1511 | " all_rows.extend(response.rows)\n", 1512 | "\n", 1513 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 1514 | "\n", 1515 | " # If fewer rows are fetched than the limit, we're done\n", 1516 | " if len(response.rows) < limit:\n", 1517 | " break\n", 1518 | "\n", 1519 | " # Update offset and page number to get the next set of rows\n", 1520 | " offset += limit\n", 1521 | " page_number += 1\n", 1522 | "\n", 1523 | " return all_rows\n", 1524 | "\n", 1525 | "# Function to fetch GA4 data using pagination\n", 1526 | "def get_ga4_report(client):\n", 1527 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 1528 | " request = RunReportRequest(\n", 1529 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 1530 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 1531 | " dimensions=[\n", 1532 | " Dimension(name='date'),\n", 1533 | " Dimension(name='sessionSource'),\n", 1534 | " Dimension(name='sessionMedium'),\n", 1535 | " Dimension(name='sessionCampaignName')\n", 1536 | " ],\n", 1537 | " metrics=[\n", 1538 | " Metric(name='ecommercePurchases'),\n", 1539 | " Metric(name='averagePurchaseRevenue'),\n", 1540 | " Metric(name='purchaseRevenue'),\n", 1541 | " Metric(name='advertiserAdClicks'),\n", 1542 | " Metric(name='advertiserAdCost'),\n", 1543 | " Metric(name='advertiserAdCostPerClick'),\n", 1544 | " Metric(name='returnOnAdSpend')\n", 1545 | " ]\n", 1546 | " )\n", 1547 | " return run_report_with_pagination(client, request)\n", 1548 | "\n", 1549 | "# Function to convert GA4 response to a DataFrame\n", 1550 | "def response_to_dataframe(response):\n", 1551 | " list_rows = []\n", 1552 | " for row in response:\n", 1553 | " try:\n", 1554 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 1555 | " except ValueError:\n", 1556 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 1557 | " session_source = row.dimension_values[1].value\n", 1558 | " session_medium = row.dimension_values[2].value\n", 1559 | " session_campaign_name = row.dimension_values[3].value\n", 1560 | " list_rows.append({\n", 1561 | " 'date': date_value,\n", 1562 | " 'sessionSource': session_source,\n", 1563 | " 'sessionMedium': session_medium,\n", 1564 | " 'sessionCampaignName': session_campaign_name,\n", 1565 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1566 | " 'averagePurchaseRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 1567 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 1568 | " 'advertiserAdClicks': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 1569 | " 'advertiserAdCost': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 1570 | " 'advertiserAdCostPerClick': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0,\n", 1571 | " 'returnOnAdSpend': pd.to_numeric(row.metric_values[6].value, errors='coerce') or 0\n", 1572 | " })\n", 1573 | " return pd.DataFrame(list_rows)\n", 1574 | "\n", 1575 | "# Function to upload data to BigQuery\n", 1576 | "def upload_to_bigquery(df, table_id):\n", 1577 | " # Define BigQuery schema\n", 1578 | " schema = [\n", 1579 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 1580 | " bigquery.SchemaField(\"sessionSource\", \"STRING\"),\n", 1581 | " bigquery.SchemaField(\"sessionMedium\", \"STRING\"),\n", 1582 | " bigquery.SchemaField(\"sessionCampaignName\", \"STRING\"),\n", 1583 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 1584 | " bigquery.SchemaField(\"averagePurchaseRevenue\", \"FLOAT\"),\n", 1585 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n", 1586 | " bigquery.SchemaField(\"advertiserAdClicks\", \"INTEGER\"),\n", 1587 | " bigquery.SchemaField(\"advertiserAdCost\", \"FLOAT\"),\n", 1588 | " bigquery.SchemaField(\"advertiserAdCostPerClick\", \"FLOAT\"),\n", 1589 | " bigquery.SchemaField(\"returnOnAdSpend\", \"FLOAT\")\n", 1590 | " ]\n", 1591 | "\n", 1592 | " # Configure BigQuery job to partition the table by the 'date' column\n", 1593 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1594 | " job_config = bigquery.LoadJobConfig(\n", 1595 | " schema=schema,\n", 1596 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 1597 | " time_partitioning=bigquery.TimePartitioning(\n", 1598 | " type_=bigquery.TimePartitioningType.DAY,\n", 1599 | " field='date'\n", 1600 | " )\n", 1601 | " )\n", 1602 | "\n", 1603 | " # Upload the DataFrame to BigQuery\n", 1604 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1605 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 1606 | "\n", 1607 | "# Main function\n", 1608 | "def main():\n", 1609 | " try:\n", 1610 | " # Authenticate GA4 using OAuth2\n", 1611 | " creds = authenticate_ga4()\n", 1612 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1613 | "\n", 1614 | " # Fetch GA4 data\n", 1615 | " ga4_response = get_ga4_report(client_ga4)\n", 1616 | "\n", 1617 | " # Convert the response to a DataFrame\n", 1618 | " ga4_df = response_to_dataframe(ga4_response)\n", 1619 | "\n", 1620 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1621 | " table_id = 'ga4_ads_data'\n", 1622 | " csv_filename = f\"{table_id}.csv\"\n", 1623 | "\n", 1624 | " # Save the DataFrame to a CSV file\n", 1625 | " ga4_df.to_csv(csv_filename, index=False)\n", 1626 | " print(f\"Data saved to {csv_filename}\")\n", 1627 | "\n", 1628 | " # Upload the DataFrame to BigQuery\n", 1629 | " upload_to_bigquery(ga4_df, table_id)\n", 1630 | " except Exception as e:\n", 1631 | " print(f\"Error occurred: {e}\")\n", 1632 | "\n", 1633 | "if __name__ == '__main__':\n", 1634 | " main()\n" 1635 | ], 1636 | "metadata": { 1637 | "id": "TB_tq1b0rvkt" 1638 | }, 1639 | "execution_count": null, 1640 | "outputs": [] 1641 | }, 1642 | { 1643 | "cell_type": "code", 1644 | "source": [ 1645 | "import pandas as pd\n", 1646 | "from google.cloud import bigquery\n", 1647 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1648 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1649 | "from google.oauth2 import service_account\n", 1650 | "from google_auth_oauthlib.flow import Flow\n", 1651 | "import json\n", 1652 | "import os\n", 1653 | "import pickle\n", 1654 | "\n", 1655 | "# Load configuration from a JSON file\n", 1656 | "with open(\"config.json\", \"r\") as f:\n", 1657 | " config = json.load(f)\n", 1658 | "\n", 1659 | "# Authenticate with service account for BigQuery\n", 1660 | "creds1 = service_account.Credentials.from_service_account_file(\n", 1661 | " config['SERVICE_ACCOUNT_FILE'],\n", 1662 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 1663 | ")\n", 1664 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 1665 | "\n", 1666 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 1667 | "def authenticate_ga4():\n", 1668 | " creds = None\n", 1669 | " if os.path.exists('token.pickle'):\n", 1670 | " with open('token.pickle', 'rb') as token:\n", 1671 | " creds = pickle.load(token)\n", 1672 | " else:\n", 1673 | " flow = Flow.from_client_secrets_file(\n", 1674 | " config['CLIENT_SECRET_FILE'],\n", 1675 | " scopes=config['SCOPES'],\n", 1676 | " redirect_uri='http://localhost:8080/'\n", 1677 | " )\n", 1678 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 1679 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 1680 | " code = input('Enter the authorization code: ')\n", 1681 | " flow.fetch_token(code=code)\n", 1682 | " creds = flow.credentials\n", 1683 | " with open('token.pickle', 'wb') as token:\n", 1684 | " pickle.dump(creds, token)\n", 1685 | " return creds\n", 1686 | "\n", 1687 | "# Function to paginate and fetch GA4 report data with logging\n", 1688 | "def run_report_with_pagination(client, request, limit=10000):\n", 1689 | " all_rows = []\n", 1690 | " offset = 0\n", 1691 | " page_number = 1\n", 1692 | "\n", 1693 | " while True:\n", 1694 | " # Apply offset and limit to request\n", 1695 | " request.offset = offset\n", 1696 | " request.limit = limit\n", 1697 | "\n", 1698 | " # Fetch report data\n", 1699 | " response = client.run_report(request)\n", 1700 | " all_rows.extend(response.rows)\n", 1701 | "\n", 1702 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 1703 | "\n", 1704 | " # If fewer rows are fetched than the limit, we're done\n", 1705 | " if len(response.rows) < limit:\n", 1706 | " break\n", 1707 | "\n", 1708 | " # Update offset and page number to get the next set of rows\n", 1709 | " offset += limit\n", 1710 | " page_number += 1\n", 1711 | "\n", 1712 | " return all_rows\n", 1713 | "\n", 1714 | "# Function to fetch GA4 data using pagination\n", 1715 | "def get_ga4_report(client):\n", 1716 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 1717 | " request = RunReportRequest(\n", 1718 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 1719 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 1720 | " dimensions=[\n", 1721 | " Dimension(name='transactionId'),\n", 1722 | " Dimension(name='itemName')\n", 1723 | " ],\n", 1724 | " metrics=[\n", 1725 | " Metric(name='itemPurchaseQuantity'),\n", 1726 | " Metric(name='itemRevenue')\n", 1727 | " ]\n", 1728 | " )\n", 1729 | " return run_report_with_pagination(client, request)\n", 1730 | "\n", 1731 | "# Function to convert GA4 response to a DataFrame\n", 1732 | "def response_to_dataframe(response):\n", 1733 | " list_rows = []\n", 1734 | " for row in response:\n", 1735 | " transaction_id = row.dimension_values[0].value\n", 1736 | " item_name = row.dimension_values[1].value\n", 1737 | " list_rows.append({\n", 1738 | " 'transactionId': transaction_id,\n", 1739 | " 'itemName': item_name,\n", 1740 | " 'itemPurchaseQuantity': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1741 | " 'itemRevenue': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0\n", 1742 | " })\n", 1743 | " return pd.DataFrame(list_rows)\n", 1744 | "\n", 1745 | "# Function to upload data to BigQuery\n", 1746 | "def upload_to_bigquery(df, table_id):\n", 1747 | " # Define BigQuery schema\n", 1748 | " schema = [\n", 1749 | " bigquery.SchemaField(\"transactionId\", \"STRING\"),\n", 1750 | " bigquery.SchemaField(\"itemName\", \"STRING\"),\n", 1751 | " bigquery.SchemaField(\"itemPurchaseQuantity\", \"INTEGER\"),\n", 1752 | " bigquery.SchemaField(\"itemRevenue\", \"FLOAT\")\n", 1753 | " ]\n", 1754 | "\n", 1755 | " # Configure BigQuery job to partition the table by the 'transactionId' field\n", 1756 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1757 | " job_config = bigquery.LoadJobConfig(\n", 1758 | " schema=schema,\n", 1759 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE\n", 1760 | " )\n", 1761 | "\n", 1762 | " # Upload the DataFrame to BigQuery\n", 1763 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1764 | " print(f\"Data uploaded to {table_ref}\")\n", 1765 | "\n", 1766 | "# Main function\n", 1767 | "def main():\n", 1768 | " try:\n", 1769 | " # Authenticate GA4 using OAuth2\n", 1770 | " creds = authenticate_ga4()\n", 1771 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1772 | "\n", 1773 | " # Fetch GA4 data\n", 1774 | " ga4_response = get_ga4_report(client_ga4)\n", 1775 | "\n", 1776 | " # Convert the response to a DataFrame\n", 1777 | " ga4_df = response_to_dataframe(ga4_response)\n", 1778 | "\n", 1779 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1780 | " table_id = 'ga4_transaction_items'\n", 1781 | " csv_filename = f\"{table_id}.csv\"\n", 1782 | "\n", 1783 | " # Save the DataFrame to a CSV file\n", 1784 | " ga4_df.to_csv(csv_filename, index=False)\n", 1785 | " print(f\"Data saved to {csv_filename}\")\n", 1786 | "\n", 1787 | " # Upload the DataFrame to BigQuery\n", 1788 | " upload_to_bigquery(ga4_df, table_id)\n", 1789 | " except Exception as e:\n", 1790 | " print(f\"Error occurred: {e}\")\n", 1791 | "\n", 1792 | "if __name__ == '__main__':\n", 1793 | " main()\n" 1794 | ], 1795 | "metadata": { 1796 | "id": "Dt6n-vqwt1NB" 1797 | }, 1798 | "execution_count": null, 1799 | "outputs": [] 1800 | }, 1801 | { 1802 | "cell_type": "code", 1803 | "source": [ 1804 | "import pandas as pd\n", 1805 | "from google.cloud import bigquery\n", 1806 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1807 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1808 | "from google.oauth2 import service_account\n", 1809 | "from google_auth_oauthlib.flow import Flow\n", 1810 | "import json\n", 1811 | "import os\n", 1812 | "import pickle\n", 1813 | "\n", 1814 | "# Load configuration from a JSON file\n", 1815 | "with open(\"config.json\", \"r\") as f:\n", 1816 | " config = json.load(f)\n", 1817 | "\n", 1818 | "# Authenticate with service account for BigQuery\n", 1819 | "creds1 = service_account.Credentials.from_service_account_file(\n", 1820 | " config['SERVICE_ACCOUNT_FILE'],\n", 1821 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 1822 | ")\n", 1823 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 1824 | "\n", 1825 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 1826 | "def authenticate_ga4():\n", 1827 | " creds = None\n", 1828 | " if os.path.exists('token.pickle'):\n", 1829 | " with open('token.pickle', 'rb') as token:\n", 1830 | " creds = pickle.load(token)\n", 1831 | " else:\n", 1832 | " flow = Flow.from_client_secrets_file(\n", 1833 | " config['CLIENT_SECRET_FILE'],\n", 1834 | " scopes=config['SCOPES'],\n", 1835 | " redirect_uri='http://localhost:8080/'\n", 1836 | " )\n", 1837 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 1838 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 1839 | " code = input('Enter the authorization code: ')\n", 1840 | " flow.fetch_token(code=code)\n", 1841 | " creds = flow.credentials\n", 1842 | " with open('token.pickle', 'wb') as token:\n", 1843 | " pickle.dump(creds, token)\n", 1844 | " return creds\n", 1845 | "\n", 1846 | "# Function to paginate and fetch GA4 report data with logging\n", 1847 | "def run_report_with_pagination(client, request, limit=10000):\n", 1848 | " all_rows = []\n", 1849 | " offset = 0\n", 1850 | " page_number = 1\n", 1851 | "\n", 1852 | " while True:\n", 1853 | " # Apply offset and limit to request\n", 1854 | " request.offset = offset\n", 1855 | " request.limit = limit\n", 1856 | "\n", 1857 | " # Fetch report data\n", 1858 | " response = client.run_report(request)\n", 1859 | " all_rows.extend(response.rows)\n", 1860 | "\n", 1861 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 1862 | "\n", 1863 | " # If fewer rows are fetched than the limit, we're done\n", 1864 | " if len(response.rows) < limit:\n", 1865 | " break\n", 1866 | "\n", 1867 | " # Update offset and page number to get the next set of rows\n", 1868 | " offset += limit\n", 1869 | " page_number += 1\n", 1870 | "\n", 1871 | " return all_rows\n", 1872 | "\n", 1873 | "# Function to fetch GA4 data using pagination\n", 1874 | "def get_ga4_report(client):\n", 1875 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 1876 | " request = RunReportRequest(\n", 1877 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 1878 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 1879 | " dimensions=[Dimension(name='date')],\n", 1880 | " metrics=[\n", 1881 | " Metric(name='sessions'),\n", 1882 | " Metric(name='totalUsers'),\n", 1883 | " Metric(name='newUsers'),\n", 1884 | " Metric(name='ecommercePurchases'),\n", 1885 | " Metric(name='purchaseRevenue'),\n", 1886 | " Metric(name='screenPageViews'),\n", 1887 | " Metric(name='eventCount'),\n", 1888 | " Metric(name='averageSessionDuration'),\n", 1889 | " Metric(name='engagedSessions'),\n", 1890 | " Metric(name='engagementRate')\n", 1891 | " ]\n", 1892 | " )\n", 1893 | " return run_report_with_pagination(client, request)\n", 1894 | "\n", 1895 | "# Function to convert GA4 response to a DataFrame\n", 1896 | "def response_to_dataframe(response):\n", 1897 | " list_rows = []\n", 1898 | " for row in response:\n", 1899 | " try:\n", 1900 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 1901 | " except ValueError:\n", 1902 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 1903 | " list_rows.append({\n", 1904 | " 'date': date_value,\n", 1905 | " 'sessions': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 1906 | " 'totalUsers': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 1907 | " 'newUsers': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 1908 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 1909 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 1910 | " 'screenPageViews': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0,\n", 1911 | " 'eventCount': pd.to_numeric(row.metric_values[6].value, errors='coerce') or 0,\n", 1912 | " 'averageSessionDuration': pd.to_numeric(row.metric_values[7].value, errors='coerce') or 0,\n", 1913 | " 'engagedSessions': pd.to_numeric(row.metric_values[8].value, errors='coerce') or 0,\n", 1914 | " 'engagementRate': pd.to_numeric(row.metric_values[9].value, errors='coerce') or 0\n", 1915 | " })\n", 1916 | " return pd.DataFrame(list_rows)\n", 1917 | "\n", 1918 | "# Function to upload data to BigQuery\n", 1919 | "def upload_to_bigquery(df, table_id):\n", 1920 | " # Define BigQuery schema\n", 1921 | " schema = [\n", 1922 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 1923 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 1924 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 1925 | " bigquery.SchemaField(\"newUsers\", \"INTEGER\"),\n", 1926 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 1927 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n", 1928 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"),\n", 1929 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n", 1930 | " bigquery.SchemaField(\"averageSessionDuration\", \"FLOAT\"),\n", 1931 | " bigquery.SchemaField(\"engagedSessions\", \"INTEGER\"),\n", 1932 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\")\n", 1933 | " ]\n", 1934 | "\n", 1935 | " # Configure BigQuery job to partition the table by the 'date' column\n", 1936 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 1937 | " job_config = bigquery.LoadJobConfig(\n", 1938 | " schema=schema,\n", 1939 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 1940 | " time_partitioning=bigquery.TimePartitioning(\n", 1941 | " type_=bigquery.TimePartitioningType.DAY,\n", 1942 | " field='date'\n", 1943 | " )\n", 1944 | " )\n", 1945 | "\n", 1946 | " # Upload the DataFrame to BigQuery\n", 1947 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 1948 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 1949 | "\n", 1950 | "# Main function\n", 1951 | "def main():\n", 1952 | " try:\n", 1953 | " # Authenticate GA4 using OAuth2\n", 1954 | " creds = authenticate_ga4()\n", 1955 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 1956 | "\n", 1957 | " # Fetch GA4 data\n", 1958 | " ga4_response = get_ga4_report(client_ga4)\n", 1959 | "\n", 1960 | " # Convert the response to a DataFrame\n", 1961 | " ga4_df = response_to_dataframe(ga4_response)\n", 1962 | "\n", 1963 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 1964 | " table_id = 'ga4_all_metrics_data'\n", 1965 | " csv_filename = f\"{table_id}.csv\"\n", 1966 | "\n", 1967 | " # Save the DataFrame to a CSV file\n", 1968 | " ga4_df.to_csv(csv_filename, index=False)\n", 1969 | " print(f\"Data saved to {csv_filename}\")\n", 1970 | "\n", 1971 | " # Upload the DataFrame to BigQuery\n", 1972 | " upload_to_bigquery(ga4_df, table_id)\n", 1973 | " except Exception as e:\n", 1974 | " print(f\"Error occurred: {e}\")\n", 1975 | "\n", 1976 | "if __name__ == '__main__':\n", 1977 | " main()\n" 1978 | ], 1979 | "metadata": { 1980 | "id": "Wb8umpjezZsU" 1981 | }, 1982 | "execution_count": null, 1983 | "outputs": [] 1984 | }, 1985 | { 1986 | "cell_type": "code", 1987 | "source": [ 1988 | "import pandas as pd\n", 1989 | "from google.cloud import bigquery\n", 1990 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 1991 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 1992 | "from google.oauth2 import service_account\n", 1993 | "from google_auth_oauthlib.flow import Flow\n", 1994 | "import json\n", 1995 | "import os\n", 1996 | "import pickle\n", 1997 | "\n", 1998 | "# Load configuration from a JSON file\n", 1999 | "with open(\"config.json\", \"r\") as f:\n", 2000 | " config = json.load(f)\n", 2001 | "\n", 2002 | "# Authenticate with service account for BigQuery\n", 2003 | "creds1 = service_account.Credentials.from_service_account_file(\n", 2004 | " config['SERVICE_ACCOUNT_FILE'],\n", 2005 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 2006 | ")\n", 2007 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 2008 | "\n", 2009 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 2010 | "def authenticate_ga4():\n", 2011 | " creds = None\n", 2012 | " if os.path.exists('token.pickle'):\n", 2013 | " with open('token.pickle', 'rb') as token:\n", 2014 | " creds = pickle.load(token)\n", 2015 | " else:\n", 2016 | " flow = Flow.from_client_secrets_file(\n", 2017 | " config['CLIENT_SECRET_FILE'],\n", 2018 | " scopes=config['SCOPES'],\n", 2019 | " redirect_uri='http://localhost:8080/'\n", 2020 | " )\n", 2021 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 2022 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 2023 | " code = input('Enter the authorization code: ')\n", 2024 | " flow.fetch_token(code=code)\n", 2025 | " creds = flow.credentials\n", 2026 | " with open('token.pickle', 'wb') as token:\n", 2027 | " pickle.dump(creds, token)\n", 2028 | " return creds\n", 2029 | "\n", 2030 | "# Function to paginate and fetch GA4 report data with logging\n", 2031 | "def run_report_with_pagination(client, request, limit=10000):\n", 2032 | " all_rows = []\n", 2033 | " offset = 0\n", 2034 | " page_number = 1\n", 2035 | "\n", 2036 | " while True:\n", 2037 | " # Apply offset and limit to request\n", 2038 | " request.offset = offset\n", 2039 | " request.limit = limit\n", 2040 | "\n", 2041 | " # Fetch report data\n", 2042 | " response = client.run_report(request)\n", 2043 | " all_rows.extend(response.rows)\n", 2044 | "\n", 2045 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 2046 | "\n", 2047 | " # If fewer rows are fetched than the limit, we're done\n", 2048 | " if len(response.rows) < limit:\n", 2049 | " break\n", 2050 | "\n", 2051 | " # Update offset and page number to get the next set of rows\n", 2052 | " offset += limit\n", 2053 | " page_number += 1\n", 2054 | "\n", 2055 | " return all_rows\n", 2056 | "\n", 2057 | "# Function to fetch GA4 data using pagination\n", 2058 | "def get_ga4_report(client):\n", 2059 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 2060 | " request = RunReportRequest(\n", 2061 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 2062 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 2063 | " dimensions=[\n", 2064 | " Dimension(name='date'),\n", 2065 | " Dimension(name='eventName')\n", 2066 | " ],\n", 2067 | " metrics=[\n", 2068 | " Metric(name='eventCount'),\n", 2069 | " Metric(name='eventCountPerUser'),\n", 2070 | " Metric(name='eventValue')\n", 2071 | " ]\n", 2072 | " )\n", 2073 | " return run_report_with_pagination(client, request)\n", 2074 | "\n", 2075 | "# Function to convert GA4 response to a DataFrame\n", 2076 | "def response_to_dataframe(response):\n", 2077 | " list_rows = []\n", 2078 | " for row in response:\n", 2079 | " try:\n", 2080 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 2081 | " except ValueError:\n", 2082 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 2083 | " event_name = row.dimension_values[1].value\n", 2084 | " list_rows.append({\n", 2085 | " 'date': date_value,\n", 2086 | " 'eventName': event_name,\n", 2087 | " 'eventCount': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 2088 | " 'eventCountPerUser': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 2089 | " 'eventValue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0\n", 2090 | " })\n", 2091 | " return pd.DataFrame(list_rows)\n", 2092 | "\n", 2093 | "# Function to upload data to BigQuery\n", 2094 | "def upload_to_bigquery(df, table_id):\n", 2095 | " # Define BigQuery schema\n", 2096 | " schema = [\n", 2097 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 2098 | " bigquery.SchemaField(\"eventName\", \"STRING\"),\n", 2099 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n", 2100 | " bigquery.SchemaField(\"eventCountPerUser\", \"FLOAT\"),\n", 2101 | " bigquery.SchemaField(\"eventValue\", \"FLOAT\")\n", 2102 | " ]\n", 2103 | "\n", 2104 | " # Configure BigQuery job to partition the table by the 'date' column\n", 2105 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 2106 | " job_config = bigquery.LoadJobConfig(\n", 2107 | " schema=schema,\n", 2108 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 2109 | " time_partitioning=bigquery.TimePartitioning(\n", 2110 | " type_=bigquery.TimePartitioningType.DAY,\n", 2111 | " field='date'\n", 2112 | " )\n", 2113 | " )\n", 2114 | "\n", 2115 | " # Upload the DataFrame to BigQuery\n", 2116 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 2117 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 2118 | "\n", 2119 | "# Main function\n", 2120 | "def main():\n", 2121 | " try:\n", 2122 | " # Authenticate GA4 using OAuth2\n", 2123 | " creds = authenticate_ga4()\n", 2124 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 2125 | "\n", 2126 | " # Fetch GA4 data\n", 2127 | " ga4_response = get_ga4_report(client_ga4)\n", 2128 | "\n", 2129 | " # Convert the response to a DataFrame\n", 2130 | " ga4_df = response_to_dataframe(ga4_response)\n", 2131 | "\n", 2132 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 2133 | " table_id = 'ga4_event_metrics_data'\n", 2134 | " csv_filename = f\"{table_id}.csv\"\n", 2135 | "\n", 2136 | " # Save the DataFrame to a CSV file\n", 2137 | " ga4_df.to_csv(csv_filename, index=False)\n", 2138 | " print(f\"Data saved to {csv_filename}\")\n", 2139 | "\n", 2140 | " # Upload the DataFrame to BigQuery\n", 2141 | " upload_to_bigquery(ga4_df, table_id)\n", 2142 | " except Exception as e:\n", 2143 | " print(f\"Error occurred: {e}\")\n", 2144 | "\n", 2145 | "if __name__ == '__main__':\n", 2146 | " main()\n" 2147 | ], 2148 | "metadata": { 2149 | "id": "CgQ4MPAf1A_b" 2150 | }, 2151 | "execution_count": null, 2152 | "outputs": [] 2153 | }, 2154 | { 2155 | "cell_type": "code", 2156 | "source": [ 2157 | "import pandas as pd\n", 2158 | "from google.cloud import bigquery\n", 2159 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 2160 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 2161 | "from google.oauth2 import service_account\n", 2162 | "from google_auth_oauthlib.flow import Flow\n", 2163 | "import json\n", 2164 | "import os\n", 2165 | "import pickle\n", 2166 | "\n", 2167 | "# Load configuration from a JSON file\n", 2168 | "with open(\"config.json\", \"r\") as f:\n", 2169 | " config = json.load(f)\n", 2170 | "\n", 2171 | "# Authenticate with service account for BigQuery\n", 2172 | "creds1 = service_account.Credentials.from_service_account_file(\n", 2173 | " config['SERVICE_ACCOUNT_FILE'],\n", 2174 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 2175 | ")\n", 2176 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 2177 | "\n", 2178 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 2179 | "def authenticate_ga4():\n", 2180 | " creds = None\n", 2181 | " if os.path.exists('token.pickle'):\n", 2182 | " with open('token.pickle', 'rb') as token:\n", 2183 | " creds = pickle.load(token)\n", 2184 | " else:\n", 2185 | " flow = Flow.from_client_secrets_file(\n", 2186 | " config['CLIENT_SECRET_FILE'],\n", 2187 | " scopes=config['SCOPES'],\n", 2188 | " redirect_uri='http://localhost:8080/'\n", 2189 | " )\n", 2190 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 2191 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 2192 | " code = input('Enter the authorization code: ')\n", 2193 | " flow.fetch_token(code=code)\n", 2194 | " creds = flow.credentials\n", 2195 | " with open('token.pickle', 'wb') as token:\n", 2196 | " pickle.dump(creds, token)\n", 2197 | " return creds\n", 2198 | "\n", 2199 | "# Function to paginate and fetch GA4 report data with logging\n", 2200 | "def run_report_with_pagination(client, request, limit=250000):\n", 2201 | " all_rows = []\n", 2202 | " offset = 0\n", 2203 | " page_number = 1\n", 2204 | "\n", 2205 | " while True:\n", 2206 | " # Apply offset and limit to request\n", 2207 | " request.offset = offset\n", 2208 | " request.limit = limit\n", 2209 | "\n", 2210 | " # Fetch report data\n", 2211 | " response = client.run_report(request)\n", 2212 | " all_rows.extend(response.rows)\n", 2213 | "\n", 2214 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 2215 | "\n", 2216 | " # If fewer rows are fetched than the limit, we're done\n", 2217 | " if len(response.rows) < limit:\n", 2218 | " break\n", 2219 | "\n", 2220 | " # Update offset and page number to get the next set of rows\n", 2221 | " offset += limit\n", 2222 | " page_number += 1\n", 2223 | "\n", 2224 | " return all_rows\n", 2225 | "\n", 2226 | "# Function to fetch GA4 data using pagination\n", 2227 | "def get_ga4_report(client):\n", 2228 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 2229 | " request = RunReportRequest(\n", 2230 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 2231 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 2232 | " dimensions=[\n", 2233 | " Dimension(name='date'),\n", 2234 | " Dimension(name='pageLocation') # New dimension\n", 2235 | " ],\n", 2236 | " metrics=[\n", 2237 | " Metric(name='totalUsers'),\n", 2238 | " Metric(name='ecommercePurchases'),\n", 2239 | " Metric(name='purchaseRevenue'),\n", 2240 | " Metric(name='screenPageViews'),\n", 2241 | " Metric(name='eventCount'),\n", 2242 | " Metric(name='engagementRate') # New metrics\n", 2243 | " ]\n", 2244 | " )\n", 2245 | " return run_report_with_pagination(client, request)\n", 2246 | "\n", 2247 | "# Function to convert GA4 response to a DataFrame\n", 2248 | "def response_to_dataframe(response):\n", 2249 | " list_rows = []\n", 2250 | " for row in response:\n", 2251 | " try:\n", 2252 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 2253 | " except ValueError:\n", 2254 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 2255 | " page_location = row.dimension_values[1].value # New dimension\n", 2256 | " list_rows.append({\n", 2257 | " 'date': date_value,\n", 2258 | " 'pageLocation': page_location, # New dimension\n", 2259 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 2260 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 2261 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 2262 | " 'screenPageViews': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 2263 | " 'eventCount': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 2264 | " 'engagementRate': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0 # New metric\n", 2265 | " })\n", 2266 | " return pd.DataFrame(list_rows)\n", 2267 | "\n", 2268 | "# Function to upload data to BigQuery\n", 2269 | "def upload_to_bigquery(df, table_id):\n", 2270 | " # Define BigQuery schema\n", 2271 | " schema = [\n", 2272 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 2273 | " bigquery.SchemaField(\"pageLocation\", \"STRING\"), # New dimension\n", 2274 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 2275 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 2276 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n", 2277 | " bigquery.SchemaField(\"screenPageViews\", \"INTEGER\"), # New metric\n", 2278 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"), # New metric\n", 2279 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\") # New metric\n", 2280 | " ]\n", 2281 | "\n", 2282 | " # Configure BigQuery job to partition the table by the 'date' column\n", 2283 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 2284 | " job_config = bigquery.LoadJobConfig(\n", 2285 | " schema=schema,\n", 2286 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 2287 | " time_partitioning=bigquery.TimePartitioning(\n", 2288 | " type_=bigquery.TimePartitioningType.DAY,\n", 2289 | " field='date'\n", 2290 | " )\n", 2291 | " )\n", 2292 | "\n", 2293 | " # Upload the DataFrame to BigQuery\n", 2294 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 2295 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 2296 | "\n", 2297 | "# Main function\n", 2298 | "def main():\n", 2299 | " try:\n", 2300 | " # Authenticate GA4 using OAuth2\n", 2301 | " creds = authenticate_ga4()\n", 2302 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 2303 | "\n", 2304 | " # Fetch GA4 data\n", 2305 | " ga4_response = get_ga4_report(client_ga4)\n", 2306 | "\n", 2307 | " # Convert the response to a DataFrame\n", 2308 | " ga4_df = response_to_dataframe(ga4_response)\n", 2309 | "\n", 2310 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 2311 | " table_id = 'ga4_page_location_data' # New table name\n", 2312 | " csv_filename = f\"{table_id}.csv\"\n", 2313 | "\n", 2314 | " # Save the DataFrame to a CSV file\n", 2315 | " ga4_df.to_csv(csv_filename, index=False)\n", 2316 | " print(f\"Data saved to {csv_filename}\")\n", 2317 | "\n", 2318 | " # Upload the DataFrame to BigQuery\n", 2319 | " upload_to_bigquery(ga4_df, table_id)\n", 2320 | " except Exception as e:\n", 2321 | " print(f\"Error occurred: {e}\")\n", 2322 | "\n", 2323 | "if __name__ == '__main__':\n", 2324 | " main()" 2325 | ], 2326 | "metadata": { 2327 | "id": "dTobTzk1nDNi" 2328 | }, 2329 | "execution_count": null, 2330 | "outputs": [] 2331 | }, 2332 | { 2333 | "cell_type": "code", 2334 | "source": [ 2335 | "import pandas as pd\n", 2336 | "from google.cloud import bigquery\n", 2337 | "from google.analytics.data_v1beta import BetaAnalyticsDataClient\n", 2338 | "from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest\n", 2339 | "from google.oauth2 import service_account\n", 2340 | "from google_auth_oauthlib.flow import Flow\n", 2341 | "import json\n", 2342 | "import os\n", 2343 | "import pickle\n", 2344 | "\n", 2345 | "# Load configuration from a JSON file\n", 2346 | "with open(\"config.json\", \"r\") as f:\n", 2347 | " config = json.load(f)\n", 2348 | "\n", 2349 | "# Authenticate with service account for BigQuery\n", 2350 | "creds1 = service_account.Credentials.from_service_account_file(\n", 2351 | " config['SERVICE_ACCOUNT_FILE'],\n", 2352 | " scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery']\n", 2353 | ")\n", 2354 | "bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id)\n", 2355 | "\n", 2356 | "# Authenticate for GA4 Analytics Data API using OAuth2\n", 2357 | "def authenticate_ga4():\n", 2358 | " creds = None\n", 2359 | " if os.path.exists('token.pickle'):\n", 2360 | " with open('token.pickle', 'rb') as token:\n", 2361 | " creds = pickle.load(token)\n", 2362 | " else:\n", 2363 | " flow = Flow.from_client_secrets_file(\n", 2364 | " config['CLIENT_SECRET_FILE'],\n", 2365 | " scopes=config['SCOPES'],\n", 2366 | " redirect_uri='http://localhost:8080/'\n", 2367 | " )\n", 2368 | " auth_url, _ = flow.authorization_url(prompt='consent')\n", 2369 | " print('Please go to this URL and finish the authentication: ', auth_url)\n", 2370 | " code = input('Enter the authorization code: ')\n", 2371 | " flow.fetch_token(code=code)\n", 2372 | " creds = flow.credentials\n", 2373 | " with open('token.pickle', 'wb') as token:\n", 2374 | " pickle.dump(creds, token)\n", 2375 | " return creds\n", 2376 | "\n", 2377 | "# Function to paginate and fetch GA4 report data with logging\n", 2378 | "def run_report_with_pagination(client, request, limit=250000):\n", 2379 | " all_rows = []\n", 2380 | " offset = 0\n", 2381 | " page_number = 1\n", 2382 | "\n", 2383 | " while True:\n", 2384 | " # Apply offset and limit to request\n", 2385 | " request.offset = offset\n", 2386 | " request.limit = limit\n", 2387 | "\n", 2388 | " # Fetch report data\n", 2389 | " response = client.run_report(request)\n", 2390 | " all_rows.extend(response.rows)\n", 2391 | "\n", 2392 | " print(f\"Fetching data... Page {page_number}, Offset: {offset}, Rows fetched: {len(response.rows)}\")\n", 2393 | "\n", 2394 | " # If fewer rows are fetched than the limit, we're done\n", 2395 | " if len(response.rows) < limit:\n", 2396 | " break\n", 2397 | "\n", 2398 | " # Update offset and page number to get the next set of rows\n", 2399 | " offset += limit\n", 2400 | " page_number += 1\n", 2401 | "\n", 2402 | " return all_rows\n", 2403 | "\n", 2404 | "# Function to fetch GA4 data using pagination\n", 2405 | "def get_ga4_report(client):\n", 2406 | " \"\"\"Fetches GA4 data based on the defined dimensions and metrics.\"\"\"\n", 2407 | " request = RunReportRequest(\n", 2408 | " property=f'properties/{config[\"PROPERTY_ID\"]}',\n", 2409 | " date_ranges=[DateRange(start_date=config['INITIAL_FETCH_FROM_DATE'], end_date=config['FETCH_TO_DATE'])],\n", 2410 | " dimensions=[\n", 2411 | " Dimension(name='date'),\n", 2412 | " Dimension(name='landingPage')\n", 2413 | " ],\n", 2414 | " metrics=[\n", 2415 | " Metric(name='totalUsers'),\n", 2416 | " Metric(name='ecommercePurchases'),\n", 2417 | " Metric(name='purchaseRevenue'),\n", 2418 | " Metric(name='sessions'),\n", 2419 | " Metric(name='eventCount'),\n", 2420 | " Metric(name='engagementRate')\n", 2421 | " ]\n", 2422 | " )\n", 2423 | " return run_report_with_pagination(client, request)\n", 2424 | "\n", 2425 | "# Function to convert GA4 response to a DataFrame\n", 2426 | "def response_to_dataframe(response):\n", 2427 | " list_rows = []\n", 2428 | " for row in response:\n", 2429 | " try:\n", 2430 | " date_value = pd.to_datetime(row.dimension_values[0].value, format='%Y%m%d')\n", 2431 | " except ValueError:\n", 2432 | " date_value = pd.NaT # Use Not-a-Time for dates that fail to convert\n", 2433 | " landing_page = row.dimension_values[1].value\n", 2434 | " list_rows.append({\n", 2435 | " 'date': date_value,\n", 2436 | " 'landingPage': landing_page,\n", 2437 | " 'totalUsers': pd.to_numeric(row.metric_values[0].value, errors='coerce') or 0,\n", 2438 | " 'ecommercePurchases': pd.to_numeric(row.metric_values[1].value, errors='coerce') or 0,\n", 2439 | " 'purchaseRevenue': pd.to_numeric(row.metric_values[2].value, errors='coerce') or 0,\n", 2440 | " 'sessions': pd.to_numeric(row.metric_values[3].value, errors='coerce') or 0,\n", 2441 | " 'eventCount': pd.to_numeric(row.metric_values[4].value, errors='coerce') or 0,\n", 2442 | " 'engagementRate': pd.to_numeric(row.metric_values[5].value, errors='coerce') or 0\n", 2443 | " })\n", 2444 | " return pd.DataFrame(list_rows)\n", 2445 | "\n", 2446 | "# Function to upload data to BigQuery\n", 2447 | "def upload_to_bigquery(df, table_id):\n", 2448 | " # Define BigQuery schema\n", 2449 | " schema = [\n", 2450 | " bigquery.SchemaField(\"date\", \"DATE\"),\n", 2451 | " bigquery.SchemaField(\"landingPage\", \"STRING\"),\n", 2452 | " bigquery.SchemaField(\"totalUsers\", \"INTEGER\"),\n", 2453 | " bigquery.SchemaField(\"ecommercePurchases\", \"INTEGER\"),\n", 2454 | " bigquery.SchemaField(\"purchaseRevenue\", \"FLOAT\"),\n", 2455 | " bigquery.SchemaField(\"sessions\", \"INTEGER\"),\n", 2456 | " bigquery.SchemaField(\"eventCount\", \"INTEGER\"),\n", 2457 | " bigquery.SchemaField(\"engagementRate\", \"FLOAT\")\n", 2458 | " ]\n", 2459 | "\n", 2460 | " # Configure BigQuery job to partition the table by the 'date' column\n", 2461 | " table_ref = f\"{bq_client.project}.{config['DATASET_ID']}.{table_id}\"\n", 2462 | " job_config = bigquery.LoadJobConfig(\n", 2463 | " schema=schema,\n", 2464 | " write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n", 2465 | " time_partitioning=bigquery.TimePartitioning(\n", 2466 | " type_=bigquery.TimePartitioningType.DAY,\n", 2467 | " field='date'\n", 2468 | " )\n", 2469 | " )\n", 2470 | "\n", 2471 | " # Upload the DataFrame to BigQuery\n", 2472 | " bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config).result()\n", 2473 | " print(f\"Data uploaded and partitioned by date to {table_ref}\")\n", 2474 | "\n", 2475 | "# Main function\n", 2476 | "def main():\n", 2477 | " try:\n", 2478 | " # Authenticate GA4 using OAuth2\n", 2479 | " creds = authenticate_ga4()\n", 2480 | " client_ga4 = BetaAnalyticsDataClient(credentials=creds)\n", 2481 | "\n", 2482 | " # Fetch GA4 data\n", 2483 | " ga4_response = get_ga4_report(client_ga4)\n", 2484 | "\n", 2485 | " # Convert the response to a DataFrame\n", 2486 | " ga4_df = response_to_dataframe(ga4_response)\n", 2487 | "\n", 2488 | " # Define the BigQuery table ID and CSV filename (same as table ID)\n", 2489 | " table_id = 'ga4_landing_page_data'\n", 2490 | " csv_filename = f\"{table_id}.csv\"\n", 2491 | "\n", 2492 | " # Save the DataFrame to a CSV file\n", 2493 | " ga4_df.to_csv(csv_filename, index=False)\n", 2494 | " print(f\"Data saved to {csv_filename}\")\n", 2495 | "\n", 2496 | " # Upload the DataFrame to BigQuery\n", 2497 | " upload_to_bigquery(ga4_df, table_id)\n", 2498 | " except Exception as e:\n", 2499 | " print(f\"Error occurred: {e}\")\n", 2500 | "\n", 2501 | "if __name__ == '__main__':\n", 2502 | " main()" 2503 | ], 2504 | "metadata": { 2505 | "id": "T5zaPVqat6kl" 2506 | }, 2507 | "execution_count": null, 2508 | "outputs": [] 2509 | } 2510 | ] 2511 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Backfill-GA4-to-BigQuery 2 | Backfill-GA4-to-BigQuery" repository offers a solution for users to backfill their GA4 data into BigQuery. This is useful for those who need historical data from the start of their GA4 property, as GA4 data is typically only available in BigQuery after linking the two services. my solution provides a Game-Changer backfill of data to BigQuery. It uses OAuth 2.0 credentials for desktop applications, making authentication easier and well-suited for IDEs like Google Colab. 3 | 4 | ## What's New 5 | 6 | I've added a **notebook version** of the code for working with GA4 data using Python and BigQuery! 7 | 8 | - If you prefer **straightforward, ready-to-use scripts** for creating GA4-like tables with minimal effort, the notebook provides a streamlined approach for quick setup. 9 | - For those looking to **customize the dimensions, metrics, or data handling processes**, the original main code remains your go-to option for flexibility and control. 10 | 11 | 12 | 13 | ## Table of Contents 14 | 1. [Features](#features) 15 | 2. [Prerequisites](#prerequisites) 16 | 3. [Setup and Installation](#setup-and-installation) 17 | - [Step 1: Create New Dataset and Activate Analytics API](#step-1-create-new-dataset-and-activate-analytics-api) 18 | - [Step 2: Creating a Service Account](#step-2-creating-a-service-account) 19 | - [Step 3: Setting Up OAuth for Desktop App](#step-3-setting-up-oauth-for-desktop-app) 20 | - [Step 4: Configuration File](#step-4-configuration-file) 21 | - [Step 5: Installation of Dependencies](#step-5-installation-of-dependencies) 22 | - [Step 6: Running the Script](#step-6-running-the-script) 23 | - [Step 7: Authentication](#step-7-authentication) 24 | - [Step 8: QA](#step-8-qa) 25 | 4. [Using the Pre-Built Notebook for GA4 Reports](#using-the-pre-built-notebook-for-ga4-reports) 26 | 5. [Troubleshooting](#troubleshooting) 27 | 6. [Customization](#customization) 28 | 7. [Contributing](#contributing) 29 | 8. [Contact](#contact) 30 | 31 | 32 | 33 | ## Features 34 | 35 | - **OAuth 2.0 Authentication**: Simplifies authentication with OAuth 2.0 credentials, ideal for desktop apps and environments like Google Colab. 36 | 37 | - **Service Account Integration**: Securely connects to Google Cloud services using service accounts for enhanced security. 38 | 39 | - **Data Extraction from GA4**: Fetches comprehensive GA4 data from a specified start date, ideal for historical data backfilling. 40 | 41 | - **Customizable Configuration**: Offers a `config.json` file for user-specific settings like table prefixes and property IDs. 42 | 43 | - **BigQuery Integration**: Efficiently processes and stores data in BigQuery with proper schema management. 44 | 45 | - **Export Functionality**: Enables exporting GA4 data to CSV format for external use. 46 | 47 | - **Duplicate Check**: Incorporates mechanisms to avoid duplicate data entries in BigQuery. 48 | 49 | - **Flexible Data Retrieval**: Allows data fetching from a specific date or the previous day. 50 | 51 | - **Robust Error Handling**: Includes effective error handling and logging for smooth operation. 52 | 53 | - **Partitioning and clustering**: Dynamic partitioning and clustering for optimized query performance and cost management. 54 | 55 | - **Configurable End Date Range**: precise control over the data retrieval period, making it easier to manage data quotas and perform historical data analysis within a specific timeframe. 56 | 57 | 58 | 59 | ## Prerequisites 60 | - Google Cloud account with billing enabled. 61 | - Access to Google Analytics 4 and Google BigQuery. 62 | - Python environment (Python 3.x recommended). 63 | 64 | ## Setup and Installation 65 | 66 | ### Step 1: Create a New Project and Activate Analytics API 67 | - Go to [Google Cloud Console](https://console.cloud.google.com/apis/api/analyticsdata.googleapis.com/metrics) to activate the Analytics API in your selected project. 68 | 69 | ### Step 2: Creating a Service Account 70 | 71 | 1. **Access Google Cloud Console**: Visit the [Google Cloud Console](https://console.cloud.google.com/). 72 | 73 | 2. **Create a Service Account**: 74 | - Navigate to "IAM & Admin > Service Accounts". 75 | - Click "Create Service Account", enter a name, description, and click "Create". 76 | - Grant necessary roles to the service account (e.g., Owner or BigQuery Admin + BigQuery Job User). 77 | 78 | 3. **Generate a Service Account Key**: 79 | - Click on the created service account to manage it. 80 | - Go to the "Keys" tab and click "Add Key", then "Create new key". 81 | - Choose "JSON" as the key type and click "Create". 82 | - A JSON key file will be downloaded. Store it securely. 83 | 84 | 85 | ### Step 3: Setting Up OAuth for Desktop App 86 | 87 | To set up OAuth for a desktop application, you need to create an OAuth client ID in Google Cloud Console. Before creating an OAuth client ID, make sure to configure your consent screen if you don't have one already. 88 | 89 | #### Configure the Consent Screen: 90 | 91 | 1. **Access Consent Screen Configuration**: 92 | - In the Google Cloud Console, navigate to "APIs & Services > OAuth consent screen". 93 | - Select the external user type. 94 | 95 | 2. **Fill in Consent Screen Details**: 96 | - Provide the necessary information, such as the app name, user support email, and developer contact information. 97 | - add your email (and others, if needed) in the "Test users" section. 98 | 99 | 4. **Publish the App**: 100 | - Once all necessary information is provided, save and publish your consent screen. 101 | 102 | #### Create OAuth 2.0 Client ID: 103 | 104 | 1. **Navigate to Credentials**: 105 | - Go to "APIs & Services > Credentials". 106 | 107 | 2. **Create OAuth Client ID**: 108 | - Click "Create Credentials" and select "OAuth client ID". 109 | - Choose "Desktop app" as the Application type. 110 | - Provide a name for the client ID and click "Create". 111 | 112 | 3. **Download Client Configuration**: 113 | - After the OAuth client ID is created, download the client configuration JSON file. 114 | - This file contains your client ID and secret, which are essential for the OAuth flow. 115 | 116 | #### Note: 117 | 118 | - The script uses a `token.pickle` file to store access tokens and refresh tokens. Once authenticated, you won't need to repeat the authentication process unless the token is revoked or expired. 119 | - Ensure that the JSON file is stored securely and referenced correctly in your project. 120 | 121 | 122 | ### Step 4: Configuration File 123 | Fill out and save a `config.json` file with your specific parameters. 124 | Example: 125 | ```json 126 | { 127 | "CLIENT_SECRET_FILE": "", 128 | "SERVICE_ACCOUNT_FILE": "", 129 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly"], 130 | "PROPERTY_ID": "", 131 | "INITIAL_FETCH_FROM_DATE": "2022-01-01", 132 | "FETCH_TO_DATE": "today", 133 | "DATASET_ID": "", 134 | "TABLE_PREFIX": "_backfill_GA4", 135 | "PARTITION_BY": "Event_Date", 136 | "CLUSTER_BY": "Event_Name" 137 | } 138 | ``` 139 | - **Client Secret and Service Account File**: Replace the placeholders with the actual paths to your OAuth client secret and service account JSON files. 140 | 141 | - **Property ID and Dataset ID**: Insert your Google Analytics Property ID and the BigQuery Dataset ID where data will be stored. 142 | 143 | - **Initial Fetch Date**: Set the initial date from which to fetch historical data in `YYYY-MM-DD` format. 144 | 145 | - **FETCH_TO_DATE**: Specify the end date for data fetching. Defaults to today's date. Format: `YYYY-MM-DD`. 146 | 147 | - **Table Prefix**: Specify the prefix for your BigQuery tables. If the specified prefix does not exist, the script will create tables with this prefix in BigQuery. 148 | 149 | - **PARTITION_BY**: Specifies the column for table partitioning. Default is Event_Date, which is highly recommended for optimal data management. 150 | 151 | - **CLUSTER_BY**: Specifies the column(s) for table clustering. Default is Event_Name, aligning with common querying patterns. While this choice is optimal for many use cases, you may customize this field. 152 | 153 | Install necessary Python packages: 154 | ```bash 155 | !pip install google-analytics-data==0.18.4 156 | !pip install google-cloud-bigquery 157 | !pip install google-auth==2.27.0 158 | !pip install google-auth-oauthlib 159 | !pip install google-auth-httplib2 160 | ``` 161 | 162 | ### Step 6: Running the Script 163 | 164 | After configuring the `config.json` file and saving the source code with the same name `backfill-ga4.py` , it's time to run the script with the desired flags. 165 | 166 | - **Execute the Script with Flags**: 167 | - Use the `%run` command followed by the script name and the desired flag. 168 | - For fetching data from yesterday, use: 169 | ```bash 170 | %run backfill-ga4.py --yesterday 171 | ``` 172 | - For fetching data from the initial fetch date specified in your `config.json`, use: 173 | ```bash 174 | %run backfill-ga4.py --initial_fetch 175 | ``` 176 | - This will start the authentication flow 177 | 178 | ### Step 7: Authentication 179 | 180 | - **Run the Script**: 181 | - Execute your Python script. 182 | - It will prompt you to open a URL for authentication. 183 | - Ensure that you choose a Google account that has the necessary access to selected property. 184 | - If you don't verify your app in the first step, select "Go to 'YourPublishedAPP'(unsafe)" to access the authentication code on localhost. 185 | - Your code can be found as a part of the URL between "code=" and the next ampersand. (Screenshot attached) 186 | [![Ga4-bigquery-Script Authentication.png](https://i.postimg.cc/5N2T2Hkj/authentication-image.png)](https://postimg.cc/6TFYHQKN) 187 | - Copy and paste this code back into the script. 188 | - Now data retrieval process based on the specified date range should be completed. It is important that a table, that has been exported, is 189 | visible in both Bigquery table and CSV downloadable file. 190 | 191 | ### Step 8: QA 192 | 193 | - **Check for Successful Setup**: 194 | - Upon successful completion of the script, it should indicate that the authentication process is complete and data fetching has started. 195 | - Now, you should be able to see the new tables in your Google Analytics BigQuery dataset (`DATASET_ID` specified in your `config.json`). 196 | - Additionally, the `output.csv` file in your project directory should contain the fetched data. 197 | - If the tables are visible and the CSV file has data, everything is set up correctly. 198 | 199 | 200 | ## Using the Pre-Built Notebook for GA4 Reports 201 | 202 | This repository now includes a **custom notebook** for exporting **13 of the most useful GA4 reports** into BigQuery and CSV format. This notebook simplifies the process, eliminating the need to dive into the source code. Follow the steps below to configure and run the notebook. Here is a clear breakdown of the tables that will be exported after running the notebook : 203 | 204 | 205 | | **Table Name** | **Dimensions** | **Metrics** | 206 | |-----------------------------------|---------------------------------------------|-------------------------------------------------------------------------------------------------| 207 | | `ga4_transaction_items` | `transactionId`, `itemName`, `date` | `itemPurchaseQuantity`, `itemRevenue` | 208 | | `ga4_data_session_channel_group` | `date`, `sessionDefaultChannelGroup` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 209 | | `ga4_data_session_source_campaign_medium` | `date`, `sessionSource`, `sessionCampaignName`, `sessionMedium` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 210 | | `ga4_data_country_language_city` | `date`, `country`, `language`, `city` | `sessions`, `screenPageViews`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 211 | | `ga4_data_item_name` | `date`, `itemName` | `itemPurchaseQuantity`, `itemRevenue` | 212 | | `ga4_data_browser_os_device` | `date`, `browser`, `operatingSystem`, `deviceCategory` | `sessions`, `screenPageViews`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 213 | | `ga4_data_first_user_source_medium` | `date`, `firstUserMedium`, `firstUserSource`, `firstUserCampaignName` | `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 214 | | `ga4_data_first_user_channel_group` | `date`, `firstUserDefaultChannelGroup` | `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue` | 215 | | `ga4_ads_data` | `date`, `sessionSource`, `sessionMedium`, `sessionCampaignName` | `ecommercePurchases`, `averagePurchaseRevenue`, `purchaseRevenue`, `advertiserAdClicks`, `advertiserAdCost`, `advertiserAdCostPerClick`, `returnOnAdSpend` | 216 | | `ga4_all_metrics_data` | `date` | `sessions`, `totalUsers`, `newUsers`, `ecommercePurchases`, `purchaseRevenue`, `screenPageViews`, `eventCount`, `averageSessionDuration`, `engagedSessions`, `engagementRate` | 217 | | `ga4_event_metrics_data` | `date`, `eventName` | `eventCount`, `eventCountPerUser`, `eventValue` | 218 | | `ga4_page_location_data` | `date`, `pageLocation` | `totalUsers`, `ecommercePurchases`, `purchaseRevenue`, `screenPageViews`, `eventCount`, `engagementRate` | 219 | | `ga4_landing_page_data` | `date`, `landingPage` | `totalUsers`, `ecommercePurchases`, `purchaseRevenue`, `sessions`, `eventCount`, `engagementRate` | 220 | 221 | 222 | 223 | ### Steps to Use the Notebook 224 | 225 | 1. **Initial Steps**: 226 | The first three steps (creating a dataset, activating the Analytics API, and setting up OAuth) remain the same as detailed in the [Setup and Installation](#setup-and-installation) section. 227 | 228 | 2. **Prepare the Configuration File (`config.json`)**: 229 | Use the following template for the `config.json` file: 230 | ```json 231 | { 232 | "CLIENT_SECRET_FILE": "/path/to/your/client_secret.json", 233 | "SERVICE_ACCOUNT_FILE": "/path/to/your/service_account.json", 234 | "PROPERTY_ID": "", 235 | "INITIAL_FETCH_FROM_DATE": "YYYY-MM-DD", 236 | "FETCH_TO_DATE": "today", 237 | "DATASET_ID": "", 238 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly", "https://www.googleapis.com/auth/bigquery"] 239 | } 240 | ``` 241 | Replace placeholders with your project-specific details. 242 | 243 | 3. **Run the Notebook**: 244 | - Upload the `config.json` file to the notebook directory. 245 | - Open and execute the cells in the notebook sequentially. 246 | - During execution, you will be prompted to authorize access. Follow the instructions to complete the OAuth flow. 247 | - Once authorized, the script will fetch the data and save it to BigQuery and a downloadable CSV. 248 | 249 | 250 | ## Troubleshooting 251 | 252 | ### AttributeError on Script Execution 253 | 254 | **Issue:** Encountering an `AttributeError` related to `credentials.universe_domain` when running the script. 255 | 256 | **Solution:** This is likely due to version mismatches in `google-auth` and `google-analytics-data` libraries. Resolve it by upgrading both libraries: 257 | 258 | ```shell 259 | pip install --upgrade google-analytics-data google-auth 260 | ``` 261 | 262 | Run this command in your terminal or command prompt to ensure you're using compatible versions, which should fix the issue. 263 | 264 | 265 | ## Customization 266 | 267 | Your project can be customized to fetch different metrics and dimensions based on your specific needs. Use the [Google Analytics Data API schema](https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema) to understand the available metrics and dimensions. You can then modify the script to query different sets of data from your Google Analytics account. 268 | 269 | - **Tailor Metrics and Dimensions**: In the script, identify the sections where API requests are constructed and modify the `metrics` and `dimensions` according to your requirements. 270 | - **Consult API Schema**: The API schema documentation provides a comprehensive list of all available metrics and dimensions, along with their descriptions and usage. 271 | 272 | 273 | ## Contributing 274 | 275 | Contributions to this project are welcome! Here's how you can help: 276 | 277 | - **Reporting Issues**: Report issues or bugs by opening a new issue in the GitHub repository. 278 | - **Feature Requests**: If you have ideas for new features or improvements, feel free to create an issue describing your suggestion. 279 | - **Submitting Pull Requests**: You can contribute directly to the codebase. Please ensure your code adheres to the project's coding standards and include tests for new features. 280 | 281 | 282 | -------------------------------------------------------------------------------- /backfill-GA4-schema.md: -------------------------------------------------------------------------------- 1 | [![Schema of Result Table](https://i.postimg.cc/SQgmXLpp/image.png)](https://postimg.cc/hhQk6djC) 2 | -------------------------------------------------------------------------------- /backfill-ga4.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import argparse 3 | import datetime 4 | import sys 5 | import json 6 | import os 7 | import pickle 8 | from google.analytics.data_v1beta import BetaAnalyticsDataClient, OrderBy 9 | from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest 10 | from google.cloud import bigquery 11 | from google.oauth2 import service_account 12 | from google.cloud.exceptions import NotFound 13 | from google_auth_oauthlib.flow import Flow 14 | 15 | # Load configuration 16 | with open("config.json", "r") as f: 17 | config = json.load(f) 18 | 19 | # function to validate and ensure FETCH_TO_DATE is correct 20 | def get_valid_end_date(end_date_str): 21 | try: 22 | valid_end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date() 23 | except (TypeError, ValueError): 24 | valid_end_date = datetime.date.today() 25 | return valid_end_date 26 | 27 | # Validate or default FETCH_TO_DATE from the config 28 | valid_end_date = get_valid_end_date(config.get('FETCH_TO_DATE')) 29 | FETCH_TO_DATE = valid_end_date.strftime('%Y-%m-%d') 30 | 31 | # Function to check if an event exists in BigQuery 32 | def exists_in_bigquery(event_name, event_date, event_count, channel_group, dataset_id, bq_client): 33 | year = event_date[:4] 34 | month = event_date[4:6] 35 | table_id = f'{TABLE_PREFIX}{year}{month}01' 36 | table_ref = bq_client.dataset(dataset_id).table(table_id) 37 | 38 | try: 39 | bq_client.get_table(table_ref) 40 | except NotFound: 41 | return False 42 | 43 | 44 | params = [ 45 | bigquery.ScalarQueryParameter('event_name', 'STRING', event_name), 46 | bigquery.ScalarQueryParameter('event_date', 'INTEGER', event_date), 47 | bigquery.ScalarQueryParameter('event_count', 'INTEGER', event_count), 48 | bigquery.ScalarQueryParameter('channel_group', 'STRING', channel_group) 49 | ] 50 | 51 | job_config = bigquery.QueryJobConfig() 52 | job_config.query_parameters = params 53 | 54 | result = bq_client.query(query, job_config=job_config).result() 55 | count = list(result)[0][0] 56 | 57 | if count > 0: 58 | print(f"..record already exists in BigQuery ({count})", flush=True) 59 | 60 | return count > 0 61 | 62 | def get_table_ref(year, month): 63 | table_id = f'{TABLE_PREFIX}{year}{month}01' 64 | return bq_client.dataset(DATASET_ID).table(table_id) 65 | 66 | # Configuration parameters 67 | CLIENT_SECRET_FILE = config['CLIENT_SECRET_FILE'] 68 | SCOPES = config['SCOPES'] 69 | TABLE_PREFIX = config['TABLE_PREFIX'] 70 | PROPERTY_ID = config['PROPERTY_ID'] 71 | DATASET_ID = config['DATASET_ID'] 72 | INITIAL_FETCH_FROM_DATE = config['INITIAL_FETCH_FROM_DATE'] 73 | SERVICE_ACCOUNT_FILE = config['SERVICE_ACCOUNT_FILE'] 74 | PARTITION_BY = config.get('PARTITION_BY', 'Event_Date') # Default to Event_Date 75 | CLUSTER_BY = config.get('CLUSTER_BY', 'Event_Name') 76 | FETCH_TO_DATE = config.get('FETCH_TO_DATE', datetime.date.today().strftime('%Y-%m-%d')) 77 | 78 | 79 | # Command line arguments for date range 80 | parser = argparse.ArgumentParser(description='Fetch data based on date range.') 81 | parser.add_argument('--yesterday', action='store_true', help='Fetch data from yesterday only.') 82 | parser.add_argument('--initial_fetch', action='store_true', help='Fetch data from a wide date range.') 83 | args = parser.parse_args() 84 | 85 | # Determine date range 86 | start_date = None 87 | end_date = None 88 | if args.yesterday: 89 | date = datetime.date.today() - datetime.timedelta(days=1) 90 | start_date = end_date = date.strftime('%Y-%m-%d') 91 | elif args.initial_fetch: 92 | confirmation = input("Using the initial_fetch might result in duplicated records. Do you want to proceed? (yes/no): ").strip().lower() 93 | if confirmation == 'yes': 94 | start_date = INITIAL_FETCH_FROM_DATE 95 | end_date = FETCH_TO_DATE 96 | else: 97 | print("Exiting script due to user cancellation.", flush=True) 98 | sys.exit() 99 | else: 100 | print("No valid date range argument provided. Exiting script.", flush=True) 101 | sys.exit() 102 | print(f"Starting fetching data from {start_date} to {valid_end_date.strftime('%Y-%m-%d')}.", flush=True) 103 | 104 | # Authenticate with service account for BigQuery 105 | creds1 = service_account.Credentials.from_service_account_file( 106 | SERVICE_ACCOUNT_FILE, 107 | scopes=['https://www.googleapis.com/auth/analytics.readonly', 'https://www.googleapis.com/auth/bigquery'] 108 | ) 109 | bq_client = bigquery.Client(credentials=creds1, project=creds1.project_id) 110 | 111 | # Authenticate for Analytics Data API 112 | if os.path.exists('token.pickle'): 113 | with open('token.pickle', 'rb') as token: 114 | creds = pickle.load(token) 115 | else: 116 | # Create the flow using the client secrets file 117 | flow = Flow.from_client_secrets_file( 118 | CLIENT_SECRET_FILE, 119 | scopes=SCOPES, 120 | redirect_uri='http://localhost:8080/' 121 | ) 122 | 123 | # Generate the authorization URL 124 | auth_url, _ = flow.authorization_url(prompt='consent') 125 | 126 | print('Please go to this URL and finish the authentication: ', auth_url) 127 | code = input('Enter the authorization code: ') 128 | flow.fetch_token(code=code) 129 | 130 | creds = flow.credentials 131 | 132 | # Save the credentials for future use 133 | with open('token.pickle', 'wb') as token: 134 | pickle.dump(creds, token) 135 | 136 | print("Authentication successful!") 137 | 138 | client = BetaAnalyticsDataClient(credentials=creds) 139 | 140 | # Function to run report with pagination 141 | def run_report_with_pagination(client, request): 142 | all_rows = [] 143 | offset = 0 # Initialize offset 144 | limit = 10000 # Set limit (maximum rows per request) 145 | 146 | while True: 147 | # Apply offset and limit to request 148 | request.offset = offset 149 | request.limit = limit 150 | 151 | response = client.run_report(request) 152 | all_rows.extend(response.rows) 153 | 154 | # Check if there are more rows to fetch 155 | if len(response.rows) == limit: 156 | offset += limit # Increase offset for the next iteration 157 | else: 158 | break # No more rows left, exit loop 159 | 160 | return all_rows 161 | 162 | # Requests for active users and events 163 | request_active_users = RunReportRequest( 164 | property=f'properties/{PROPERTY_ID}', 165 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)], 166 | dimensions=[ 167 | Dimension(name='date'), 168 | Dimension(name='sessionDefaultChannelGroup') 169 | ], 170 | metrics=[Metric(name='sessions')], 171 | order_bys=[OrderBy({"dimension": {"dimension_name": "date"}})] 172 | ) 173 | 174 | active_users = run_report_with_pagination(client, request_active_users) 175 | 176 | request_events = RunReportRequest( 177 | property=f'properties/{PROPERTY_ID}', 178 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)], 179 | dimensions=[Dimension(name='eventName'), Dimension(name='date'), Dimension(name='isConversionEvent'), Dimension(name='sessionDefaultChannelGroup')], 180 | metrics=[Metric(name='eventCount')] 181 | ) 182 | 183 | all_events = run_report_with_pagination(client, request_events) 184 | 185 | # Process and write data to CSV 186 | rows_by_month = {} 187 | 188 | with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile: 189 | csv_writer = csv.writer(csvfile) 190 | csv_writer.writerow(['Event Name', 'Event Date', 'Event Count', 'Is Conversion', 'Channel', 'Event_Type']) 191 | 192 | # Processing active users data 193 | for row in active_users: 194 | event_name = "ct_active_users" 195 | is_conversion = None 196 | event_date = row.dimension_values[0].value 197 | channel_group = row.dimension_values[1].value 198 | event_count = row.metric_values[0].value 199 | event_type = "Traffic" 200 | 201 | csv_writer.writerow([event_name, event_date, event_count, is_conversion, channel_group, event_type]) 202 | 203 | # Check for existing records in BigQuery 204 | if not (args.yesterday and exists_in_bigquery(event_name, event_date, event_count, channel_group, DATASET_ID, bq_client)): 205 | year, month = event_date[:4], event_date[4:6] 206 | key = (year, month) 207 | rows_by_month.setdefault(key, []).append({ 208 | "Event_Name": event_name, 209 | "Event_Date": event_date, 210 | "Event_Count": event_count, 211 | "Is_Conversion": is_conversion, 212 | "Channel": channel_group, 213 | "Event_Type": event_type 214 | }) 215 | 216 | # Sort and process events data 217 | sorted_events = sorted(all_events, key=lambda x: x.dimension_values[1].value) 218 | for row in sorted_events: 219 | event_name = row.dimension_values[0].value 220 | event_date = row.dimension_values[1].value 221 | is_conversion = row.dimension_values[2].value 222 | 223 | if is_conversion == "(not set)": 224 | is_conversion = "" 225 | 226 | channel_group = row.dimension_values[3].value 227 | event_count = row.metric_values[0].value 228 | 229 | is_conversion = bool(is_conversion) 230 | event_type = "Conversion" if is_conversion else "Event" 231 | 232 | csv_writer.writerow([event_name, event_date, event_count, is_conversion, channel_group, event_type]) 233 | 234 | # Check for existing records in BigQuery 235 | if not (args.yesterday and exists_in_bigquery(event_name, event_date, event_count, channel_group, DATASET_ID, bq_client)): 236 | year, month = event_date[:4], event_date[4:6] 237 | key = (year, month) 238 | rows_by_month.setdefault(key, []).append({ 239 | "Event_Name": event_name, 240 | "Event_Date": event_date, 241 | "Event_Count": event_count, 242 | "Is_Conversion": is_conversion, 243 | "Channel": channel_group, 244 | "Event_Type": event_type 245 | }) 246 | 247 | print("Data saved to output.csv!", flush=True) 248 | 249 | def create_or_update_table_with_partition_and_cluster(dataset_id, simple_table_id, schema, partition_by=None, cluster_by=None): 250 | full_table_id = f"{bq_client.project}.{dataset_id}.{simple_table_id}" # Correctly construct the full table ID 251 | table = bigquery.Table(full_table_id, schema=schema) 252 | 253 | if partition_by: 254 | table.time_partitioning = bigquery.TimePartitioning(field=partition_by) 255 | 256 | if cluster_by: 257 | table.clustering_fields = [cluster_by] 258 | 259 | try: 260 | # Attempt to create the table, or if it exists, confirm it's updated 261 | created_table = bq_client.create_table(table, exists_ok=True) 262 | print(f"Table {created_table.full_table_id} created or confirmed existing with specified settings.") 263 | except Exception as e: 264 | print(f"Error creating or confirming table: {e}") 265 | 266 | TABLE_PREFIX = config.get('TABLE_PREFIX') # Handle potential absence of key 267 | DATASET_ID = config['DATASET_ID'] 268 | 269 | schema = [ 270 | bigquery.SchemaField("Event_Name", "STRING", mode="NULLABLE"), 271 | bigquery.SchemaField("Event_Date", "DATE", mode="NULLABLE"), 272 | bigquery.SchemaField("Event_Count", "INTEGER", mode="NULLABLE"), 273 | bigquery.SchemaField("Is_Conversion", "BOOLEAN", mode="NULLABLE"), 274 | bigquery.SchemaField("Channel", "STRING", mode="NULLABLE"), 275 | bigquery.SchemaField("Event_Type", "STRING", mode="NULLABLE"), 276 | ] 277 | 278 | def format_event_date(event_date): 279 | return f"{event_date[:4]}-{event_date[4:6]}-{event_date[6:]}" 280 | 281 | table_id = f"{bq_client.project}.{DATASET_ID}.{TABLE_PREFIX}" 282 | 283 | try: 284 | bq_client.get_table(table_id) 285 | print(f"Table {table_id} already exists.") 286 | except NotFound: 287 | # If table does not exist, create it 288 | print(f"Table {table_id} not found. Creating table...") 289 | table = bigquery.Table(table_id, schema=schema) 290 | table.time_partitioning = bigquery.TimePartitioning( 291 | field=config["PARTITION_BY"], 292 | type_=bigquery.TimePartitioningType.DAY 293 | ) 294 | if "CLUSTER_BY" in config and config["CLUSTER_BY"]: 295 | table.clustering_fields = [config["CLUSTER_BY"]] 296 | bq_client.create_table(table) 297 | print(f"Created table {table_id}") 298 | 299 | all_rows_to_insert = [] 300 | for _, month_data in rows_by_month.items(): 301 | for row in month_data: 302 | # Format the 'Event_Date' to match BigQuery DATE format 'YYYY-MM-DD' 303 | if 'Event_Date' in row: 304 | row['Event_Date'] = format_event_date(row['Event_Date']) 305 | all_rows_to_insert.append(row) 306 | 307 | # Now, insert all rows into the single table 308 | if all_rows_to_insert: 309 | errors = bq_client.insert_rows_json(table_id, all_rows_to_insert) # Use insert_rows_json for better performance with dicts 310 | if errors: 311 | print("Errors:", errors, flush=True) 312 | else: 313 | print(f"Data saved to BigQuery!", flush=True) 314 | else: 315 | print("No data to insert.") 316 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "CLIENT_SECRET_FILE": "", 3 | "SERVICE_ACCOUNT_FILE": "", 4 | "SCOPES": ["https://www.googleapis.com/auth/analytics.readonly"], 5 | "PROPERTY_ID": "", 6 | "INITIAL_FETCH_FROM_DATE": "2022-01-01", 7 | "FETCH_TO_DATE": "today", 8 | "DATASET_ID": "", 9 | "TABLE_PREFIX": "_backfill_GA4", 10 | "PARTITION_BY": "Event_Date", 11 | "CLUSTER_BY": "Event_Name" 12 | } 13 | -------------------------------------------------------------------------------- /tansfer_divar_data_from_huggingface_to_bigquery.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOUqeVLqcnQdmK/K27isM7E", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "widgets": { 18 | "application/vnd.jupyter.widget-state+json": { 19 | "cf441a3e0a58460a902f8ece8095abcd": { 20 | "model_module": "@jupyter-widgets/controls", 21 | "model_name": "HBoxModel", 22 | "model_module_version": "1.5.0", 23 | "state": { 24 | "_dom_classes": [], 25 | "_model_module": "@jupyter-widgets/controls", 26 | "_model_module_version": "1.5.0", 27 | "_model_name": "HBoxModel", 28 | "_view_count": null, 29 | "_view_module": "@jupyter-widgets/controls", 30 | "_view_module_version": "1.5.0", 31 | "_view_name": "HBoxView", 32 | "box_style": "", 33 | "children": [ 34 | "IPY_MODEL_f38efd6522cf47c2a80d4230a3673bb2", 35 | "IPY_MODEL_b90a72eeccaa4f1c889f652ec0f56a84", 36 | "IPY_MODEL_2d040286969446e1818a6b53616ca163" 37 | ], 38 | "layout": "IPY_MODEL_48ee6b5e634948a7bec5d3ae58e32dba" 39 | } 40 | }, 41 | "f38efd6522cf47c2a80d4230a3673bb2": { 42 | "model_module": "@jupyter-widgets/controls", 43 | "model_name": "HTMLModel", 44 | "model_module_version": "1.5.0", 45 | "state": { 46 | "_dom_classes": [], 47 | "_model_module": "@jupyter-widgets/controls", 48 | "_model_module_version": "1.5.0", 49 | "_model_name": "HTMLModel", 50 | "_view_count": null, 51 | "_view_module": "@jupyter-widgets/controls", 52 | "_view_module_version": "1.5.0", 53 | "_view_name": "HTMLView", 54 | "description": "", 55 | "description_tooltip": null, 56 | "layout": "IPY_MODEL_561eeac4fa5e4134a143d5ee08e5bf21", 57 | "placeholder": "​", 58 | "style": "IPY_MODEL_8b4bbb4cb2e144fd86cfc4c72fc21df6", 59 | "value": "real_estate_ads.csv: 100%" 60 | } 61 | }, 62 | "b90a72eeccaa4f1c889f652ec0f56a84": { 63 | "model_module": "@jupyter-widgets/controls", 64 | "model_name": "FloatProgressModel", 65 | "model_module_version": "1.5.0", 66 | "state": { 67 | "_dom_classes": [], 68 | "_model_module": "@jupyter-widgets/controls", 69 | "_model_module_version": "1.5.0", 70 | "_model_name": "FloatProgressModel", 71 | "_view_count": null, 72 | "_view_module": "@jupyter-widgets/controls", 73 | "_view_module_version": "1.5.0", 74 | "_view_name": "ProgressView", 75 | "bar_style": "success", 76 | "description": "", 77 | "description_tooltip": null, 78 | "layout": "IPY_MODEL_3f6e48f483774947bf59bf143a89c76d", 79 | "max": 780721338, 80 | "min": 0, 81 | "orientation": "horizontal", 82 | "style": "IPY_MODEL_ee74e78455634b59a4d9cea414f1780f", 83 | "value": 780721338 84 | } 85 | }, 86 | "2d040286969446e1818a6b53616ca163": { 87 | "model_module": "@jupyter-widgets/controls", 88 | "model_name": "HTMLModel", 89 | "model_module_version": "1.5.0", 90 | "state": { 91 | "_dom_classes": [], 92 | "_model_module": "@jupyter-widgets/controls", 93 | "_model_module_version": "1.5.0", 94 | "_model_name": "HTMLModel", 95 | "_view_count": null, 96 | "_view_module": "@jupyter-widgets/controls", 97 | "_view_module_version": "1.5.0", 98 | "_view_name": "HTMLView", 99 | "description": "", 100 | "description_tooltip": null, 101 | "layout": "IPY_MODEL_b4e7f4b921574c4cace9cafe6c13b34c", 102 | "placeholder": "​", 103 | "style": "IPY_MODEL_66bca6fc67d34d1bb5c68bd645df0c59", 104 | "value": " 781M/781M [00:11<00:00, 117MB/s]" 105 | } 106 | }, 107 | "48ee6b5e634948a7bec5d3ae58e32dba": { 108 | "model_module": "@jupyter-widgets/base", 109 | "model_name": "LayoutModel", 110 | "model_module_version": "1.2.0", 111 | "state": { 112 | "_model_module": "@jupyter-widgets/base", 113 | "_model_module_version": "1.2.0", 114 | "_model_name": "LayoutModel", 115 | "_view_count": null, 116 | "_view_module": "@jupyter-widgets/base", 117 | "_view_module_version": "1.2.0", 118 | "_view_name": "LayoutView", 119 | "align_content": null, 120 | "align_items": null, 121 | "align_self": null, 122 | "border": null, 123 | "bottom": null, 124 | "display": null, 125 | "flex": null, 126 | "flex_flow": null, 127 | "grid_area": null, 128 | "grid_auto_columns": null, 129 | "grid_auto_flow": null, 130 | "grid_auto_rows": null, 131 | "grid_column": null, 132 | "grid_gap": null, 133 | "grid_row": null, 134 | "grid_template_areas": null, 135 | "grid_template_columns": null, 136 | "grid_template_rows": null, 137 | "height": null, 138 | "justify_content": null, 139 | "justify_items": null, 140 | "left": null, 141 | "margin": null, 142 | "max_height": null, 143 | "max_width": null, 144 | "min_height": null, 145 | "min_width": null, 146 | "object_fit": null, 147 | "object_position": null, 148 | "order": null, 149 | "overflow": null, 150 | "overflow_x": null, 151 | "overflow_y": null, 152 | "padding": null, 153 | "right": null, 154 | "top": null, 155 | "visibility": null, 156 | "width": null 157 | } 158 | }, 159 | "561eeac4fa5e4134a143d5ee08e5bf21": { 160 | "model_module": "@jupyter-widgets/base", 161 | "model_name": "LayoutModel", 162 | "model_module_version": "1.2.0", 163 | "state": { 164 | "_model_module": "@jupyter-widgets/base", 165 | "_model_module_version": "1.2.0", 166 | "_model_name": "LayoutModel", 167 | "_view_count": null, 168 | "_view_module": "@jupyter-widgets/base", 169 | "_view_module_version": "1.2.0", 170 | "_view_name": "LayoutView", 171 | "align_content": null, 172 | "align_items": null, 173 | "align_self": null, 174 | "border": null, 175 | "bottom": null, 176 | "display": null, 177 | "flex": null, 178 | "flex_flow": null, 179 | "grid_area": null, 180 | "grid_auto_columns": null, 181 | "grid_auto_flow": null, 182 | "grid_auto_rows": null, 183 | "grid_column": null, 184 | "grid_gap": null, 185 | "grid_row": null, 186 | "grid_template_areas": null, 187 | "grid_template_columns": null, 188 | "grid_template_rows": null, 189 | "height": null, 190 | "justify_content": null, 191 | "justify_items": null, 192 | "left": null, 193 | "margin": null, 194 | "max_height": null, 195 | "max_width": null, 196 | "min_height": null, 197 | "min_width": null, 198 | "object_fit": null, 199 | "object_position": null, 200 | "order": null, 201 | "overflow": null, 202 | "overflow_x": null, 203 | "overflow_y": null, 204 | "padding": null, 205 | "right": null, 206 | "top": null, 207 | "visibility": null, 208 | "width": null 209 | } 210 | }, 211 | "8b4bbb4cb2e144fd86cfc4c72fc21df6": { 212 | "model_module": "@jupyter-widgets/controls", 213 | "model_name": "DescriptionStyleModel", 214 | "model_module_version": "1.5.0", 215 | "state": { 216 | "_model_module": "@jupyter-widgets/controls", 217 | "_model_module_version": "1.5.0", 218 | "_model_name": "DescriptionStyleModel", 219 | "_view_count": null, 220 | "_view_module": "@jupyter-widgets/base", 221 | "_view_module_version": "1.2.0", 222 | "_view_name": "StyleView", 223 | "description_width": "" 224 | } 225 | }, 226 | "3f6e48f483774947bf59bf143a89c76d": { 227 | "model_module": "@jupyter-widgets/base", 228 | "model_name": "LayoutModel", 229 | "model_module_version": "1.2.0", 230 | "state": { 231 | "_model_module": "@jupyter-widgets/base", 232 | "_model_module_version": "1.2.0", 233 | "_model_name": "LayoutModel", 234 | "_view_count": null, 235 | "_view_module": "@jupyter-widgets/base", 236 | "_view_module_version": "1.2.0", 237 | "_view_name": "LayoutView", 238 | "align_content": null, 239 | "align_items": null, 240 | "align_self": null, 241 | "border": null, 242 | "bottom": null, 243 | "display": null, 244 | "flex": null, 245 | "flex_flow": null, 246 | "grid_area": null, 247 | "grid_auto_columns": null, 248 | "grid_auto_flow": null, 249 | "grid_auto_rows": null, 250 | "grid_column": null, 251 | "grid_gap": null, 252 | "grid_row": null, 253 | "grid_template_areas": null, 254 | "grid_template_columns": null, 255 | "grid_template_rows": null, 256 | "height": null, 257 | "justify_content": null, 258 | "justify_items": null, 259 | "left": null, 260 | "margin": null, 261 | "max_height": null, 262 | "max_width": null, 263 | "min_height": null, 264 | "min_width": null, 265 | "object_fit": null, 266 | "object_position": null, 267 | "order": null, 268 | "overflow": null, 269 | "overflow_x": null, 270 | "overflow_y": null, 271 | "padding": null, 272 | "right": null, 273 | "top": null, 274 | "visibility": null, 275 | "width": null 276 | } 277 | }, 278 | "ee74e78455634b59a4d9cea414f1780f": { 279 | "model_module": "@jupyter-widgets/controls", 280 | "model_name": "ProgressStyleModel", 281 | "model_module_version": "1.5.0", 282 | "state": { 283 | "_model_module": "@jupyter-widgets/controls", 284 | "_model_module_version": "1.5.0", 285 | "_model_name": "ProgressStyleModel", 286 | "_view_count": null, 287 | "_view_module": "@jupyter-widgets/base", 288 | "_view_module_version": "1.2.0", 289 | "_view_name": "StyleView", 290 | "bar_color": null, 291 | "description_width": "" 292 | } 293 | }, 294 | "b4e7f4b921574c4cace9cafe6c13b34c": { 295 | "model_module": "@jupyter-widgets/base", 296 | "model_name": "LayoutModel", 297 | "model_module_version": "1.2.0", 298 | "state": { 299 | "_model_module": "@jupyter-widgets/base", 300 | "_model_module_version": "1.2.0", 301 | "_model_name": "LayoutModel", 302 | "_view_count": null, 303 | "_view_module": "@jupyter-widgets/base", 304 | "_view_module_version": "1.2.0", 305 | "_view_name": "LayoutView", 306 | "align_content": null, 307 | "align_items": null, 308 | "align_self": null, 309 | "border": null, 310 | "bottom": null, 311 | "display": null, 312 | "flex": null, 313 | "flex_flow": null, 314 | "grid_area": null, 315 | "grid_auto_columns": null, 316 | "grid_auto_flow": null, 317 | "grid_auto_rows": null, 318 | "grid_column": null, 319 | "grid_gap": null, 320 | "grid_row": null, 321 | "grid_template_areas": null, 322 | "grid_template_columns": null, 323 | "grid_template_rows": null, 324 | "height": null, 325 | "justify_content": null, 326 | "justify_items": null, 327 | "left": null, 328 | "margin": null, 329 | "max_height": null, 330 | "max_width": null, 331 | "min_height": null, 332 | "min_width": null, 333 | "object_fit": null, 334 | "object_position": null, 335 | "order": null, 336 | "overflow": null, 337 | "overflow_x": null, 338 | "overflow_y": null, 339 | "padding": null, 340 | "right": null, 341 | "top": null, 342 | "visibility": null, 343 | "width": null 344 | } 345 | }, 346 | "66bca6fc67d34d1bb5c68bd645df0c59": { 347 | "model_module": "@jupyter-widgets/controls", 348 | "model_name": "DescriptionStyleModel", 349 | "model_module_version": "1.5.0", 350 | "state": { 351 | "_model_module": "@jupyter-widgets/controls", 352 | "_model_module_version": "1.5.0", 353 | "_model_name": "DescriptionStyleModel", 354 | "_view_count": null, 355 | "_view_module": "@jupyter-widgets/base", 356 | "_view_module_version": "1.2.0", 357 | "_view_name": "StyleView", 358 | "description_width": "" 359 | } 360 | }, 361 | "4beb968ea9bf4666bcc52ea171f4226b": { 362 | "model_module": "@jupyter-widgets/controls", 363 | "model_name": "HBoxModel", 364 | "model_module_version": "1.5.0", 365 | "state": { 366 | "_dom_classes": [], 367 | "_model_module": "@jupyter-widgets/controls", 368 | "_model_module_version": "1.5.0", 369 | "_model_name": "HBoxModel", 370 | "_view_count": null, 371 | "_view_module": "@jupyter-widgets/controls", 372 | "_view_module_version": "1.5.0", 373 | "_view_name": "HBoxView", 374 | "box_style": "", 375 | "children": [ 376 | "IPY_MODEL_c88c0ef2a51144188cd5cd25747d4eea", 377 | "IPY_MODEL_5d9f8e6989324983895d0e6caf4c2c0a", 378 | "IPY_MODEL_b74d1074999c4c079db3fda33543b1ab" 379 | ], 380 | "layout": "IPY_MODEL_46a914846d9049ec93db190ff2b5ef40" 381 | } 382 | }, 383 | "c88c0ef2a51144188cd5cd25747d4eea": { 384 | "model_module": "@jupyter-widgets/controls", 385 | "model_name": "HTMLModel", 386 | "model_module_version": "1.5.0", 387 | "state": { 388 | "_dom_classes": [], 389 | "_model_module": "@jupyter-widgets/controls", 390 | "_model_module_version": "1.5.0", 391 | "_model_name": "HTMLModel", 392 | "_view_count": null, 393 | "_view_module": "@jupyter-widgets/controls", 394 | "_view_module_version": "1.5.0", 395 | "_view_name": "HTMLView", 396 | "description": "", 397 | "description_tooltip": null, 398 | "layout": "IPY_MODEL_7c6edd53b4af41d7ad0eb79ae02bddcf", 399 | "placeholder": "​", 400 | "style": "IPY_MODEL_62afe04f90034d8095b6fd2630d7845d", 401 | "value": "Generating train split: 100%" 402 | } 403 | }, 404 | "5d9f8e6989324983895d0e6caf4c2c0a": { 405 | "model_module": "@jupyter-widgets/controls", 406 | "model_name": "FloatProgressModel", 407 | "model_module_version": "1.5.0", 408 | "state": { 409 | "_dom_classes": [], 410 | "_model_module": "@jupyter-widgets/controls", 411 | "_model_module_version": "1.5.0", 412 | "_model_name": "FloatProgressModel", 413 | "_view_count": null, 414 | "_view_module": "@jupyter-widgets/controls", 415 | "_view_module_version": "1.5.0", 416 | "_view_name": "ProgressView", 417 | "bar_style": "success", 418 | "description": "", 419 | "description_tooltip": null, 420 | "layout": "IPY_MODEL_981b459b97ad4d099873b23408970ac4", 421 | "max": 1000000, 422 | "min": 0, 423 | "orientation": "horizontal", 424 | "style": "IPY_MODEL_64fc088c40ee426c87a17421a14ea9b5", 425 | "value": 1000000 426 | } 427 | }, 428 | "b74d1074999c4c079db3fda33543b1ab": { 429 | "model_module": "@jupyter-widgets/controls", 430 | "model_name": "HTMLModel", 431 | "model_module_version": "1.5.0", 432 | "state": { 433 | "_dom_classes": [], 434 | "_model_module": "@jupyter-widgets/controls", 435 | "_model_module_version": "1.5.0", 436 | "_model_name": "HTMLModel", 437 | "_view_count": null, 438 | "_view_module": "@jupyter-widgets/controls", 439 | "_view_module_version": "1.5.0", 440 | "_view_name": "HTMLView", 441 | "description": "", 442 | "description_tooltip": null, 443 | "layout": "IPY_MODEL_316f27747662462aaf689620788f71ab", 444 | "placeholder": "​", 445 | "style": "IPY_MODEL_c0d04991fb754e23af1a3942dcf06d45", 446 | "value": " 1000000/1000000 [00:36<00:00, 30208.23 examples/s]" 447 | } 448 | }, 449 | "46a914846d9049ec93db190ff2b5ef40": { 450 | "model_module": "@jupyter-widgets/base", 451 | "model_name": "LayoutModel", 452 | "model_module_version": "1.2.0", 453 | "state": { 454 | "_model_module": "@jupyter-widgets/base", 455 | "_model_module_version": "1.2.0", 456 | "_model_name": "LayoutModel", 457 | "_view_count": null, 458 | "_view_module": "@jupyter-widgets/base", 459 | "_view_module_version": "1.2.0", 460 | "_view_name": "LayoutView", 461 | "align_content": null, 462 | "align_items": null, 463 | "align_self": null, 464 | "border": null, 465 | "bottom": null, 466 | "display": null, 467 | "flex": null, 468 | "flex_flow": null, 469 | "grid_area": null, 470 | "grid_auto_columns": null, 471 | "grid_auto_flow": null, 472 | "grid_auto_rows": null, 473 | "grid_column": null, 474 | "grid_gap": null, 475 | "grid_row": null, 476 | "grid_template_areas": null, 477 | "grid_template_columns": null, 478 | "grid_template_rows": null, 479 | "height": null, 480 | "justify_content": null, 481 | "justify_items": null, 482 | "left": null, 483 | "margin": null, 484 | "max_height": null, 485 | "max_width": null, 486 | "min_height": null, 487 | "min_width": null, 488 | "object_fit": null, 489 | "object_position": null, 490 | "order": null, 491 | "overflow": null, 492 | "overflow_x": null, 493 | "overflow_y": null, 494 | "padding": null, 495 | "right": null, 496 | "top": null, 497 | "visibility": null, 498 | "width": null 499 | } 500 | }, 501 | "7c6edd53b4af41d7ad0eb79ae02bddcf": { 502 | "model_module": "@jupyter-widgets/base", 503 | "model_name": "LayoutModel", 504 | "model_module_version": "1.2.0", 505 | "state": { 506 | "_model_module": "@jupyter-widgets/base", 507 | "_model_module_version": "1.2.0", 508 | "_model_name": "LayoutModel", 509 | "_view_count": null, 510 | "_view_module": "@jupyter-widgets/base", 511 | "_view_module_version": "1.2.0", 512 | "_view_name": "LayoutView", 513 | "align_content": null, 514 | "align_items": null, 515 | "align_self": null, 516 | "border": null, 517 | "bottom": null, 518 | "display": null, 519 | "flex": null, 520 | "flex_flow": null, 521 | "grid_area": null, 522 | "grid_auto_columns": null, 523 | "grid_auto_flow": null, 524 | "grid_auto_rows": null, 525 | "grid_column": null, 526 | "grid_gap": null, 527 | "grid_row": null, 528 | "grid_template_areas": null, 529 | "grid_template_columns": null, 530 | "grid_template_rows": null, 531 | "height": null, 532 | "justify_content": null, 533 | "justify_items": null, 534 | "left": null, 535 | "margin": null, 536 | "max_height": null, 537 | "max_width": null, 538 | "min_height": null, 539 | "min_width": null, 540 | "object_fit": null, 541 | "object_position": null, 542 | "order": null, 543 | "overflow": null, 544 | "overflow_x": null, 545 | "overflow_y": null, 546 | "padding": null, 547 | "right": null, 548 | "top": null, 549 | "visibility": null, 550 | "width": null 551 | } 552 | }, 553 | "62afe04f90034d8095b6fd2630d7845d": { 554 | "model_module": "@jupyter-widgets/controls", 555 | "model_name": "DescriptionStyleModel", 556 | "model_module_version": "1.5.0", 557 | "state": { 558 | "_model_module": "@jupyter-widgets/controls", 559 | "_model_module_version": "1.5.0", 560 | "_model_name": "DescriptionStyleModel", 561 | "_view_count": null, 562 | "_view_module": "@jupyter-widgets/base", 563 | "_view_module_version": "1.2.0", 564 | "_view_name": "StyleView", 565 | "description_width": "" 566 | } 567 | }, 568 | "981b459b97ad4d099873b23408970ac4": { 569 | "model_module": "@jupyter-widgets/base", 570 | "model_name": "LayoutModel", 571 | "model_module_version": "1.2.0", 572 | "state": { 573 | "_model_module": "@jupyter-widgets/base", 574 | "_model_module_version": "1.2.0", 575 | "_model_name": "LayoutModel", 576 | "_view_count": null, 577 | "_view_module": "@jupyter-widgets/base", 578 | "_view_module_version": "1.2.0", 579 | "_view_name": "LayoutView", 580 | "align_content": null, 581 | "align_items": null, 582 | "align_self": null, 583 | "border": null, 584 | "bottom": null, 585 | "display": null, 586 | "flex": null, 587 | "flex_flow": null, 588 | "grid_area": null, 589 | "grid_auto_columns": null, 590 | "grid_auto_flow": null, 591 | "grid_auto_rows": null, 592 | "grid_column": null, 593 | "grid_gap": null, 594 | "grid_row": null, 595 | "grid_template_areas": null, 596 | "grid_template_columns": null, 597 | "grid_template_rows": null, 598 | "height": null, 599 | "justify_content": null, 600 | "justify_items": null, 601 | "left": null, 602 | "margin": null, 603 | "max_height": null, 604 | "max_width": null, 605 | "min_height": null, 606 | "min_width": null, 607 | "object_fit": null, 608 | "object_position": null, 609 | "order": null, 610 | "overflow": null, 611 | "overflow_x": null, 612 | "overflow_y": null, 613 | "padding": null, 614 | "right": null, 615 | "top": null, 616 | "visibility": null, 617 | "width": null 618 | } 619 | }, 620 | "64fc088c40ee426c87a17421a14ea9b5": { 621 | "model_module": "@jupyter-widgets/controls", 622 | "model_name": "ProgressStyleModel", 623 | "model_module_version": "1.5.0", 624 | "state": { 625 | "_model_module": "@jupyter-widgets/controls", 626 | "_model_module_version": "1.5.0", 627 | "_model_name": "ProgressStyleModel", 628 | "_view_count": null, 629 | "_view_module": "@jupyter-widgets/base", 630 | "_view_module_version": "1.2.0", 631 | "_view_name": "StyleView", 632 | "bar_color": null, 633 | "description_width": "" 634 | } 635 | }, 636 | "316f27747662462aaf689620788f71ab": { 637 | "model_module": "@jupyter-widgets/base", 638 | "model_name": "LayoutModel", 639 | "model_module_version": "1.2.0", 640 | "state": { 641 | "_model_module": "@jupyter-widgets/base", 642 | "_model_module_version": "1.2.0", 643 | "_model_name": "LayoutModel", 644 | "_view_count": null, 645 | "_view_module": "@jupyter-widgets/base", 646 | "_view_module_version": "1.2.0", 647 | "_view_name": "LayoutView", 648 | "align_content": null, 649 | "align_items": null, 650 | "align_self": null, 651 | "border": null, 652 | "bottom": null, 653 | "display": null, 654 | "flex": null, 655 | "flex_flow": null, 656 | "grid_area": null, 657 | "grid_auto_columns": null, 658 | "grid_auto_flow": null, 659 | "grid_auto_rows": null, 660 | "grid_column": null, 661 | "grid_gap": null, 662 | "grid_row": null, 663 | "grid_template_areas": null, 664 | "grid_template_columns": null, 665 | "grid_template_rows": null, 666 | "height": null, 667 | "justify_content": null, 668 | "justify_items": null, 669 | "left": null, 670 | "margin": null, 671 | "max_height": null, 672 | "max_width": null, 673 | "min_height": null, 674 | "min_width": null, 675 | "object_fit": null, 676 | "object_position": null, 677 | "order": null, 678 | "overflow": null, 679 | "overflow_x": null, 680 | "overflow_y": null, 681 | "padding": null, 682 | "right": null, 683 | "top": null, 684 | "visibility": null, 685 | "width": null 686 | } 687 | }, 688 | "c0d04991fb754e23af1a3942dcf06d45": { 689 | "model_module": "@jupyter-widgets/controls", 690 | "model_name": "DescriptionStyleModel", 691 | "model_module_version": "1.5.0", 692 | "state": { 693 | "_model_module": "@jupyter-widgets/controls", 694 | "_model_module_version": "1.5.0", 695 | "_model_name": "DescriptionStyleModel", 696 | "_view_count": null, 697 | "_view_module": "@jupyter-widgets/base", 698 | "_view_module_version": "1.2.0", 699 | "_view_name": "StyleView", 700 | "description_width": "" 701 | } 702 | } 703 | } 704 | } 705 | }, 706 | "cells": [ 707 | { 708 | "cell_type": "markdown", 709 | "metadata": { 710 | "id": "view-in-github", 711 | "colab_type": "text" 712 | }, 713 | "source": [ 714 | "\"Open" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 6, 720 | "metadata": { 721 | "id": "edMbM4ZtMeFs" 722 | }, 723 | "outputs": [], 724 | "source": [ 725 | "# CELL 1: Install Libraries\n", 726 | "\n", 727 | "!pip install datasets pandas google-cloud-bigquery pyarrow db-dtypes -q" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "source": [ 733 | "# CELL 2: Authenticate to Google Cloud\n", 734 | "from google.colab import auth\n", 735 | "auth.authenticate_user()\n", 736 | "print('✅ Authenticated')\n" 737 | ], 738 | "metadata": { 739 | "id": "L46Um3ftOETP", 740 | "colab": { 741 | "base_uri": "https://localhost:8080/" 742 | }, 743 | "outputId": "ac104c9f-ff70-41ce-908e-6c66c846dae0" 744 | }, 745 | "execution_count": 7, 746 | "outputs": [ 747 | { 748 | "output_type": "stream", 749 | "name": "stdout", 750 | "text": [ 751 | "✅ Authenticated\n" 752 | ] 753 | } 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "source": [ 759 | "# CELL 3: Download from HF & Load into BigQuery (with retries)\n", 760 | "\n", 761 | "# === CONFIGURATION: REPLACE THESE ===\n", 762 | "gcp_project_id = \"azw-ua\" # ← your GCP project ID\n", 763 | "bq_dataset_id = \"real_estate_data\" # ← your existing BigQuery dataset\n", 764 | "bq_table_id = \"divar_real_estate_ads\" # ← name for the new table\n", 765 | "hf_dataset = \"divaroffical/real_estate_ads\"\n", 766 | "hf_split = \"train\"\n", 767 | "bq_location = \"US\" # ← match your dataset location\n", 768 | "# ===================================\n", 769 | "\n", 770 | "import time\n", 771 | "import pandas as pd\n", 772 | "from datasets import load_dataset\n", 773 | "from google.cloud import bigquery\n", 774 | "\n", 775 | "# Full table reference\n", 776 | "table_ref = f\"{gcp_project_id}.{bq_dataset_id}.{bq_table_id}\"\n", 777 | "\n", 778 | "print(f\"→ HF dataset: {hf_dataset} [{hf_split}]\")\n", 779 | "print(f\"→ BQ table: {table_ref} (location={bq_location})\\n\")\n", 780 | "\n", 781 | "# 1) Download HF dataset\n", 782 | "print(\"1) Downloading Hugging Face dataset…\")\n", 783 | "hf_ds = load_dataset(hf_dataset, split=hf_split)\n", 784 | "df = hf_ds.to_pandas()\n", 785 | "print(f\" → Downloaded & converted to DataFrame: {df.shape[0]} rows, {df.shape[1]} cols\\n\")\n", 786 | "\n", 787 | "# 2) Initialize BQ client\n", 788 | "client = bigquery.Client(project=gcp_project_id, location=bq_location)\n", 789 | "job_config = bigquery.LoadJobConfig(\n", 790 | " write_disposition=\"WRITE_TRUNCATE\",\n", 791 | " autodetect=True,\n", 792 | ")\n", 793 | "\n", 794 | "# 3) Upload with retries\n", 795 | "max_retries = 5\n", 796 | "for attempt in range(1, max_retries+1):\n", 797 | " try:\n", 798 | " print(f\"{attempt=}: Starting load_job…\")\n", 799 | " job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)\n", 800 | " job.result() # wait for completion\n", 801 | " print(f\"✅ Loaded {job.output_rows} rows into {table_ref}\")\n", 802 | " break\n", 803 | " except Exception as err:\n", 804 | " print(f\"❌ Attempt {attempt} failed: {err}\")\n", 805 | " if attempt == max_retries:\n", 806 | " raise RuntimeError(\"All retries failed—aborting.\") from err\n", 807 | " backoff = 2 ** attempt\n", 808 | " print(f\" ↳ retrying in {backoff}s…\")\n", 809 | " time.sleep(backoff)\n", 810 | "\n", 811 | "print(\"\\n🎉 All done!\")\n" 812 | ], 813 | "metadata": { 814 | "id": "3UXzjvGlOJ74", 815 | "colab": { 816 | "base_uri": "https://localhost:8080/", 817 | "height": 255, 818 | "referenced_widgets": [ 819 | "cf441a3e0a58460a902f8ece8095abcd", 820 | "f38efd6522cf47c2a80d4230a3673bb2", 821 | "b90a72eeccaa4f1c889f652ec0f56a84", 822 | "2d040286969446e1818a6b53616ca163", 823 | "48ee6b5e634948a7bec5d3ae58e32dba", 824 | "561eeac4fa5e4134a143d5ee08e5bf21", 825 | "8b4bbb4cb2e144fd86cfc4c72fc21df6", 826 | "3f6e48f483774947bf59bf143a89c76d", 827 | "ee74e78455634b59a4d9cea414f1780f", 828 | "b4e7f4b921574c4cace9cafe6c13b34c", 829 | "66bca6fc67d34d1bb5c68bd645df0c59", 830 | "4beb968ea9bf4666bcc52ea171f4226b", 831 | "c88c0ef2a51144188cd5cd25747d4eea", 832 | "5d9f8e6989324983895d0e6caf4c2c0a", 833 | "b74d1074999c4c079db3fda33543b1ab", 834 | "46a914846d9049ec93db190ff2b5ef40", 835 | "7c6edd53b4af41d7ad0eb79ae02bddcf", 836 | "62afe04f90034d8095b6fd2630d7845d", 837 | "981b459b97ad4d099873b23408970ac4", 838 | "64fc088c40ee426c87a17421a14ea9b5", 839 | "316f27747662462aaf689620788f71ab", 840 | "c0d04991fb754e23af1a3942dcf06d45" 841 | ] 842 | }, 843 | "outputId": "7964477d-a590-43ea-9b31-6b39817e21c2" 844 | }, 845 | "execution_count": 9, 846 | "outputs": [ 847 | { 848 | "metadata": { 849 | "tags": null 850 | }, 851 | "name": "stdout", 852 | "output_type": "stream", 853 | "text": [ 854 | "→ HF dataset: divaroffical/real_estate_ads [train]\n", 855 | "→ BQ table: azw-ua.real_estate_data.divar_real_estate_ads (location=US)\n", 856 | "\n", 857 | "1) Downloading Hugging Face dataset…\n" 858 | ] 859 | }, 860 | { 861 | "data": { 862 | "application/vnd.jupyter.widget-view+json": { 863 | "model_id": "cf441a3e0a58460a902f8ece8095abcd", 864 | "version_major": 2, 865 | "version_minor": 0 866 | }, 867 | "text/plain": [ 868 | "real_estate_ads.csv: 0%| | 0.00/781M [00:00