├── .gitignore
├── requirements.txt
├── bqfetch
    ├── __init__.py
    ├── utils.py
    └── bqfetch.py
├── main.py
├── LICENSE
├── .github
    └── logo.svg
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | secrets
3 | bqfetch.egg-info
4 | dist
5 | pyproject.toml
6 | setup.cfg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | psutil==5.8.0
2 | pandas
3 | google-cloud-bigquery-storage>=2.1.0
4 | google-cloud-bigquery>=2.1.0
5 | billiard>=3.6.4.0
6 | joblib>=1.0.1
7 | pyarrow>=1.0.1


--------------------------------------------------------------------------------
/bqfetch/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .bqfetch import (
 3 |     BigQueryTable,
 4 |     FetchingChunk,
 5 |     BigQueryClient,
 6 |     InvalidChunkRangeException,
 7 |     BigQueryFetcher,
 8 | )
 9 | 
10 | __all__ = [
11 |     "BigQueryTable",
12 |     "FetchingChunk",
13 |     "BigQueryClient",
14 |     "InvalidChunkRangeException",
15 |     "BigQueryFetcher",
16 | ]
17 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('/Users/tristanbilot/Desktop/bq_data_fetcher')
 3 | from bqfetch.bqfetch import BigQueryFetcher, BigQueryTable
 4 | 
 5 | if __name__ == '__main__':
 6 |     table = BigQueryTable(
 7 |         "PROJECT",
 8 |         "DATASET",
 9 |         "TABLE"
10 |     )
11 |     fetcher = BigQueryFetcher(table)
12 |     chunks = fetcher.chunks(
13 |         column='id',
14 |         by_chunk_size_in_GB=15,
15 |         verbose=True
16 |     )
17 |     for chunk in chunks:
18 |         df = fetcher.fetch(chunk=chunk, nb_cores=1, parallel_backend='billiard', verbose=True)
19 |         


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tristan Bilot
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bqfetch/utils.py:
--------------------------------------------------------------------------------
 1 | from math import ceil
 2 | import multiprocessing
 3 | 
 4 | import numpy as np
 5 | import billiard as billiard_multiprocessing
 6 | from typing import Iterable, List
 7 | from joblib import Parallel, delayed
 8 | 
 9 | def scope_splitter(target_scope: list, chunk_reference_size) -> List[list]:
10 |     '''
11 |     Split a list into multiple arrays depending on memory size.
12 |     '''
13 |     nb_barcodes = len(target_scope)
14 |     nb_chuncks = ceil(nb_barcodes / chunk_reference_size)
15 |     chunks = np.array_split(target_scope, nb_chuncks)
16 |     return list(chunks)
17 | 
18 | def divide_in_chunks(seq: Iterable, n: int) -> Iterable:
19 |     '''
20 |     Divide an array in `n` arrays of approximately the same length.
21 |     '''
22 |     avg = len(seq) / float(n)
23 |     out = []
24 |     last = 0.0
25 |     while last < len(seq):
26 |         out.append(seq[int(last):int(last + avg)])
27 |         last += avg
28 |     return out
29 | 
30 | def do_parallel_billiard(function, num_cores, partition_list):
31 |     '''
32 |     Run `function` in parallel using `num_cores` processes with each
33 |     call using as parameters the tuple at the ith index of `partition_list`.
34 | 
35 |     Billiard library is an old fork of Python multiprocessoing lib which
36 |     allow forking processes from daemon process.
37 |     '''
38 |     pool = billiard_multiprocessing.Pool(processes=num_cores)
39 |     return pool.map(function, partition_list)
40 | 
41 | def do_parallel_joblib(function, num_cores, partition_list):
42 |     '''
43 |     Run `function` in parallel using `num_cores` processes with each
44 |     call using as parameters the tuple at the ith index of `partition_list`.
45 | 
46 |     Warning: using joblib do not allow to create child processes from
47 |     a daemon process (because joblib use multiprocessing as backend).
48 |     So in many cases (ex: running this function from Airflow),
49 |     using this function will result in computing  multiprocessing with 
50 |     n_jobs=1 so no parallel processing at all.
51 |     Prefer using `do_parallel()` function using billiard (old fork of
52 |     multiprocessing whiwh allow process spawn from daemon).
53 |     '''
54 |     return Parallel(n_jobs=num_cores)(delayed(function)(item) for item in partition_list)
55 | 
56 | def do_parallel_multiprocessing(function, num_cores, partition_list):
57 |     '''
58 |     Run `function` in parallel using `num_cores` processes with each
59 |     call using as parameters the tuple at the ith index of `partition_list`.
60 |     '''
61 |     pool = multiprocessing.Pool(processes=num_cores)
62 |     return pool.map(function, partition_list)
63 | 
64 | def log(*args):
65 |     print()
66 |     for arg in args:
67 |         print(f'>>> {arg}')
68 | 
69 | def ft(size_in_GB: int) -> str:
70 |     '''
71 |     Formats gigabytes like 2.3892... = 2.38GB
72 |     '''
73 |     return f'{round(size_in_GB, 2)}GB'


--------------------------------------------------------------------------------
/.github/logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48"><linearGradient id="X5rURb9A6jIn~aOQjnp9ga" x1="21.5" x2="21.5" y1=".618" y2="14.141" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#0d61a9"></stop><stop offset="1" stop-color="#16528c"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9ga)" d="M24,8.143C24,5,24,3,23.25,3C21.869,3,19,6,19,8.143C19,9.563,20.119,12,21.5,12	S24,9.563,24,8.143z"></path><linearGradient id="X5rURb9A6jIn~aOQjnp9gb" x1="21.5" x2="21.5" y1="-7.382" y2="6.141" gradientTransform="matrix(1 0 0 -1 0 13)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#0d61a9"></stop><stop offset="1" stop-color="#16528c"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9gb)" d="M24,12.857C24,16,24,18,23,18c-1.381,0-4-2.75-4-5.143C19,11.437,20.119,9,21.5,9	S24,11.437,24,12.857z"></path><linearGradient id="X5rURb9A6jIn~aOQjnp9gc" x1="15.505" x2="29.798" y1="4.909" y2="43.074" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#199ae0"></stop><stop offset="1" stop-color="#0782d8"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9gc)" d="M14.411,16.743c1.493-2.967,3.805-3.777,5.089-3.777v-5c-9.5,0-15.5,9-15.5,16	c0,9.819,6.537,19.075,21.684,19C35.275,42.918,43,35.057,43,25.466v-0.5c0-0.552-0.448-1-1-1l-23.104,0	C15.189,23.966,12.745,20.055,14.411,16.743z"></path><linearGradient id="X5rURb9A6jIn~aOQjnp9gd" x1="-9.818" x2="47.57" y1="32.725" y2="32.725" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#32bdef"></stop><stop offset="1" stop-color="#1ea2e4"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9gd)" d="M43,24.97v0.5c0,9.59-7.73,17.45-17.32,17.5C10.54,43.04,4,33.79,4,23.97	c0-0.49,0.03-0.99,0.09-1.49C6.88,30.28,14.74,35,24,35c8.35,0,15.57-3.84,18.95-10.32C42.99,24.77,43,24.87,43,24.97z"></path><linearGradient id="X5rURb9A6jIn~aOQjnp9ge" x1="-9.477" x2="24.898" y1="21.483" y2="21.483" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#199ae0"></stop><stop offset="1" stop-color="#0782d8"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9ge)" d="M42,23.97H18.9c-3.71,0-6.15-3.92-4.49-7.23c1.49-2.96,3.81-3.77,5.09-3.77v-0.004	c1.381,0,2.5-1.119,2.5-2.5c0-1.381-1.119-2.5-2.5-2.5c-0.031,0-0.059,0.008-0.089,0.009C10.622,8.028,4.857,15.783,4.09,22.48	C6.88,30.28,14.74,35,24,35c8.35,0,15.57-3.84,18.95-10.32C42.83,24.27,42.45,23.97,42,23.97z"></path><linearGradient id="X5rURb9A6jIn~aOQjnp9gf" x1="34" x2="36" y1="28" y2="28" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#4c4c4c"></stop><stop offset="1" stop-color="#343434"></stop></linearGradient><circle cx="35" cy="28" r="1" fill="url(#X5rURb9A6jIn~aOQjnp9gf)"></circle><linearGradient id="X5rURb9A6jIn~aOQjnp9gg" x1="-5.477" x2="28.898" y1="40" y2="40" gradientTransform="matrix(-1 0 0 1 63 0)" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#199ae0"></stop><stop offset="1" stop-color="#0782d8"></stop></linearGradient><path fill="url(#X5rURb9A6jIn~aOQjnp9gg)" d="M26,46L26,46c2.209,0,4-1.791,4-4v-3c0-2.761-2.239-5-5-5h0v11C25,45.552,25.448,46,26,46z"></path></svg>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center"><img width="15%" src="https://raw.githubusercontent.com/TristanBilot/bqfetch/master/.github/logo.svg"/></p>
  2 | 
  3 | <p align="center">
  4 |   <a href=""><img src="https://img.shields.io/github/last-commit/tristanbilot/bqfetch" alt="Last commit"></a>
  5 |   <a href="https://img.shields.io/github/languages/count/tristanbilot/bqfetch"><img src="https://img.shields.io/github/languages/count/tristanbilot/bqfetch" alt="Languages"></a>
  6 |   <a href=""><img src="https://img.shields.io/github/release-date/tristanbilot/bqfetch" alt="Release date"></a>
  7 |   <br>
  8 |   <a href=""><img src="https://img.shields.io/badge/Python-%3E%3D3.6-blue" alt="Python version"></a>
  9 |   <a href=""><img src="https://img.shields.io/github/license/tristanbilot/bqfetch" alt="Python version"></a>
 10 | </p>
 11 | 
 12 | # <p align="center">bqfetch<p>
 13 | **A lightweight tool to fetch tables from BigQuery as pandas DataFrames very fast using BigQuery Storage API combined with multiprocessing. This module also aims to fetch large tables that cannot fit into memory, by chunking the table in a smart and scalable way.**
 14 | 
 15 | 
 16 | ## Installation
 17 | ```
 18 | pip install bqfetch
 19 | pip install -r requirements.txt
 20 | ```
 21 |   
 22 | ## Algorithm
 23 |   * Fetch all distinct values from the given index `column`.
 24 |   * Divide these indices in `chunks` based on the available memory and the number of cores on the machine.
 25 |   * `If multiprocessing`:
 26 |       * Each chunk will be divided in multiple sub-chunks based on the parameter `nb_cores` and the available memory.
 27 |       * For each sub-chunk, create a temporary table containing all the matching rows in the whole table.
 28 |       * Fetch these temporary tables using BigQuery Storage as dataframes.
 29 |       * Merge the dataframes.
 30 |       * Delete temporary tables.
 31 |   * `If !multiprocessing`:
 32 |       * Same process with only one temporary table and no parallel processes created.
 33 | 
 34 | ## Use case
 35 | 
 36 | ### Fetching a huge table of users using multiple cores
 37 | |  id |   Name  | Age |
 38 | |:---:|:-------:|:---:|
 39 | | 187 | Bartolomé |  30 |
 40 | | 188 | Tristan |  22 |
 41 | | ... |   ...   | ... |
 42 | 
 43 | ```python
 44 | >>> table = BigQueryTable("PROJECT", "DATASET", "TABLE")
 45 | >>> fetcher = BigQueryFetcher('/path/to/service_account.json', table)
 46 | >>> chunks = fetcher.chunks('id', by_chunk_size_in_GB=5)
 47 | 
 48 | >>> for chunk in chunks:
 49 |         df = fetcher.fetch(chunk, nb_cores=-1, parallel_backend='billiard')
 50 |         # ...
 51 | ```
 52 |   
 53 | * First, we have to create a `BigQueryTable` object which contains the path to the BigQuery table stored in GCP.
 54 | * A fetcher is created, given in parameter the absolute path to the service_account.json file, the file is mandatory in order to do operations in GCP.
 55 | * Chunks the whole table, given the `column` name and the chunk size. In this case, choosing the **id** column is perfect because this each value of this column appears the same number of times: 1 time. Concerning the chunks size, if by_chunk_size_in_GB=5, each chunk that will be fetched on the machine will be of size 5GB. Thus it has to fit into memory. You need to save 1/3 more memory because the size of a DataFrame object is larger than the raw fetched data.
 56 | * For each chunk, fetch it.
 57 |     * `nb_cores`=-1 will use the number of cores available on the machine.
 58 |     * `parallel_backend`='billiard' | 'joblib' | 'multiprocessing' specify the backend framework to use.
 59 | 
 60 | ## Fetch by number of chunks
 61 | It is also possible to use `by_nb_chunks` instead of `by_chunk_size_in_GB`. It will divided the table in N, so you cannot control more flexibly the size of each chunk.
 62 |   
 63 | ```python
 64 | >>> table = BigQueryTable("PROJECT", "DATASET", "TABLE")
 65 | >>> fetcher = BigQueryFetcher('/path/to/service_account.json', table)
 66 | >>> chunks = fetcher.chunks('id', by_nb_chunks=10)
 67 | 
 68 | >>> for chunk in chunks:
 69 |         df = fetcher.fetch(chunk, nb_cores=-1, parallel_backend='billiard')
 70 |         # ...
 71 | ```
 72 | 
 73 | ## Verbose mode
 74 | 
 75 | ```python
 76 | >>> chunks = fetcher.chunks(column='id', by_nb_chunks=1, verbose=True)
 77 | # Available memory on device:  7.04GB
 78 | # Size of table:               2.19GB
 79 | # Prefered size of chunk:      3GB
 80 | # Size per chunk:              3GB
 81 | # Nb chunks:                   1
 82 |   
 83 | # Nb values in "id":           96
 84 | # Chunk size:                  3GB
 85 | # Nb chunks:                   1
 86 |   
 87 | >>> for chunk in chunks:
 88 | >>>        df = fetcher.fetch(chunk=chunk, nb_cores=1, parallel_backend='joblib', verbose=True)
 89 | # Use multiprocessing :        False
 90 | # Nb cores:                    1
 91 | # Parallel backend:            joblib
 92 | 
 93 | # Time to fetch:               43.21s
 94 | # Nb lines in dataframe:       3375875
 95 | # Size of dataframe:           2.83GB
 96 | ```
 97 |   
 98 | ## Warning
 99 | We recommend to use this tool only in the case where the table to fetch contains a column that can be easily chunked (divided in small parts). Thus the perfect column to achieve this is a column containing distinct values or values that appear ~ the same number of time. **If some values appear thousands of times and some only fews, then the chunking will not be reliable** because we need to make the assumption that each chunk will be approximatively the same size in order to estimate the needed memory necessary to fetch in an optimize way the table.
100 |   
101 | ### A good index colum:
102 | This column contains distinct values so can be divided in chunks easily.
103 |   
104 | |  Card number |
105 | |:---:|
106 | |  |
107 | | 4390 3849 ... |
108 | | 2903 1182 ... |
109 | | 0562 7205 ... |
110 | | ... |
111 |   
112 | ### A bad index colum:
113 | This column can contains a lot of variance between values so the chunking will not be reliable.
114 | 
115 | |  Age |
116 | |:---:|
117 | |  |
118 | | 18 |
119 | | 18 |
120 | | 64 |
121 | | 18 |
122 | | ... |
123 | 
124 | ### More cores != faster
125 | I remind you that adding more cores to the fetching process will not necessarily gain performance and most of the time it will even be slower. The reason is that the fetching is directly dependent on the Internet bandwidth available on your network, not the number of working cores or the computer power. However, you can do your own tests and in some cases the multiprocessing can gain time (ex: in the case where cloud machines allow only an amount of bandwidth by core, multiplying the number of cores will also multiplying the bandwidth, ex: GCP compute engines).
126 | 
127 | ## Contribution
128 | bqfetch is open to new contributors, especially for bug fixing or implementation of new features. Do not hesitate to open an issue/pull request :)
129 |   
130 | ## License
131 |   <a href="https://opensource.org/licenses/MIT">MIT</a>
132 |   
133 |   Copyright (c) 2021-present, Tristan Bilot
134 | 


--------------------------------------------------------------------------------
/bqfetch/bqfetch.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | import os
  3 | import psutil
  4 | import math
  5 | from typing import Iterator, List, Tuple
  6 | 
  7 | import pandas as pd
  8 | from google.cloud import bigquery
  9 | from google.oauth2 import service_account
 10 | 
 11 | from bqfetch.utils import *
 12 | 
 13 | CREDS_SCOPES = [
 14 |     "https://www.googleapis.com/auth/drive",
 15 |     "https://www.googleapis.com/auth/bigquery",
 16 |     "https://www.googleapis.com/auth/devstorage.full_control"
 17 | ]
 18 | DEFAULT_CHUNK_SIZE_PER_CORE_IN_GB = 2
 19 | 
 20 | class BigQueryTable:
 21 |     '''
 22 |     A simple object containing the path to the requested table.
 23 |     `project_id` is the name of the BigQuery project, `dataset`
 24 |     the BigQuery dataset entry and `table` the name of the 
 25 |     requested table.
 26 |     '''
 27 |     def __init__(
 28 |         self,
 29 |         project_id: str,
 30 |         dataset: str,
 31 |         table: str,
 32 |     ) -> None:
 33 |         self._variables = {
 34 |             "PROJECT_ID": project_id,
 35 |             "DATASET": dataset,
 36 |             "TABLE": table,
 37 |         }
 38 | 
 39 |     @property
 40 |     def variables(self):
 41 |         return self._variables 
 42 | 
 43 | class FetchingChunk:
 44 |     '''
 45 |     Wrapper object used to store the elements to select in the 
 46 |     given column.
 47 |     '''
 48 |     def __init__(self, elements: List[str], column: str,) -> None:
 49 |         self.elements = elements
 50 |         self.column = column
 51 | 
 52 | class BigQueryClient:
 53 |     '''
 54 |     Wrapper of BigQuery Client object containing credentials.
 55 | 
 56 |     Parameters:
 57 |     ----------
 58 |     service_account_path: str
 59 |         The path and file name of credentials file bq_service_account.json.
 60 |         The path should be absolute.
 61 |     '''
 62 |     def __init__(
 63 |         self,
 64 |         service_account_path: str=None,
 65 |         creds_scope: str=None,
 66 |     ) -> None:
 67 |         if isinstance(service_account_path, str):
 68 |             creds_scope = creds_scope if creds_scope is not None \
 69 |                 else CREDS_SCOPES
 70 |             credentials = service_account.Credentials.from_service_account_file(
 71 |                 service_account_path, scopes=creds_scope
 72 |             )
 73 |             bq_client = bigquery.Client(
 74 |                 credentials=credentials,
 75 |                 project=credentials.project_id
 76 |             )
 77 |         elif service_account_path is None:
 78 |             bq_client = bigquery.Client()
 79 |         else:
 80 |             raise ValueError('`service_account_path` should be of type str or None')
 81 | 
 82 |         
 83 |         self._client = bq_client
 84 | 
 85 |     def run(
 86 |         self,
 87 |         request: str
 88 |     ) -> bigquery.table.RowIterator:
 89 |         """
 90 |         Run a SQL BigQuery request.
 91 |         """
 92 |         job = self._client.query(request)
 93 |         return job.result()
 94 | 
 95 |     def delete_table(
 96 |         self,
 97 |         table_name: str,
 98 |         not_found_ok: bool=True,
 99 |     ):
100 |         '''
101 |         Delete a BigQuery table.
102 |         '''
103 |         self._client.delete_table(table_name, not_found_ok=not_found_ok)
104 | 
105 |     def get_nb_occurences_for_column(
106 |         self,
107 |         table: BigQueryTable,
108 |         column: str,
109 |     ) -> List[int]:
110 |         '''
111 |         For each distinct element in `column`, counts the number of occurences
112 |         and returns a list containing all the countings.
113 |         Ex: For a column name contaning: John, John, Louis
114 |         >>> [2, 1]
115 |         '''
116 |         var = table.variables
117 |         nb_occurences_query = f'''
118 |             SELECT COUNT(*)
119 |             FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}`
120 |             GROUP BY {column}
121 |         '''
122 |         nb_occurences = [nb_occurences[0] for nb_occurences in self.run(nb_occurences_query)]
123 |         return nb_occurences
124 | 
125 |     def get_table_size_in_GB(
126 |         self,
127 |         table: BigQueryTable,
128 |     ) -> int:
129 |         '''
130 |         Returns the size in GB of `table`.
131 |         '''
132 |         var = table.variables
133 |         size_of_table_query = f'''
134 |             SELECT SUM(size_bytes)/{1024**3} AS size_GB
135 |             FROM {var["PROJECT_ID"]}.{var["DATASET"]}.__TABLES__
136 |             WHERE table_id = '{var["TABLE"]}'
137 |         '''
138 |         size_of_table_in_GB = next(self.run(size_of_table_query).__iter__())[0]
139 |         return size_of_table_in_GB
140 | 
141 |     def get_column_values(
142 |         self,
143 |         table: BigQueryTable,
144 |         column: str,
145 |     ) -> pd.DataFrame:
146 |         '''
147 |         Returns a Dataframe (with 1 col) of distinct elements on the 
148 |         `column` of the `table`.
149 |         '''
150 |         var = table.variables
151 |         query = f'''
152 |             SELECT DISTINCT `{column}`
153 |             FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}`
154 |         '''
155 |         query_results = self.run(query)
156 |         return query_results.to_dataframe()
157 | 
158 |     def create_partitioned_table(
159 |         self,
160 |         table: BigQueryTable,
161 |         chunk: FetchingChunk,
162 |         partitioned_table_name: str,
163 |     ):
164 |         '''
165 |         Create a temporary table used to store one `chunk` of data,
166 |         extracted from the main table to fetch. This step is necessary
167 |         in order to improve performances and avoid network bottleneck.
168 |         The table is created with the name `partitioned_table_name` in the
169 |         same dataset as `bq_table`.
170 |         '''
171 |         sqlify_chunk_elements = ','.join(list(map(lambda x: f'"{x}"', chunk.elements)))
172 |         var = table.variables
173 |         query = f'''
174 |             CREATE OR REPLACE TABLE
175 |             `{var["PROJECT_ID"]}.{var["DATASET"]}.{partitioned_table_name}` AS
176 |             SELECT 
177 |             * 
178 |             FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}`
179 |             WHERE {chunk.column} IN ({sqlify_chunk_elements})
180 |         '''
181 |         self.run(query)
182 | 
183 |     def delete_partitioned_table(
184 |         self,
185 |         table: BigQueryTable,
186 |         partitioned_table_name: str,
187 |     ):
188 |         '''
189 |         Delete the temporary table used to chunk the table.
190 |         '''
191 |         var = table.variables
192 |         table = f'{var["PROJECT_ID"]}.{var["DATASET"]}.{partitioned_table_name}'
193 |         self.delete_table(table)
194 | 
195 | 
196 | class BigQueryFetcher:
197 |     '''
198 |     An object used to fetch BigQuery tables easily and progressively
199 |     in order to handle huge tables that does not fit into memory.
200 |     The fetcher divides the table in chunks of size `chunk_size_in_GB`
201 |     based on the `column` parameter. Then each chunk is fetched 
202 |     using BigQuery Storage API, sequencially or in parallel using
203 |     child processes running on multiple cores.
204 | 
205 |     Ex: Fetch a huge table of users: first, all the 'user_id' are 
206 |     fetched and divided in chunks of size 50000 (should fit into memory).
207 |     Then, we fetch each small chunk separately using multiprocessing
208 |     with the number of cores available of the machine.
209 | 
210 |     >>> table = BigQueryTable("my_project", "dataset1", "users_table")
211 |     >>> fetcher = BigQueryFetcher('path/to/service_account.json', table)
212 |     >>> chunks = fetcher.chunks('user_id', 50000)
213 |     >>> for chunk in chunks:
214 |             df = fetcher.fetch(chunk, nb_cores=-1)
215 |             # compute df...
216 |     '''
217 |     def __init__(
218 |         self, 
219 |         bq_table: BigQueryTable,
220 |         service_account_filename: str=None,
221 |         existing_client: BigQueryClient=None,
222 |         creds_scope: str=None,
223 |     ):
224 |         self._client = existing_client if existing_client is not None \
225 |             else BigQueryClient(service_account_filename, creds_scope=creds_scope)
226 |         self._bq_table = bq_table
227 |         self._service_account_filename = service_account_filename
228 |         self._creds_scopes = CREDS_SCOPES
229 |         self._cache = {}
230 |         self._first_fetch = True
231 | 
232 |     def chunks(
233 |         self,
234 |         column: str,
235 |         by_nb_chunks: int=None,
236 |         by_chunk_size_in_GB: int=None,
237 |         verbose: bool=False,
238 |     ) -> Iterator:
239 |         '''
240 |         Returns a list on which iterate to get `nb_chunks` chunks of `column` items.
241 |         It allows to fetch the whole table with multiple chunks that can handle in memory.
242 |         The chosen column can be of any type, not only String or Int.
243 |         '''
244 |         assert isinstance(column, str)
245 |         
246 |         if (by_nb_chunks is None and by_chunk_size_in_GB is None) \
247 |             or (by_nb_chunks is not None and by_chunk_size_in_GB is not None):
248 |             raise ValueError('Only one parameter `by_nb_chunks` or `by_chunk_size_in_GB` has to be set')
249 |         if not ((by_nb_chunks is not None and by_nb_chunks > 0) \
250 |             or (by_chunk_size_in_GB is not None and by_chunk_size_in_GB > 0)):
251 |             raise ValueError('Value has to be greater than 0')
252 | 
253 |         by_nb_chunks = by_nb_chunks if by_nb_chunks is not None else \
254 |             self.get_nb_chunks_approximation(column, verbose=verbose, chunk_size_in_GB=by_chunk_size_in_GB)
255 | 
256 |         indexes = self._client.get_column_values(self._bq_table, column)
257 |         chunks = divide_in_chunks(indexes, by_nb_chunks)
258 |         chunks = [FetchingChunk(x[column].tolist(), column) for x in chunks]
259 | 
260 |         if verbose:
261 |             log(
262 |                 'Chunking',
263 |                 f'Nb values in "{column}":\t {len(indexes)}',
264 |                 f'Nb chunks:\t\t\t {len(chunks)}')
265 |         return chunks
266 | 
267 |     def fetch(
268 |         self,
269 |         chunk: FetchingChunk=None,
270 |         nb_cores: int=1,
271 |         memory_to_save: float = 1.0,
272 |         parallel_backend: str='billiard',
273 |         partitioned_table_name: str='TMP_TABLE',
274 |         verbose: bool=False,
275 |     ) -> pd.DataFrame:
276 |         '''
277 |         Fetch a `chunk` using BigQuery Storage API as a pandas Dataframe.
278 |         The `chunk` can be given using the `chunks()` method.
279 | 
280 |         Parameters:
281 |         ----------
282 |         chunk: FetchingChunk
283 |             A selection of rows that we want to fetch
284 |         nb_cores: int
285 |             The number of processes to create. By default, each process
286 |             will run on a separate core. It is not recommanded to set `nb_cores`
287 |             to a value larger than the number of vCPUs on the machine.
288 |             Setting this parameter to `-1` will use the number of vCPUs on
289 |             the machine.
290 |         memory_to_save: float
291 |             The amount of memory in GB to not use on the machine to avoid overflows.
292 |         parallel_backend: str
293 |             The framework used to parallelize the fetching.
294 |             >>> Choose 'billiard' to use an old fork of the Python multiprocessing lib
295 |             which allows to use multiprocessing from a process launched as daemon
296 |             (ex: Airflow).
297 |             >>> Choose 'joblib' to use the joblib backend.
298 |             >>> Choose 'multiprocessing' to use the current version of Python
299 |             multiprocessing lib.
300 |         partitioned_table_name: str
301 |             The name of the temporary table that will be created in the same dataset as the 
302 |             fetched `bq_table` at each call to fetch() in order to divide the whole table
303 |             in small chunked tables that can be fetched extremly fast.
304 |             This table will be deleted after each execution so no need to delete it manually
305 |             afterwards.
306 |         
307 |         Returns:
308 |         -------
309 |         pd.DataFrame
310 |             A Dataframe containing all the data fetched from the chunk.
311 |         '''
312 |         assert nb_cores == -1 or nb_cores > 0
313 |         assert isinstance(chunk, FetchingChunk)
314 |         assert parallel_backend in ['billiard', 'joblib', 'multiprocessing']
315 |         assert memory_to_save > 0
316 | 
317 |         vcpu_count = os.cpu_count()
318 |         if nb_cores > vcpu_count:
319 |             print(f'Warning: `nb_cores` ({nb_cores}) greater than cpus on machine ({vcpu_count})')
320 |         if nb_cores == -1:
321 |             nb_cores = vcpu_count
322 | 
323 |         if verbose and self._first_fetch:
324 |             log(
325 |                 'Fetching',
326 |                 f'Use multiprocessing : \t{nb_cores > 1}',
327 |                 f'Nb cores: \t\t\t{nb_cores}',
328 |                 f'Parallel backend: \t\t{parallel_backend}')
329 |             self._first_fetch = False
330 | 
331 |         start = time()
332 |         df = None
333 |         column = chunk.column
334 |         
335 |         if nb_cores == 1:
336 |             partitioned_table_name = f'{partitioned_table_name}0'
337 |             self._client.create_partitioned_table(self._bq_table, chunk, partitioned_table_name)
338 |             df = _fetch_in_parallel(
339 |                 (self._service_account_filename, self._creds_scopes, \
340 |                     partitioned_table_name, self._bq_table, column, chunk.elements)
341 |             )
342 |             self._client.delete_partitioned_table(self._bq_table, partitioned_table_name)
343 |         else:
344 |             chunks_per_core = divide_in_chunks(chunk.elements, nb_cores)
345 |             for i, small_chunk in enumerate(chunks_per_core):
346 |                 small_chunk = FetchingChunk(small_chunk, chunk.column)
347 |                 self._client.create_partitioned_table(self._bq_table, small_chunk, f'{partitioned_table_name}{i}')
348 | 
349 |             partition_list = [(self._service_account_filename, self._creds_scopes, \
350 |                 f'{partitioned_table_name}{i}', self._bq_table, column, item) for i, item in enumerate(chunks_per_core)]
351 |             
352 |             parallel_backends = {
353 |                 'billiard': do_parallel_billiard,
354 |                 'joblib': do_parallel_joblib,
355 |                 'multiprocessing': do_parallel_multiprocessing,
356 |             }
357 |             parallel_function = parallel_backends[parallel_backend]
358 |             df = pd.concat(parallel_function(
359 |                 _fetch_in_parallel,
360 |                 len(chunks_per_core),
361 |                 partition_list
362 |             ))
363 |             for i in range(len(chunks_per_core)):
364 |                 self._client.delete_partitioned_table(self._bq_table, f'{partitioned_table_name}{i}')
365 |         end = time() - start
366 | 
367 |         if verbose:
368 |             log(
369 |                 f'Time to fetch:\t\t {round(end, 2)}s',
370 |                 f'Nb lines in dataframe:\t {len(df)}',
371 |                 f'Size of dataframe:\t\t {ft(df.memory_usage(deep=True).sum() / 1024**3)}')
372 |         return df
373 | 
374 |     def get_nb_chunks_approximation(
375 |         self,
376 |         column: str,
377 |         nb_cores: int=1,
378 |         nb_GB_to_save: int = 1,
379 |         chunk_size_in_GB: int = DEFAULT_CHUNK_SIZE_PER_CORE_IN_GB,
380 |         verbose: bool=False,
381 |     ) -> int:
382 |         '''
383 |         Tries to give an estimation for the chunk size to used in order to divide the table.
384 |         The approximation uses the free memory available on the machine.
385 |         This approximation only works in the case where the number of distinct values in `column` is 
386 |         approximatively the same. 
387 |         To perform well, we have to predict an average chunk size, so if the size differs too much, the
388 |         prediction will not be accurate.
389 |         Ex: if `column` contains for a given value thousands rows, for a second value only ten rows etc
390 |         the approximation will not work because there is too much variance between each number of values.
391 |         Ex: if `column` refers to IDs, each row is unique so it is perfect use case to use this function.
392 |         The function will throw if there is more than 25% of values that are 25% too far from the mean.
393 |         Parameters:
394 |         ----------
395 |         column: str
396 |             The column name of the table on which do the approximation.
397 |         nb_cores: int
398 |             The number of cores that will be used.
399 |         nb_GB_to_save: int
400 |             The amount of memory in GB to not use on the machine.
401 |         chunk_size_in_GB: int
402 |             The amount of memory of one chunk, this amount should fit in memory and thus be less
403 |             than the free memory available on the machine.
404 | 
405 |         Returns:
406 |         -------
407 |         nb_chunks: int
408 |             The approximated number of chunks based on free space and size of table.
409 |         '''
410 |         nb_occurences = self._client.get_nb_occurences_for_column(self._bq_table, column)
411 |         mean = sum(nb_occurences) / len(nb_occurences)
412 |         coeff = 0.25
413 |         nb_dispersed_values = sum(not (mean * (1 - coeff) < count < mean * (1 + coeff)) \
414 |             for count in nb_occurences)
415 |         dispersion_quotient = nb_dispersed_values / len(nb_occurences)
416 | 
417 |         if dispersion_quotient > coeff:
418 |             print(f'''Warning: Difference of range between elements of column {column} \
419 |                 is too high: {(dispersion_quotient * 100):.2f}%, more than {coeff * 100}% of elements are too far from the mean.''')
420 | 
421 |         available_memory_in_GB = (psutil.virtual_memory()[1] - nb_GB_to_save) / 1024**3
422 |         if chunk_size_in_GB >= available_memory_in_GB:
423 |             print(f'WARNING: you are using a chunk size bigger than the available memory ({ft(chunk_size_in_GB)}>{ft(available_memory_in_GB)})')
424 |         nb_chunks, size_of_table_in_GB = self._nb_chunks_approximation_formula(nb_cores, chunk_size_in_GB, \
425 |             available_memory_in_GB)
426 |         size_per_chunk_in_GB = math.ceil(size_of_table_in_GB / nb_chunks)
427 | 
428 |         if verbose:
429 |             log(
430 |                 'Chunk size approximation',
431 |                 f'Available memory on device:\t {ft(available_memory_in_GB)}',
432 |                 f'Size of table:\t\t {ft(size_of_table_in_GB)}',
433 |                 f'Prefered size of chunk:\t {ft(chunk_size_in_GB)}',
434 |                 f'Size per chunk:\t\t {ft(size_per_chunk_in_GB)}',
435 |                 f'Nb chunks approximation:\t {nb_chunks}')
436 |         return nb_chunks
437 | 
438 |     def _nb_chunks_approximation_formula(
439 |         self,
440 |         nb_cores: int,
441 |         prefered_chunk_size_in_GB: int,
442 |         available_memory_in_GB: int,
443 |     ):
444 |         '''
445 |         Returns an estimated number of chunks to divide the whole table.
446 |         This estimation is based on the free memory and the number of cores.
447 |         Returns also the size of the table for cache and performance reasons.
448 |         '''
449 |         if not 'size_of_table_in_GB' in self._cache:
450 |             size_of_table_in_GB = self._client.get_table_size_in_GB(self._bq_table)
451 |             self._cache['size_of_table_in_GB'] = size_of_table_in_GB
452 |         sum_of_GB_for_cores = prefered_chunk_size_in_GB * nb_cores
453 |         nb_chunks = math.ceil(self._cache['size_of_table_in_GB'] / min(sum_of_GB_for_cores, available_memory_in_GB))
454 |         return nb_chunks, self._cache['size_of_table_in_GB']
455 |         
456 | 
457 | def _fetch_in_parallel(
458 |     pickled_parameters: Tuple,
459 | ) -> pd.DataFrame:
460 |     '''
461 |     Fetch a BigQuery table using Storage API.
462 |     If `chunk` is given, the fetching will return only
463 |     the chunk matching the given list, based on the `column`.
464 |     Warning: imports should not be removed from the inner function
465 |     because dependencies could not be found outside when running
466 |     in child processes.
467 |     Function should be global, not inside a class.
468 |     '''
469 |     from google.cloud.bigquery_storage import BigQueryReadClient, ReadSession, DataFormat
470 | 
471 |     service_account_filename, creds_scopes, partitioned_table_name, bq_table, column, chunk = pickled_parameters
472 | 
473 |     credentials = service_account.Credentials.from_service_account_file(
474 |         service_account_filename, scopes=creds_scopes) \
475 |             if service_account_filename != None else None
476 |     var = bq_table.variables
477 |     bqstorageclient = BigQueryReadClient(credentials=credentials)
478 |     stringify_table = f"projects/{var['PROJECT_ID']}/datasets/{var['DATASET']}/tables/{partitioned_table_name}"
479 |     parent = "projects/{}".format(var['PROJECT_ID'])
480 | 
481 |     requested_session = None
482 |     if chunk is not None:
483 |         sqlify_indexes = ','.join(list(map(lambda x: f'"{x}"', chunk)))
484 |         row_filter = ReadSession.TableReadOptions(row_restriction=f'{column} IN ({sqlify_indexes})')
485 |         requested_session = ReadSession(
486 |             table=stringify_table,
487 |             data_format=DataFormat.ARROW,
488 |             read_options=row_filter,
489 |         )
490 |     else:
491 |         requested_session = ReadSession(
492 |             table=stringify_table,
493 |             data_format=DataFormat.ARROW,
494 |         )
495 |     
496 |     session = bqstorageclient.create_read_session(
497 |         parent=parent,
498 |         read_session=requested_session, 
499 |         max_stream_count=1,
500 |     )
501 |     reader = bqstorageclient.read_rows(session.streams[0].name, timeout=10000)
502 |     return reader.to_dataframe(session)


--------------------------------------------------------------------------------