├── .gitignore ├── requirements.txt ├── bqfetch ├── __init__.py ├── utils.py └── bqfetch.py ├── main.py ├── LICENSE ├── .github └── logo.svg └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | secrets 3 | bqfetch.egg-info 4 | dist 5 | pyproject.toml 6 | setup.cfg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | psutil==5.8.0 2 | pandas 3 | google-cloud-bigquery-storage>=2.1.0 4 | google-cloud-bigquery>=2.1.0 5 | billiard>=3.6.4.0 6 | joblib>=1.0.1 7 | pyarrow>=1.0.1 -------------------------------------------------------------------------------- /bqfetch/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .bqfetch import ( 3 | BigQueryTable, 4 | FetchingChunk, 5 | BigQueryClient, 6 | InvalidChunkRangeException, 7 | BigQueryFetcher, 8 | ) 9 | 10 | __all__ = [ 11 | "BigQueryTable", 12 | "FetchingChunk", 13 | "BigQueryClient", 14 | "InvalidChunkRangeException", 15 | "BigQueryFetcher", 16 | ] 17 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/Users/tristanbilot/Desktop/bq_data_fetcher') 3 | from bqfetch.bqfetch import BigQueryFetcher, BigQueryTable 4 | 5 | if __name__ == '__main__': 6 | table = BigQueryTable( 7 | "PROJECT", 8 | "DATASET", 9 | "TABLE" 10 | ) 11 | fetcher = BigQueryFetcher(table) 12 | chunks = fetcher.chunks( 13 | column='id', 14 | by_chunk_size_in_GB=15, 15 | verbose=True 16 | ) 17 | for chunk in chunks: 18 | df = fetcher.fetch(chunk=chunk, nb_cores=1, parallel_backend='billiard', verbose=True) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tristan Bilot 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bqfetch/utils.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | import multiprocessing 3 | 4 | import numpy as np 5 | import billiard as billiard_multiprocessing 6 | from typing import Iterable, List 7 | from joblib import Parallel, delayed 8 | 9 | def scope_splitter(target_scope: list, chunk_reference_size) -> List[list]: 10 | ''' 11 | Split a list into multiple arrays depending on memory size. 12 | ''' 13 | nb_barcodes = len(target_scope) 14 | nb_chuncks = ceil(nb_barcodes / chunk_reference_size) 15 | chunks = np.array_split(target_scope, nb_chuncks) 16 | return list(chunks) 17 | 18 | def divide_in_chunks(seq: Iterable, n: int) -> Iterable: 19 | ''' 20 | Divide an array in `n` arrays of approximately the same length. 21 | ''' 22 | avg = len(seq) / float(n) 23 | out = [] 24 | last = 0.0 25 | while last < len(seq): 26 | out.append(seq[int(last):int(last + avg)]) 27 | last += avg 28 | return out 29 | 30 | def do_parallel_billiard(function, num_cores, partition_list): 31 | ''' 32 | Run `function` in parallel using `num_cores` processes with each 33 | call using as parameters the tuple at the ith index of `partition_list`. 34 | 35 | Billiard library is an old fork of Python multiprocessoing lib which 36 | allow forking processes from daemon process. 37 | ''' 38 | pool = billiard_multiprocessing.Pool(processes=num_cores) 39 | return pool.map(function, partition_list) 40 | 41 | def do_parallel_joblib(function, num_cores, partition_list): 42 | ''' 43 | Run `function` in parallel using `num_cores` processes with each 44 | call using as parameters the tuple at the ith index of `partition_list`. 45 | 46 | Warning: using joblib do not allow to create child processes from 47 | a daemon process (because joblib use multiprocessing as backend). 48 | So in many cases (ex: running this function from Airflow), 49 | using this function will result in computing multiprocessing with 50 | n_jobs=1 so no parallel processing at all. 51 | Prefer using `do_parallel()` function using billiard (old fork of 52 | multiprocessing whiwh allow process spawn from daemon). 53 | ''' 54 | return Parallel(n_jobs=num_cores)(delayed(function)(item) for item in partition_list) 55 | 56 | def do_parallel_multiprocessing(function, num_cores, partition_list): 57 | ''' 58 | Run `function` in parallel using `num_cores` processes with each 59 | call using as parameters the tuple at the ith index of `partition_list`. 60 | ''' 61 | pool = multiprocessing.Pool(processes=num_cores) 62 | return pool.map(function, partition_list) 63 | 64 | def log(*args): 65 | print() 66 | for arg in args: 67 | print(f'>>> {arg}') 68 | 69 | def ft(size_in_GB: int) -> str: 70 | ''' 71 | Formats gigabytes like 2.3892... = 2.38GB 72 | ''' 73 | return f'{round(size_in_GB, 2)}GB' -------------------------------------------------------------------------------- /.github/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | Last commit 5 | Languages 6 | Release date 7 |
8 | Python version 9 | Python version 10 |

11 | 12 | #

bqfetch

13 | **A lightweight tool to fetch tables from BigQuery as pandas DataFrames very fast using BigQuery Storage API combined with multiprocessing. This module also aims to fetch large tables that cannot fit into memory, by chunking the table in a smart and scalable way.** 14 | 15 | 16 | ## Installation 17 | ``` 18 | pip install bqfetch 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | ## Algorithm 23 | * Fetch all distinct values from the given index `column`. 24 | * Divide these indices in `chunks` based on the available memory and the number of cores on the machine. 25 | * `If multiprocessing`: 26 | * Each chunk will be divided in multiple sub-chunks based on the parameter `nb_cores` and the available memory. 27 | * For each sub-chunk, create a temporary table containing all the matching rows in the whole table. 28 | * Fetch these temporary tables using BigQuery Storage as dataframes. 29 | * Merge the dataframes. 30 | * Delete temporary tables. 31 | * `If !multiprocessing`: 32 | * Same process with only one temporary table and no parallel processes created. 33 | 34 | ## Use case 35 | 36 | ### Fetching a huge table of users using multiple cores 37 | | id | Name | Age | 38 | |:---:|:-------:|:---:| 39 | | 187 | Bartolomé | 30 | 40 | | 188 | Tristan | 22 | 41 | | ... | ... | ... | 42 | 43 | ```python 44 | >>> table = BigQueryTable("PROJECT", "DATASET", "TABLE") 45 | >>> fetcher = BigQueryFetcher('/path/to/service_account.json', table) 46 | >>> chunks = fetcher.chunks('id', by_chunk_size_in_GB=5) 47 | 48 | >>> for chunk in chunks: 49 | df = fetcher.fetch(chunk, nb_cores=-1, parallel_backend='billiard') 50 | # ... 51 | ``` 52 | 53 | * First, we have to create a `BigQueryTable` object which contains the path to the BigQuery table stored in GCP. 54 | * A fetcher is created, given in parameter the absolute path to the service_account.json file, the file is mandatory in order to do operations in GCP. 55 | * Chunks the whole table, given the `column` name and the chunk size. In this case, choosing the **id** column is perfect because this each value of this column appears the same number of times: 1 time. Concerning the chunks size, if by_chunk_size_in_GB=5, each chunk that will be fetched on the machine will be of size 5GB. Thus it has to fit into memory. You need to save 1/3 more memory because the size of a DataFrame object is larger than the raw fetched data. 56 | * For each chunk, fetch it. 57 | * `nb_cores`=-1 will use the number of cores available on the machine. 58 | * `parallel_backend`='billiard' | 'joblib' | 'multiprocessing' specify the backend framework to use. 59 | 60 | ## Fetch by number of chunks 61 | It is also possible to use `by_nb_chunks` instead of `by_chunk_size_in_GB`. It will divided the table in N, so you cannot control more flexibly the size of each chunk. 62 | 63 | ```python 64 | >>> table = BigQueryTable("PROJECT", "DATASET", "TABLE") 65 | >>> fetcher = BigQueryFetcher('/path/to/service_account.json', table) 66 | >>> chunks = fetcher.chunks('id', by_nb_chunks=10) 67 | 68 | >>> for chunk in chunks: 69 | df = fetcher.fetch(chunk, nb_cores=-1, parallel_backend='billiard') 70 | # ... 71 | ``` 72 | 73 | ## Verbose mode 74 | 75 | ```python 76 | >>> chunks = fetcher.chunks(column='id', by_nb_chunks=1, verbose=True) 77 | # Available memory on device: 7.04GB 78 | # Size of table: 2.19GB 79 | # Prefered size of chunk: 3GB 80 | # Size per chunk: 3GB 81 | # Nb chunks: 1 82 | 83 | # Nb values in "id": 96 84 | # Chunk size: 3GB 85 | # Nb chunks: 1 86 | 87 | >>> for chunk in chunks: 88 | >>> df = fetcher.fetch(chunk=chunk, nb_cores=1, parallel_backend='joblib', verbose=True) 89 | # Use multiprocessing : False 90 | # Nb cores: 1 91 | # Parallel backend: joblib 92 | 93 | # Time to fetch: 43.21s 94 | # Nb lines in dataframe: 3375875 95 | # Size of dataframe: 2.83GB 96 | ``` 97 | 98 | ## Warning 99 | We recommend to use this tool only in the case where the table to fetch contains a column that can be easily chunked (divided in small parts). Thus the perfect column to achieve this is a column containing distinct values or values that appear ~ the same number of time. **If some values appear thousands of times and some only fews, then the chunking will not be reliable** because we need to make the assumption that each chunk will be approximatively the same size in order to estimate the needed memory necessary to fetch in an optimize way the table. 100 | 101 | ### A good index colum: 102 | This column contains distinct values so can be divided in chunks easily. 103 | 104 | | Card number | 105 | |:---:| 106 | | | 107 | | 4390 3849 ... | 108 | | 2903 1182 ... | 109 | | 0562 7205 ... | 110 | | ... | 111 | 112 | ### A bad index colum: 113 | This column can contains a lot of variance between values so the chunking will not be reliable. 114 | 115 | | Age | 116 | |:---:| 117 | | | 118 | | 18 | 119 | | 18 | 120 | | 64 | 121 | | 18 | 122 | | ... | 123 | 124 | ### More cores != faster 125 | I remind you that adding more cores to the fetching process will not necessarily gain performance and most of the time it will even be slower. The reason is that the fetching is directly dependent on the Internet bandwidth available on your network, not the number of working cores or the computer power. However, you can do your own tests and in some cases the multiprocessing can gain time (ex: in the case where cloud machines allow only an amount of bandwidth by core, multiplying the number of cores will also multiplying the bandwidth, ex: GCP compute engines). 126 | 127 | ## Contribution 128 | bqfetch is open to new contributors, especially for bug fixing or implementation of new features. Do not hesitate to open an issue/pull request :) 129 | 130 | ## License 131 | MIT 132 | 133 | Copyright (c) 2021-present, Tristan Bilot 134 | -------------------------------------------------------------------------------- /bqfetch/bqfetch.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import os 3 | import psutil 4 | import math 5 | from typing import Iterator, List, Tuple 6 | 7 | import pandas as pd 8 | from google.cloud import bigquery 9 | from google.oauth2 import service_account 10 | 11 | from bqfetch.utils import * 12 | 13 | CREDS_SCOPES = [ 14 | "https://www.googleapis.com/auth/drive", 15 | "https://www.googleapis.com/auth/bigquery", 16 | "https://www.googleapis.com/auth/devstorage.full_control" 17 | ] 18 | DEFAULT_CHUNK_SIZE_PER_CORE_IN_GB = 2 19 | 20 | class BigQueryTable: 21 | ''' 22 | A simple object containing the path to the requested table. 23 | `project_id` is the name of the BigQuery project, `dataset` 24 | the BigQuery dataset entry and `table` the name of the 25 | requested table. 26 | ''' 27 | def __init__( 28 | self, 29 | project_id: str, 30 | dataset: str, 31 | table: str, 32 | ) -> None: 33 | self._variables = { 34 | "PROJECT_ID": project_id, 35 | "DATASET": dataset, 36 | "TABLE": table, 37 | } 38 | 39 | @property 40 | def variables(self): 41 | return self._variables 42 | 43 | class FetchingChunk: 44 | ''' 45 | Wrapper object used to store the elements to select in the 46 | given column. 47 | ''' 48 | def __init__(self, elements: List[str], column: str,) -> None: 49 | self.elements = elements 50 | self.column = column 51 | 52 | class BigQueryClient: 53 | ''' 54 | Wrapper of BigQuery Client object containing credentials. 55 | 56 | Parameters: 57 | ---------- 58 | service_account_path: str 59 | The path and file name of credentials file bq_service_account.json. 60 | The path should be absolute. 61 | ''' 62 | def __init__( 63 | self, 64 | service_account_path: str=None, 65 | creds_scope: str=None, 66 | ) -> None: 67 | if isinstance(service_account_path, str): 68 | creds_scope = creds_scope if creds_scope is not None \ 69 | else CREDS_SCOPES 70 | credentials = service_account.Credentials.from_service_account_file( 71 | service_account_path, scopes=creds_scope 72 | ) 73 | bq_client = bigquery.Client( 74 | credentials=credentials, 75 | project=credentials.project_id 76 | ) 77 | elif service_account_path is None: 78 | bq_client = bigquery.Client() 79 | else: 80 | raise ValueError('`service_account_path` should be of type str or None') 81 | 82 | 83 | self._client = bq_client 84 | 85 | def run( 86 | self, 87 | request: str 88 | ) -> bigquery.table.RowIterator: 89 | """ 90 | Run a SQL BigQuery request. 91 | """ 92 | job = self._client.query(request) 93 | return job.result() 94 | 95 | def delete_table( 96 | self, 97 | table_name: str, 98 | not_found_ok: bool=True, 99 | ): 100 | ''' 101 | Delete a BigQuery table. 102 | ''' 103 | self._client.delete_table(table_name, not_found_ok=not_found_ok) 104 | 105 | def get_nb_occurences_for_column( 106 | self, 107 | table: BigQueryTable, 108 | column: str, 109 | ) -> List[int]: 110 | ''' 111 | For each distinct element in `column`, counts the number of occurences 112 | and returns a list containing all the countings. 113 | Ex: For a column name contaning: John, John, Louis 114 | >>> [2, 1] 115 | ''' 116 | var = table.variables 117 | nb_occurences_query = f''' 118 | SELECT COUNT(*) 119 | FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}` 120 | GROUP BY {column} 121 | ''' 122 | nb_occurences = [nb_occurences[0] for nb_occurences in self.run(nb_occurences_query)] 123 | return nb_occurences 124 | 125 | def get_table_size_in_GB( 126 | self, 127 | table: BigQueryTable, 128 | ) -> int: 129 | ''' 130 | Returns the size in GB of `table`. 131 | ''' 132 | var = table.variables 133 | size_of_table_query = f''' 134 | SELECT SUM(size_bytes)/{1024**3} AS size_GB 135 | FROM {var["PROJECT_ID"]}.{var["DATASET"]}.__TABLES__ 136 | WHERE table_id = '{var["TABLE"]}' 137 | ''' 138 | size_of_table_in_GB = next(self.run(size_of_table_query).__iter__())[0] 139 | return size_of_table_in_GB 140 | 141 | def get_column_values( 142 | self, 143 | table: BigQueryTable, 144 | column: str, 145 | ) -> pd.DataFrame: 146 | ''' 147 | Returns a Dataframe (with 1 col) of distinct elements on the 148 | `column` of the `table`. 149 | ''' 150 | var = table.variables 151 | query = f''' 152 | SELECT DISTINCT `{column}` 153 | FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}` 154 | ''' 155 | query_results = self.run(query) 156 | return query_results.to_dataframe() 157 | 158 | def create_partitioned_table( 159 | self, 160 | table: BigQueryTable, 161 | chunk: FetchingChunk, 162 | partitioned_table_name: str, 163 | ): 164 | ''' 165 | Create a temporary table used to store one `chunk` of data, 166 | extracted from the main table to fetch. This step is necessary 167 | in order to improve performances and avoid network bottleneck. 168 | The table is created with the name `partitioned_table_name` in the 169 | same dataset as `bq_table`. 170 | ''' 171 | sqlify_chunk_elements = ','.join(list(map(lambda x: f'"{x}"', chunk.elements))) 172 | var = table.variables 173 | query = f''' 174 | CREATE OR REPLACE TABLE 175 | `{var["PROJECT_ID"]}.{var["DATASET"]}.{partitioned_table_name}` AS 176 | SELECT 177 | * 178 | FROM `{var["PROJECT_ID"]}.{var["DATASET"]}.{var["TABLE"]}` 179 | WHERE {chunk.column} IN ({sqlify_chunk_elements}) 180 | ''' 181 | self.run(query) 182 | 183 | def delete_partitioned_table( 184 | self, 185 | table: BigQueryTable, 186 | partitioned_table_name: str, 187 | ): 188 | ''' 189 | Delete the temporary table used to chunk the table. 190 | ''' 191 | var = table.variables 192 | table = f'{var["PROJECT_ID"]}.{var["DATASET"]}.{partitioned_table_name}' 193 | self.delete_table(table) 194 | 195 | 196 | class BigQueryFetcher: 197 | ''' 198 | An object used to fetch BigQuery tables easily and progressively 199 | in order to handle huge tables that does not fit into memory. 200 | The fetcher divides the table in chunks of size `chunk_size_in_GB` 201 | based on the `column` parameter. Then each chunk is fetched 202 | using BigQuery Storage API, sequencially or in parallel using 203 | child processes running on multiple cores. 204 | 205 | Ex: Fetch a huge table of users: first, all the 'user_id' are 206 | fetched and divided in chunks of size 50000 (should fit into memory). 207 | Then, we fetch each small chunk separately using multiprocessing 208 | with the number of cores available of the machine. 209 | 210 | >>> table = BigQueryTable("my_project", "dataset1", "users_table") 211 | >>> fetcher = BigQueryFetcher('path/to/service_account.json', table) 212 | >>> chunks = fetcher.chunks('user_id', 50000) 213 | >>> for chunk in chunks: 214 | df = fetcher.fetch(chunk, nb_cores=-1) 215 | # compute df... 216 | ''' 217 | def __init__( 218 | self, 219 | bq_table: BigQueryTable, 220 | service_account_filename: str=None, 221 | existing_client: BigQueryClient=None, 222 | creds_scope: str=None, 223 | ): 224 | self._client = existing_client if existing_client is not None \ 225 | else BigQueryClient(service_account_filename, creds_scope=creds_scope) 226 | self._bq_table = bq_table 227 | self._service_account_filename = service_account_filename 228 | self._creds_scopes = CREDS_SCOPES 229 | self._cache = {} 230 | self._first_fetch = True 231 | 232 | def chunks( 233 | self, 234 | column: str, 235 | by_nb_chunks: int=None, 236 | by_chunk_size_in_GB: int=None, 237 | verbose: bool=False, 238 | ) -> Iterator: 239 | ''' 240 | Returns a list on which iterate to get `nb_chunks` chunks of `column` items. 241 | It allows to fetch the whole table with multiple chunks that can handle in memory. 242 | The chosen column can be of any type, not only String or Int. 243 | ''' 244 | assert isinstance(column, str) 245 | 246 | if (by_nb_chunks is None and by_chunk_size_in_GB is None) \ 247 | or (by_nb_chunks is not None and by_chunk_size_in_GB is not None): 248 | raise ValueError('Only one parameter `by_nb_chunks` or `by_chunk_size_in_GB` has to be set') 249 | if not ((by_nb_chunks is not None and by_nb_chunks > 0) \ 250 | or (by_chunk_size_in_GB is not None and by_chunk_size_in_GB > 0)): 251 | raise ValueError('Value has to be greater than 0') 252 | 253 | by_nb_chunks = by_nb_chunks if by_nb_chunks is not None else \ 254 | self.get_nb_chunks_approximation(column, verbose=verbose, chunk_size_in_GB=by_chunk_size_in_GB) 255 | 256 | indexes = self._client.get_column_values(self._bq_table, column) 257 | chunks = divide_in_chunks(indexes, by_nb_chunks) 258 | chunks = [FetchingChunk(x[column].tolist(), column) for x in chunks] 259 | 260 | if verbose: 261 | log( 262 | 'Chunking', 263 | f'Nb values in "{column}":\t {len(indexes)}', 264 | f'Nb chunks:\t\t\t {len(chunks)}') 265 | return chunks 266 | 267 | def fetch( 268 | self, 269 | chunk: FetchingChunk=None, 270 | nb_cores: int=1, 271 | memory_to_save: float = 1.0, 272 | parallel_backend: str='billiard', 273 | partitioned_table_name: str='TMP_TABLE', 274 | verbose: bool=False, 275 | ) -> pd.DataFrame: 276 | ''' 277 | Fetch a `chunk` using BigQuery Storage API as a pandas Dataframe. 278 | The `chunk` can be given using the `chunks()` method. 279 | 280 | Parameters: 281 | ---------- 282 | chunk: FetchingChunk 283 | A selection of rows that we want to fetch 284 | nb_cores: int 285 | The number of processes to create. By default, each process 286 | will run on a separate core. It is not recommanded to set `nb_cores` 287 | to a value larger than the number of vCPUs on the machine. 288 | Setting this parameter to `-1` will use the number of vCPUs on 289 | the machine. 290 | memory_to_save: float 291 | The amount of memory in GB to not use on the machine to avoid overflows. 292 | parallel_backend: str 293 | The framework used to parallelize the fetching. 294 | >>> Choose 'billiard' to use an old fork of the Python multiprocessing lib 295 | which allows to use multiprocessing from a process launched as daemon 296 | (ex: Airflow). 297 | >>> Choose 'joblib' to use the joblib backend. 298 | >>> Choose 'multiprocessing' to use the current version of Python 299 | multiprocessing lib. 300 | partitioned_table_name: str 301 | The name of the temporary table that will be created in the same dataset as the 302 | fetched `bq_table` at each call to fetch() in order to divide the whole table 303 | in small chunked tables that can be fetched extremly fast. 304 | This table will be deleted after each execution so no need to delete it manually 305 | afterwards. 306 | 307 | Returns: 308 | ------- 309 | pd.DataFrame 310 | A Dataframe containing all the data fetched from the chunk. 311 | ''' 312 | assert nb_cores == -1 or nb_cores > 0 313 | assert isinstance(chunk, FetchingChunk) 314 | assert parallel_backend in ['billiard', 'joblib', 'multiprocessing'] 315 | assert memory_to_save > 0 316 | 317 | vcpu_count = os.cpu_count() 318 | if nb_cores > vcpu_count: 319 | print(f'Warning: `nb_cores` ({nb_cores}) greater than cpus on machine ({vcpu_count})') 320 | if nb_cores == -1: 321 | nb_cores = vcpu_count 322 | 323 | if verbose and self._first_fetch: 324 | log( 325 | 'Fetching', 326 | f'Use multiprocessing : \t{nb_cores > 1}', 327 | f'Nb cores: \t\t\t{nb_cores}', 328 | f'Parallel backend: \t\t{parallel_backend}') 329 | self._first_fetch = False 330 | 331 | start = time() 332 | df = None 333 | column = chunk.column 334 | 335 | if nb_cores == 1: 336 | partitioned_table_name = f'{partitioned_table_name}0' 337 | self._client.create_partitioned_table(self._bq_table, chunk, partitioned_table_name) 338 | df = _fetch_in_parallel( 339 | (self._service_account_filename, self._creds_scopes, \ 340 | partitioned_table_name, self._bq_table, column, chunk.elements) 341 | ) 342 | self._client.delete_partitioned_table(self._bq_table, partitioned_table_name) 343 | else: 344 | chunks_per_core = divide_in_chunks(chunk.elements, nb_cores) 345 | for i, small_chunk in enumerate(chunks_per_core): 346 | small_chunk = FetchingChunk(small_chunk, chunk.column) 347 | self._client.create_partitioned_table(self._bq_table, small_chunk, f'{partitioned_table_name}{i}') 348 | 349 | partition_list = [(self._service_account_filename, self._creds_scopes, \ 350 | f'{partitioned_table_name}{i}', self._bq_table, column, item) for i, item in enumerate(chunks_per_core)] 351 | 352 | parallel_backends = { 353 | 'billiard': do_parallel_billiard, 354 | 'joblib': do_parallel_joblib, 355 | 'multiprocessing': do_parallel_multiprocessing, 356 | } 357 | parallel_function = parallel_backends[parallel_backend] 358 | df = pd.concat(parallel_function( 359 | _fetch_in_parallel, 360 | len(chunks_per_core), 361 | partition_list 362 | )) 363 | for i in range(len(chunks_per_core)): 364 | self._client.delete_partitioned_table(self._bq_table, f'{partitioned_table_name}{i}') 365 | end = time() - start 366 | 367 | if verbose: 368 | log( 369 | f'Time to fetch:\t\t {round(end, 2)}s', 370 | f'Nb lines in dataframe:\t {len(df)}', 371 | f'Size of dataframe:\t\t {ft(df.memory_usage(deep=True).sum() / 1024**3)}') 372 | return df 373 | 374 | def get_nb_chunks_approximation( 375 | self, 376 | column: str, 377 | nb_cores: int=1, 378 | nb_GB_to_save: int = 1, 379 | chunk_size_in_GB: int = DEFAULT_CHUNK_SIZE_PER_CORE_IN_GB, 380 | verbose: bool=False, 381 | ) -> int: 382 | ''' 383 | Tries to give an estimation for the chunk size to used in order to divide the table. 384 | The approximation uses the free memory available on the machine. 385 | This approximation only works in the case where the number of distinct values in `column` is 386 | approximatively the same. 387 | To perform well, we have to predict an average chunk size, so if the size differs too much, the 388 | prediction will not be accurate. 389 | Ex: if `column` contains for a given value thousands rows, for a second value only ten rows etc 390 | the approximation will not work because there is too much variance between each number of values. 391 | Ex: if `column` refers to IDs, each row is unique so it is perfect use case to use this function. 392 | The function will throw if there is more than 25% of values that are 25% too far from the mean. 393 | Parameters: 394 | ---------- 395 | column: str 396 | The column name of the table on which do the approximation. 397 | nb_cores: int 398 | The number of cores that will be used. 399 | nb_GB_to_save: int 400 | The amount of memory in GB to not use on the machine. 401 | chunk_size_in_GB: int 402 | The amount of memory of one chunk, this amount should fit in memory and thus be less 403 | than the free memory available on the machine. 404 | 405 | Returns: 406 | ------- 407 | nb_chunks: int 408 | The approximated number of chunks based on free space and size of table. 409 | ''' 410 | nb_occurences = self._client.get_nb_occurences_for_column(self._bq_table, column) 411 | mean = sum(nb_occurences) / len(nb_occurences) 412 | coeff = 0.25 413 | nb_dispersed_values = sum(not (mean * (1 - coeff) < count < mean * (1 + coeff)) \ 414 | for count in nb_occurences) 415 | dispersion_quotient = nb_dispersed_values / len(nb_occurences) 416 | 417 | if dispersion_quotient > coeff: 418 | print(f'''Warning: Difference of range between elements of column {column} \ 419 | is too high: {(dispersion_quotient * 100):.2f}%, more than {coeff * 100}% of elements are too far from the mean.''') 420 | 421 | available_memory_in_GB = (psutil.virtual_memory()[1] - nb_GB_to_save) / 1024**3 422 | if chunk_size_in_GB >= available_memory_in_GB: 423 | print(f'WARNING: you are using a chunk size bigger than the available memory ({ft(chunk_size_in_GB)}>{ft(available_memory_in_GB)})') 424 | nb_chunks, size_of_table_in_GB = self._nb_chunks_approximation_formula(nb_cores, chunk_size_in_GB, \ 425 | available_memory_in_GB) 426 | size_per_chunk_in_GB = math.ceil(size_of_table_in_GB / nb_chunks) 427 | 428 | if verbose: 429 | log( 430 | 'Chunk size approximation', 431 | f'Available memory on device:\t {ft(available_memory_in_GB)}', 432 | f'Size of table:\t\t {ft(size_of_table_in_GB)}', 433 | f'Prefered size of chunk:\t {ft(chunk_size_in_GB)}', 434 | f'Size per chunk:\t\t {ft(size_per_chunk_in_GB)}', 435 | f'Nb chunks approximation:\t {nb_chunks}') 436 | return nb_chunks 437 | 438 | def _nb_chunks_approximation_formula( 439 | self, 440 | nb_cores: int, 441 | prefered_chunk_size_in_GB: int, 442 | available_memory_in_GB: int, 443 | ): 444 | ''' 445 | Returns an estimated number of chunks to divide the whole table. 446 | This estimation is based on the free memory and the number of cores. 447 | Returns also the size of the table for cache and performance reasons. 448 | ''' 449 | if not 'size_of_table_in_GB' in self._cache: 450 | size_of_table_in_GB = self._client.get_table_size_in_GB(self._bq_table) 451 | self._cache['size_of_table_in_GB'] = size_of_table_in_GB 452 | sum_of_GB_for_cores = prefered_chunk_size_in_GB * nb_cores 453 | nb_chunks = math.ceil(self._cache['size_of_table_in_GB'] / min(sum_of_GB_for_cores, available_memory_in_GB)) 454 | return nb_chunks, self._cache['size_of_table_in_GB'] 455 | 456 | 457 | def _fetch_in_parallel( 458 | pickled_parameters: Tuple, 459 | ) -> pd.DataFrame: 460 | ''' 461 | Fetch a BigQuery table using Storage API. 462 | If `chunk` is given, the fetching will return only 463 | the chunk matching the given list, based on the `column`. 464 | Warning: imports should not be removed from the inner function 465 | because dependencies could not be found outside when running 466 | in child processes. 467 | Function should be global, not inside a class. 468 | ''' 469 | from google.cloud.bigquery_storage import BigQueryReadClient, ReadSession, DataFormat 470 | 471 | service_account_filename, creds_scopes, partitioned_table_name, bq_table, column, chunk = pickled_parameters 472 | 473 | credentials = service_account.Credentials.from_service_account_file( 474 | service_account_filename, scopes=creds_scopes) \ 475 | if service_account_filename != None else None 476 | var = bq_table.variables 477 | bqstorageclient = BigQueryReadClient(credentials=credentials) 478 | stringify_table = f"projects/{var['PROJECT_ID']}/datasets/{var['DATASET']}/tables/{partitioned_table_name}" 479 | parent = "projects/{}".format(var['PROJECT_ID']) 480 | 481 | requested_session = None 482 | if chunk is not None: 483 | sqlify_indexes = ','.join(list(map(lambda x: f'"{x}"', chunk))) 484 | row_filter = ReadSession.TableReadOptions(row_restriction=f'{column} IN ({sqlify_indexes})') 485 | requested_session = ReadSession( 486 | table=stringify_table, 487 | data_format=DataFormat.ARROW, 488 | read_options=row_filter, 489 | ) 490 | else: 491 | requested_session = ReadSession( 492 | table=stringify_table, 493 | data_format=DataFormat.ARROW, 494 | ) 495 | 496 | session = bqstorageclient.create_read_session( 497 | parent=parent, 498 | read_session=requested_session, 499 | max_stream_count=1, 500 | ) 501 | reader = bqstorageclient.read_rows(session.streams[0].name, timeout=10000) 502 | return reader.to_dataframe(session) --------------------------------------------------------------------------------