├── .gitignore ├── README.md ├── condaenv.yml ├── data_etl ├── __init__.py ├── checks.py ├── connections.py ├── data_files.py └── general_functions.py ├── examples ├── 00_create_data.ipynb ├── 01_example.ipynb ├── 02_example.ipynb ├── 03_example.ipynb ├── 04_example.ipynb ├── 04_example.py ├── README.md ├── data │ └── .gitkeep ├── logs │ └── .gitkeep └── test_scripts │ ├── .config │ ├── alter_cols.py │ ├── checks_1.py │ ├── convert_columns.py │ ├── main.py │ ├── reporting_1.py │ └── test_reading_in.py ├── setup.py └── tests └── 00_pytest.py /.gitignore: -------------------------------------------------------------------------------- 1 | logs/*.log 2 | .idea/* 3 | pickles/* 4 | *~* 5 | data/processed/* 6 | data/deliverables/* 7 | *.pkl 8 | *.tsv 9 | *.db 10 | *.csv 11 | *.xlsx 12 | *.log 13 | .ipynb_checkpoints/* 14 | */.ipynb_checkpoints/* 15 | *.pyc 16 | docs/* 17 | data_etl.egg-info/* 18 | logs/* 19 | !logs/README_logs.md 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data ETL 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | 5 | A package for dealing with data curation, transformation and checks. 6 | 7 | This can be reading in and converting to the correct dtypes, making suitable alterations to bring data into a uniform format. Or just taking an existing data set and performing some checks on it. 8 | 9 | The aim is to help with regular data sources provided by others, or by systems. This means it could be in a flat file format, it could mean that you are given data that isn't logically correct, it could mean missing data, there could be any number of problems. But hopefully having an issue report with a good amount of information on how something is wrong and where will give us capacity to provide this back to the data creators. Thus checks can be done in bulk, quickly, and issue reports put the responsibility on the data creator to make the corrections. 10 | 11 | The checks are not just considering single columns or single values they can consider the whole data set or even in conjunction with extra data sets, because that's how data often behaves. 12 | 13 | With models if certain assumptions are made then these can be tested. 14 | 15 | There is also benefit in performing checks in bulk, even if they produce issues, so it stops the stop start process. 16 | 17 | To use this package you should already have a good understanding of how the `pandas` package works. 18 | 19 | ## How to use this repoistory 20 | 21 | ### Setup environment 22 | 23 | There is a YML file for the main requirements. 24 | 25 | ``` 26 | conda env create --file condaenv.yml 27 | ``` 28 | 29 | Then you can use `pip` to install the `data_etl` module, navigate to the same directory as contains the `setup.py` file then: 30 | 31 | ``` 32 | pip install -e . 33 | ``` 34 | 35 | This now means you can import `data_etl` from the environment. 36 | 37 | ## Examples 38 | 39 | There are multiple examples present in the repository in the `examples` files. 40 | 41 | Use the `00_create_data.py` file to create the data to run the examples on and the sqlitedb file that will contain any errors or written out data. 42 | 43 | The other files, both `*.ipynb` and `*.py`, are the examples files. 44 | 45 | A brief code example of how to use: 46 | 47 | ```python 48 | from data_etl import Checks 49 | import pandas as pd 50 | 51 | data = pd.DataFrame([1, -3, 2], columns=['number']) 52 | 53 | # Initialise the Checks class 54 | ch_simple = Checks('grouping_label', 'key_1', 'key_2', 'key_3') 55 | 56 | # Define a simple check 57 | dict_checks = { 58 | 'Number should be greater than 0': { 59 | 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0 60 | } 61 | } 62 | # Apply the checks to the tables 63 | ch_simple.apply_checks(data, dictionary=dict_checks) 64 | 65 | # If any issues are found then they are stored internal to the class as a Pandas DataFrame 66 | ch_simple.df_issues 67 | ``` 68 | -------------------------------------------------------------------------------- /condaenv.yml: -------------------------------------------------------------------------------- 1 | name: data_etl 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.6 6 | - pandas=0.24.0 7 | - pytest=5.0.1 8 | - jupyter=1.0.0 9 | - matplotlib=3.0.3 10 | - xlrd=1.2.0 11 | - pyodbc=4.0.27 12 | - openpyxl=3.0.3 13 | -------------------------------------------------------------------------------- /data_etl/__init__.py: -------------------------------------------------------------------------------- 1 | from data_etl.data_files import DataCuration 2 | from data_etl.checks import Checks 3 | from data_etl.connections import Connections 4 | from data_etl.general_functions import func_check_for_issues, \ 5 | func_initialise_logging, import_attr 6 | 7 | __all__ = [ 8 | DataCuration, Checks, Connections, func_check_for_issues, 9 | func_initialise_logging, import_attr 10 | ] 11 | __version__ = '0.1.0dev' -------------------------------------------------------------------------------- /data_etl/checks.py: -------------------------------------------------------------------------------- 1 | # Here we are defining a class that will deal with checking data sets 2 | import logging 3 | from inspect import getfullargspec 4 | from copy import deepcopy 5 | from inspect import getsourcelines 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | from data_etl.general_functions import import_attr 11 | 12 | module_logger = logging.getLogger(__name__) 13 | 14 | dict_checks_defaults = { 15 | 'columns': [np.nan], 16 | 'check_condition': 17 | lambda df, col, condition, **kwargs: condition.sum() > 0, 18 | 'count_condition': lambda df, col, condition, **kwargs: condition.sum(), 19 | 'index_position': lambda df, col, condition, **kwargs: condition, 20 | 'relevant_columns': lambda df, col, condition, **kwargs: col, 21 | 'long_description': lambda df, col, condition, **kwargs: "", 22 | 'idx_flag': True, 23 | 'category': np.nan 24 | } 25 | 26 | 27 | class Checks: 28 | __step_no = 0 29 | __key_1 = None 30 | __key_2 = None 31 | __key_3 = None 32 | __grouping = None 33 | df_issues = None 34 | __key_separator = " -:- " 35 | __checks_defaults = None 36 | 37 | def __init__(self, grouping, key_1, key_2=None, key_3=None): 38 | module_logger.info("Initialising `Checks` object") 39 | # Three keys, all good things come in threes 40 | self.__key_1 = str(key_1) 41 | self.__key_2 = str(key_2) 42 | self.__key_3 = str(key_3) 43 | self.__grouping = grouping 44 | self.__checks_defaults = dict(dict_checks_defaults) 45 | # Initialise the `df_issues` table 46 | df_issues = pd.DataFrame( 47 | columns=[ 48 | "key_1", "key_2", "key_3", "file", "sub_file", "step_number", 49 | "category", "issue_short_desc", "issue_long_desc", "column", 50 | "issue_count", "issue_idx", "grouping" 51 | ] 52 | ) 53 | df_issues["step_number"] = df_issues["step_number"].astype(int) 54 | self.df_issues = df_issues 55 | module_logger.info("Initialising `Checks` object complete") 56 | 57 | def error_handling(self, file, subfile, issue_short_desc, issue_long_desc, 58 | column, issue_count, issue_idx, category=np.nan): 59 | """ 60 | If an error is handled, as they all should be, we need to specify what 61 | happens with the error. By putting it into a single function it will 62 | hopefully make the code briefer. 63 | """ 64 | # TODO work out how to add in `file` and `subfile` where data is a 65 | # dictionary 66 | module_logger.info("Logging an error with `error_handling`") 67 | df = self.df_issues.copy() 68 | list_vals = [ 69 | self.__key_1, self.__key_2, self.__key_3, file, subfile, 70 | self.__step_no, category, issue_short_desc, issue_long_desc, column, 71 | issue_count, issue_idx, self.__grouping 72 | ] 73 | try: 74 | df.loc[df.shape[0]] = list_vals 75 | self.df_issues = df.copy() 76 | except: 77 | var_msg = f"Logging the issue failed, values: {list_vals}" 78 | module_logger.error(var_msg) 79 | raise ValueError(var_msg) 80 | module_logger.info(f"Error logged: {list_vals}") 81 | 82 | def set_defaults( 83 | self, columns=None, check_condition=None, count_condition=None, 84 | index_position=None, relevant_columns=None, long_description=None, 85 | idx_flag=None): 86 | module_logger.info("Starting `set_defaults`") 87 | if columns is not None: 88 | if type(columns).__name__ != 'list': 89 | var_msg = 'The `columns` argument is not a list as required' 90 | module_logger.error(var_msg) 91 | raise ValueError(var_msg) 92 | if len(columns) == 0: 93 | var_msg = ('The `columns` argument is empty, it needs to be ' 94 | 'at least length 1, this can be a null') 95 | module_logger.error(var_msg) 96 | raise ValueError(var_msg) 97 | self.__checks_defaults['columns'] = columns 98 | if check_condition is not None: 99 | self.__set_defaults_check(check_condition, 'check_condition') 100 | self.__checks_defaults['check_condition'] = check_condition 101 | if count_condition is not None: 102 | self.__set_defaults_check(count_condition, 'count_condition') 103 | self.__checks_defaults['count_condition'] = count_condition 104 | if index_position is not None: 105 | self.__set_defaults_check(index_position, 'index_position') 106 | self.__checks_defaults['index_position'] = index_position 107 | if relevant_columns is not None: 108 | self.__set_defaults_check(relevant_columns, 'relevant_columns') 109 | self.__checks_defaults['relevant_columns'] = relevant_columns 110 | if long_description is not None: 111 | self.__set_defaults_check(long_description, 'long_descriptions') 112 | self.__checks_defaults['long_description'] = long_description 113 | if idx_flag is not None: 114 | if idx_flag not in [True, False]: 115 | var_msg = 'The value of `idx_flag` need to be True or False' 116 | module_logger.error(var_msg) 117 | raise ValueError(var_msg) 118 | self.__checks_defaults['idx_flag'] = idx_flag 119 | module_logger.info("Completed `set_defaults`") 120 | 121 | @staticmethod 122 | def __set_defaults_check(function, label): 123 | module_logger.info("Starting `__set_defaults_check`") 124 | if type(function).__name__ != 'function': 125 | var_msg = f'The passed value for `{label}` is not a function' 126 | module_logger.error(var_msg) 127 | raise ValueError(var_msg) 128 | arg_spec = getfullargspec(function) 129 | if arg_spec.args != ['df', 'col', 'condition']: 130 | var_msg = ( 131 | f'The arguments passed in for the function `{label}` does not ' 132 | f'match with the required args: df, col, condition') 133 | module_logger.error(var_msg) 134 | raise ValueError(var_msg) 135 | if arg_spec.varkw != 'kwargs': 136 | var_msg = (f'The **kwargs argument has not been provided for ' 137 | f'`{label}` and is required') 138 | module_logger.error(var_msg) 139 | raise ValueError(var_msg) 140 | module_logger.info("Completed `__set_defaults_check`") 141 | 142 | def set_key_separator(self, separator): 143 | module_logger.info("Starting `set_key_separator`") 144 | if (type(separator).__name__ != "str") | (len(separator) == 0): 145 | var_msg = ("The argument `separator` for function " 146 | "`set_key_separator` should be a string of length " 147 | "greater than 0") 148 | module_logger.error(var_msg) 149 | raise ValueError(var_msg) 150 | self.__key_separator = separator 151 | module_logger.info(f"Completed `set_key_separator`, the key separator " 152 | f"is: {self.__key_separator}") 153 | 154 | def apply_checks( 155 | self, tables, path=None, script_name=None, 156 | object_name="dict_checks", dictionary=None, **kwargs): 157 | module_logger.info("Starting `apply_checks`") 158 | if (script_name is not None) & (object_name is not None): 159 | dict_checks = import_attr(path, script_name, object_name) 160 | elif dictionary is not None: 161 | if type(dictionary).__name__ != "dict": 162 | var_msg = "The `dictionary` argument is not a dictionary" 163 | module_logger.error(var_msg) 164 | raise ValueError(var_msg) 165 | dict_checks = dictionary 166 | else: 167 | var_msg = ("Either `dictionary` or both of `script_name` and " 168 | "`path` need to be none null") 169 | module_logger.error(var_msg) 170 | raise ValueError(var_msg) 171 | 172 | if type(tables).__name__ == "dict": 173 | for table_key in tables.keys(): 174 | for check_key in dict_checks.keys(): 175 | self.__apply_the_check( 176 | tables[table_key], dict_checks[check_key], check_key, 177 | table_key, **kwargs) 178 | elif type(tables).__name__ == "DataFrame": 179 | for check_key in dict_checks.keys(): 180 | self.__apply_the_check(tables, dict_checks[check_key], 181 | check_key, np.nan, **kwargs) 182 | 183 | module_logger.info("Completed `apply_checks`") 184 | 185 | def __apply_the_check( 186 | self, df, dict_check_info, check_key, table_key, **kwargs): 187 | module_logger.info(f"Starting check `{check_key}`") 188 | if "calc_condition" not in dict_check_info: 189 | var_msg = "The check requires a value for key `calc_condition`" 190 | module_logger.error(var_msg) 191 | raise AttributeError(var_msg) 192 | func_calc_condition = dict_check_info["calc_condition"] 193 | func_long_description = ( 194 | self.__checks_defaults['long_description'] if 195 | "long_description" not in dict_check_info else 196 | dict_check_info["long_description"]) 197 | func_check_condition = ( 198 | self.__checks_defaults['check_condition'] if 199 | "check_condition" not in dict_check_info else 200 | dict_check_info["check_condition"]) 201 | list_columns = ( 202 | self.__checks_defaults['columns'] if 203 | "columns" not in dict_check_info else 204 | dict_check_info["columns"]) 205 | if type(list_columns).__name__ == 'str': 206 | list_columns = [list_columns] 207 | func_count_condition = ( 208 | self.__checks_defaults['count_condition'] if 209 | "count_condition" not in dict_check_info else 210 | dict_check_info["count_condition"]) 211 | func_index_position = ( 212 | self.__checks_defaults['index_position'] if 213 | "index_position" not in dict_check_info else 214 | dict_check_info["index_position"]) 215 | func_relevant_columns = ( 216 | self.__checks_defaults['relevant_columns'] if 217 | "relevant_columns" not in dict_check_info else 218 | dict_check_info["relevant_columns"]) 219 | var_idx_flag = ( 220 | self.__checks_defaults['idx_flag'] if 221 | "idx_flag" not in dict_check_info else 222 | dict_check_info['idx_flag']) 223 | var_category = ( 224 | self.__checks_defaults['category'] if 225 | "category" not in dict_check_info else 226 | dict_check_info['category']) 227 | if len(list_columns) == 0: 228 | var_msg = ('The `list_columns` value somehow has length 0, needs ' 229 | 'to have at least one element, which can be `np.nan`') 230 | module_logger.error(var_msg) 231 | raise ValueError(var_msg) 232 | for col in list_columns: 233 | self.__evaluate_check( 234 | check_key, df, col, func_calc_condition, 235 | func_check_condition, func_count_condition, func_index_position, 236 | func_relevant_columns, func_long_description, var_idx_flag, 237 | var_category, table_key, **kwargs) 238 | 239 | module_logger.info(f"Completed check `{check_key}`") 240 | 241 | def __evaluate_check( 242 | self, check_key, df, col, func_calc_condition, func_check_condition, 243 | func_count_condition, func_index_position, func_relevant_columns, 244 | func_long_description, var_idx_flag, var_category, table_key, 245 | **kwargs): 246 | module_logger.info( 247 | f"Starting evaluating check `{check_key}` for column {col}") 248 | s_calc_condition = func_calc_condition(df, col, **kwargs) 249 | var_check_condition = func_check_condition( 250 | df, col, s_calc_condition, **kwargs) 251 | var_count_condition = func_count_condition( 252 | df, col, s_calc_condition, **kwargs) 253 | s_index_conditions = func_index_position( 254 | df, col, s_calc_condition, **kwargs) 255 | if var_idx_flag is False: 256 | s_index_conditions = s_index_conditions.map( 257 | {True: False, False: True}) 258 | var_relevant_columns = func_relevant_columns( 259 | df, col, s_calc_condition, **kwargs) 260 | var_long_description = func_long_description( 261 | df, col, s_calc_condition, **kwargs) 262 | if type(var_long_description).__name__ != "str": 263 | var_msg = ( 264 | f"The variable `var_long_description` is not a string! It is a" 265 | f" {type(var_long_description).__name__}") 266 | module_logger.warning(var_msg) 267 | if ( 268 | (type(var_relevant_columns).__name__ != "str") & 269 | (pd.isnull(var_relevant_columns) is False) 270 | ): 271 | var_msg = ( 272 | f"The variable `var_relevant_columns` is not a string or null! " 273 | f"It is a {type(var_relevant_columns).__name__}") 274 | module_logger.warning(var_msg) 275 | if "int" not in type(var_count_condition).__name__: 276 | var_msg = ( 277 | f"The variable `var_count_condition` is not an integer! It is a" 278 | f" {type(var_count_condition).__name__}") 279 | module_logger.warning(var_msg) 280 | if type(s_calc_condition).__name__ != "Series": 281 | var_msg = ( 282 | f"The variable `s_calc_condition` is not a Series! It is a " 283 | f"{type(s_calc_condition).__name__}") 284 | module_logger.warning(var_msg) 285 | if type(s_index_conditions).__name__ != "Series": 286 | var_msg = ( 287 | f"The variable `s_index_conditions` is not a Series! It is a " 288 | f"{type(s_index_conditions).__name__}") 289 | module_logger.warning(var_msg) 290 | if ( 291 | (type(var_category).__name__ != 'str') & 292 | (pd.isnull(var_category) is False) 293 | ): 294 | var_msg = (f'The variable `category` is not a string or null! It ' 295 | f'is a {type(var_category).__name__}') 296 | module_logger.warning(var_msg) 297 | if var_check_condition: 298 | if pd.isnull(table_key): 299 | var_file = np.nan 300 | var_subfile = np.nan 301 | else: 302 | var_file = table_key.split(self.__key_separator)[0] 303 | var_subfile = (table_key.split(self.__key_separator)[1] if 304 | self.__key_separator in table_key else np.nan) 305 | self.error_handling( 306 | var_file, var_subfile, check_key, var_long_description, 307 | var_relevant_columns, var_count_condition, 308 | ", ".join( 309 | [ 310 | str(item) for item in 311 | s_index_conditions.loc[ 312 | s_index_conditions].index.tolist() 313 | ] 314 | ), 315 | var_category 316 | ) 317 | module_logger.info( 318 | f"Completed evaluating check `{check_key}` for column {col}") 319 | 320 | def get_issue_count(self, issue_number_min=None, issue_number_max=None): 321 | module_logger.info("Starting `get_issue_count`") 322 | df = self.df_issues.copy() 323 | if issue_number_min is not None: 324 | df = df.loc[df["step_number"] >= issue_number_min].copy() 325 | if issue_number_max is not None: 326 | df = df.loc[df["step_number"] <= issue_number_max].copy() 327 | var_count = df.shape[0] 328 | module_logger.info("Completed `get_issue_count`") 329 | return var_count 330 | 331 | def table_look(self, table, issue_idx): 332 | module_logger.info("Starting `table_look`") 333 | if issue_idx not in self.df_issues.index.tolist(): 334 | var_msg = (f"The requested issue index, {issue_idx}, is not " 335 | f"present in the `df_issues` table") 336 | module_logger.error(var_msg) 337 | raise AttributeError(var_msg) 338 | if type(table).__name__ != 'DataFrame': 339 | var_msg = 'The `table` argument is not a DataFrame as required' 340 | module_logger.error(var_msg) 341 | raise ValueError(var_msg) 342 | df_check = table.loc[ 343 | [ 344 | int(item) for item in 345 | self.df_issues.loc[issue_idx, "issue_idx"].split(", ") 346 | ] 347 | ] 348 | module_logger.info("Completed `table_look`") 349 | return self.df_issues.loc[[issue_idx]], df_check 350 | 351 | @staticmethod 352 | def __func_summary_(key_value): 353 | if type(key_value).__name__ == 'function': 354 | var_out = ''.join([ 355 | x.strip().strip("['\\n']") for x in 356 | getsourcelines(key_value)[0] 357 | ]) 358 | if (var_out.strip()[-1] == ':') | (var_out.strip()[-1] == '('): 359 | return ('raise Exception("The definition does not allow for' 360 | ' this info to be retrieved")') 361 | var_out = var_out.split(':')[-1].strip() 362 | if var_out[-1] == ',': 363 | var_out = var_out[:-1] 364 | return var_out 365 | else: 366 | return key_value 367 | 368 | def summary(self, path=None, script_name=None, 369 | object_name="dict_checks", dictionary=None): 370 | if (script_name is not None) & (object_name is not None): 371 | dict_checks = import_attr(path, script_name, object_name) 372 | elif dictionary is not None: 373 | if type(dictionary).__name__ != "dict": 374 | var_msg = "The `dictionary` argument is not a dictionary" 375 | module_logger.error(var_msg) 376 | raise ValueError(var_msg) 377 | dict_checks = dictionary 378 | else: 379 | var_msg = ("Either `dictionary` or both of `script_name` and " 380 | "`path` need to be none null") 381 | module_logger.error(var_msg) 382 | raise ValueError(var_msg) 383 | 384 | list_keys = [ 385 | 'calc_condition', 'long_description', 'check_condition', 'columns', 386 | 'count_condition', 'index_position', 'relevant_columns', 'idx_flag', 387 | 'category' 388 | ] 389 | 390 | dict_checks_values = deepcopy(dict_checks) 391 | for check in [key for key in dict_checks_values.keys()]: 392 | for key in [key for key in list_keys if 393 | key not in dict_checks_values[check].keys()]: 394 | dict_checks_values[check][key] = self.__checks_defaults[key] 395 | 396 | for check in [key for key in dict_checks_values.keys()]: 397 | for key in [key for key in dict_checks_values[check].keys()]: 398 | dict_checks_values[check][key] = self.__func_summary_( 399 | dict_checks_values[check][key]) 400 | 401 | df_summary = pd.DataFrame( 402 | dict_checks_values 403 | ).T.reset_index().rename(columns={'index': 'check'}) 404 | 405 | return {'df': df_summary, 'dict': dict_checks} 406 | 407 | def set_step_no(self, step_no): 408 | """ 409 | Set the step number, this allows errors to be recorded against a 410 | specific step which in turn can help with issue tracking and checking 411 | once issues are recorded. 412 | 413 | The argument step_no needs to be convertible to integer format. 414 | """ 415 | module_logger.info("Starting `set_step_no`") 416 | try: 417 | self.__step_no = int(step_no) 418 | except ValueError: 419 | var_msg = (f"Function set_step_no: The value {step_no} can not be " 420 | f"converted to int.") 421 | module_logger.error(var_msg) 422 | raise ValueError(var_msg) 423 | module_logger.info( 424 | f"Completed `set_step_no`, the step number is {self.__step_no}") 425 | 426 | def get_step_no(self): 427 | module_logger.info("Starting `get_step_no`") 428 | module_logger.info("Completed `get_step_no`") 429 | return self.__step_no 430 | -------------------------------------------------------------------------------- /data_etl/connections.py: -------------------------------------------------------------------------------- 1 | # Here we are defining a class that will deal with the various connections 2 | # required by the pipeline 3 | import logging 4 | import sqlite3 5 | import os 6 | import configparser 7 | 8 | import pandas as pd 9 | import pyodbc 10 | 11 | from data_etl.general_functions import func_to_sql 12 | 13 | module_logger = logging.getLogger(__name__) 14 | # TODO account for tables not existing and existing when writing to the cnx, 15 | # ideally any tables used should have been pre-emptively setup in the required 16 | # databases 17 | # TODO add MSSQL connection handling 18 | 19 | 20 | class Connections: 21 | __step_no = 0 22 | __df_issues = None 23 | __dict_cnx = None 24 | 25 | def __init__(self, step_no=None): 26 | module_logger.info("Initialising `Connections` object") 27 | if step_no is not None: 28 | self.set_step_no(step_no) 29 | self.__dict_cnx = { 30 | 'blank': {'cnx_type': 'blank'} 31 | } 32 | module_logger.info("Initialising `Connections` object complete") 33 | 34 | def set_step_no(self, step_no): 35 | module_logger.info(f"Starting `set_step_no`") 36 | self.__step_no = step_no 37 | module_logger.info(f"Completed `set_step_no`") 38 | 39 | def get_step_no(self): 40 | module_logger.info("Starting `get_step_no`") 41 | module_logger.info("Completed `get_step_no`") 42 | return self.__step_no 43 | 44 | def add_cnx(self, cnx_key, cnx_type, table_name, cnx_string=None, 45 | file_path=None, config_section=None, overwrite=False, 46 | timestamp_format='%Y-%m-%d', **kwargs): 47 | module_logger.info(f"Starting `add_cnx` for cnx key `{cnx_key}`") 48 | # TODO query is the file existing, if not then error out 49 | if (cnx_key in self.__dict_cnx) & (overwrite is False): 50 | var_msg = ('This connection string is already set, use the ' 51 | 'argument `overwrite=True` to overwrite') 52 | module_logger.error(var_msg) 53 | raise ValueError(var_msg) 54 | if cnx_type not in ['sqlite3', 'db']: 55 | var_msg = ( 56 | 'The `cnx_type` argument only takes values `sqlite3`, `db`') 57 | module_logger.error(var_msg) 58 | raise AttributeError(var_msg) 59 | if (table_name is None) & (cnx_type in ['sqlite3', 'db']): 60 | var_msg = 'The argument `table_name` is required' 61 | module_logger.error(var_msg) 62 | raise AttributeError(var_msg) 63 | if (file_path is None) & (cnx_type in ['sqlite3', 'db']): 64 | var_msg = 'The argument `file_path` is required' 65 | module_logger.error(var_msg) 66 | raise AttributeError(var_msg) 67 | if ( 68 | (not os.path.exists(file_path)) & 69 | (cnx_string is None) & 70 | (cnx_type in ['db']) 71 | ): 72 | var_msg = ( 73 | f'The `file_path` to the config file {file_path} is not valid, ' 74 | f'the `file_path` is expected since the `cnx_string` is None' 75 | ) 76 | module_logger.error(var_msg) 77 | raise AttributeError(var_msg) 78 | if ( 79 | (not os.path.exists(os.path.dirname(file_path))) & 80 | (cnx_type in ['sqlite3']) 81 | ): 82 | var_msg = ( 83 | f'The folder path {os.path.dirname(file_path)} is not valid') 84 | module_logger.error(var_msg) 85 | raise AttributeError(var_msg) 86 | if (not os.path.exists(file_path)) & (cnx_type in ['sqlite3']): 87 | var_msg = (f'The `file_path` {file_path} is not valid so this ' 88 | f'file will be created') 89 | module_logger.warning(var_msg) 90 | if cnx_type == 'sqlite3': 91 | module_logger.info( 92 | f'The information is: {cnx_type}, {file_path}, {table_name}') 93 | self.__dict_cnx[cnx_key] = { 94 | 'cnx_type': cnx_type, 95 | 'file_path': file_path, 96 | 'table_name': table_name 97 | } 98 | elif cnx_type == 'db': 99 | if (config_section is None) & (cnx_string is None): 100 | var_msg = ('The argument `config_section` or `cnx_string` is ' 101 | 'required for `cnx_type=db`') 102 | module_logger.error(var_msg) 103 | raise AttributeError(var_msg) 104 | if config_section is not None: 105 | dict_config = configparser.ConfigParser() 106 | dict_config.read(file_path) 107 | var_cnx_string = ''.join( 108 | [ 109 | f"{key}={dict_config[config_section][key]};" for 110 | key in dict_config[config_section] 111 | ] 112 | ) 113 | self.__dict_cnx[cnx_key] = { 114 | 'cnx_type': cnx_type, 115 | 'file_path': file_path, 116 | 'cnx_string': var_cnx_string , 117 | 'table_name': table_name, 118 | 'timestamp_format': timestamp_format 119 | } 120 | elif cnx_string is not None: 121 | self.__dict_cnx[cnx_key] = { 122 | 'cnx_type': cnx_type, 123 | 'file_path': file_path, 124 | 'cnx_string': cnx_string, 125 | 'table_name': table_name, 126 | 'timestamp_format': timestamp_format 127 | } 128 | self.test_cnx(cnx_key, **kwargs) 129 | module_logger.info("Completed `add_cnx`") 130 | 131 | def test_cnx(self, cnx_key, **kwargs): 132 | module_logger.info(f"Starting `test_cnx` for cnx key `{cnx_key}`") 133 | if cnx_key not in self.__dict_cnx: 134 | var_msg = f'The key {cnx_key} is not present' 135 | module_logger.error(var_msg) 136 | raise AttributeError(var_msg) 137 | dict_cnx = self.__dict_cnx[cnx_key] 138 | var_cnx_type = dict_cnx['cnx_type'] 139 | if var_cnx_type == 'sqlite3': 140 | cnx = sqlite3.connect(dict_cnx['file_path']) 141 | if kwargs.get('sqlite_df_issues_create') is True: 142 | var_create_table_sql = """ 143 | CREATE TABLE IF NOT EXISTS {} ( 144 | key_1 text, 145 | key_2 text, 146 | key_3 text, 147 | file text, 148 | sub_file text, 149 | step_number integer, 150 | category text, 151 | issue_short_desc text, 152 | issue_long_desc text, 153 | column text, 154 | issue_count integer, 155 | issue_idx text, 156 | grouping text 157 | ); 158 | """.format(dict_cnx['table_name']) 159 | cnx.execute(var_create_table_sql) 160 | try: 161 | pd.read_sql( 162 | f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;", 163 | cnx 164 | ) 165 | cnx.close() 166 | except: 167 | cnx.close() 168 | var_msg = 'Reading in from the table has not worked' 169 | module_logger.error(var_msg) 170 | raise AttributeError(var_msg) 171 | elif var_cnx_type == 'db': 172 | cnx = pyodbc.connect(dict_cnx['cnx_string']) 173 | try: 174 | pd.read_sql( 175 | f"SELECT TOP (0) * FROM {dict_cnx['table_name']};", 176 | cnx 177 | ) 178 | cnx.close() 179 | except: 180 | cnx.close() 181 | module_logger.info("Completed `test_cnx`") 182 | 183 | def read_from_db(self, cnx_key, sql_stmt): 184 | module_logger.info("Starting `read_from_db`") 185 | module_logger.info(f'Sql statement: {sql_stmt}') 186 | dict_cnx = self.__dict_cnx[cnx_key] 187 | var_cnx_type = dict_cnx['cnx_type'] 188 | df = pd.DataFrame() 189 | if var_cnx_type == 'blank': 190 | var_msg = 'Trying to use `read_from_db` using a blank connection' 191 | module_logger.error(var_msg) 192 | raise ValueError(var_msg) 193 | elif var_cnx_type == 'sqlite3': 194 | cnx = sqlite3.connect(dict_cnx['file_path']) 195 | try: 196 | df = pd.read_sql(sql_stmt, cnx) 197 | cnx.close() 198 | except: 199 | cnx.close() 200 | var_msg = 'Reading in using a `sqlite3` connection has failed' 201 | module_logger.error(var_msg) 202 | raise ValueError(var_msg) 203 | elif var_cnx_type == 'db': 204 | cnx = pyodbc.connect(dict_cnx['cnx_string']) 205 | try: 206 | df = pd.read_sql(sql_stmt, cnx) 207 | cnx.close() 208 | except: 209 | cnx.close() 210 | var_msg = 'Reading in using a `db` connection has failed' 211 | module_logger.error(var_msg) 212 | raise ValueError(var_msg) 213 | module_logger.info("Completed `read_from_db`") 214 | return df 215 | 216 | def write_to_db(self, cnx_key, table, batch_size=None, 217 | flag_sql_logging=False): 218 | module_logger.info("Starting `write_to_db`") 219 | dict_cnx = self.__dict_cnx[cnx_key] 220 | var_cnx_type = dict_cnx['cnx_type'] 221 | # Temp table first 222 | var_write_works = 0 223 | if var_cnx_type == 'blank': 224 | var_write_works += 1 225 | elif var_cnx_type == 'sqlite3': 226 | cnx = sqlite3.connect(dict_cnx['file_path']) 227 | cursor = cnx.cursor() 228 | var_sql = (f"CREATE TEMP TABLE temp.{dict_cnx['table_name']} AS " 229 | f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;") 230 | module_logger.info(var_sql) 231 | cursor.execute(var_sql) 232 | cnx.commit() 233 | for idx in table.index.tolist(): 234 | var_sql = "INSERT INTO temp.{} VALUES ({});".format( 235 | dict_cnx['table_name'], 236 | ', '.join( 237 | table.loc[idx].map( 238 | lambda value: 'NULL' if pd.isnull(value) else 239 | f"'{str(value)}'" 240 | ).astype(str).values.tolist() 241 | ) 242 | ) 243 | if flag_sql_logging: 244 | module_logger.info(var_sql) 245 | cursor.execute(var_sql) 246 | cnx.commit() 247 | 248 | df_test = pd.read_sql( 249 | f"SELECT * FROM temp.{dict_cnx['table_name']}", cnx) 250 | 251 | if df_test.shape[0] == table.shape[0]: 252 | var_write_works += 1 253 | 254 | cnx.close() 255 | elif var_cnx_type == 'db': 256 | cnx = pyodbc.connect(dict_cnx['cnx_string']) 257 | cursor = cnx.cursor() 258 | 259 | var_sql = (f"DROP TABLE IF EXISTS #Temp " 260 | f"SELECT TOP(0) * INTO #Temp " 261 | f"FROM {dict_cnx['table_name']}") 262 | module_logger.info(var_sql) 263 | cursor.execute(var_sql) 264 | cnx.commit() 265 | 266 | var_sql_template = "INSERT INTO #Temp ([{}]) VALUES {}".format( 267 | "], [".join(table.columns.tolist()), 268 | '{}' 269 | ) 270 | module_logger.info(var_sql_template) 271 | s_sql_values = table.apply( 272 | lambda s: s.map( 273 | lambda x: func_to_sql(x, dict_cnx['timestamp_format'])) 274 | ).apply( 275 | lambda r: f"({', '.join(r)})", axis=1) 276 | var_iloc_min = 0 277 | for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2): 278 | s_filtered = s_sql_values.iloc[ 279 | var_iloc_min:(i * batch_size)] 280 | var_sql = var_sql_template.format( 281 | ", ".join(s_filtered.values.tolist())) 282 | if flag_sql_logging: 283 | module_logger.info(var_sql) 284 | cursor.execute(var_sql) 285 | cnx.commit() 286 | var_iloc_min = i * batch_size 287 | 288 | df_test = pd.read_sql("SELECT * FROM #Temp", cnx) 289 | 290 | if df_test.shape[0] == table.shape[0]: 291 | var_write_works += 1 292 | 293 | cnx.close() 294 | 295 | if var_write_works == 0: 296 | var_msg = ('The writing to a temporary table has not worked, ' 297 | 'will not try writing to main table') 298 | module_logger.error(var_msg) 299 | raise ValueError(var_msg) 300 | if var_write_works > 1: 301 | var_msg = ('The writing to a temporary table has happened ' 302 | 'multiple times, will not try writing to main table') 303 | module_logger.error(var_msg) 304 | raise ValueError(var_msg) 305 | # Then move to the main table only if the temporary table worked 306 | if var_cnx_type == 'blank': 307 | pass 308 | elif var_cnx_type == 'sqlite3': 309 | cnx = sqlite3.connect(dict_cnx['file_path']) 310 | try: 311 | table.to_sql(dict_cnx['table_name'], cnx, 312 | index=False, if_exists='append') 313 | cnx.close() 314 | except: 315 | cnx.close() 316 | var_msg = 'Writing to the table has not worked' 317 | module_logger.error(var_msg) 318 | raise ValueError(var_msg) 319 | elif var_cnx_type == 'db': 320 | cnx = pyodbc.connect(dict_cnx['cnx_string']) 321 | cursor = cnx.cursor() 322 | try: 323 | var_sql_template = "INSERT INTO {} ([{}]) VALUES {}".format( 324 | dict_cnx['table_name'], 325 | "], [".join(table.columns.tolist()), 326 | '{}' 327 | ) 328 | s_sql_values = table.apply( 329 | lambda s: s.map( 330 | lambda x: func_to_sql(x, dict_cnx['timestamp_format'])) 331 | ).apply( 332 | lambda r: f"({', '.join(r)})", axis=1) 333 | var_iloc_min = 0 334 | for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2): 335 | s_filtered = s_sql_values.iloc[ 336 | var_iloc_min:(i * batch_size)] 337 | var_sql = var_sql_template.format( 338 | ", ".join(s_filtered.values.tolist())) 339 | if flag_sql_logging: 340 | module_logger.info(var_sql) 341 | cursor.execute(var_sql) 342 | cnx.commit() 343 | var_iloc_min = i * batch_size 344 | cnx.close() 345 | except: 346 | cnx.close() 347 | var_msg = 'Writing to the table has not worked' 348 | module_logger.error(var_msg) 349 | raise ValueError(var_msg) 350 | 351 | module_logger.info("Completed `write_to_db`") 352 | 353 | def get_cnx_keys(self): 354 | module_logger.info("Starting `get_cnx_keys`") 355 | module_logger.info("Completed `get_cnx_keys`") 356 | return [x for x in self.__dict_cnx.keys()] 357 | -------------------------------------------------------------------------------- /data_etl/data_files.py: -------------------------------------------------------------------------------- 1 | # Here we are defining a class that will deal with all the data storage and 2 | # manipulations 3 | import logging 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from data_etl.general_functions import import_attr 9 | 10 | module_logger = logging.getLogger(__name__) 11 | 12 | 13 | class DataCuration: 14 | __step_no = 0 15 | df_issues = None 16 | headers = None 17 | __key_1 = None 18 | __key_2 = None 19 | __key_3 = None 20 | __grouping = None 21 | tables = None 22 | formed_tables = None 23 | list_files = None 24 | __key_separator = " -:- " 25 | __link_headers = None 26 | 27 | def __init__(self, grouping, key_1, key_2=None, key_3=None): 28 | """ 29 | All data actions are taken on all tables, the aim is to process data to 30 | end up with a uniform data set that can be utilised and is consistent. 31 | 32 | The three arguments are individual identifiers for the data. 33 | 34 | The end form would be a pipeline that has regular data ingests. 35 | """ 36 | module_logger.info("Initialising `DataCuration` object") 37 | # Three keys, all good things come in threes 38 | self.__key_1 = str(key_1) 39 | self.__key_2 = str(key_2) 40 | self.__key_3 = str(key_3) 41 | self.__grouping = grouping 42 | # sub_file, e.g. sheet for a spreadsheet, may not always be applicable 43 | df_issues = pd.DataFrame( 44 | columns=[ 45 | "key_1", "key_2", "key_3", "file", "sub_file", "step_number", 46 | "category", "issue_short_desc", "issue_long_desc", "column", 47 | "issue_count", "issue_idx", "grouping" 48 | ] 49 | ) 50 | df_issues["step_number"] = df_issues["step_number"].astype(int) 51 | self.df_issues = df_issues 52 | self.tables = dict() 53 | self.formed_tables = dict() 54 | self.list_files = list() 55 | self.__link_headers = dict() 56 | module_logger.info("Initialising `DataCuration` object complete") 57 | 58 | def error_handling(self, file, subfile, issue_short_desc, issue_long_desc, 59 | column, issue_count, issue_idx, category=np.nan): 60 | """ 61 | If an error is handled, as they all should be, we need to specify what 62 | happens with the error. By putting it into a single function it will 63 | hopefully make the code briefer. 64 | """ 65 | module_logger.info("Logging an error with `error_handling`") 66 | df = self.df_issues.copy() 67 | list_vals = [ 68 | self.__key_1, self.__key_2, self.__key_3, file, subfile, 69 | self.__step_no, category, issue_short_desc, issue_long_desc, column, 70 | issue_count, issue_idx, self.__grouping 71 | ] 72 | try: 73 | df.loc[df.shape[0]] = list_vals 74 | self.df_issues = df.copy() 75 | except: 76 | var_msg = f"Logging the issue failed for values: {list_vals}" 77 | module_logger.error(var_msg) 78 | raise ValueError(var_msg) 79 | module_logger.info(f"Error logged: {list_vals}") 80 | 81 | def set_step_no(self, step_no): 82 | """ 83 | Set the step number, this allows errors to be recorded against a 84 | specific step which in turn can help with issue tracking and checking 85 | once issues are recorded. 86 | 87 | The argument step_no needs to be convertible to integer format. 88 | """ 89 | module_logger.info("Starting `set_step_no`") 90 | try: 91 | self.__step_no = int(step_no) 92 | except ValueError: 93 | var_msg = (f"Function set_step_no: The value {step_no} can not be " 94 | f"converted to int.") 95 | module_logger.error(var_msg) 96 | raise ValueError(var_msg) 97 | module_logger.info( 98 | f"Completed `set_step_no`, the step number is {self.__step_no}") 99 | 100 | def set_key_separator(self, separator): 101 | """ 102 | The key separator is used in the error handling section to split out the 103 | file and sub file portions of the dictionary keys of the files read in. 104 | 105 | So if you have a key of 'file name -:- sheet name', for tables read in 106 | from an Excel file, and an issue is found. The associated issues log 107 | entry will then have a file value of 'file name' and a sub file value of 108 | 'sheet name'. 109 | """ 110 | module_logger.info("Starting `set_key_separator`") 111 | if (type(separator).__name__ != "str") | (len(separator) == 0): 112 | var_msg = ("The argument `separator` for function " 113 | "`set_key_separator` should be a string of length " 114 | "greater than 0") 115 | module_logger.error(var_msg) 116 | raise ValueError(var_msg) 117 | self.__key_separator = separator 118 | module_logger.info(f"Completed `set_key_separator`, the key separator " 119 | f"is: {self.__key_separator}") 120 | 121 | def set_file_list(self, list_files, append=False): 122 | """ 123 | If there is a know list of files then define them here rather than 124 | setting a function to find the files. 125 | """ 126 | module_logger.info("Starting `set_file_list`") 127 | var_type = type(list_files).__name__ 128 | if (var_type != "list") & (var_type != "str"): 129 | var_msg = ("The type of the `list_files` argument is not a list or " 130 | "a string.") 131 | module_logger.error(var_msg) 132 | raise ValueError(var_msg) 133 | elif var_type == "str": 134 | if len(list_files) == 0: 135 | var_msg = ("The length of the `list_files` argument is 0, it " 136 | "needs to be a valid value.") 137 | module_logger.error(var_msg) 138 | raise ValueError(var_msg) 139 | list_files = [list_files] 140 | elif var_type == 'list': 141 | if len(list_files) == 0: 142 | var_msg = ("The length of the `list_files` argument is 0, it " 143 | "needs to be a valid value.") 144 | module_logger.error(var_msg) 145 | raise ValueError(var_msg) 146 | list_files = list_files 147 | else: 148 | var_msg = (f"Unhandled type for function `set_file_list`: " 149 | f"{var_type}") 150 | module_logger.error(var_msg) 151 | raise ValueError(var_msg) 152 | 153 | if append: 154 | self.list_files += list_files 155 | else: 156 | self.list_files = list_files 157 | module_logger.info(f"Completed `set_file_list`, the list of files is: " 158 | f"{self.list_files}") 159 | 160 | def find_files(self, path=None, script_name=None, 161 | func_name="list_the_files", function=None, files_path='.', 162 | append=False, **kwargs): 163 | """ 164 | Using an externally defined function, as specified in the module 165 | argument script, acquire a list of files to be read in. 166 | 167 | In the case that we want to accumulate a list of files from different 168 | main paths there is an append option. 169 | """ 170 | module_logger.info("Starting `find_files`") 171 | # TODO move this to an internal function as it's used so often! 172 | if script_name is not None: 173 | function = import_attr(path, script_name, func_name) 174 | elif function is not None: 175 | if type(function).__name__ != "function": 176 | var_msg = "The `function` argument needs to be a function" 177 | module_logger.error(var_msg) 178 | raise ValueError(var_msg) 179 | else: 180 | var_msg = ("One of `script_name` or `function` needs to be not " 181 | "None in the function `find_files`") 182 | module_logger.error(var_msg) 183 | raise ValueError(var_msg) 184 | list_files = function(files_path, **kwargs) 185 | # TODO move these to be calls on the self.set_file_list function instead 186 | # of setting the value here 187 | if append: 188 | self.list_files += list_files 189 | else: 190 | self.list_files = list_files 191 | module_logger.info( 192 | f"Completed `find_files`, the list of files is: {self.list_files}") 193 | 194 | def reading_in(self, path=None, script_name=None, func_name="read_files", 195 | function=None, overwrite=True, **kwargs): 196 | """ 197 | Using an externally defined reading in function, and the internally 198 | defined list of files, read in each of the tables required. 199 | 200 | `path` being the relative script file path 201 | """ 202 | module_logger.info("Starting `reading_in`") 203 | if type(self.tables).__name__ != "dict": 204 | var_msg = ("The tables need to be in dictionary format for this " 205 | "`self.reading_in` step") 206 | module_logger.error(var_msg) 207 | raise ValueError(var_msg) 208 | if function is not None: 209 | if type(function).__name__ != "function": 210 | var_msg = ("The function passed to `self.reading_in` is not a " 211 | "function.") 212 | module_logger.error(var_msg) 213 | raise ValueError(var_msg) 214 | elif script_name is not None: 215 | function = import_attr(path, script_name, func_name) 216 | else: 217 | var_msg = ("One of the `function` or `script_name` arguments needs " 218 | "to be completed. And if `script name is then `path` " 219 | "needs to be too.") 220 | module_logger.error(var_msg) 221 | raise ValueError(var_msg) 222 | 223 | try: 224 | dfs = function(self.list_files, **kwargs) 225 | except AttributeError: 226 | if len([x for x in kwargs.keys()]) > 0: 227 | var_msg = (f"Function reading_in, kwargs may have been passed " 228 | f"when the function {func_name} in the script " 229 | f"{script_name} does not take kwargs") 230 | else: 231 | var_msg = (f"Function reading in: The {func_name} function " 232 | f"does not exist in the {script_name} script.") 233 | module_logger.error(var_msg) 234 | raise AttributeError(var_msg) 235 | if overwrite is False: 236 | df_org = self.tables.copy() 237 | df_org.update(dfs) 238 | elif overwrite is True: 239 | pass 240 | else: 241 | var_msg = ("The attribute `overwrite` in the function " 242 | "`reading_in` needs to be `True` or `False`") 243 | module_logger.error(var_msg) 244 | raise ValueError(var_msg) 245 | self.set_table(dfs, overwrite=overwrite) 246 | if type(dfs).__name__ == "DataFrame": 247 | module_logger.info(f"The table has shape '{dfs.shape}'") 248 | else: 249 | for key in dfs: 250 | module_logger.info( 251 | f"The table with key '{key}' has shape '{dfs[key].shape}'") 252 | 253 | module_logger.info("Completed `reading_in`") 254 | 255 | def set_table(self, tables, dict_key=None, overwrite=True): 256 | """ 257 | If self.tables is a dictionary set df to key else overwrite existing 258 | table if argument is True 259 | """ 260 | module_logger.info("Starting `set_table`") 261 | if (overwrite is True) & (dict_key is None): 262 | self.tables = tables 263 | elif ( 264 | (overwrite is True) & 265 | (dict_key is not None) & 266 | (type(self.tables).__name__ == 'dict') & 267 | (type(tables).__name__ == 'DataFrame') 268 | ): 269 | self.tables[dict_key] = tables 270 | elif ( 271 | (overwrite is False) & 272 | (dict_key is not None) & 273 | (type(self.tables).__name__ == 'dict') & 274 | (type(tables).__name__ == 'DataFrame') 275 | ): 276 | if dict_key not in [key for key in self.tables.keys()]: 277 | self.tables[dict_key] = tables 278 | else: 279 | var_msg = ( 280 | f'The combination of attributes has resulted in no change: ' 281 | f'`self.tables` type - {type(self.tables).__name__}, ' 282 | f'`tables` type - {type(tables).__name__}, `dict_key` - ' 283 | f'{dict_key}, `overwrite` - {overwrite}') 284 | module_logger.error(var_msg) 285 | raise AttributeError(var_msg) 286 | else: 287 | var_msg = ( 288 | f'The combination of attributes has resulted in no change: ' 289 | f'`self.tables` type - {type(self.tables).__name__}, `tables` ' 290 | f'type - {type(tables).__name__}, `dict_key` - {dict_key}, ' 291 | f'`overwrite` - {overwrite}') 292 | module_logger.error(var_msg) 293 | raise AttributeError(var_msg) 294 | module_logger.info("Completed `set_table`") 295 | 296 | def concatenate_tables(self): 297 | """ 298 | Where the tables are in a dictionary format put them into a DataFrame 299 | """ 300 | module_logger.info("Starting `concatenate_tables`") 301 | if type(self.tables).__name__ != "dict": 302 | var_msg = ("For the function `concatenate_tables` the `tables` " 303 | "should be in dictionary format") 304 | module_logger.error(var_msg) 305 | raise ValueError(var_msg) 306 | if len([key for key in self.tables.keys()]) > 1: 307 | df = pd.concat(self.tables, axis=1) 308 | elif len([key for key in self.tables.keys()]) == 1: 309 | dict_df = self.tables.copy() 310 | dict_key = [key for key in dict_df.keys()][0] 311 | df = dict_df[dict_key].copy() 312 | df['level_0'] = dict_key 313 | else: 314 | var_msg = "The dictionary `self.tables` is empty" 315 | module_logger.error(var_msg) 316 | raise AttributeError(var_msg) 317 | self.set_table(df, overwrite=True) 318 | module_logger.info("Completed `concatenate_tables`") 319 | 320 | def dictionary_tables(self, key=None): 321 | """ 322 | Where the tables are in a DataFrame format put them in a dictionary, 323 | using the values in the key column as the new dictionary keys 324 | """ 325 | module_logger.info("Starting `dictionary_tables`") 326 | if type(self.tables).__name__ != "DataFrame": 327 | var_msg = ("For the function `dictionary_tables` the `tables` " 328 | "should be in DataFrame format.") 329 | module_logger.error(var_msg) 330 | raise ValueError(var_msg) 331 | df = self.tables 332 | dict_dfs = dict() 333 | 334 | if key is not None: 335 | var_cycle = key 336 | else: 337 | var_cycle = "level_0" 338 | if var_cycle not in self.tables.columns.tolist(): 339 | var_msg = f"There is no {var_cycle} column present in the table" 340 | module_logger.error(var_msg) 341 | raise ValueError(var_msg) 342 | for val in df[var_cycle].unique().tolist(): 343 | dict_dfs[val] = df.loc[df[var_cycle] == val].copy() 344 | self.set_table(dict_dfs) 345 | 346 | module_logger.info("Completed `dictionary_tables`") 347 | 348 | def set_comparison_headers( 349 | self, path=None, script_name=None, func_name="read_headers", 350 | function=None, dictionary=None, **kwargs): 351 | # TODO Need to see if we can isolate just a set of new tables? Maybe 352 | # have a list of dictionary keys that have had their headers done 353 | # already? 354 | module_logger.info("Starting `set_comparison_headers`") 355 | 356 | if function is not None: 357 | if type(function).__name__ != "function": 358 | var_msg = ("The function passed to " 359 | "`self.set_comparison_headers` is not a function.") 360 | module_logger.error(var_msg) 361 | raise ValueError(var_msg) 362 | elif script_name is not None: 363 | function = import_attr(path, script_name, func_name) 364 | elif dictionary is not None: 365 | def function(**kwargs): return dictionary 366 | else: 367 | var_msg = ("One of the `function` or `script_name` arguments needs " 368 | "to be completed. And if `script name is then `path` " 369 | "needs to be too.") 370 | module_logger.error(var_msg) 371 | raise ValueError(var_msg) 372 | 373 | try: 374 | dict_headers = function(**kwargs) 375 | except AttributeError: 376 | if len([x for x in kwargs.keys()]) > 0: 377 | var_msg = ( 378 | f"Function set_comparison_headers, kwargs may have been " 379 | f"passed when the function {func_name} in the script " 380 | f"{script_name} does not take kwargs") 381 | else: 382 | var_msg = ( 383 | f"Function set_comparison_headers: The {func_name} function" 384 | f" does not exist in the {script_name} script.") 385 | module_logger.error(var_msg) 386 | raise AttributeError(var_msg) 387 | 388 | if type(dict_headers).__name__ != 'dict': 389 | var_msg = 'The headers output should be a dictionary' 390 | module_logger.error(var_msg) 391 | raise Exception(var_msg) 392 | list_keys = [ 393 | key for key in dict_headers.keys() if key != 'ideal_headers'] 394 | list_keys = [ 395 | key for key in list_keys if 396 | (dict_headers[key].get('expected_headers') is None) | 397 | (dict_headers[key].get('new_headers') is None) | 398 | (dict_headers[key].get('remove') is None) 399 | ] 400 | if len(list_keys) > 0: 401 | var_msg = ( 402 | f'There are dictionary keys that do not have all the required ' 403 | f'values: {", ".join([str(key) for key in list_keys])}') 404 | module_logger.error(var_msg) 405 | raise Exception(var_msg) 406 | if dict_headers.get('ideal_headers') is None: 407 | var_msg = ('There needs to be a key to the headers dictionary that' 408 | ' is "ideal_headers"') 409 | module_logger.error(var_msg) 410 | raise Exception(var_msg) 411 | if type(dict_headers.get('ideal_headers')).__name__ != 'list': 412 | var_msg = 'The value of key "ideal_headers" needs to be a list' 413 | module_logger.error(var_msg) 414 | raise Exception(var_msg) 415 | 416 | self.headers = dict(dict_headers) 417 | 418 | module_logger.info( 419 | f"There are {len(dict_headers)} header keys and they are: " 420 | f"{', '.join([key for key in dict_headers.keys()])}") 421 | 422 | module_logger.info("Completed `set_comparison_headers`") 423 | 424 | @staticmethod 425 | def _link_headers(tables, headers, **kwargs): 426 | dict_link = dict() 427 | list_headers_keys = [ 428 | key for key in headers.keys() if key != 'ideal_headers'] 429 | if type(tables).__name__ == 'dict': 430 | for df_key in [key for key in tables.keys()]: 431 | for header_set in list_headers_keys: 432 | list_expected = headers[header_set]['expected_headers'] 433 | if list_expected == tables[ 434 | df_key].iloc[:len(list_expected)].values.tolist()[0]: 435 | dict_link[df_key] = header_set 436 | break 437 | else: 438 | for header_set in list_headers_keys: 439 | list_expected = headers[header_set]['expected_headers'] 440 | if list_expected == tables.iloc[ 441 | :len(list_expected)].values.tolist()[0]: 442 | dict_link['combined'] = header_set 443 | break 444 | return dict_link 445 | 446 | def link_headers(self, path=None, script_name=None, 447 | func_name="link_headers", function=None, **kwargs): 448 | # TODO Need to see if we can isolate just a set of new tables? Maybe 449 | # have a list of dictionary keys that have had their headers 450 | # done already? 451 | module_logger.info("Starting `link_headers`") 452 | 453 | if function is not None: 454 | if type(function).__name__ != "function": 455 | var_msg = ("The function passed to `self.link_headers` is " 456 | "not a function.") 457 | module_logger.error(var_msg) 458 | raise ValueError(var_msg) 459 | elif script_name is not None: 460 | function = import_attr(path, script_name, func_name) 461 | else: 462 | function = self._link_headers 463 | 464 | try: 465 | dict_link = function(self.tables, self.headers, **kwargs) 466 | except AttributeError: 467 | if len([x for x in kwargs.keys()]) > 0: 468 | var_msg = ( 469 | f"Function link_headers, kwargs may have been passed when " 470 | f"the function {func_name} in the script {script_name} does" 471 | f" not take kwargs") 472 | else: 473 | var_msg = (f"Function link_headers: The {func_name} function " 474 | f"does not exist in the {script_name} script.") 475 | module_logger.error(var_msg) 476 | raise AttributeError(var_msg) 477 | 478 | list_unallocated_keys = set(self.tables.keys()) - set(dict_link.keys()) 479 | if len(list_unallocated_keys) != 0: 480 | var_msg = (f"Not all the headers are linked, the unlinked tables " 481 | f"are: {list_unallocated_keys}") 482 | module_logger.error(var_msg) 483 | raise ValueError(var_msg) 484 | 485 | self.__link_headers = dict(dict_link) 486 | 487 | module_logger.info("Completed `link_headers`") 488 | 489 | @staticmethod 490 | def __assert_linked_headers( 491 | list_ideal_headers, dict_header, df, remove_header_rows, reset_index): 492 | list_expected_headers = dict_header['expected_headers'] 493 | list_new_names = dict_header['new_headers'] 494 | list_remove = [ 495 | dict_header['new_headers'][i] for i in range(len(dict_header['remove'])) 496 | if dict_header['remove'][i] == 'remove' 497 | ] 498 | 499 | # Remove the expected headers rows 500 | if remove_header_rows: 501 | df.drop( 502 | [i for i in range(len(list_expected_headers))], 503 | axis=0, 504 | inplace=True) 505 | if reset_index: 506 | df.reset_index(drop=True, inplace=True) 507 | 508 | # Set the new headers 509 | df.columns = list_new_names 510 | 511 | # Remove the columns to remove 512 | if len(list_remove) > 0: 513 | df.drop(list_remove, axis=1, inplace=True) 514 | 515 | # Fill in missing columns and reorder columns 516 | list_df_cols = df.columns.tolist() 517 | list_cols = [ 518 | col for col in list_ideal_headers if col not in list_df_cols] 519 | for col in list_cols: 520 | df[col] = np.nan 521 | 522 | df = df[list_ideal_headers].copy() 523 | 524 | return df 525 | 526 | def assert_linked_headers( 527 | self, remove_header_rows=False, reset_index=False): 528 | module_logger.info("Starting `assert_linked_headers`") 529 | 530 | if type(self.tables).__name__ == 'dict': 531 | dict_dfs = dict(self.tables) 532 | for key in [key for key in self.__link_headers.keys()]: 533 | dict_dfs[key] = self.__assert_linked_headers( 534 | self.headers['ideal_headers'], 535 | self.headers[self.__link_headers[key]], 536 | dict_dfs[key], 537 | remove_header_rows, 538 | reset_index 539 | ) 540 | self.set_table(dict(dict_dfs)) 541 | else: 542 | key = [key for key in self.__link_headers.keys()][0] 543 | df = self.__assert_linked_headers( 544 | self.headers['ideal_headers'], 545 | self.headers[self.__link_headers[key]], 546 | self.tables, 547 | remove_header_rows, 548 | reset_index 549 | ) 550 | self.set_table(df.copy()) 551 | 552 | module_logger.info("Completed `assert_linked_headers`") 553 | 554 | def set_headers( 555 | self, path=None, script_name=None, func_name=None, list_cols=None, 556 | function=None, ideal_headers=None, required_headers=None): 557 | module_logger.info("Starting `set_headers`") 558 | if list_cols is not None: 559 | if type(list_cols).__name__ != "list": 560 | var_msg = ("The argument `list_cols` of function `set_headers` " 561 | "needs to be a list") 562 | module_logger.error(var_msg) 563 | raise ValueError(var_msg) 564 | elif function is not None: 565 | if type(function).__name__ != "function": 566 | var_msg = ("The argument `function` of function `set_headers` " 567 | "needs to be a function") 568 | module_logger.error(var_msg) 569 | raise ValueError(var_msg) 570 | elif script_name is not None: 571 | function = import_attr(path, script_name, func_name) 572 | elif ideal_headers is not None: 573 | if type(ideal_headers).__name__ != 'list': 574 | var_msg = ("The argument `ideal_headers` of function " 575 | "`set_headers` needs to be a list") 576 | module_logger.error(var_msg) 577 | raise ValueError(var_msg) 578 | elif required_headers is not None: 579 | if type(required_headers).__name__ != 'list': 580 | var_msg = ("The argument `required_headers` of function " 581 | "`set_headers` needs to be a list") 582 | module_logger.error(var_msg) 583 | raise ValueError(var_msg) 584 | var_type = type(self.tables).__name__ 585 | if var_type == "dict": 586 | dict_dfs = self.tables.copy() 587 | var_cond = len( 588 | set([dict_dfs[key].shape[1] for key in dict_dfs.keys()])) 589 | var_cond = var_cond != 1 590 | if var_cond: 591 | var_msg = ("There are an inconsistent number of columns " 592 | "present in the dictionary of tables") 593 | module_logger.error(var_msg) 594 | raise ValueError(var_msg) 595 | if list_cols is not None: 596 | if (len(list_cols) != 597 | dict_dfs[[x for x in dict_dfs.keys()][0]].shape[1]): 598 | var_msg = ("The length of `list_cols` is different to the " 599 | "number of columns present in the table") 600 | module_logger.error(var_msg) 601 | raise ValueError(var_msg) 602 | elif function is not None: 603 | list_cols_org = dict_dfs[ 604 | [x for x in dict_dfs.keys()][0] 605 | ].columns.tolist() 606 | list_cols = [function(x) for x in list_cols_org] 607 | for key in dict_dfs.keys(): 608 | if list_cols is not None: 609 | dict_dfs[key].columns = list_cols 610 | elif function is not None: 611 | dict_dfs[key].columns = list_cols 612 | elif ideal_headers is not None: 613 | for col in [ 614 | col for col in ideal_headers if 615 | col not in dict_dfs[key].columns.tolist() 616 | ]: 617 | dict_dfs[key][col] = np.nan 618 | dict_dfs[key] = dict_dfs[key][ideal_headers].copy() 619 | elif required_headers is not None: 620 | for col in [ 621 | col for col in required_headers if 622 | col not in dict_dfs[key].columns.tolist() 623 | ]: 624 | dict_dfs[key][col] = np.nan 625 | self.set_table(dict_dfs, overwrite=True) 626 | elif var_type == "DataFrame": 627 | if len(list_cols) != self.tables.shape[1]: 628 | var_msg = ("The length of `list_cols` is different to the " 629 | "number of columns present in the table") 630 | module_logger.error(var_msg) 631 | raise ValueError(var_msg) 632 | df = self.tables.copy() 633 | if list_cols is not None: 634 | df.columns = list_cols 635 | elif function is not None: 636 | df.columns = [function(x) for x in df.columns.tolist()] 637 | elif ideal_headers is not None: 638 | for col in [ 639 | col for col in ideal_headers if 640 | col not in df.columns.tolist() 641 | ]: 642 | df[col] = np.nan 643 | df = df[ideal_headers].copy() 644 | elif required_headers is not None: 645 | for col in [ 646 | col for col in required_headers if 647 | col not in df.columns.tolist() 648 | ]: 649 | df[col] = np.nan 650 | self.set_table(df, overwrite=True) 651 | else: 652 | var_msg = ("Somehow the tables are not a dictionary or a DataFrame " 653 | "for function `set_headers`") 654 | module_logger.error(var_msg) 655 | raise ValueError(var_msg) 656 | 657 | module_logger.info("Completed `set_headers`") 658 | 659 | def alter_tables(self, path=None, script_name=None, 660 | object_name="dict_alter", dictionary=None, **kwargs): 661 | """ 662 | Use this functionality to make alterations to the table(s) 663 | """ 664 | module_logger.info("Starting `alter_tables`") 665 | # TODO move this check to own function (applies to convert_columns too) 666 | if (script_name is not None) & (object_name is not None): 667 | dict_alter = import_attr(path, script_name, object_name) 668 | elif dictionary is not None: 669 | if type(dictionary).__name__ != "dict": 670 | var_msg = "The `dictionary` argument is not a dictionary" 671 | module_logger.error(var_msg) 672 | raise ValueError(var_msg) 673 | dict_alter = dictionary 674 | else: 675 | var_msg = ("Either `dictionary` or both of `script_name` and " 676 | "`path` need to be none null") 677 | module_logger.error(var_msg) 678 | raise ValueError(var_msg) 679 | 680 | if type(self.tables).__name__ == "DataFrame": 681 | df = self.tables.copy() 682 | df_new = self.__alter_cols( 683 | df, dict_alter, [self.__key_1, self.__key_2, self.__key_3], 684 | np.nan, **kwargs) 685 | self.set_table(df_new) 686 | elif type(self.tables).__name__ == "dict": 687 | dfs = self.tables 688 | for key in self.tables.keys(): 689 | df = dfs[key].copy() 690 | df_new = self.__alter_cols( 691 | df, dict_alter, [self.__key_1, self.__key_2, self.__key_3], 692 | key, **kwargs) 693 | self.set_table(df_new, key) 694 | else: 695 | var_msg = ("The tables are in neither a DataFrame or dictionary " 696 | "format, which means something is seriously wrong...") 697 | module_logger.error(var_msg) 698 | raise ValueError(var_msg) 699 | 700 | module_logger.info("Completed `alter_tables`") 701 | 702 | def __alter_cols(self, df, dict_alter, keys, dict_key, **kwargs): 703 | module_logger.info("Starting `__alter_cols`") 704 | if pd.isnull(dict_key): 705 | var_file = np.nan 706 | var_subfile = np.nan 707 | else: 708 | var_file = dict_key.split(self.__key_separator)[0] 709 | var_subfile = (dict_key.split(self.__key_separator)[1] if 710 | self.__key_separator in dict_key else np.nan) 711 | for alter_key in dict_alter.keys(): 712 | var_type = dict_alter[alter_key]["type"] 713 | function = dict_alter[alter_key]["function"] 714 | if var_type == "new_col": 715 | var_col_name = dict_alter[alter_key]["col_name"] 716 | if var_col_name in df.columns.tolist(): 717 | var_msg = ( 718 | f"The column {var_col_name} is present in the " 719 | f"table so should not be overwritten") 720 | module_logger.error(var_msg) 721 | self.error_handling(var_file, var_subfile, "", var_msg, 722 | var_col_name, np.nan, np.nan) 723 | continue 724 | try: 725 | s = function(df, keys, **kwargs) 726 | df[var_col_name] = s 727 | except KeyError: 728 | var_msg = ( 729 | f"For type new_col the function for alter_key " 730 | f"{alter_key} has not worked with a KeyError") 731 | module_logger.error(var_msg) 732 | self.error_handling(var_file, var_subfile, "", var_msg, 733 | var_col_name, np.nan, np.nan) 734 | continue 735 | except: 736 | var_msg = (f"For type new_col the function for " 737 | f"alter_key {alter_key} has not worked") 738 | module_logger.error(var_msg) 739 | 740 | var_idx = np.nan 741 | var_issue_count = np.nan 742 | if "idx_function" in dict_alter[alter_key]: 743 | func_idx = dict_alter[alter_key]['idx_function'] 744 | if type(func_idx).__name__ != 'function': 745 | var_msg = '' 746 | module_logger.error(var_msg) 747 | s_idx = func_idx(df, keys, **kwargs) 748 | var_idx = ', '.join( 749 | [ 750 | str(item) for item in 751 | s_idx.loc[s_idx].index.tolist() 752 | ] 753 | ) 754 | var_issue_count = s_idx.sum() 755 | self.error_handling(var_file, var_subfile, "", var_msg, 756 | var_col_name, var_issue_count, var_idx) 757 | continue 758 | elif var_type == "map_df": 759 | try: 760 | df = function(df, keys, **kwargs) 761 | except: 762 | var_msg = (f"For type map_df the function for " 763 | f"alter_key {alter_key} has not worked") 764 | module_logger.error(var_msg) 765 | 766 | var_idx = np.nan 767 | var_issue_count = np.nan 768 | if "idx_function" in dict_alter[alter_key]: 769 | func_idx = dict_alter[alter_key]['idx_function'] 770 | if type(func_idx).__name__ != 'function': 771 | var_msg = '' 772 | module_logger.error(var_msg) 773 | s_idx = func_idx(df, keys, **kwargs) 774 | var_idx = ', '.join( 775 | [ 776 | str(item) for item in 777 | s_idx.loc[s_idx].index.tolist() 778 | ] 779 | ) 780 | var_issue_count = s_idx.sum() 781 | self.error_handling(var_file, var_subfile, "", var_msg, 782 | np.nan, var_issue_count, var_idx) 783 | continue 784 | 785 | module_logger.info("Completed `__alter_cols`") 786 | return df 787 | 788 | def convert_columns(self, path=None, script_name=None, 789 | object_name="dict_convert", dictionary=None, **kwargs): 790 | module_logger.info("Starting `convert_columns`") 791 | if (script_name is not None) & (object_name is not None): 792 | dict_convert = import_attr(path, script_name, object_name) 793 | elif dictionary is not None: 794 | if type(dictionary).__name__ != "dict": 795 | var_msg = "The `dictionary` argument is not a dictionary" 796 | module_logger.error(var_msg) 797 | raise ValueError(var_msg) 798 | dict_convert = dictionary 799 | else: 800 | var_msg = ("Either `dictionary` or both of `script_name` and " 801 | "`path` need to be none null") 802 | module_logger.error(var_msg) 803 | raise ValueError(var_msg) 804 | 805 | if type(self.tables).__name__ == "DataFrame": 806 | df = self.tables.copy() 807 | df_new = self.__convert_col(df, dict_convert, "", **kwargs) 808 | self.set_table(df_new, overwrite=True) 809 | elif type(self.tables).__name__ == "dict": 810 | dfs = self.tables 811 | for key in self.tables.keys(): 812 | df = dfs[key].copy() 813 | df_new = self.__convert_col(df, dict_convert, key, **kwargs) 814 | dfs[key] = df_new.copy() 815 | self.set_table(dfs, overwrite=True) 816 | else: 817 | var_msg = ("The tables are in neither a DataFrame or dictionary " 818 | "format, which means something is seriously wrong...") 819 | module_logger.error(var_msg) 820 | raise ValueError(var_msg) 821 | 822 | module_logger.info("Completed `convert_columns`") 823 | 824 | def __convert_col(self, df, dict_convert, dict_key, **kwargs): 825 | module_logger.info("Starting `__convert_col`") 826 | for convert_key in dict_convert.keys(): 827 | cols = dict_convert[convert_key]["columns"] 828 | if type(cols).__name__ == 'function': 829 | cols = cols(df, **kwargs) 830 | list_cols = list(cols) 831 | list_stops = dict_convert[convert_key]["dtypes"] 832 | dict_functions = dict_convert[convert_key]["functions"] 833 | for col in list_cols: 834 | if col not in df.columns.tolist(): 835 | var_msg = f"The column {col} is not present" 836 | module_logger.error(var_msg) 837 | raise ValueError(var_msg) 838 | dtype_flag = 0 839 | var_dtype = df[col].dtype.name 840 | for dtype in list_stops: 841 | if dtype in var_dtype: 842 | dtype_flag = 1 843 | break 844 | if dtype_flag == 1: 845 | continue 846 | converted_flag = 0 847 | for key in dict_functions.keys(): 848 | func_use = dict_functions[key] 849 | if type(func_use).__name__ != "function": 850 | var_msg = (f"The function for converting is not a " 851 | f"function! For keys {convert_key}, {key}") 852 | module_logger.error(var_msg) 853 | raise ValueError(var_msg) 854 | try: 855 | s = func_use(df, col, **kwargs) 856 | df[col] = s.copy() 857 | converted_flag = 1 858 | break 859 | except: 860 | var_msg = (f"The conversion failed for keys " 861 | f"{convert_key}, {key}, trying next") 862 | module_logger.warning(var_msg) 863 | continue 864 | if converted_flag == 0: 865 | var_idx = np.nan 866 | var_issue_count = np.nan 867 | if "idx_function" in dict_convert[convert_key]: 868 | func_idx = dict_convert[convert_key]['idx_function'] 869 | if type(func_idx).__name__ != 'function': 870 | var_msg = ( 871 | f'The `idx_function` argument is not a function' 872 | f' it is a {type(func_idx).__name__}') 873 | module_logger.error(var_msg) 874 | raise ValueError(var_msg) 875 | s_idx = func_idx(df, col, **kwargs) 876 | var_idx = ', '.join( 877 | [ 878 | str(item) for item in 879 | s_idx.loc[s_idx].index.tolist() 880 | ] 881 | ) 882 | var_issue_count = s_idx.sum() 883 | var_msg = (f"The conversion for column {col} for " 884 | f"convert_key {convert_key} failed.") 885 | module_logger.error(var_msg) 886 | self.error_handling( 887 | dict_key.split(self.__key_separator)[0], 888 | (dict_key.split(self.__key_separator)[1] if 889 | self.__key_separator in dict_key else np.nan), 890 | "", 891 | f"The conversion failed to format {convert_key}", 892 | col, 893 | var_issue_count, 894 | var_idx 895 | ) 896 | 897 | module_logger.info("Completed `__convert_col`") 898 | return df 899 | 900 | def assert_nulls(self, list_nulls=None, list_exclude_cols=None): 901 | module_logger.info("Starting `assert_nulls`") 902 | if list_nulls is None: 903 | list_nulls_use = ["nan", ""] 904 | else: 905 | list_nulls_use = list_nulls 906 | if list_exclude_cols is None: 907 | list_exclude_cols_use = [] 908 | else: 909 | list_exclude_cols_use = list_exclude_cols 910 | module_logger.info(f"The nulls being used are: {list_nulls_use}") 911 | module_logger.info( 912 | f"The columns being excluded are: {list_exclude_cols_use}") 913 | df = self.tables.copy() 914 | if type(df).__name__ == "dict": 915 | list_keys = [x for x in df.keys()] 916 | for key in list_keys: 917 | for null in list_nulls_use: 918 | if len(list_exclude_cols_use) == 0: 919 | df[key] = df[key].replace(null, np.nan) 920 | else: 921 | for col in [ 922 | col for col in df[key].columns.tolist() if 923 | col not in list_exclude_cols_use 924 | ]: 925 | df[key][col] = df[key][col].replace(null, np.nan) 926 | else: 927 | for null in list_nulls_use: 928 | if len(list_exclude_cols_use) == 0: 929 | df = df.replace(null, np.nan) 930 | else: 931 | for col in [ 932 | col for col in df.columns.tolist() if 933 | col not in list_exclude_cols_use 934 | ]: 935 | df[col] = df[col].replace(null, np.nan) 936 | self.set_table(df, overwrite=True) 937 | module_logger.info("Completed `assert_nulls`") 938 | 939 | def get_issue_count(self, issue_number_min=None, issue_number_max=None): 940 | module_logger.info("Starting `get_issue_count`") 941 | df = self.df_issues.copy() 942 | if issue_number_min is not None: 943 | df = df.loc[df["step_number"] >= issue_number_min].copy() 944 | if issue_number_max is not None: 945 | df = df.loc[df["step_number"] <= issue_number_max].copy() 946 | var_count = df.shape[0] 947 | module_logger.info("Completed `get_issue_count`") 948 | return var_count 949 | 950 | def form_summary_tables(self, path=None, script_name=None, 951 | func_name="form_tables", function=None, **kwargs): 952 | """ 953 | Use a function to create summaries off the main table set. 954 | 955 | The function is passed the arguments: 956 | self.tables, self.formed_tables, self.__grouping, self.__key_1, 957 | self.__key_2, self.__key_3, self.__key_separator, **kwargs 958 | """ 959 | module_logger.info("Starting `form_summary_tables`") 960 | 961 | if function is not None: 962 | if type(function).__name__ != "function": 963 | var_msg = ("The function passed to `self.form_summary_tables` " 964 | "is not a function.") 965 | module_logger.error(var_msg) 966 | raise ValueError(var_msg) 967 | elif script_name is not None: 968 | function = import_attr(path, script_name, func_name) 969 | else: 970 | var_msg = ("One of the `function` or `script_name` arguments needs " 971 | "to be completed. And if `script name is then `path` " 972 | "needs to be too.") 973 | module_logger.error(var_msg) 974 | raise ValueError(var_msg) 975 | 976 | dict_formed_tables = function( 977 | self.tables, self.formed_tables, self.__grouping, self.__key_1, 978 | self.__key_2, self.__key_3, self.__key_separator, **kwargs) 979 | if type(dict_formed_tables).__name__ != 'dict': 980 | var_msg = ('The output of the function for `form_summary_table` ' 981 | 'is not a dictionary and it needs to be') 982 | module_logger.error(var_msg) 983 | raise ValueError(var_msg) 984 | self.formed_tables = dict_formed_tables 985 | 986 | module_logger.info("Completed `form_summary_tables`") 987 | 988 | def get_step_no(self): 989 | module_logger.info("Starting `get_step_no`") 990 | module_logger.info("Completed `get_step_no`") 991 | return self.__step_no 992 | 993 | def _repr_html_(self): 994 | module_logger.info("Starting `_repr__html_`") 995 | var_key_3 = "" if self.__key_3 == "None" else self.__key_3 996 | var_out_keys = f""" 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 |
Grouping{self.__grouping}
Key 1{self.__key_1}
Key 2{self.__key_2}
Key 3{var_key_3}
1015 | """ 1016 | if type(self.tables).__name__ == 'dict': 1017 | var_out_tbl_info = """ 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | {} 1027 |
Dictionary keyDataframe shapeCount numeric columnsCount date columnsCount object columns
1028 | """ 1029 | for key in [key for key in self.tables.keys()]: 1030 | var_out_tbl_info = var_out_tbl_info.replace( 1031 | '{}', 1032 | f""" 1033 | 1034 | {key} 1035 | {self.tables[key].shape} 1036 | {self.tables[key].select_dtypes(include=[np.number]).shape[1]} 1037 | {self.tables[key].select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]} 1038 | {self.tables[key].select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]} 1039 | 1040 | {{}} 1041 | """ 1042 | ) 1043 | var_out_tbl_info = var_out_tbl_info.replace('{}', '') 1044 | else: 1045 | var_out_tbl_info = f""" 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 |
Dataframe shapeCount numeric columnsCount date columnsCount object columns
{self.tables.shape}{self.tables.select_dtypes(include=[np.number]).shape[1]}{self.tables.select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]}{self.tables.select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]}
1060 | """ 1061 | var_out_issues = """ 1062 | """ 1063 | var_out = f"{var_out_keys}

{var_out_tbl_info}

{var_out_issues}" 1064 | module_logger.info("Completed `_repr_html_`") 1065 | return var_out 1066 | -------------------------------------------------------------------------------- /data_etl/general_functions.py: -------------------------------------------------------------------------------- 1 | # Here functions that are typically used when using these scripts or writing 2 | # these data curation scripts are predefined here 3 | import logging 4 | import os 5 | from datetime import datetime 6 | import importlib 7 | 8 | import pandas as pd 9 | 10 | module_logger = logging.getLogger(__name__) 11 | 12 | 13 | def func_initialise_logging( 14 | script_name, log_folder_path, key_1, key_2, key_3, start_time): 15 | var_log_name = os.path.abspath( 16 | os.path.join( 17 | log_folder_path, 18 | (f"{script_name}_{key_1}_{key_2}_{key_3}_" 19 | f"{start_time.strftime('%Y%m%d_%H%M%S')}.log") 20 | ) 21 | ) 22 | logging.basicConfig( 23 | filename=var_log_name, filemode="a", datefmt="%H:%M:%S", 24 | level=logging.DEBUG, 25 | format="%(asctime)s|%(name)s|%(levelname)s|%(message)s") 26 | 27 | logging.info(f"Starting the process at " 28 | f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}") 29 | 30 | 31 | def func_check_for_issues(issue_count, cnx, cnx_key, table, step_no, 32 | override=False, start_time=None): 33 | if (issue_count > 0) & (override is not True): 34 | cnx.write_to_db(cnx_key, table) 35 | var_msg = f'There were {issue_count} issues found at step {step_no}' 36 | module_logger.error(var_msg) 37 | if start_time is not None: 38 | module_logger.info("Script time taken: {}".format( 39 | str(datetime.now() - start_time))) 40 | raise ValueError(var_msg) 41 | 42 | 43 | def func_to_sql(x, datetime_format='%Y-%m-%d'): 44 | if pd.isnull(x): 45 | return "NULL" 46 | elif type(x).__name__ == 'Timestamp': 47 | return f"'{x.strftime(datetime_format)}'" 48 | else: 49 | return f"'{str(x)}'" 50 | 51 | 52 | def import_attr(path, script_name, attr_name): 53 | if (path is None) | (path == '.'): 54 | mod = importlib.import_module(script_name) 55 | else: 56 | var_script_path = os.path.join(path, f"{script_name}.py") 57 | if not os.path.exists(var_script_path): 58 | var_msg = f"The script does not exist: {script_name}.py" 59 | module_logger.error(var_msg) 60 | raise ValueError(var_msg) 61 | spec = importlib.util.spec_from_file_location( 62 | script_name, var_script_path) 63 | mod = importlib.util.module_from_spec(spec) 64 | spec.loader.exec_module(mod) 65 | attr = getattr(mod, attr_name) 66 | 67 | return attr 68 | -------------------------------------------------------------------------------- /examples/00_create_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Create data\n", 8 | "\n", 9 | "This notebook creates the data that is used in the examples\n", 10 | "\n", 11 | "There is a data set that will process without problems in the examples and one that will have issues to see the difference. There are also some excel outputs for the scripts example.\n", 12 | "\n", 13 | "The specific sections for creating tables are: \n", 14 | "+ [Conversions](#Conversions), converting column dtypes\n", 15 | "+ [Altering](#Altering), changing the values in the DataFrame, adding new columns, dropping rows or columns etc\n", 16 | "+ [Checks](#Checks), looking for outliers or rows that data does not follow the prescribed rules\n", 17 | "+ [For summary tables](#For-summary-tables), there is one table here and it's for a summary output\n", 18 | "\n", 19 | "## Setup\n", 20 | "" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Import and settings options" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import sqlite3\n", 37 | "import pickle\n", 38 | "import datetime\n", 39 | "\n", 40 | "import pandas as pd\n", 41 | "import numpy as np" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "pd.set_option('display.max_rows', 10)\n", 51 | "pd.set_option('display.max_columns', 10)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Create tables\n", 59 | "
\n", 60 | "\n", 61 | "There are lots of different but small tables used in the examples\n", 62 | "\n", 63 | "### Conversions\n", 64 | "
" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "df_convert = pd.DataFrame(\n", 74 | " [\n", 75 | " ('A', '1', '0.6', '2019-01-01'),\n", 76 | " ('B', '4', '5.2', '2019-02-05'),\n", 77 | " ('C', '1', '5.6', '2018-12-17'),\n", 78 | " ('D', '10', '15.9', '2019-07-18'),\n", 79 | " ('E', '-8', '4.7', '2018-03-09')\n", 80 | " ],\n", 81 | " columns=['object', 'int', 'float', 'date']\n", 82 | ")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df_convert_issues = pd.DataFrame(\n", 92 | " [\n", 93 | " ('A', '1', '0.6', '2019-02-29'),\n", 94 | " ('B', '4.5', 'A', '2019-22-05'),\n", 95 | " ('C', '1', '5.6', '2018-12-17'),\n", 96 | " ('D', 'b', '15.9', '2019-09-31'),\n", 97 | " (5, '-8', '4.7', '2018-03-09')\n", 98 | " ],\n", 99 | " columns=['object', 'int', 'float', 'date']\n", 100 | ")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "### Altering\n", 108 | "
" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "df_alterations = pd.DataFrame(\n", 118 | " [\n", 119 | " ('A', 2, 'key_1'),\n", 120 | " ('B', 199, 'key_2'),\n", 121 | " ('C', -1, 'key_1'),\n", 122 | " ('D', 20, 'key_3'),\n", 123 | " ('E', 6, 'key_2')\n", 124 | " ],\n", 125 | " columns=['to_map', 'add_1', 'merge_key']\n", 126 | ")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "df_alterations_issues = pd.DataFrame(\n", 136 | " [\n", 137 | " ('A', 2, 'key_1'),\n", 138 | " ('B', 199, 2),\n", 139 | " ('C', -1, 'key_1'),\n", 140 | " (['D'], 'a', 'key_3'),\n", 141 | " ('E', 6, 'key_2')\n", 142 | " ],\n", 143 | " columns=['to_map', 'add_1', 'merge_key']\n", 144 | ")" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Checks\n", 152 | "
" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "df_checks = pd.DataFrame(\n", 162 | " [\n", 163 | " (3, 'A', 'a'),\n", 164 | " (10, 'A', 'z'),\n", 165 | " (9, 'B', 'b'),\n", 166 | " (4, 'D', 'd'),\n", 167 | " (7, 'C', 'c')\n", 168 | " ],\n", 169 | " columns=['number', 'category_1', 'category_2']\n", 170 | ")" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 8, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "df_checks_issues = pd.DataFrame(\n", 180 | " [\n", 181 | " (1, 'Z', 'y'),\n", 182 | " (10, 'A', 'a'),\n", 183 | " (9, 'Y', 'b'),\n", 184 | " (4, 'B', 'b'),\n", 185 | " (-1, 'C', 'c')\n", 186 | " ],\n", 187 | " columns=['number', 'category_1', 'category_2']\n", 188 | ")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "### For summary tables\n", 196 | "
" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 9, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df_summary = pd.DataFrame(\n", 206 | " [\n", 207 | " ('b', 'c', 1, 6),\n", 208 | " ('d', 'b', 1, 9),\n", 209 | " ('c', 'b', 1, 0),\n", 210 | " ('d', 'd', 1, 9),\n", 211 | " ('c', 'b', 1, 1),\n", 212 | " ('a', 'd', 1, 3),\n", 213 | " ('c', 'c', 1, 0),\n", 214 | " ('c', 'd', 1, 0),\n", 215 | " ('c', 'c', 1, 0),\n", 216 | " ('a', 'e', 1, 4),\n", 217 | " ('b', 'e', 1, 7),\n", 218 | " ('a', 'd', 1, 4),\n", 219 | " ('b', 'e', 1, 6),\n", 220 | " ('b', 'c', 1, 8),\n", 221 | " ('b', 'c', 1, 7),\n", 222 | " ('d', 'e', 1, 9),\n", 223 | " ('a', 'b', 1, 5),\n", 224 | " ('a', 'd', 1, 5),\n", 225 | " ('a', 'b', 1, 4),\n", 226 | " ('d', 'b', 1, 10),\n", 227 | " ('b', 'c', 1, 6),\n", 228 | " ('b', 'e', 1, 7),\n", 229 | " ('a', 'e', 1, 4),\n", 230 | " ('a', 'c', 1, 3),\n", 231 | " ('c', 'c', 1, 0),\n", 232 | " ('c', 'd', 1, 2),\n", 233 | " ('a', 'b', 1, 3),\n", 234 | " ('a', 'e', 1, 5),\n", 235 | " ('a', 'c', 1, 3),\n", 236 | " ('a', 'e', 1, 4),\n", 237 | " ('b', 'd', 1, 6),\n", 238 | " ('c', 'e', 1, 1),\n", 239 | " ('b', 'e', 1, 7),\n", 240 | " ('c', 'c', 1, 0),\n", 241 | " ('a', 'c', 1, 5),\n", 242 | " ('c', 'b', 1, 0),\n", 243 | " ('d', 'b', 1, 8),\n", 244 | " ('d', 'e', 1, 10),\n", 245 | " ('d', 'c', 1, 8),\n", 246 | " ('a', 'd', 1, 3),\n", 247 | " ('d', 'e', 1, 10),\n", 248 | " ('d', 'c', 1, 8),\n", 249 | " ('d', 'e', 1, 10),\n", 250 | " ('a', 'c', 1, 4),\n", 251 | " ('d', 'b', 1, 8),\n", 252 | " ('d', 'b', 1, 10),\n", 253 | " ('d', 'e', 1, 10),\n", 254 | " ('a', 'c', 1, 5),\n", 255 | " ('a', 'd', 1, 5),\n", 256 | " ('d', 'c', 1, 10)\n", 257 | " ],\n", 258 | " columns=['str', 'str_2', 'count', 'int_max']\n", 259 | ")" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### For scripts\n", 267 | "
" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 10, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "df_data = pd.DataFrame(\n", 277 | " [\n", 278 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 7, 7, 0, 0), \n", 279 | " 'A string this is', 51.5074, 0.1278),\n", 280 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 4, 9, 0, 0), \n", 281 | " 'Test', 51.5084, 0.1268),\n", 282 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 1, 10, 0, 0), \n", 283 | " 'testing', 51.5094, 0.1258),\n", 284 | " (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 10, 13, 0, 0),\n", 285 | " 'test test test', 51.5104, 0.1248),\n", 286 | " (4, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 7, 16, 0, 0),\n", 287 | " np.nan, 51.5114, 0.1238),\n", 288 | " (5, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 4, 18, 0, 0), \n", 289 | " np.nan, 51.5124, 0.1228),\n", 290 | " (6, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 1, 19, 0, 0),\n", 291 | " 'Blah', 51.5134, 0.1218),\n", 292 | " (7, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 10, 22, 0, 0),\n", 293 | " 'Dah', 51.5144, 0.1208),\n", 294 | " (1234, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 7, 25, 0, 0), \n", 295 | " 'Doh', 51.5154, 0.1198),\n", 296 | " (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 4, 27, 0, 0),\n", 297 | " 'Boh', 51.5164, 0.1188),\n", 298 | " (2341243, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 1, 29, 0, 0),\n", 299 | " 'Pho', 51.5174, 0.1178)\n", 300 | " ],\n", 301 | " columns=['Number', 'A date', 'Another date£', ' StringStringString ', 'lat', 'lng']\n", 302 | ")" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 11, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "df_headers_1 = pd.DataFrame(\n", 312 | " [\n", 313 | " ('Header', 'Number', 'A date', 'Another date£', ' StringStringString ', 'lat', 'lng'), \n", 314 | " ('New name', 'a_number', 'date_1', 'date_2', 'string', 'lat', 'lng'),\n", 315 | " ('Remove', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),\n", 316 | " ('Notes', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)\n", 317 | " ]\n", 318 | ")" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 12, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "df_ideal_headers = pd.DataFrame(\n", 328 | " [\n", 329 | " ('a_number', 'date_1', 'date_2', 'string', 'testing', 'a', 'b', 'lat', 'lng')\n", 330 | " ]\n", 331 | ")" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "## Write out data\n", 339 | "
" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 13, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "df_convert.to_csv('data/df_convert.tsv', sep='\\t', index=False)\n", 349 | "df_convert_issues.to_csv('data/df_convert_issues.tsv', sep='\\t', index=False)\n", 350 | "\n", 351 | "df_alterations.to_csv('data/df_alterations.tsv', sep='\\t', index=False)\n", 352 | "df_alterations_issues.to_csv('data/df_alterations_issues.tsv', sep='\\t', index=False)\n", 353 | "\n", 354 | "pickle.dump(df_checks, open('data/df_checks.pkl', 'wb'))\n", 355 | "pickle.dump(df_checks_issues, open('data/df_checks_issues.pkl', 'wb'))\n", 356 | "\n", 357 | "pickle.dump(df_summary, open('data/df_summary.pkl', 'wb'))\n", 358 | "\n", 359 | "df_data.to_excel('data/A.xlsx', index=False)\n", 360 | "xl_writer = pd.ExcelWriter('data/headers.xlsx')\n", 361 | "df_headers_1.to_excel(xl_writer, index=False, sheet_name='A 1', header=None)\n", 362 | "df_ideal_headers.to_excel(xl_writer, index=False, sheet_name='IdealHeaders', header=None)\n", 363 | "xl_writer.save()\n", 364 | "xl_writer.close()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "---\n", 372 | "\n", 373 | "**GigiSR**" 374 | ] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.6.10" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 2 398 | } 399 | -------------------------------------------------------------------------------- /examples/03_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example notebook 03\n", 8 | "\n", 9 | "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `Connections` class:\n", 10 | "\n", 11 | "+ [Initialise a SqliteDB connection](#Initialise-a-SqliteDB-connection)\n", 12 | "+ [Read from cnx](#Read-from-cnx)\n", 13 | "+ [Write to a table](#Write-to-a-table)\n", 14 | "\n", 15 | "## Setup\n", 16 | "
\n", 17 | "\n", 18 | "Imports and setting options" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from datetime import datetime\n", 28 | "import pickle\n", 29 | "\n", 30 | "from data_etl import Connections, Checks" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Examples\n", 38 | "
" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Initialise the class" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "cnxs = Connections()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "### Initialise a SqliteDB connection\n", 62 | "
" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Initialise the SqliteDB, it doesn't already exist so a warning message is output that a file is being created\n", 70 | "\n", 71 | "The optional kwarg `sqlite_df_issues_create` creates a table structure to match the issues tables present in `DataCuration` and `Checks` objects" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stderr", 81 | "output_type": "stream", 82 | "text": [ 83 | "The `file_path` data/00_db.db is not valid so this file will be created\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "cnxs.add_cnx(\n", 89 | " cnx_key='df_issues', \n", 90 | " cnx_type='sqlite3',\n", 91 | " table_name='df_issues',\n", 92 | " file_path='data/00_db.db',\n", 93 | " sqlite_df_issues_create=True\n", 94 | ")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Read from cnx\n", 102 | "
" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Using `read_from_db` you can read data out from a table, or from a database on the same connection" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/html": [ 120 | "
\n", 121 | "\n", 134 | "\n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
key_1key_2key_3filesub_filestep_numbercategoryissue_short_descissue_long_desccolumnissue_countissue_idxgrouping
\n", 156 | "
" 157 | ], 158 | "text/plain": [ 159 | "Empty DataFrame\n", 160 | "Columns: [key_1, key_2, key_3, file, sub_file, step_number, category, issue_short_desc, issue_long_desc, column, issue_count, issue_idx, grouping]\n", 161 | "Index: []" 162 | ] 163 | }, 164 | "execution_count": 4, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "### Write to a table\n", 178 | "
" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "We needs some issues to write to the table" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 5, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/html": [ 196 | "
\n", 197 | "\n", 210 | "\n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | "
key_1key_2key_3filesub_filestep_numbercategoryissue_short_descissue_long_desccolumnissue_countissue_idxgrouping
01NoneNonedf_checks_issues.pklNaN0NaNNumber should be greater than 0NaN142020-05-26 07:36:41.839557
\n", 248 | "
" 249 | ], 250 | "text/plain": [ 251 | " key_1 key_2 key_3 file sub_file step_number category \\\n", 252 | "0 1 None None df_checks_issues.pkl NaN 0 NaN \n", 253 | "\n", 254 | " issue_short_desc issue_long_desc column issue_count \\\n", 255 | "0 Number should be greater than 0 NaN 1 \n", 256 | "\n", 257 | " issue_idx grouping \n", 258 | "0 4 2020-05-26 07:36:41.839557 " 259 | ] 260 | }, 261 | "execution_count": 5, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "var_start_time = datetime.now()\n", 268 | "ch_checks = Checks(var_start_time, '1')\n", 269 | "\n", 270 | "dict_data = {\n", 271 | " 'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n", 272 | "}\n", 273 | "\n", 274 | "dict_checks = dict()\n", 275 | "dict_checks['Number should be greater than 0'] = {\n", 276 | " 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n", 277 | "}\n", 278 | "\n", 279 | "ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n", 280 | "\n", 281 | "ch_checks.df_issues" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "Using `write_to_db` creates a temporary table in the background which the data is written to, if that has written with no issues then it moves all that data to the main table" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 6, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "cnxs.write_to_db('df_issues', ch_checks.df_issues)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "And then check it wrote to the table" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 7, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/html": [ 315 | "
\n", 316 | "\n", 329 | "\n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | "
key_1key_2key_3filesub_filestep_numbercategoryissue_short_descissue_long_desccolumnissue_countissue_idxgrouping
01NoneNonedf_checks_issues.pklNone0NoneNumber should be greater than 0None142020-05-26 07:36:41.839557
\n", 367 | "
" 368 | ], 369 | "text/plain": [ 370 | " key_1 key_2 key_3 file sub_file step_number category \\\n", 371 | "0 1 None None df_checks_issues.pkl None 0 None \n", 372 | "\n", 373 | " issue_short_desc issue_long_desc column issue_count \\\n", 374 | "0 Number should be greater than 0 None 1 \n", 375 | "\n", 376 | " issue_idx grouping \n", 377 | "0 4 2020-05-26 07:36:41.839557 " 378 | ] 379 | }, 380 | "execution_count": 7, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "---\n", 394 | "**GigiSR**" 395 | ] 396 | } 397 | ], 398 | "metadata": { 399 | "kernelspec": { 400 | "display_name": "Python 3", 401 | "language": "python", 402 | "name": "python3" 403 | }, 404 | "language_info": { 405 | "codemirror_mode": { 406 | "name": "ipython", 407 | "version": 3 408 | }, 409 | "file_extension": ".py", 410 | "mimetype": "text/x-python", 411 | "name": "python", 412 | "nbconvert_exporter": "python", 413 | "pygments_lexer": "ipython3", 414 | "version": "3.6.10" 415 | } 416 | }, 417 | "nbformat": 4, 418 | "nbformat_minor": 2 419 | } 420 | -------------------------------------------------------------------------------- /examples/04_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example notebook 04\n", 8 | "\n", 9 | "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `general_functions` module:\n", 10 | "\n", 11 | "+ [Initialise logging](#Initialise-logging)\n", 12 | "+ [Import attribute](#Import-attribute)\n", 13 | "+ [Check for issues](#Check-for-issues)\n", 14 | "\n", 15 | "\n", 16 | "## Setup\n", 17 | "
" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Imports and setting options" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from datetime import datetime\n", 34 | "import pickle\n", 35 | "\n", 36 | "from data_etl import Checks, Connections, general_functions" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Initialise logging\n", 44 | "
" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "When running interlocking scripts it can be useful to have logging so that if a problem is encountered there's hopefully enough information provided to debug\n", 52 | "\n", 53 | "This function helps to set up a logging file" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "general_functions.func_initialise_logging(\n", 63 | " 'example_04', 'logs/', '1', None, None, datetime.now()\n", 64 | ")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### Import attribute\n", 72 | "
" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Quite often it is more useful to define the large dictionaries that go into the checks in a separate script so that it is in a collection but doesn't clutter up the main script where the flow of processing is defined\n", 80 | "\n", 81 | "This function is also used in the classes as reading in from other scripts is a frequent action for clarity of the code" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "{'Number should be greater than 0': {'calc_condition': (df, col, **kwargs)>},\n", 93 | " 'Number should be greater than 2': {'columns': ['number'],\n", 94 | " 'calc_condition': (df, col, **kwargs)>,\n", 95 | " 'category': 'severe'},\n", 96 | " 'check values in list': {'columns': ['category_1'],\n", 97 | " 'calc_condition': (df, col, **kwargs)>,\n", 98 | " 'long_description': (df, col, condition, **kwargs)>},\n", 99 | " 'The category_1 column can only map to certain values': {'calc_condition': (df, col, **kwargs)>,\n", 100 | " 'check_condition': (df, col, condition, **kwargs)>,\n", 101 | " 'count_condition': (df, col, condition, **kwargs)>,\n", 102 | " 'index_position': (df, col, condition, **kwargs)>,\n", 103 | " 'relevant_columns': (df, col, condition, **kwargs)>,\n", 104 | " 'long_description': (df, col, condition, **kwargs)>}}" 105 | ] 106 | }, 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "dict_checks = general_functions.import_attr('.', '04_example', 'dict_checks')\n", 114 | "dict_checks" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "And this can then be used or modified and used in the `DataCuration` and `Checks` classes" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Check for issues\n", 129 | "
" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "The aim of this function is to have a way to create a break in the code if there is are issues, and to store the issues before erroring out of the script\n", 137 | "\n", 138 | "To use this function we need a class instance with issue entries and a connections class instance to write the issues out to" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 4, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/html": [ 149 | "
\n", 150 | "\n", 163 | "\n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | "
key_1key_2key_3filesub_filestep_numbercategoryissue_short_descissue_long_desccolumnissue_countissue_idxgrouping
01NoneNonedf_checks_issues.pklNaN0NaNNumber should be greater than 0NaN142020-05-26 07:43:04.328680
11NoneNonedf_checks_issues.pklNaN1NaNNumber should be greater than 0NaN142020-05-26 07:43:04.328680
21NoneNonedf_checks_issues.pklNaN2NaNNumber should be greater than 0NaN142020-05-26 07:43:04.328680
31NoneNonedf_checks_issues.pklNaN3NaNNumber should be greater than 0NaN142020-05-26 07:43:04.328680
41NoneNonedf_checks_issues.pklNaN4NaNNumber should be greater than 0NaN142020-05-26 07:43:04.328680
\n", 265 | "
" 266 | ], 267 | "text/plain": [ 268 | " key_1 key_2 key_3 file sub_file step_number category \\\n", 269 | "0 1 None None df_checks_issues.pkl NaN 0 NaN \n", 270 | "1 1 None None df_checks_issues.pkl NaN 1 NaN \n", 271 | "2 1 None None df_checks_issues.pkl NaN 2 NaN \n", 272 | "3 1 None None df_checks_issues.pkl NaN 3 NaN \n", 273 | "4 1 None None df_checks_issues.pkl NaN 4 NaN \n", 274 | "\n", 275 | " issue_short_desc issue_long_desc column issue_count \\\n", 276 | "0 Number should be greater than 0 NaN 1 \n", 277 | "1 Number should be greater than 0 NaN 1 \n", 278 | "2 Number should be greater than 0 NaN 1 \n", 279 | "3 Number should be greater than 0 NaN 1 \n", 280 | "4 Number should be greater than 0 NaN 1 \n", 281 | "\n", 282 | " issue_idx grouping \n", 283 | "0 4 2020-05-26 07:43:04.328680 \n", 284 | "1 4 2020-05-26 07:43:04.328680 \n", 285 | "2 4 2020-05-26 07:43:04.328680 \n", 286 | "3 4 2020-05-26 07:43:04.328680 \n", 287 | "4 4 2020-05-26 07:43:04.328680 " 288 | ] 289 | }, 290 | "execution_count": 4, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "var_start_time = datetime.now()\n", 297 | "ch_checks = Checks(var_start_time, '1')\n", 298 | "\n", 299 | "dict_data = {\n", 300 | " 'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n", 301 | "}\n", 302 | "\n", 303 | "dict_checks = dict()\n", 304 | "dict_checks['Number should be greater than 0'] = {\n", 305 | " 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n", 306 | "}\n", 307 | "\n", 308 | "for step_no in range(5):\n", 309 | " ch_checks.set_step_no(step_no)\n", 310 | " ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n", 311 | "\n", 312 | "ch_checks.df_issues" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 5, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "cnxs = Connections()\n", 322 | "cnxs.add_cnx(\n", 323 | " cnx_key='df_issues', \n", 324 | " cnx_type='sqlite3',\n", 325 | " table_name='df_issues',\n", 326 | " file_path='data/00_db.db'\n", 327 | ")" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "Now use the issues table in the function" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 6, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "general_functions.func_check_for_issues(\n", 344 | " ch_checks.get_issue_count(), \n", 345 | " cnxs, \n", 346 | " 'df_issues', \n", 347 | " ch_checks.df_issues, \n", 348 | " ch_checks.get_step_no(),\n", 349 | " override=True\n", 350 | ")" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "The above has `override=True`, this means even if problems are found it will not error out, the below doesn't have `override=True` and intentionally errors" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 7, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "ename": "ValueError", 367 | "evalue": "There were 5 issues found at step 4", 368 | "output_type": "error", 369 | "traceback": [ 370 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 371 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 372 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;34m'df_issues'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdf_issues\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_step_no\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m )\n", 373 | "\u001b[1;32mc:\\users\\georg\\documents\\workspace\\modules\\data_etl\\data_etl\\general_functions.py\u001b[0m in \u001b[0;36mfunc_check_for_issues\u001b[1;34m(issue_count, cnx, cnx_key, table, step_no, override, start_time)\u001b[0m\n\u001b[0;32m 38\u001b[0m module_logger.info(\"Script time taken: {}\".format(\n\u001b[0;32m 39\u001b[0m str(datetime.now() - start_time)))\n\u001b[1;32m---> 40\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvar_msg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 374 | "\u001b[1;31mValueError\u001b[0m: There were 5 issues found at step 4" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "general_functions.func_check_for_issues(\n", 380 | " ch_checks.get_issue_count(), \n", 381 | " cnxs, \n", 382 | " 'df_issues', \n", 383 | " ch_checks.df_issues, \n", 384 | " ch_checks.get_step_no()\n", 385 | ")" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "The benefit of the `override` argument is that you may have a mixture of issues you want definitely resolving and those you can live with, this allows you to have errors but to carry on regardless" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "---\n", 400 | "**GigiSR**" 401 | ] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.6.10" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 2 425 | } 426 | -------------------------------------------------------------------------------- /examples/04_example.py: -------------------------------------------------------------------------------- 1 | # This script is used in the `02_examples.ipynb` file to highlight how using 2 | # externally defined information works 3 | 4 | import pandas as pd 5 | 6 | dict_cat_1_map = { 7 | 'A': ['a', 'z'], 8 | 'B': ['b'], 9 | 'C': ['c'], 10 | 'D': ['d'], 11 | 'Y': ['y'], 12 | 'Z': ['z'] 13 | } 14 | 15 | dict_checks = { 16 | 'Number should be greater than 0': { 17 | 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0 18 | }, 19 | 'Number should be greater than 2': { 20 | "columns": ['number'], 21 | 'calc_condition': lambda df, col, **kwargs: df[col] <= 2, 22 | 'category': 'severe' 23 | }, 24 | 'check values in list': { 25 | 'columns': ['category_1'], 26 | 'calc_condition': lambda df, col, **kwargs: ~df[col].isin(['A', 'B', 'C', 'D']), 27 | 'long_description': lambda df, col, condition, **kwargs: 28 | f"The invalid values are: {df.loc[~df[col].isin(['A', 'B', 'C', 'D'])][col].unique().tolist()}" 29 | }, 30 | 'The category_1 column can only map to certain values': { 31 | 'calc_condition': lambda df, col, **kwargs: [ 32 | item[1] not in dict_cat_1_map[item[0]] for item in 33 | df[['category_1', 'category_2']].values.tolist() 34 | ], 35 | 'check_condition': lambda df, col, condition, **kwargs: sum(condition) > 0, 36 | 'count_condition': lambda df, col, condition, **kwargs: sum(condition), 37 | 'index_position': lambda df, col, condition, **kwargs: pd.Series(condition), 38 | 'relevant_columns': lambda df, col, condition, **kwargs: 'category_1, category_2', 39 | 'long_description': lambda df, col, condition, **kwargs: ( 40 | f"The values that have no mapping are: " 41 | f"{df.loc[pd.Series(condition)]['category_1'].unique().tolist()}" 42 | ) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | A collection of examples for potential uses of my package! 4 | 5 | A lot of the functionality is easy to code in yourself and is dependant on the data set in use. But I have found it useful to be able to try to apply all conversions at once and then check that there were no errors rather than stop each time there is an error. For example, knowing exactly which columns failed to convert to integer means you can investigate all of them at once. And then having the flexibility to define a function to find out which data rows specifically failed is even more powerful. 6 | 7 | The main use I have for using this package at work is so I can feed back to the data creators where there are errors in their manually entered or system extracted data sets so they can make corrections to it before I use it. And if there are values that break my assumptions but are actually valid values I get the feedback from the domain experts that can help me modify my assumptions, or keep the check as-is because it's a highly unlikely occurance and it's good to know when it's cropped up. So, although the problems are labelled as being in a `issues log` they could just be flags for unusual or specific values of particular interest or they could be genuine errors that need resolving. 8 | 9 | # The structure 10 | 11 | + `data/` will contain any generated data we need, some of the tables may be pre-existing hard coded ones 12 | + `test_scripts/` contains an example in scripts rather than notebooks, from this form which runs well locally you can easily convert it into an Airflow compatible form, the `main.py` script accesses all the other scripts so you only need to run one 13 | + `00_create_data.ipynb` creates the data and dbs that are used in the examples 14 | + `01_example.ipnb` a look at some basic functionality: finding files, reading in the data, setting new headers, asserting nulls, then converting to the correct dtypes 15 | + `02_example.ipynb` a concentrated look at individual bits of functionality available and a look at the issue output produced when there are problems 16 | + `02_example.py` some externally defined information to use in the `02_example.ipynb` notebook for one of the sections 17 | 18 | # Run order 19 | 20 | 1. Run `00_create_data.ipynb` first to create the data files for the examples 21 | 22 | You can then run either the notebooks or the `test_scripts/` files. 23 | -------------------------------------------------------------------------------- /examples/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/data/.gitkeep -------------------------------------------------------------------------------- /examples/logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/logs/.gitkeep -------------------------------------------------------------------------------- /examples/test_scripts/.config: -------------------------------------------------------------------------------- 1 | [TEST] 2 | DRIVER = {SQLite3 ODBC Driver} 3 | SERVER = localhost 4 | DATABASE = test.db 5 | Trusted_connection = yes 6 | -------------------------------------------------------------------------------- /examples/test_scripts/alter_cols.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | dict_alter = dict() 4 | 5 | dict_alter['01'] = { 6 | 'type': 'new_col', 7 | 'col_name': 'number_2', 8 | 'function': lambda df, keys, **kwargs: df['a_number'] * 2 9 | } 10 | dict_alter['02'] = { 11 | 'type': 'new_col', 12 | 'col_name': 'key_1', 13 | 'function': lambda df, keys, **kwargs: keys[0] 14 | } 15 | dict_alter['03'] = { 16 | 'type': 'new_col', 17 | 'col_name': 'key_2', 18 | 'function': lambda df, keys, **kwargs: keys[1] 19 | } 20 | dict_alter['04'] = { 21 | 'type': 'map_df', 22 | 'function': lambda df, keys, **kwargs: df, 23 | 'idx_function': lambda df, keys, **kwargs: pd.Series(True, index=df.index) 24 | } 25 | -------------------------------------------------------------------------------- /examples/test_scripts/checks_1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | dict_checks = dict() 4 | 5 | dict_checks["This check is for numbers being greater than 6"] = { 6 | "columns": ["a_number", "number_2"], 7 | "calc_condition": lambda df, col, **kwargs: df[col] <= 6, 8 | "long_description": lambda df, col, condition, **kwargs: 9 | "There are numbers less than or equal to 6", 10 | "index_position": lambda df, col, condition, **kwargs: 11 | pd.Series(False, df.index) 12 | } 13 | 14 | dict_checks["This check is for the column to be not null"] = { 15 | "columns": ['string'], 16 | "calc_condition": lambda df, col, **kwargs: df[col].isnull(), 17 | "long_description": lambda df, col, condition, **kwargs: 18 | f"The column `{col}` should not be null", 19 | "category": 'must be resolved' 20 | } 21 | -------------------------------------------------------------------------------- /examples/test_scripts/convert_columns.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | 5 | dict_convert = dict() 6 | 7 | 8 | def func_string_to_int(df, col): 9 | s = df[col].copy() 10 | s = s.str.replace(',', '') # thousand separators 11 | s = s.str.replace('%', '') # percentage sign 12 | s = s.str.replace('£', '') # pound stirling sign 13 | s = s.str.replace('$', '') # dollar sign 14 | s = s.str.replace('€', '') # euro sign 15 | s = s.str.replace('¥', '') # yen sign 16 | s = s.astype(int) 17 | return s 18 | 19 | 20 | def func_string_to_float(df, col): 21 | s = df[col].copy() 22 | s = s.str.replace(',', '') # thousand separators 23 | s = s.str.replace('%', '') # percentage sign 24 | s = s.str.replace('£', '') # pound stirling sign 25 | s = s.str.replace('$', '') # dollar sign 26 | s = s.str.replace('€', '') # euro sign 27 | s = s.str.replace('¥', '') # yen sign 28 | s = s.astype(float) 29 | return s 30 | 31 | 32 | dict_convert['int'] = { 33 | 'columns': lambda df, **kwargs: ['a_number'], 34 | 'dtypes': ['int', 'float'], 35 | 'functions': { 36 | 1: lambda df, col, **kwargs: df[col].astype(int), 37 | 2: lambda df, col, **kwargs: func_string_to_int(df, col), 38 | 3: lambda df, col, **kwargs: df[col].astype(float), 39 | 4: lambda df, col, **kwargs: func_string_to_float(df, col) 40 | } 41 | } 42 | dict_convert['float'] = { 43 | 'columns': ['lat', 'lng'], 44 | 'dtypes': ['float'], 45 | 'functions': { 46 | 1: lambda df, col, **kwargs: df[col].astype(float), 47 | 2: lambda df, col, **kwargs: func_string_to_float(df, col) 48 | } 49 | } 50 | # TODO have a mash-up function that also takes care of Excel dates? 51 | dict_convert['date'] = { 52 | 'columns': ['date_1', 'date_2'], 53 | 'dtypes': ['datetime'], 54 | 'functions': { 55 | 1: lambda df, col, *kwargs: 56 | pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S') 57 | } 58 | } 59 | 60 | 61 | def func_string_format(df, col): 62 | s = df[col].copy() 63 | s_null = s.isnull() 64 | s = s.astype(str) 65 | s = s.str.strip() 66 | reg_ex = re.compile(' +') 67 | s = s.map(lambda x: re.sub(reg_ex, ' ', x)) 68 | s.loc[s_null] = pd.np.nan 69 | return s 70 | 71 | 72 | dict_convert['string'] = { 73 | 'columns': ['string'], 74 | 'dtypes': [], 75 | 'functions': { 76 | 1: lambda df, col, **kwargs: func_string_format(df, col) 77 | }, 78 | 'idx_function': lambda df, col, **kwargs: pd.Series(True, index=df.index) 79 | } 80 | -------------------------------------------------------------------------------- /examples/test_scripts/main.py: -------------------------------------------------------------------------------- 1 | # This is the section where we put all the classes together in combinations 2 | # that are required for specific data sets 3 | import logging 4 | from datetime import datetime 5 | import pickle 6 | # This is only used to create a table, usually this would already be done 7 | import sqlite3 8 | 9 | from data_etl import DataCuration, Checks, Connections, Reporting, \ 10 | func_check_for_issues, func_initialise_logging 11 | 12 | if __name__ == "__main__": 13 | var_key_1 = "A" 14 | var_key_2 = "1" 15 | var_key_3 = "1" 16 | var_start_time = datetime.now() 17 | 18 | var_checks_1_pass = True 19 | var_write_out = True 20 | 21 | func_initialise_logging('pipeline_test_1', '../logs/', var_key_1, 22 | var_key_2, var_key_3, var_start_time) 23 | 24 | # Initialise objects required 25 | cnxs = Connections() 26 | data = DataCuration(var_start_time, "A") 27 | check = Checks(var_start_time, "A") 28 | reporting = Reporting(var_start_time, "A") 29 | 30 | # Set up connections 31 | cnxs.add_cnx( 32 | cnx_key='df_issues', cnx_type='sqlite3', table_name='df_issues', 33 | file_path='../data/00_db.db', sqlite_df_issues_create=True) 34 | 35 | # # This is only needed to create the structure, 36 | cnx = sqlite3.connect('../data/00_db.db') 37 | var_create_table = """CREATE TABLE IF NOT EXISTS data ( 38 | a_number INTEGER, date_1 TEXT, date_2 TEXT, string TEXT, 39 | testing REAL, a REAL, b REAL, lat REAL, lng REAL, number_2 INTEGER, 40 | key_1 TEXT, key_2 TEXT, level_0 TEXT 41 | );""" 42 | cnx.execute(var_create_table) 43 | cnx.commit() 44 | cnx.close() 45 | 46 | cnxs.add_cnx(cnx_key='data_out', cnx_type='sqlite3', table_name='data', 47 | file_path='../data/00_db.db') 48 | 49 | # Data etl testing 50 | 51 | # Read the files in 52 | data.find_files(files_path="../data", 53 | script_name="test_reading_in", path='.') 54 | data.reading_in(path=".", script_name="test_reading_in") 55 | 56 | # Set the step number 57 | data.set_step_no(1) 58 | 59 | # Read in the headers 60 | data.set_comparison_headers( 61 | path=".", 62 | script_name="test_reading_in", 63 | filepath="../data/headers.xlsx") 64 | data.link_headers() 65 | data.assert_linked_headers(remove_header_rows=True, reset_index=True) 66 | 67 | data.set_step_no(2) 68 | data.assert_nulls([""]) 69 | data.convert_columns(".", "convert_columns") 70 | func_check_for_issues( 71 | data.get_issue_count(2, 2), cnxs, 'df_issues', data.df_issues, 72 | data.get_step_no(), start_time=var_start_time) 73 | 74 | data.set_step_no(3) 75 | data.alter_tables(".", "alter_cols") 76 | func_check_for_issues( 77 | data.get_issue_count(3, 3), cnxs, 'df_issues', data.df_issues, 78 | data.get_step_no(), start_time=var_start_time) 79 | 80 | data.set_step_no(4) 81 | data.concatenate_tables() 82 | 83 | check.set_step_no(5) 84 | check.set_defaults(idx_flag=True) 85 | check.apply_checks(data.tables, ".", "checks_1") 86 | func_check_for_issues( 87 | check.get_issue_count(5, 5), cnxs, 'df_issues', check.df_issues, 88 | check.get_step_no(), var_checks_1_pass, var_start_time) 89 | 90 | # Now the data is cleansed do the reporting, this could be 91 | # post writing to DB 92 | data.set_step_no(6) 93 | data.form_summary_tables(path='.', script_name='reporting_1') 94 | 95 | # Temporary snapshot for testing 96 | pickle.dump( 97 | {'data': data, 'checks': check, 'report': reporting, 'cnx': cnxs}, 98 | open("../data/dict_dc.pkl", "wb")) 99 | 100 | # Log issues found 101 | cnxs.write_to_db('df_issues', data.df_issues) 102 | cnxs.write_to_db('df_issues', check.df_issues) 103 | 104 | # Write the data out 105 | if var_write_out: 106 | cnxs.write_to_db('data_out', data.tables) 107 | 108 | logging.info("Script time taken: {}".format( 109 | str(datetime.now() - var_start_time))) 110 | -------------------------------------------------------------------------------- /examples/test_scripts/reporting_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import folium 5 | 6 | 7 | def form_tables(tables, formed_tables, grouping, key_1, key_2, key_3, 8 | key_separator, **kwargs): 9 | dict_data = dict() 10 | dict_data['main_data'] = tables.copy() 11 | return dict_data 12 | 13 | 14 | dict_reporting = dict() 15 | 16 | 17 | def func_chart_1(tables, file_path, file_name): 18 | df = tables['main_data'] 19 | plt.figure() 20 | g = df['number_2'].hist(bins=50) 21 | plt.title('Histogram') 22 | plt.savefig(os.path.join(file_path, file_name)) 23 | return None 24 | 25 | 26 | dict_reporting['Histogram 1'] = { 27 | 'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3, 28 | **kwargs: 'chart_1.png', 29 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2, 30 | key_3,**kwargs: 31 | func_chart_1(tables, file_path, file_name) 32 | } 33 | dict_reporting['Histogram 2'] = { 34 | 'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3, 35 | **kwargs: 'sub_folder_test/chart_1.png', 36 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2, 37 | key_3, **kwargs: 38 | func_chart_1(tables, file_path, file_name) 39 | } 40 | 41 | 42 | def func_map_1(tables, file_path, file_name): 43 | df = tables['main_data'] 44 | m = folium.Map([51.5074, 0.1278], zoom_start=12) 45 | for idx in df.index.tolist(): 46 | folium.Marker([df.loc[idx, 'lat'], df.loc[idx, 'lng']]).add_to(m) 47 | m.save(os.path.join(file_path, file_name)) 48 | return df 49 | 50 | 51 | dict_reporting['Map 1'] = { 52 | 'file_name': lambda tables, file_path, grouping, key_1, key_2, 53 | key_3, **kwargs: 'map_1.html', 54 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2, 55 | key_3, **kwargs: func_map_1(tables, file_path, file_name) 56 | } 57 | -------------------------------------------------------------------------------- /examples/test_scripts/test_reading_in.py: -------------------------------------------------------------------------------- 1 | # This file contains the information required for listing files and reading in 2 | # tables of data 3 | import os 4 | 5 | import pandas as pd 6 | 7 | 8 | def list_the_files(path): 9 | list_files = os.listdir(path) 10 | list_files = [os.path.abspath(os.path.join(path, x)) for x in list_files] 11 | list_files = [x for x in list_files if '.xlsx' in x.lower()] 12 | list_files = [x for x in list_files if '~' not in x.lower()] 13 | list_files = [x for x in list_files if 'header' not in x.lower()] 14 | return list_files 15 | 16 | 17 | def read_files(list_files): 18 | dict_files = dict() 19 | for file in list_files: 20 | xl = pd.ExcelFile(file) 21 | for sheet in xl.sheet_names: 22 | df = xl.parse( 23 | sheet_name=sheet, dtype=str, keep_default_na=False, header=None) 24 | key = '{} -:- {}'.format( 25 | file.split('\\')[-1].lower().replace('.xlsx', ''), sheet) 26 | dict_files[key] = df.copy() 27 | return dict_files 28 | 29 | 30 | def read_headers(filepath): 31 | if not os.path.exists(filepath): 32 | raise ValueError( 33 | 'The passed file path does not exist: {}'.format(filepath)) 34 | dict_headers = dict() 35 | file = pd.ExcelFile(filepath) 36 | dict_headers['ideal_headers'] = file.parse( 37 | 'IdealHeaders', header=None).values.tolist()[0] 38 | for sheet in [sheet for sheet in 39 | file.sheet_names if sheet != 'IdealHeaders']: 40 | df_header = file.parse(sheet, header=None) 41 | dict_headers[sheet] = { 42 | 'expected_headers': df_header[ 43 | df_header[0] == 'Header'].iloc[:, 1:].values.tolist()[0], 44 | 'new_headers': df_header[ 45 | df_header[0] == 'New name'].iloc[:, 1:].values.tolist()[0], 46 | 'remove': df_header[ 47 | df_header[0] == 'Remove'].iloc[:, 1:].values.tolist()[0], 48 | 'notes': df_header[ 49 | df_header[0] == 'Notes'].iloc[:, 1:].values.tolist()[0] 50 | } 51 | return dict_headers 52 | 53 | 54 | def link_headers(dfs, df_headers): 55 | dict_link = dict() 56 | for key_df in dfs.keys(): 57 | for key_header in df_headers.keys(): 58 | check_shape = ( 59 | # + 1 because the headers have an index to explain the 60 | # row purposes 61 | dfs[key_df].shape[1] + 1 == df_headers[key_header].shape[1]) 62 | if check_shape is True: 63 | dict_link[key_df] = str(key_header) 64 | break 65 | return dict_link 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='data_etl', 5 | version='0.1.0dev', 6 | packages=['data_etl',], 7 | license='MIT', 8 | url="https://github.com/gigisr/data_etl", 9 | 10 | author='GigiSR', requires=['pandas', 'numpy', 'pyodbc'] 11 | ) 12 | -------------------------------------------------------------------------------- /tests/00_pytest.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import pickle 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from data_curation import DataCuration, Checks 8 | 9 | 10 | var_cnv_1_start_time = datetime.now() 11 | data_cnv_1 = DataCuration(var_cnv_1_start_time , 'test') 12 | df_convert_issues = pd.DataFrame( 13 | [ 14 | ('A', '1', '0.6', '2019-02-29'), 15 | ('B', '4.5', 'A', '2019-22-05'), 16 | ('C', '1', '5.6', '2018-12-17'), 17 | ('D', 'b', '15.9', '2019-09-31'), 18 | (5, '-8', '4.7', '2018-03-09') 19 | ], 20 | columns=['object', 'int', 'float', 'date'] 21 | ) 22 | data_cnv_1.set_table({'df_convert_issues.tsv': df_convert_issues}) 23 | 24 | 25 | def func_try_float_cnv(x): 26 | try: 27 | var = float(x) 28 | except: 29 | return True 30 | return False 31 | 32 | 33 | def func_try_int_cnv(x): 34 | try: 35 | var = int(x) 36 | except: 37 | return True 38 | return False 39 | 40 | 41 | def func_str_cnv(s): 42 | var_is_null_pre = s.isnull().sum() 43 | s_cnv = s.map(func_to_int).str.strip() 44 | var_is_null_post = s_cnv.isnull().sum() 45 | if var_is_null_post != var_is_null_pre: 46 | raise ValueError 47 | return s_cnv 48 | 49 | 50 | def func_to_int(x): 51 | try: 52 | return int(x) 53 | except: 54 | return x 55 | 56 | 57 | def func_try_str_cnv(s): 58 | var_is_null_pre = s.isnull().sum() 59 | s_cnv = s.map(func_to_int).str.strip() 60 | var_is_null_post = s_cnv.isnull().sum() 61 | return s != s_cnv 62 | 63 | 64 | def func_try_date_cnv(x): 65 | if pd.isnull(x): 66 | return False 67 | if pd.isnull(pd.to_datetime(x, format='%Y-%m-%d', errors='coerce')): 68 | return True 69 | return False 70 | 71 | 72 | dict_cnv_1 = { 73 | 'float': { 74 | 'columns': ['float'], 75 | 'dtypes': ['float'], 76 | 'functions': { 77 | 1: lambda df, col, **kwargs: df[col].astype(float) 78 | }, 79 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_float_cnv) 80 | }, 81 | 'int': { 82 | 'columns': ['int'], 83 | 'dtypes': ['int'], 84 | 'functions': { 85 | 1: lambda df, col, **kwargs: df[col].astype(int) 86 | }, 87 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_int_cnv) 88 | }, 89 | 'object': { 90 | 'columns': ['object'], 91 | 'dtypes': [], 92 | 'functions': { 93 | 1: lambda df, col, **kwargs: func_str_cnv(df[col]) 94 | }, 95 | 'idx_function': lambda df, col, **kwargs: func_try_str_cnv(df[col]) 96 | }, 97 | 'date': { 98 | 'columns': ['date'], 99 | 'dtypes': ['date', '[ns]'], 100 | 'functions': { 101 | 1: lambda df, col, **kwargs: pd.to_datetime( 102 | df[col], format='%Y-%m-%d') 103 | }, 104 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_date_cnv) 105 | } 106 | } 107 | 108 | df_cnv_1_expected_df_issues = pd.DataFrame( 109 | [ 110 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '', 111 | 'The conversion failed to format float', 'float', 1, '1', 112 | var_cnv_1_start_time), 113 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '', 114 | 'The conversion failed to format int', 'int', 2, '1, 3', 115 | var_cnv_1_start_time), 116 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '', 117 | 'The conversion failed to format object', 'object', 1, '4', 118 | var_cnv_1_start_time), 119 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '', 120 | 'The conversion failed to format date', 'date', 3, '0, 1, 3', 121 | var_cnv_1_start_time) 122 | ], 123 | columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number', 124 | 'category', 'issue_short_desc', 'issue_long_desc', 'column', 125 | 'issue_count', 'issue_idx', 'grouping'] 126 | ) 127 | 128 | 129 | def test_cnv_1(): 130 | data_cnv_1.convert_columns(dictionary=dict_cnv_1) 131 | assert data_cnv_1.df_issues.fillna('').equals( 132 | df_cnv_1_expected_df_issues.fillna('')) 133 | 134 | 135 | var_alter_1_start_time = datetime.now() 136 | data_alter_1 = DataCuration(var_alter_1_start_time, 'test') 137 | 138 | data_alter_1.set_table( 139 | { 140 | 'df_alterations.tsv': pd.DataFrame( 141 | [ 142 | ('A', 2, 'key_1'), 143 | ('B', 199, 'key_2'), 144 | ('C', -1, 'key_1'), 145 | ('D', 20, 'key_3'), 146 | ('E', 6, 'key_2') 147 | ], 148 | columns=['to_map', 'add_1', 'merge_key'] 149 | ), 150 | 'df_alterations_issues.tsv': pd.DataFrame( 151 | [ 152 | ('A', 2, 'key_1'), 153 | ('B', 199, 2), 154 | ('C', -1, 'key_1'), 155 | (['D'], 'a', 'key_3'), 156 | ('E', 6, 'key_2') 157 | ], 158 | columns=['to_map', 'add_1', 'merge_key'] 159 | ) 160 | } 161 | ) 162 | 163 | 164 | df_mapping = pd.DataFrame( 165 | [ 166 | ('key_1', 1), 167 | ('key_2', 2), 168 | ('key_3', 3) 169 | ], 170 | columns=['merge_key', 'out_value'] 171 | ) 172 | 173 | 174 | def func_alter_merge(df, df_mapping): 175 | df_mapped = pd.merge( 176 | df, 177 | df_mapping, 178 | on='merge_key', 179 | how='left' 180 | ) 181 | if ( 182 | df_mapped['out_value'].isnull().sum() != 183 | df['merge_key'].isnull().sum() 184 | ): 185 | raise ValueError 186 | return df_mapped 187 | 188 | 189 | dict_alter_1 = { 190 | '01': { 191 | 'type': 'new_col', 192 | 'col_name': 'key', 193 | 'function': lambda df, keys, **kwargs: keys[0] 194 | }, 195 | '02': { 196 | 'type': 'new_col', 197 | 'col_name': 'done_add_1', 198 | 'function': lambda df, keys, **kwargs: df['add_1'] + 1, 199 | 'idx_function': lambda df, keys, **kwargs: 200 | df['add_1'].map( 201 | lambda x: type(x).__name__).map( 202 | lambda x: ('int' in x) | ('float' in x)).map( 203 | {True: False, False: True}) 204 | }, 205 | '03': { 206 | 'type': 'new_col', 207 | 'col_name': 'mapped', 208 | 'function': lambda df, keys, **kwargs: df['to_map'].map({ 209 | 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5}), 210 | 'idx_function': lambda df, keys, **kwargs: 211 | ~df['to_map'].astype(str).isin(['A', 'B', 'C', 'D', 'E']) 212 | }, 213 | '04': { 214 | 'type': 'map_df', 215 | 'function': lambda df, keys, **kwargs: 216 | func_alter_merge(df, kwargs['df_mapping']), 217 | 'idx_function': lambda df, keys, **kwargs: 218 | ~df['merge_key'].isin(['key_1', 'key_2', 'key_3', np.nan]) 219 | } 220 | } 221 | 222 | df_alter_1_expected_df_issues = pd.DataFrame( 223 | [ 224 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan, 225 | '', 'For type new_col the function for alter_key 02 has not worked', 226 | 'done_add_1', 1, '3', var_alter_1_start_time), 227 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan, 228 | '', 'For type new_col the function for alter_key 03 has not worked', 229 | 'mapped', 1, '3', var_alter_1_start_time), 230 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan, 231 | '', 'For type map_df the function for alter_key 04 has not worked', 232 | np.nan, 1, '1', var_alter_1_start_time) 233 | ], 234 | columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number', 235 | 'category', 'issue_short_desc', 'issue_long_desc', 'column', 236 | 'issue_count', 'issue_idx', 'grouping'] 237 | ) 238 | 239 | 240 | def test_alter_1(): 241 | data_alter_1.alter_tables(dictionary=dict_alter_1, df_mapping=df_mapping) 242 | assert data_alter_1.df_issues.fillna('').equals( 243 | df_alter_1_expected_df_issues.fillna('')) 244 | --------------------------------------------------------------------------------