├── .gitignore
├── README.md
├── condaenv.yml
├── data_etl
    ├── __init__.py
    ├── checks.py
    ├── connections.py
    ├── data_files.py
    └── general_functions.py
├── examples
    ├── 00_create_data.ipynb
    ├── 01_example.ipynb
    ├── 02_example.ipynb
    ├── 03_example.ipynb
    ├── 04_example.ipynb
    ├── 04_example.py
    ├── README.md
    ├── data
    │   └── .gitkeep
    ├── logs
    │   └── .gitkeep
    └── test_scripts
    │   ├── .config
    │   ├── alter_cols.py
    │   ├── checks_1.py
    │   ├── convert_columns.py
    │   ├── main.py
    │   ├── reporting_1.py
    │   └── test_reading_in.py
├── setup.py
└── tests
    └── 00_pytest.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | logs/*.log
 2 | .idea/*
 3 | pickles/*
 4 | *~*
 5 | data/processed/*
 6 | data/deliverables/*
 7 | *.pkl
 8 | *.tsv
 9 | *.db
10 | *.csv
11 | *.xlsx
12 | *.log
13 | .ipynb_checkpoints/*
14 | */.ipynb_checkpoints/*
15 | *.pyc
16 | docs/*
17 | data_etl.egg-info/*
18 | logs/*
19 | !logs/README_logs.md
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data ETL
 2 | 
 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 4 | 
 5 | A package for dealing with data curation, transformation and checks.
 6 | 
 7 | This can be reading in and converting to the correct dtypes, making suitable alterations to bring data into a uniform format. Or just taking an existing data set and performing some checks on it.
 8 | 
 9 | The aim is to help with regular data sources provided by others, or by systems. This means it could be in a flat file format, it could mean that you are given data that isn't logically correct, it could mean missing data, there could be any number of problems. But hopefully having an issue report with a good amount of information on how something is wrong and where will give us capacity to provide this back to the data creators. Thus checks can be done in bulk, quickly, and issue reports put the responsibility on the data creator to make the corrections.
10 | 
11 | The checks are not just considering single columns or single values they can consider the whole data set or even in conjunction with extra data sets, because that's how data often behaves. 
12 | 
13 | With models if certain assumptions are made then these can be tested.
14 | 
15 | There is also benefit in performing checks in bulk, even if they produce issues, so it stops the stop start process.
16 | 
17 | To use this package you should already have a good understanding of how the `pandas` package works.
18 | 
19 | ## How to use this repoistory
20 | 
21 | ### Setup environment
22 | 
23 | There is a YML file for the main requirements.
24 | 
25 | ```
26 | conda env create --file condaenv.yml
27 | ```
28 | 
29 | Then you can use `pip` to install the `data_etl` module, navigate to the same directory as contains the `setup.py` file then:
30 | 
31 | ```
32 | pip install -e .
33 | ```
34 | 
35 | This now means you can import `data_etl` from the environment. 
36 | 
37 | ## Examples
38 | 
39 | There are multiple examples present in the repository in the `examples` files. 
40 | 
41 | Use the `00_create_data.py` file to create the data to run the examples on and the sqlitedb file that will contain any errors or written out data.
42 | 
43 | The other files, both `*.ipynb` and `*.py`, are the examples files.   
44 | 
45 | A brief code example of how to use:
46 | 
47 | ```python
48 | from data_etl import Checks
49 | import pandas as pd
50 | 
51 | data = pd.DataFrame([1, -3, 2], columns=['number'])
52 | 
53 | # Initialise the Checks class
54 | ch_simple = Checks('grouping_label', 'key_1', 'key_2', 'key_3')
55 | 
56 | # Define a simple check
57 | dict_checks = {
58 |     'Number should be greater than 0': {
59 |         'calc_condition': lambda df, col, **kwargs: df['number'] <= 0
60 |     }
61 | }
62 | # Apply the checks to the tables
63 | ch_simple.apply_checks(data, dictionary=dict_checks)
64 | 
65 | # If any issues are found then they are stored internal to the class as a Pandas DataFrame
66 | ch_simple.df_issues
67 | ```
68 | 


--------------------------------------------------------------------------------
/condaenv.yml:
--------------------------------------------------------------------------------
 1 | name: data_etl
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - python=3.6
 6 |   - pandas=0.24.0
 7 |   - pytest=5.0.1
 8 |   - jupyter=1.0.0
 9 |   - matplotlib=3.0.3
10 |   - xlrd=1.2.0
11 |   - pyodbc=4.0.27
12 |   - openpyxl=3.0.3
13 | 


--------------------------------------------------------------------------------
/data_etl/__init__.py:
--------------------------------------------------------------------------------
 1 | from data_etl.data_files import DataCuration
 2 | from data_etl.checks import Checks
 3 | from data_etl.connections import Connections
 4 | from data_etl.general_functions import func_check_for_issues, \
 5 |     func_initialise_logging, import_attr
 6 | 
 7 | __all__ = [
 8 |     DataCuration, Checks, Connections, func_check_for_issues,
 9 |     func_initialise_logging, import_attr
10 | ]
11 | __version__ = '0.1.0dev'


--------------------------------------------------------------------------------
/data_etl/checks.py:
--------------------------------------------------------------------------------
  1 | # Here we are defining a class that will deal with checking data sets
  2 | import logging
  3 | from inspect import getfullargspec
  4 | from copy import deepcopy
  5 | from inspect import getsourcelines
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | from data_etl.general_functions import import_attr
 11 | 
 12 | module_logger = logging.getLogger(__name__)
 13 | 
 14 | dict_checks_defaults = {
 15 |     'columns': [np.nan],
 16 |     'check_condition':
 17 |         lambda df, col, condition, **kwargs: condition.sum() > 0,
 18 |     'count_condition': lambda df, col, condition, **kwargs: condition.sum(),
 19 |     'index_position': lambda df, col, condition, **kwargs: condition,
 20 |     'relevant_columns': lambda df, col, condition, **kwargs: col,
 21 |     'long_description': lambda df, col, condition, **kwargs: "",
 22 |     'idx_flag': True,
 23 |     'category': np.nan
 24 | }
 25 | 
 26 | 
 27 | class Checks:
 28 |     __step_no = 0
 29 |     __key_1 = None
 30 |     __key_2 = None
 31 |     __key_3 = None
 32 |     __grouping = None
 33 |     df_issues = None
 34 |     __key_separator = " -:- "
 35 |     __checks_defaults = None
 36 | 
 37 |     def __init__(self, grouping, key_1, key_2=None, key_3=None):
 38 |         module_logger.info("Initialising `Checks` object")
 39 |         # Three keys, all good things come in threes
 40 |         self.__key_1 = str(key_1)
 41 |         self.__key_2 = str(key_2)
 42 |         self.__key_3 = str(key_3)
 43 |         self.__grouping = grouping
 44 |         self.__checks_defaults = dict(dict_checks_defaults)
 45 |         # Initialise the `df_issues` table
 46 |         df_issues = pd.DataFrame(
 47 |             columns=[
 48 |                 "key_1", "key_2", "key_3", "file", "sub_file", "step_number",
 49 |                 "category", "issue_short_desc", "issue_long_desc", "column",
 50 |                 "issue_count", "issue_idx", "grouping"
 51 |             ]
 52 |         )
 53 |         df_issues["step_number"] = df_issues["step_number"].astype(int)
 54 |         self.df_issues = df_issues
 55 |         module_logger.info("Initialising `Checks` object complete")
 56 | 
 57 |     def error_handling(self, file, subfile, issue_short_desc, issue_long_desc,
 58 |                        column, issue_count, issue_idx, category=np.nan):
 59 |         """
 60 |         If an error is handled, as they all should be, we need to specify what
 61 |         happens with the error. By putting it into a single function it will
 62 |         hopefully make the code briefer.
 63 |         """
 64 |         # TODO work out how to add in `file` and `subfile` where data is a
 65 |         #  dictionary
 66 |         module_logger.info("Logging an error with `error_handling`")
 67 |         df = self.df_issues.copy()
 68 |         list_vals = [
 69 |             self.__key_1, self.__key_2, self.__key_3, file, subfile,
 70 |             self.__step_no, category, issue_short_desc, issue_long_desc, column,
 71 |             issue_count, issue_idx, self.__grouping
 72 |         ]
 73 |         try:
 74 |             df.loc[df.shape[0]] = list_vals
 75 |             self.df_issues = df.copy()
 76 |         except:
 77 |             var_msg = f"Logging the issue failed, values: {list_vals}"
 78 |             module_logger.error(var_msg)
 79 |             raise ValueError(var_msg)
 80 |         module_logger.info(f"Error logged: {list_vals}")
 81 | 
 82 |     def set_defaults(
 83 |             self, columns=None, check_condition=None, count_condition=None,
 84 |             index_position=None, relevant_columns=None, long_description=None,
 85 |             idx_flag=None):
 86 |         module_logger.info("Starting `set_defaults`")
 87 |         if columns is not None:
 88 |             if type(columns).__name__ != 'list':
 89 |                 var_msg = 'The `columns` argument is not a list as required'
 90 |                 module_logger.error(var_msg)
 91 |                 raise ValueError(var_msg)
 92 |             if len(columns) == 0:
 93 |                 var_msg = ('The `columns` argument is empty, it needs to be '
 94 |                            'at least length 1, this can be a null')
 95 |                 module_logger.error(var_msg)
 96 |                 raise ValueError(var_msg)
 97 |             self.__checks_defaults['columns'] = columns
 98 |         if check_condition is not None:
 99 |             self.__set_defaults_check(check_condition, 'check_condition')
100 |             self.__checks_defaults['check_condition'] = check_condition
101 |         if count_condition is not None:
102 |             self.__set_defaults_check(count_condition, 'count_condition')
103 |             self.__checks_defaults['count_condition'] = count_condition
104 |         if index_position is not None:
105 |             self.__set_defaults_check(index_position, 'index_position')
106 |             self.__checks_defaults['index_position'] = index_position
107 |         if relevant_columns is not None:
108 |             self.__set_defaults_check(relevant_columns, 'relevant_columns')
109 |             self.__checks_defaults['relevant_columns'] = relevant_columns
110 |         if long_description is not None:
111 |             self.__set_defaults_check(long_description, 'long_descriptions')
112 |             self.__checks_defaults['long_description'] = long_description
113 |         if idx_flag is not None:
114 |             if idx_flag not in [True, False]:
115 |                 var_msg = 'The value of `idx_flag` need to be True or False'
116 |                 module_logger.error(var_msg)
117 |                 raise ValueError(var_msg)
118 |             self.__checks_defaults['idx_flag'] = idx_flag
119 |         module_logger.info("Completed `set_defaults`")
120 | 
121 |     @staticmethod
122 |     def __set_defaults_check(function, label):
123 |         module_logger.info("Starting `__set_defaults_check`")
124 |         if type(function).__name__ != 'function':
125 |             var_msg = f'The passed value for `{label}` is not a function'
126 |             module_logger.error(var_msg)
127 |             raise ValueError(var_msg)
128 |         arg_spec = getfullargspec(function)
129 |         if arg_spec.args != ['df', 'col', 'condition']:
130 |             var_msg = (
131 |                 f'The arguments passed in for the function `{label}` does not '
132 |                 f'match with the required args: df, col, condition')
133 |             module_logger.error(var_msg)
134 |             raise ValueError(var_msg)
135 |         if arg_spec.varkw != 'kwargs':
136 |             var_msg = (f'The **kwargs argument has not been provided for '
137 |                        f'`{label}` and is required')
138 |             module_logger.error(var_msg)
139 |             raise ValueError(var_msg)
140 |         module_logger.info("Completed `__set_defaults_check`")
141 | 
142 |     def set_key_separator(self, separator):
143 |         module_logger.info("Starting `set_key_separator`")
144 |         if (type(separator).__name__ != "str") | (len(separator) == 0):
145 |             var_msg = ("The argument `separator` for function "
146 |                        "`set_key_separator` should be a string of length "
147 |                        "greater than 0")
148 |             module_logger.error(var_msg)
149 |             raise ValueError(var_msg)
150 |         self.__key_separator = separator
151 |         module_logger.info(f"Completed `set_key_separator`, the key separator "
152 |                            f"is: {self.__key_separator}")
153 | 
154 |     def apply_checks(
155 |             self, tables, path=None, script_name=None,
156 |             object_name="dict_checks", dictionary=None, **kwargs):
157 |         module_logger.info("Starting `apply_checks`")
158 |         if (script_name is not None) & (object_name is not None):
159 |             dict_checks = import_attr(path, script_name, object_name)
160 |         elif dictionary is not None:
161 |             if type(dictionary).__name__ != "dict":
162 |                 var_msg = "The `dictionary` argument is not a dictionary"
163 |                 module_logger.error(var_msg)
164 |                 raise ValueError(var_msg)
165 |             dict_checks = dictionary
166 |         else:
167 |             var_msg = ("Either `dictionary` or both of `script_name` and "
168 |                        "`path` need to be none null")
169 |             module_logger.error(var_msg)
170 |             raise ValueError(var_msg)
171 | 
172 |         if type(tables).__name__ == "dict":
173 |             for table_key in tables.keys():
174 |                 for check_key in dict_checks.keys():
175 |                     self.__apply_the_check(
176 |                         tables[table_key], dict_checks[check_key], check_key,
177 |                         table_key, **kwargs)
178 |         elif type(tables).__name__ == "DataFrame":
179 |             for check_key in dict_checks.keys():
180 |                 self.__apply_the_check(tables, dict_checks[check_key],
181 |                                        check_key, np.nan, **kwargs)
182 | 
183 |         module_logger.info("Completed `apply_checks`")
184 | 
185 |     def __apply_the_check(
186 |             self, df, dict_check_info, check_key, table_key, **kwargs):
187 |         module_logger.info(f"Starting check `{check_key}`")
188 |         if "calc_condition" not in dict_check_info:
189 |             var_msg = "The check requires a value for key `calc_condition`"
190 |             module_logger.error(var_msg)
191 |             raise AttributeError(var_msg)
192 |         func_calc_condition = dict_check_info["calc_condition"]
193 |         func_long_description = (
194 |             self.__checks_defaults['long_description'] if
195 |             "long_description" not in dict_check_info else
196 |             dict_check_info["long_description"])
197 |         func_check_condition = (
198 |             self.__checks_defaults['check_condition'] if
199 |             "check_condition" not in dict_check_info else
200 |             dict_check_info["check_condition"])
201 |         list_columns = (
202 |             self.__checks_defaults['columns'] if
203 |             "columns" not in dict_check_info else
204 |             dict_check_info["columns"])
205 |         if type(list_columns).__name__ == 'str':
206 |             list_columns = [list_columns]
207 |         func_count_condition = (
208 |             self.__checks_defaults['count_condition'] if
209 |             "count_condition" not in dict_check_info else
210 |             dict_check_info["count_condition"])
211 |         func_index_position = (
212 |             self.__checks_defaults['index_position'] if
213 |             "index_position" not in dict_check_info else
214 |             dict_check_info["index_position"])
215 |         func_relevant_columns = (
216 |             self.__checks_defaults['relevant_columns'] if
217 |             "relevant_columns" not in dict_check_info else
218 |             dict_check_info["relevant_columns"])
219 |         var_idx_flag = (
220 |             self.__checks_defaults['idx_flag'] if
221 |             "idx_flag" not in dict_check_info else
222 |             dict_check_info['idx_flag'])
223 |         var_category = (
224 |             self.__checks_defaults['category'] if
225 |             "category" not in dict_check_info else
226 |             dict_check_info['category'])
227 |         if len(list_columns) == 0:
228 |             var_msg = ('The `list_columns` value somehow has length 0, needs '
229 |                        'to have at least one element, which can be `np.nan`')
230 |             module_logger.error(var_msg)
231 |             raise ValueError(var_msg)
232 |         for col in list_columns:
233 |             self.__evaluate_check(
234 |                 check_key, df, col, func_calc_condition,
235 |                 func_check_condition, func_count_condition, func_index_position,
236 |                 func_relevant_columns, func_long_description, var_idx_flag,
237 |                 var_category, table_key, **kwargs)
238 | 
239 |         module_logger.info(f"Completed check `{check_key}`")
240 | 
241 |     def __evaluate_check(
242 |             self, check_key, df, col, func_calc_condition, func_check_condition,
243 |             func_count_condition, func_index_position, func_relevant_columns,
244 |             func_long_description, var_idx_flag, var_category, table_key,
245 |             **kwargs):
246 |         module_logger.info(
247 |             f"Starting evaluating check `{check_key}` for column {col}")
248 |         s_calc_condition = func_calc_condition(df, col, **kwargs)
249 |         var_check_condition = func_check_condition(
250 |             df, col, s_calc_condition, **kwargs)
251 |         var_count_condition = func_count_condition(
252 |             df, col, s_calc_condition, **kwargs)
253 |         s_index_conditions = func_index_position(
254 |             df, col, s_calc_condition, **kwargs)
255 |         if var_idx_flag is False:
256 |             s_index_conditions = s_index_conditions.map(
257 |                 {True: False, False: True})
258 |         var_relevant_columns = func_relevant_columns(
259 |             df, col, s_calc_condition, **kwargs)
260 |         var_long_description = func_long_description(
261 |             df, col, s_calc_condition, **kwargs)
262 |         if type(var_long_description).__name__ != "str":
263 |             var_msg = (
264 |                 f"The variable `var_long_description` is not a string! It is a"
265 |                 f" {type(var_long_description).__name__}")
266 |             module_logger.warning(var_msg)
267 |         if (
268 |             (type(var_relevant_columns).__name__ != "str") &
269 |             (pd.isnull(var_relevant_columns) is False)
270 |         ):
271 |             var_msg = (
272 |                 f"The variable `var_relevant_columns` is not a string or null! "
273 |                 f"It is a {type(var_relevant_columns).__name__}")
274 |             module_logger.warning(var_msg)
275 |         if "int" not in type(var_count_condition).__name__:
276 |             var_msg = (
277 |                 f"The variable `var_count_condition` is not an integer! It is a"
278 |                 f" {type(var_count_condition).__name__}")
279 |             module_logger.warning(var_msg)
280 |         if type(s_calc_condition).__name__ != "Series":
281 |             var_msg = (
282 |                 f"The variable `s_calc_condition` is not a Series! It is a "
283 |                 f"{type(s_calc_condition).__name__}")
284 |             module_logger.warning(var_msg)
285 |         if type(s_index_conditions).__name__ != "Series":
286 |             var_msg = (
287 |                 f"The variable `s_index_conditions` is not a Series! It is a "
288 |                 f"{type(s_index_conditions).__name__}")
289 |             module_logger.warning(var_msg)
290 |         if (
291 |             (type(var_category).__name__ != 'str') &
292 |             (pd.isnull(var_category) is False)
293 |         ):
294 |             var_msg = (f'The variable `category` is not a string or null! It '
295 |                        f'is a {type(var_category).__name__}')
296 |             module_logger.warning(var_msg)
297 |         if var_check_condition:
298 |             if pd.isnull(table_key):
299 |                 var_file = np.nan
300 |                 var_subfile = np.nan
301 |             else:
302 |                 var_file = table_key.split(self.__key_separator)[0]
303 |                 var_subfile = (table_key.split(self.__key_separator)[1] if
304 |                                self.__key_separator in table_key else np.nan)
305 |             self.error_handling(
306 |                 var_file, var_subfile, check_key, var_long_description,
307 |                 var_relevant_columns, var_count_condition,
308 |                 ", ".join(
309 |                     [
310 |                         str(item) for item in
311 |                         s_index_conditions.loc[
312 |                             s_index_conditions].index.tolist()
313 |                     ]
314 |                 ),
315 |                 var_category
316 |             )
317 |         module_logger.info(
318 |             f"Completed evaluating check `{check_key}` for column {col}")
319 | 
320 |     def get_issue_count(self, issue_number_min=None, issue_number_max=None):
321 |         module_logger.info("Starting `get_issue_count`")
322 |         df = self.df_issues.copy()
323 |         if issue_number_min is not None:
324 |             df = df.loc[df["step_number"] >= issue_number_min].copy()
325 |         if issue_number_max is not None:
326 |             df = df.loc[df["step_number"] <= issue_number_max].copy()
327 |         var_count = df.shape[0]
328 |         module_logger.info("Completed `get_issue_count`")
329 |         return var_count
330 | 
331 |     def table_look(self, table, issue_idx):
332 |         module_logger.info("Starting `table_look`")
333 |         if issue_idx not in self.df_issues.index.tolist():
334 |             var_msg = (f"The requested issue index, {issue_idx}, is not "
335 |                        f"present in the `df_issues` table")
336 |             module_logger.error(var_msg)
337 |             raise AttributeError(var_msg)
338 |         if type(table).__name__ != 'DataFrame':
339 |             var_msg = 'The `table` argument is not a DataFrame as required'
340 |             module_logger.error(var_msg)
341 |             raise ValueError(var_msg)
342 |         df_check = table.loc[
343 |             [
344 |                 int(item) for item in
345 |                 self.df_issues.loc[issue_idx, "issue_idx"].split(", ")
346 |             ]
347 |         ]
348 |         module_logger.info("Completed `table_look`")
349 |         return self.df_issues.loc[[issue_idx]], df_check
350 | 
351 |     @staticmethod
352 |     def __func_summary_(key_value):
353 |         if type(key_value).__name__ == 'function':
354 |             var_out = ''.join([
355 |                 x.strip().strip("['\\n']") for x in
356 |                 getsourcelines(key_value)[0]
357 |             ])
358 |             if (var_out.strip()[-1] == ':') | (var_out.strip()[-1] == '('):
359 |                 return ('raise Exception("The definition does not allow for'
360 |                         ' this info to be retrieved")')
361 |             var_out = var_out.split(':')[-1].strip()
362 |             if var_out[-1] == ',':
363 |                 var_out = var_out[:-1]
364 |             return var_out
365 |         else:
366 |             return key_value
367 | 
368 |     def summary(self, path=None, script_name=None,
369 |                 object_name="dict_checks", dictionary=None):
370 |         if (script_name is not None) & (object_name is not None):
371 |             dict_checks = import_attr(path, script_name, object_name)
372 |         elif dictionary is not None:
373 |             if type(dictionary).__name__ != "dict":
374 |                 var_msg = "The `dictionary` argument is not a dictionary"
375 |                 module_logger.error(var_msg)
376 |                 raise ValueError(var_msg)
377 |             dict_checks = dictionary
378 |         else:
379 |             var_msg = ("Either `dictionary` or both of `script_name` and "
380 |                        "`path` need to be none null")
381 |             module_logger.error(var_msg)
382 |             raise ValueError(var_msg)
383 | 
384 |         list_keys = [
385 |             'calc_condition', 'long_description', 'check_condition', 'columns',
386 |             'count_condition', 'index_position', 'relevant_columns', 'idx_flag',
387 |             'category'
388 |         ]
389 | 
390 |         dict_checks_values = deepcopy(dict_checks)
391 |         for check in [key for key in dict_checks_values.keys()]:
392 |             for key in [key for key in list_keys if
393 |                         key not in dict_checks_values[check].keys()]:
394 |                 dict_checks_values[check][key] = self.__checks_defaults[key]
395 | 
396 |         for check in [key for key in dict_checks_values.keys()]:
397 |             for key in [key for key in dict_checks_values[check].keys()]:
398 |                 dict_checks_values[check][key] = self.__func_summary_(
399 |                     dict_checks_values[check][key])
400 | 
401 |         df_summary = pd.DataFrame(
402 |             dict_checks_values
403 |         ).T.reset_index().rename(columns={'index': 'check'})
404 | 
405 |         return {'df': df_summary, 'dict': dict_checks}
406 | 
407 |     def set_step_no(self, step_no):
408 |         """
409 |         Set the step number, this allows errors to be recorded against a
410 |         specific step which in turn can help with issue tracking and checking
411 |         once issues are recorded.
412 | 
413 |         The argument step_no needs to be convertible to integer format.
414 |         """
415 |         module_logger.info("Starting `set_step_no`")
416 |         try:
417 |             self.__step_no = int(step_no)
418 |         except ValueError:
419 |             var_msg = (f"Function set_step_no: The value {step_no} can not be "
420 |                        f"converted to int.")
421 |             module_logger.error(var_msg)
422 |             raise ValueError(var_msg)
423 |         module_logger.info(
424 |             f"Completed `set_step_no`, the step number is {self.__step_no}")
425 | 
426 |     def get_step_no(self):
427 |         module_logger.info("Starting `get_step_no`")
428 |         module_logger.info("Completed `get_step_no`")
429 |         return self.__step_no
430 | 


--------------------------------------------------------------------------------
/data_etl/connections.py:
--------------------------------------------------------------------------------
  1 | # Here we are defining a class that will deal with the various connections
  2 | # required by the pipeline
  3 | import logging
  4 | import sqlite3
  5 | import os
  6 | import configparser
  7 | 
  8 | import pandas as pd
  9 | import pyodbc
 10 | 
 11 | from data_etl.general_functions import func_to_sql
 12 | 
 13 | module_logger = logging.getLogger(__name__)
 14 | # TODO account for tables not existing and existing when writing to the cnx,
 15 | #  ideally any tables used should have been pre-emptively setup in the required
 16 | #  databases
 17 | # TODO add MSSQL connection handling
 18 | 
 19 | 
 20 | class Connections:
 21 |     __step_no = 0
 22 |     __df_issues = None
 23 |     __dict_cnx = None
 24 | 
 25 |     def __init__(self, step_no=None):
 26 |         module_logger.info("Initialising `Connections` object")
 27 |         if step_no is not None:
 28 |             self.set_step_no(step_no)
 29 |         self.__dict_cnx = {
 30 |             'blank': {'cnx_type': 'blank'}
 31 |         }
 32 |         module_logger.info("Initialising `Connections` object complete")
 33 | 
 34 |     def set_step_no(self, step_no):
 35 |         module_logger.info(f"Starting `set_step_no`")
 36 |         self.__step_no = step_no
 37 |         module_logger.info(f"Completed `set_step_no`")
 38 | 
 39 |     def get_step_no(self):
 40 |         module_logger.info("Starting `get_step_no`")
 41 |         module_logger.info("Completed `get_step_no`")
 42 |         return self.__step_no
 43 | 
 44 |     def add_cnx(self, cnx_key, cnx_type, table_name, cnx_string=None,
 45 |                 file_path=None, config_section=None, overwrite=False,
 46 |                 timestamp_format='%Y-%m-%d', **kwargs):
 47 |         module_logger.info(f"Starting `add_cnx` for cnx key `{cnx_key}`")
 48 |         # TODO query is the file existing, if not then error out
 49 |         if (cnx_key in self.__dict_cnx) & (overwrite is False):
 50 |             var_msg = ('This connection string is already set, use the '
 51 |                        'argument `overwrite=True` to overwrite')
 52 |             module_logger.error(var_msg)
 53 |             raise ValueError(var_msg)
 54 |         if cnx_type not in ['sqlite3', 'db']:
 55 |             var_msg = (
 56 |                 'The `cnx_type` argument only takes values `sqlite3`, `db`')
 57 |             module_logger.error(var_msg)
 58 |             raise AttributeError(var_msg)
 59 |         if (table_name is None) & (cnx_type in ['sqlite3', 'db']):
 60 |             var_msg = 'The argument `table_name` is required'
 61 |             module_logger.error(var_msg)
 62 |             raise AttributeError(var_msg)
 63 |         if (file_path is None) & (cnx_type in ['sqlite3', 'db']):
 64 |             var_msg = 'The argument `file_path` is required'
 65 |             module_logger.error(var_msg)
 66 |             raise AttributeError(var_msg)
 67 |         if (
 68 |                 (not os.path.exists(file_path)) &
 69 |                 (cnx_string is None) &
 70 |                 (cnx_type in ['db'])
 71 |         ):
 72 |             var_msg = (
 73 |                 f'The `file_path` to the config file {file_path} is not valid, '
 74 |                 f'the `file_path` is expected since the `cnx_string` is None'
 75 |             )
 76 |             module_logger.error(var_msg)
 77 |             raise AttributeError(var_msg)
 78 |         if (
 79 |                 (not os.path.exists(os.path.dirname(file_path))) &
 80 |                 (cnx_type in ['sqlite3'])
 81 |         ):
 82 |             var_msg = (
 83 |                 f'The folder path {os.path.dirname(file_path)} is not valid')
 84 |             module_logger.error(var_msg)
 85 |             raise AttributeError(var_msg)
 86 |         if (not os.path.exists(file_path)) & (cnx_type in ['sqlite3']):
 87 |             var_msg = (f'The `file_path` {file_path} is not valid so this '
 88 |                        f'file will be created')
 89 |             module_logger.warning(var_msg)
 90 |         if cnx_type == 'sqlite3':
 91 |             module_logger.info(
 92 |                 f'The information is: {cnx_type}, {file_path}, {table_name}')
 93 |             self.__dict_cnx[cnx_key] = {
 94 |                 'cnx_type': cnx_type,
 95 |                 'file_path': file_path,
 96 |                 'table_name': table_name
 97 |             }
 98 |         elif cnx_type == 'db':
 99 |             if (config_section is None) & (cnx_string is None):
100 |                 var_msg = ('The argument `config_section` or `cnx_string` is '
101 |                            'required for `cnx_type=db`')
102 |                 module_logger.error(var_msg)
103 |                 raise AttributeError(var_msg)
104 |             if config_section is not None:
105 |                 dict_config = configparser.ConfigParser()
106 |                 dict_config.read(file_path)
107 |                 var_cnx_string = ''.join(
108 |                     [
109 |                         f"{key}={dict_config[config_section][key]};" for
110 |                         key in dict_config[config_section]
111 |                     ]
112 |                 )
113 |                 self.__dict_cnx[cnx_key] = {
114 |                     'cnx_type': cnx_type,
115 |                     'file_path': file_path,
116 |                     'cnx_string': var_cnx_string ,
117 |                     'table_name': table_name,
118 |                     'timestamp_format': timestamp_format
119 |                 }
120 |             elif cnx_string is not None:
121 |                 self.__dict_cnx[cnx_key] = {
122 |                     'cnx_type': cnx_type,
123 |                     'file_path': file_path,
124 |                     'cnx_string': cnx_string,
125 |                     'table_name': table_name,
126 |                     'timestamp_format': timestamp_format
127 |                 }
128 |         self.test_cnx(cnx_key, **kwargs)
129 |         module_logger.info("Completed `add_cnx`")
130 | 
131 |     def test_cnx(self, cnx_key, **kwargs):
132 |         module_logger.info(f"Starting `test_cnx` for cnx key `{cnx_key}`")
133 |         if cnx_key not in self.__dict_cnx:
134 |             var_msg = f'The key {cnx_key} is not present'
135 |             module_logger.error(var_msg)
136 |             raise AttributeError(var_msg)
137 |         dict_cnx = self.__dict_cnx[cnx_key]
138 |         var_cnx_type = dict_cnx['cnx_type']
139 |         if var_cnx_type == 'sqlite3':
140 |             cnx = sqlite3.connect(dict_cnx['file_path'])
141 |             if kwargs.get('sqlite_df_issues_create') is True:
142 |                 var_create_table_sql = """
143 |                 CREATE TABLE IF NOT EXISTS {} (
144 |                     key_1 text,
145 |                     key_2 text,
146 |                     key_3 text,
147 |                     file text,
148 |                     sub_file text,
149 |                     step_number integer,
150 |                     category text,
151 |                     issue_short_desc text,
152 |                     issue_long_desc text,
153 |                     column text,
154 |                     issue_count integer,
155 |                     issue_idx text,
156 |                     grouping text
157 |                 );
158 |                 """.format(dict_cnx['table_name'])
159 |                 cnx.execute(var_create_table_sql)
160 |             try:
161 |                 pd.read_sql(
162 |                     f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;",
163 |                     cnx
164 |                 )
165 |                 cnx.close()
166 |             except:
167 |                 cnx.close()
168 |                 var_msg = 'Reading in from the table has not worked'
169 |                 module_logger.error(var_msg)
170 |                 raise AttributeError(var_msg)
171 |         elif var_cnx_type == 'db':
172 |             cnx = pyodbc.connect(dict_cnx['cnx_string'])
173 |             try:
174 |                 pd.read_sql(
175 |                     f"SELECT TOP (0) * FROM {dict_cnx['table_name']};",
176 |                     cnx
177 |                 )
178 |                 cnx.close()
179 |             except:
180 |                 cnx.close()
181 |         module_logger.info("Completed `test_cnx`")
182 | 
183 |     def read_from_db(self, cnx_key, sql_stmt):
184 |         module_logger.info("Starting `read_from_db`")
185 |         module_logger.info(f'Sql statement: {sql_stmt}')
186 |         dict_cnx = self.__dict_cnx[cnx_key]
187 |         var_cnx_type = dict_cnx['cnx_type']
188 |         df = pd.DataFrame()
189 |         if var_cnx_type == 'blank':
190 |             var_msg = 'Trying to use `read_from_db` using a blank connection'
191 |             module_logger.error(var_msg)
192 |             raise ValueError(var_msg)
193 |         elif var_cnx_type == 'sqlite3':
194 |             cnx = sqlite3.connect(dict_cnx['file_path'])
195 |             try:
196 |                 df = pd.read_sql(sql_stmt, cnx)
197 |                 cnx.close()
198 |             except:
199 |                 cnx.close()
200 |                 var_msg = 'Reading in using a `sqlite3` connection has failed'
201 |                 module_logger.error(var_msg)
202 |                 raise ValueError(var_msg)
203 |         elif var_cnx_type == 'db':
204 |             cnx = pyodbc.connect(dict_cnx['cnx_string'])
205 |             try:
206 |                 df = pd.read_sql(sql_stmt, cnx)
207 |                 cnx.close()
208 |             except:
209 |                 cnx.close()
210 |                 var_msg = 'Reading in using a `db` connection has failed'
211 |                 module_logger.error(var_msg)
212 |                 raise ValueError(var_msg)
213 |         module_logger.info("Completed `read_from_db`")
214 |         return df
215 | 
216 |     def write_to_db(self, cnx_key, table, batch_size=None,
217 |                     flag_sql_logging=False):
218 |         module_logger.info("Starting `write_to_db`")
219 |         dict_cnx = self.__dict_cnx[cnx_key]
220 |         var_cnx_type = dict_cnx['cnx_type']
221 |         # Temp table first
222 |         var_write_works = 0
223 |         if var_cnx_type == 'blank':
224 |             var_write_works += 1
225 |         elif var_cnx_type == 'sqlite3':
226 |             cnx = sqlite3.connect(dict_cnx['file_path'])
227 |             cursor = cnx.cursor()
228 |             var_sql = (f"CREATE TEMP TABLE temp.{dict_cnx['table_name']} AS "
229 |                        f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;")
230 |             module_logger.info(var_sql)
231 |             cursor.execute(var_sql)
232 |             cnx.commit()
233 |             for idx in table.index.tolist():
234 |                 var_sql = "INSERT INTO temp.{} VALUES ({});".format(
235 |                     dict_cnx['table_name'],
236 |                     ', '.join(
237 |                         table.loc[idx].map(
238 |                             lambda value: 'NULL' if pd.isnull(value) else
239 |                             f"'{str(value)}'"
240 |                         ).astype(str).values.tolist()
241 |                     )
242 |                 )
243 |                 if flag_sql_logging:
244 |                     module_logger.info(var_sql)
245 |                 cursor.execute(var_sql)
246 |                 cnx.commit()
247 | 
248 |             df_test = pd.read_sql(
249 |                 f"SELECT * FROM temp.{dict_cnx['table_name']}", cnx)
250 | 
251 |             if df_test.shape[0] == table.shape[0]:
252 |                 var_write_works += 1
253 | 
254 |             cnx.close()
255 |         elif var_cnx_type == 'db':
256 |             cnx = pyodbc.connect(dict_cnx['cnx_string'])
257 |             cursor = cnx.cursor()
258 | 
259 |             var_sql = (f"DROP TABLE IF EXISTS #Temp "
260 |                        f"SELECT TOP(0) * INTO #Temp "
261 |                        f"FROM {dict_cnx['table_name']}")
262 |             module_logger.info(var_sql)
263 |             cursor.execute(var_sql)
264 |             cnx.commit()
265 | 
266 |             var_sql_template = "INSERT INTO #Temp ([{}]) VALUES {}".format(
267 |                 "], [".join(table.columns.tolist()),
268 |                 '{}'
269 |             )
270 |             module_logger.info(var_sql_template)
271 |             s_sql_values = table.apply(
272 |                 lambda s: s.map(
273 |                     lambda x: func_to_sql(x, dict_cnx['timestamp_format']))
274 |             ).apply(
275 |                 lambda r: f"({', '.join(r)})", axis=1)
276 |             var_iloc_min = 0
277 |             for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2):
278 |                 s_filtered = s_sql_values.iloc[
279 |                              var_iloc_min:(i * batch_size)]
280 |                 var_sql = var_sql_template.format(
281 |                     ", ".join(s_filtered.values.tolist()))
282 |                 if flag_sql_logging:
283 |                     module_logger.info(var_sql)
284 |                 cursor.execute(var_sql)
285 |                 cnx.commit()
286 |                 var_iloc_min = i * batch_size
287 | 
288 |             df_test = pd.read_sql("SELECT * FROM #Temp", cnx)
289 | 
290 |             if df_test.shape[0] == table.shape[0]:
291 |                 var_write_works += 1
292 | 
293 |             cnx.close()
294 | 
295 |         if var_write_works == 0:
296 |             var_msg = ('The writing to a temporary table has not worked, '
297 |                        'will not try writing to main table')
298 |             module_logger.error(var_msg)
299 |             raise ValueError(var_msg)
300 |         if var_write_works > 1:
301 |             var_msg = ('The writing to a temporary table has happened '
302 |                        'multiple times, will not try writing to main table')
303 |             module_logger.error(var_msg)
304 |             raise ValueError(var_msg)
305 |         # Then move to the main table only if the temporary table worked
306 |         if var_cnx_type == 'blank':
307 |             pass
308 |         elif var_cnx_type == 'sqlite3':
309 |             cnx = sqlite3.connect(dict_cnx['file_path'])
310 |             try:
311 |                 table.to_sql(dict_cnx['table_name'], cnx,
312 |                              index=False, if_exists='append')
313 |                 cnx.close()
314 |             except:
315 |                 cnx.close()
316 |                 var_msg = 'Writing to the table has not worked'
317 |                 module_logger.error(var_msg)
318 |                 raise ValueError(var_msg)
319 |         elif var_cnx_type == 'db':
320 |             cnx = pyodbc.connect(dict_cnx['cnx_string'])
321 |             cursor = cnx.cursor()
322 |             try:
323 |                 var_sql_template = "INSERT INTO {} ([{}]) VALUES {}".format(
324 |                     dict_cnx['table_name'],
325 |                     "], [".join(table.columns.tolist()),
326 |                     '{}'
327 |                 )
328 |                 s_sql_values = table.apply(
329 |                     lambda s: s.map(
330 |                         lambda x: func_to_sql(x, dict_cnx['timestamp_format']))
331 |                 ).apply(
332 |                     lambda r: f"({', '.join(r)})", axis=1)
333 |                 var_iloc_min = 0
334 |                 for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2):
335 |                     s_filtered = s_sql_values.iloc[
336 |                                  var_iloc_min:(i * batch_size)]
337 |                     var_sql = var_sql_template.format(
338 |                         ", ".join(s_filtered.values.tolist()))
339 |                     if flag_sql_logging:
340 |                         module_logger.info(var_sql)
341 |                     cursor.execute(var_sql)
342 |                     cnx.commit()
343 |                     var_iloc_min = i * batch_size
344 |                 cnx.close()
345 |             except:
346 |                 cnx.close()
347 |                 var_msg = 'Writing to the table has not worked'
348 |                 module_logger.error(var_msg)
349 |                 raise ValueError(var_msg)
350 | 
351 |         module_logger.info("Completed `write_to_db`")
352 | 
353 |     def get_cnx_keys(self):
354 |         module_logger.info("Starting `get_cnx_keys`")
355 |         module_logger.info("Completed `get_cnx_keys`")
356 |         return [x for x in self.__dict_cnx.keys()]
357 | 


--------------------------------------------------------------------------------
/data_etl/data_files.py:
--------------------------------------------------------------------------------
   1 | # Here we are defining a class that will deal with all the data storage and
   2 | # manipulations
   3 | import logging
   4 | 
   5 | import pandas as pd
   6 | import numpy as np
   7 | 
   8 | from data_etl.general_functions import import_attr
   9 | 
  10 | module_logger = logging.getLogger(__name__)
  11 | 
  12 | 
  13 | class DataCuration:
  14 |     __step_no = 0
  15 |     df_issues = None
  16 |     headers = None
  17 |     __key_1 = None
  18 |     __key_2 = None
  19 |     __key_3 = None
  20 |     __grouping = None
  21 |     tables = None
  22 |     formed_tables = None
  23 |     list_files = None
  24 |     __key_separator = " -:- "
  25 |     __link_headers = None
  26 | 
  27 |     def __init__(self, grouping, key_1, key_2=None, key_3=None):
  28 |         """
  29 |         All data actions are taken on all tables, the aim is to process data to
  30 |         end up with a uniform data set that can be utilised and is consistent.
  31 | 
  32 |         The three arguments are individual identifiers for the data.
  33 | 
  34 |         The end form would be a pipeline that has regular data ingests.
  35 |         """
  36 |         module_logger.info("Initialising `DataCuration` object")
  37 |         # Three keys, all good things come in threes
  38 |         self.__key_1 = str(key_1)
  39 |         self.__key_2 = str(key_2)
  40 |         self.__key_3 = str(key_3)
  41 |         self.__grouping = grouping
  42 |         # sub_file, e.g. sheet for a spreadsheet, may not always be applicable
  43 |         df_issues = pd.DataFrame(
  44 |             columns=[
  45 |                 "key_1", "key_2", "key_3", "file", "sub_file", "step_number",
  46 |                 "category", "issue_short_desc", "issue_long_desc", "column",
  47 |                 "issue_count", "issue_idx", "grouping"
  48 |             ]
  49 |         )
  50 |         df_issues["step_number"] = df_issues["step_number"].astype(int)
  51 |         self.df_issues = df_issues
  52 |         self.tables = dict()
  53 |         self.formed_tables = dict()
  54 |         self.list_files = list()
  55 |         self.__link_headers = dict()
  56 |         module_logger.info("Initialising `DataCuration` object complete")
  57 | 
  58 |     def error_handling(self, file, subfile, issue_short_desc, issue_long_desc,
  59 |                        column, issue_count, issue_idx, category=np.nan):
  60 |         """
  61 |         If an error is handled, as they all should be, we need to specify what
  62 |         happens with the error. By putting it into a single function it will
  63 |         hopefully make the code briefer.
  64 |         """
  65 |         module_logger.info("Logging an error with `error_handling`")
  66 |         df = self.df_issues.copy()
  67 |         list_vals = [
  68 |             self.__key_1, self.__key_2, self.__key_3, file, subfile,
  69 |             self.__step_no, category, issue_short_desc, issue_long_desc, column,
  70 |             issue_count, issue_idx, self.__grouping
  71 |         ]
  72 |         try:
  73 |             df.loc[df.shape[0]] = list_vals
  74 |             self.df_issues = df.copy()
  75 |         except:
  76 |             var_msg = f"Logging the issue failed for values: {list_vals}"
  77 |             module_logger.error(var_msg)
  78 |             raise ValueError(var_msg)
  79 |         module_logger.info(f"Error logged: {list_vals}")
  80 | 
  81 |     def set_step_no(self, step_no):
  82 |         """
  83 |         Set the step number, this allows errors to be recorded against a
  84 |         specific step which in turn can help with issue tracking and checking
  85 |         once issues are recorded.
  86 | 
  87 |         The argument step_no needs to be convertible to integer format.
  88 |         """
  89 |         module_logger.info("Starting `set_step_no`")
  90 |         try:
  91 |             self.__step_no = int(step_no)
  92 |         except ValueError:
  93 |             var_msg = (f"Function set_step_no: The value {step_no} can not be "
  94 |                        f"converted to int.")
  95 |             module_logger.error(var_msg)
  96 |             raise ValueError(var_msg)
  97 |         module_logger.info(
  98 |             f"Completed `set_step_no`, the step number is {self.__step_no}")
  99 | 
 100 |     def set_key_separator(self, separator):
 101 |         """
 102 |         The key separator is used in the error handling section to split out the
 103 |         file and sub file portions of the dictionary keys of the files read in.
 104 | 
 105 |         So if you have a key of 'file name -:- sheet name', for tables read in
 106 |         from an Excel file, and an issue is found. The associated issues log
 107 |         entry will then have a file value of 'file name' and a sub file value of
 108 |         'sheet name'.
 109 |         """
 110 |         module_logger.info("Starting `set_key_separator`")
 111 |         if (type(separator).__name__ != "str") | (len(separator) == 0):
 112 |             var_msg = ("The argument `separator` for function "
 113 |                        "`set_key_separator` should be a string of length "
 114 |                        "greater than 0")
 115 |             module_logger.error(var_msg)
 116 |             raise ValueError(var_msg)
 117 |         self.__key_separator = separator
 118 |         module_logger.info(f"Completed `set_key_separator`, the key separator "
 119 |                            f"is: {self.__key_separator}")
 120 | 
 121 |     def set_file_list(self, list_files, append=False):
 122 |         """
 123 |         If there is a know list of files then define them here rather than
 124 |         setting a function to find the files.
 125 |         """
 126 |         module_logger.info("Starting `set_file_list`")
 127 |         var_type = type(list_files).__name__
 128 |         if (var_type != "list") & (var_type != "str"):
 129 |             var_msg = ("The type of the `list_files` argument is not a list or "
 130 |                        "a string.")
 131 |             module_logger.error(var_msg)
 132 |             raise ValueError(var_msg)
 133 |         elif var_type == "str":
 134 |             if len(list_files) == 0:
 135 |                 var_msg = ("The length of the `list_files` argument is 0, it "
 136 |                            "needs to be a valid value.")
 137 |                 module_logger.error(var_msg)
 138 |                 raise ValueError(var_msg)
 139 |             list_files = [list_files]
 140 |         elif var_type == 'list':
 141 |             if len(list_files) == 0:
 142 |                 var_msg = ("The length of the `list_files` argument is 0, it "
 143 |                            "needs to be a valid value.")
 144 |                 module_logger.error(var_msg)
 145 |                 raise ValueError(var_msg)
 146 |             list_files = list_files
 147 |         else:
 148 |             var_msg = (f"Unhandled type for function `set_file_list`: "
 149 |                        f"{var_type}")
 150 |             module_logger.error(var_msg)
 151 |             raise ValueError(var_msg)
 152 | 
 153 |         if append:
 154 |             self.list_files += list_files
 155 |         else:
 156 |             self.list_files = list_files
 157 |         module_logger.info(f"Completed `set_file_list`, the list of files is: "
 158 |                            f"{self.list_files}")
 159 | 
 160 |     def find_files(self, path=None, script_name=None,
 161 |                    func_name="list_the_files", function=None, files_path='.',
 162 |                    append=False, **kwargs):
 163 |         """
 164 |         Using an externally defined function, as specified in the module
 165 |         argument script, acquire a list of files to be read in.
 166 | 
 167 |         In the case that we want to accumulate a list of files from different
 168 |         main paths there is an append option.
 169 |         """
 170 |         module_logger.info("Starting `find_files`")
 171 |         # TODO move this to an internal function as it's used so often!
 172 |         if script_name is not None:
 173 |             function = import_attr(path, script_name, func_name)
 174 |         elif function is not None:
 175 |             if type(function).__name__ != "function":
 176 |                 var_msg = "The `function` argument needs to be a function"
 177 |                 module_logger.error(var_msg)
 178 |                 raise ValueError(var_msg)
 179 |         else:
 180 |             var_msg = ("One of `script_name` or `function` needs to be not "
 181 |                        "None in the function `find_files`")
 182 |             module_logger.error(var_msg)
 183 |             raise ValueError(var_msg)
 184 |         list_files = function(files_path, **kwargs)
 185 |         # TODO move these to be calls on the self.set_file_list function instead
 186 |         #  of setting the value here
 187 |         if append:
 188 |             self.list_files += list_files
 189 |         else:
 190 |             self.list_files = list_files
 191 |         module_logger.info(
 192 |             f"Completed `find_files`, the list of files is: {self.list_files}")
 193 | 
 194 |     def reading_in(self, path=None, script_name=None, func_name="read_files",
 195 |                    function=None, overwrite=True, **kwargs):
 196 |         """
 197 |         Using an externally defined reading in function, and the internally
 198 |         defined list of files, read in each of the tables required.
 199 | 
 200 |         `path` being the relative script file path
 201 |         """
 202 |         module_logger.info("Starting `reading_in`")
 203 |         if type(self.tables).__name__ != "dict":
 204 |             var_msg = ("The tables need to be in dictionary format for this "
 205 |                        "`self.reading_in` step")
 206 |             module_logger.error(var_msg)
 207 |             raise ValueError(var_msg)
 208 |         if function is not None:
 209 |             if type(function).__name__ != "function":
 210 |                 var_msg = ("The function passed to `self.reading_in` is not a "
 211 |                            "function.")
 212 |                 module_logger.error(var_msg)
 213 |                 raise ValueError(var_msg)
 214 |         elif script_name is not None:
 215 |             function = import_attr(path, script_name, func_name)
 216 |         else:
 217 |             var_msg = ("One of the `function` or `script_name` arguments needs "
 218 |                        "to be completed. And if `script name is then `path` "
 219 |                        "needs to be too.")
 220 |             module_logger.error(var_msg)
 221 |             raise ValueError(var_msg)
 222 | 
 223 |         try:
 224 |             dfs = function(self.list_files, **kwargs)
 225 |         except AttributeError:
 226 |             if len([x for x in kwargs.keys()]) > 0:
 227 |                 var_msg = (f"Function reading_in, kwargs may have been passed "
 228 |                            f"when the function {func_name} in the script "
 229 |                            f"{script_name} does not take kwargs")
 230 |             else:
 231 |                 var_msg = (f"Function reading in: The {func_name} function "
 232 |                            f"does not exist in the {script_name} script.")
 233 |             module_logger.error(var_msg)
 234 |             raise AttributeError(var_msg)
 235 |         if overwrite is False:
 236 |             df_org = self.tables.copy()
 237 |             df_org.update(dfs)
 238 |         elif overwrite is True:
 239 |             pass
 240 |         else:
 241 |             var_msg = ("The attribute `overwrite` in the function "
 242 |                        "`reading_in` needs to be `True` or `False`")
 243 |             module_logger.error(var_msg)
 244 |             raise ValueError(var_msg)
 245 |         self.set_table(dfs, overwrite=overwrite)
 246 |         if type(dfs).__name__ == "DataFrame":
 247 |             module_logger.info(f"The table has shape '{dfs.shape}'")
 248 |         else:
 249 |             for key in dfs:
 250 |                 module_logger.info(
 251 |                     f"The table with key '{key}' has shape '{dfs[key].shape}'")
 252 | 
 253 |         module_logger.info("Completed `reading_in`")
 254 | 
 255 |     def set_table(self, tables, dict_key=None, overwrite=True):
 256 |         """
 257 |         If self.tables is a dictionary set df to key else overwrite existing
 258 |         table if argument is True
 259 |         """
 260 |         module_logger.info("Starting `set_table`")
 261 |         if (overwrite is True) & (dict_key is None):
 262 |             self.tables = tables
 263 |         elif (
 264 |             (overwrite is True) &
 265 |             (dict_key is not None) &
 266 |             (type(self.tables).__name__ == 'dict') &
 267 |             (type(tables).__name__ == 'DataFrame')
 268 |         ):
 269 |             self.tables[dict_key] = tables
 270 |         elif (
 271 |             (overwrite is False) &
 272 |             (dict_key is not None) &
 273 |             (type(self.tables).__name__ == 'dict') &
 274 |             (type(tables).__name__ == 'DataFrame')
 275 |         ):
 276 |             if dict_key not in [key for key in self.tables.keys()]:
 277 |                 self.tables[dict_key] = tables
 278 |             else:
 279 |                 var_msg = (
 280 |                     f'The combination of attributes has resulted in no change: '
 281 |                     f'`self.tables` type - {type(self.tables).__name__}, '
 282 |                     f'`tables` type - {type(tables).__name__}, `dict_key` - '
 283 |                     f'{dict_key}, `overwrite` - {overwrite}')
 284 |                 module_logger.error(var_msg)
 285 |                 raise AttributeError(var_msg)
 286 |         else:
 287 |             var_msg = (
 288 |                 f'The combination of attributes has resulted in no change: '
 289 |                 f'`self.tables` type - {type(self.tables).__name__}, `tables` '
 290 |                 f'type - {type(tables).__name__}, `dict_key` - {dict_key}, '
 291 |                 f'`overwrite` - {overwrite}')
 292 |             module_logger.error(var_msg)
 293 |             raise AttributeError(var_msg)
 294 |         module_logger.info("Completed `set_table`")
 295 | 
 296 |     def concatenate_tables(self):
 297 |         """
 298 |         Where the tables are in a dictionary format put them into a DataFrame
 299 |         """
 300 |         module_logger.info("Starting `concatenate_tables`")
 301 |         if type(self.tables).__name__ != "dict":
 302 |             var_msg = ("For the function `concatenate_tables` the `tables` "
 303 |                        "should be in dictionary format")
 304 |             module_logger.error(var_msg)
 305 |             raise ValueError(var_msg)
 306 |         if len([key for key in self.tables.keys()]) > 1:
 307 |             df = pd.concat(self.tables, axis=1)
 308 |         elif len([key for key in self.tables.keys()]) == 1:
 309 |             dict_df = self.tables.copy()
 310 |             dict_key = [key for key in dict_df.keys()][0]
 311 |             df = dict_df[dict_key].copy()
 312 |             df['level_0'] = dict_key
 313 |         else:
 314 |             var_msg = "The dictionary `self.tables` is empty"
 315 |             module_logger.error(var_msg)
 316 |             raise AttributeError(var_msg)
 317 |         self.set_table(df, overwrite=True)
 318 |         module_logger.info("Completed `concatenate_tables`")
 319 | 
 320 |     def dictionary_tables(self, key=None):
 321 |         """
 322 |         Where the tables are in a DataFrame format put them in a dictionary,
 323 |         using the values in the key column as the new dictionary keys
 324 |         """
 325 |         module_logger.info("Starting `dictionary_tables`")
 326 |         if type(self.tables).__name__ != "DataFrame":
 327 |             var_msg = ("For the function `dictionary_tables` the `tables` "
 328 |                        "should be in DataFrame format.")
 329 |             module_logger.error(var_msg)
 330 |             raise ValueError(var_msg)
 331 |         df = self.tables
 332 |         dict_dfs = dict()
 333 | 
 334 |         if key is not None:
 335 |             var_cycle = key
 336 |         else:
 337 |             var_cycle = "level_0"
 338 |         if var_cycle not in self.tables.columns.tolist():
 339 |             var_msg = f"There is no {var_cycle} column present in the table"
 340 |             module_logger.error(var_msg)
 341 |             raise ValueError(var_msg)
 342 |         for val in df[var_cycle].unique().tolist():
 343 |             dict_dfs[val] = df.loc[df[var_cycle] == val].copy()
 344 |         self.set_table(dict_dfs)
 345 | 
 346 |         module_logger.info("Completed `dictionary_tables`")
 347 | 
 348 |     def set_comparison_headers(
 349 |             self, path=None, script_name=None, func_name="read_headers",
 350 |             function=None, dictionary=None, **kwargs):
 351 |         # TODO Need to see if we can isolate just a set of new tables? Maybe
 352 |         #  have a list of dictionary keys that have had their headers done
 353 |         #  already?
 354 |         module_logger.info("Starting `set_comparison_headers`")
 355 | 
 356 |         if function is not None:
 357 |             if type(function).__name__ != "function":
 358 |                 var_msg = ("The function passed to "
 359 |                            "`self.set_comparison_headers` is not a function.")
 360 |                 module_logger.error(var_msg)
 361 |                 raise ValueError(var_msg)
 362 |         elif script_name is not None:
 363 |             function = import_attr(path, script_name, func_name)
 364 |         elif dictionary is not None:
 365 |             def function(**kwargs): return dictionary
 366 |         else:
 367 |             var_msg = ("One of the `function` or `script_name` arguments needs "
 368 |                        "to be completed. And if `script name is then `path` "
 369 |                        "needs to be too.")
 370 |             module_logger.error(var_msg)
 371 |             raise ValueError(var_msg)
 372 | 
 373 |         try:
 374 |             dict_headers = function(**kwargs)
 375 |         except AttributeError:
 376 |             if len([x for x in kwargs.keys()]) > 0:
 377 |                 var_msg = (
 378 |                     f"Function set_comparison_headers, kwargs may have been "
 379 |                     f"passed when the function {func_name} in the script "
 380 |                     f"{script_name} does not take kwargs")
 381 |             else:
 382 |                 var_msg = (
 383 |                     f"Function set_comparison_headers: The {func_name} function"
 384 |                     f" does not exist in the {script_name} script.")
 385 |             module_logger.error(var_msg)
 386 |             raise AttributeError(var_msg)
 387 | 
 388 |         if type(dict_headers).__name__ != 'dict':
 389 |             var_msg = 'The headers output should be a dictionary'
 390 |             module_logger.error(var_msg)
 391 |             raise Exception(var_msg)
 392 |         list_keys = [
 393 |             key for key in dict_headers.keys() if key != 'ideal_headers']
 394 |         list_keys = [
 395 |             key for key in list_keys if
 396 |             (dict_headers[key].get('expected_headers') is None) |
 397 |             (dict_headers[key].get('new_headers') is None) |
 398 |             (dict_headers[key].get('remove') is None)
 399 |         ]
 400 |         if len(list_keys) > 0:
 401 |             var_msg = (
 402 |                 f'There are dictionary keys that do not have all the required '
 403 |                 f'values: {", ".join([str(key) for key in list_keys])}')
 404 |             module_logger.error(var_msg)
 405 |             raise Exception(var_msg)
 406 |         if dict_headers.get('ideal_headers') is None:
 407 |             var_msg = ('There needs to be a key to the headers dictionary that'
 408 |                        ' is "ideal_headers"')
 409 |             module_logger.error(var_msg)
 410 |             raise Exception(var_msg)
 411 |         if type(dict_headers.get('ideal_headers')).__name__ != 'list':
 412 |             var_msg = 'The value of key "ideal_headers" needs to be a list'
 413 |             module_logger.error(var_msg)
 414 |             raise Exception(var_msg)
 415 | 
 416 |         self.headers = dict(dict_headers)
 417 | 
 418 |         module_logger.info(
 419 |             f"There are {len(dict_headers)} header keys and they are: "
 420 |             f"{', '.join([key for key in dict_headers.keys()])}")
 421 | 
 422 |         module_logger.info("Completed `set_comparison_headers`")
 423 | 
 424 |     @staticmethod
 425 |     def _link_headers(tables, headers, **kwargs):
 426 |         dict_link = dict()
 427 |         list_headers_keys = [
 428 |             key for key in headers.keys() if key != 'ideal_headers']
 429 |         if type(tables).__name__ == 'dict':
 430 |             for df_key in [key for key in tables.keys()]:
 431 |                 for header_set in list_headers_keys:
 432 |                     list_expected = headers[header_set]['expected_headers']
 433 |                     if list_expected == tables[
 434 |                         df_key].iloc[:len(list_expected)].values.tolist()[0]:
 435 |                         dict_link[df_key] = header_set
 436 |                         break
 437 |         else:
 438 |             for header_set in list_headers_keys:
 439 |                 list_expected = headers[header_set]['expected_headers']
 440 |                 if list_expected == tables.iloc[
 441 |                     :len(list_expected)].values.tolist()[0]:
 442 |                     dict_link['combined'] = header_set
 443 |                     break
 444 |         return dict_link
 445 | 
 446 |     def link_headers(self, path=None, script_name=None,
 447 |                      func_name="link_headers", function=None, **kwargs):
 448 |         # TODO Need to see if we can isolate just a set of new tables? Maybe
 449 |         #  have a list of dictionary keys that have had their headers
 450 |         #  done already?
 451 |         module_logger.info("Starting `link_headers`")
 452 | 
 453 |         if function is not None:
 454 |             if type(function).__name__ != "function":
 455 |                 var_msg = ("The function passed to `self.link_headers` is "
 456 |                            "not a function.")
 457 |                 module_logger.error(var_msg)
 458 |                 raise ValueError(var_msg)
 459 |         elif script_name is not None:
 460 |             function = import_attr(path, script_name, func_name)
 461 |         else:
 462 |             function = self._link_headers
 463 | 
 464 |         try:
 465 |             dict_link = function(self.tables, self.headers, **kwargs)
 466 |         except AttributeError:
 467 |             if len([x for x in kwargs.keys()]) > 0:
 468 |                 var_msg = (
 469 |                     f"Function link_headers, kwargs may have been passed when "
 470 |                     f"the function {func_name} in the script {script_name} does"
 471 |                     f" not take kwargs")
 472 |             else:
 473 |                 var_msg = (f"Function link_headers: The {func_name} function "
 474 |                            f"does not exist in the {script_name} script.")
 475 |             module_logger.error(var_msg)
 476 |             raise AttributeError(var_msg)
 477 | 
 478 |         list_unallocated_keys = set(self.tables.keys()) - set(dict_link.keys())
 479 |         if len(list_unallocated_keys) != 0:
 480 |             var_msg = (f"Not all the headers are linked, the unlinked tables "
 481 |                        f"are: {list_unallocated_keys}")
 482 |             module_logger.error(var_msg)
 483 |             raise ValueError(var_msg)
 484 | 
 485 |         self.__link_headers = dict(dict_link)
 486 | 
 487 |         module_logger.info("Completed `link_headers`")
 488 | 
 489 |     @staticmethod
 490 |     def __assert_linked_headers(
 491 |         list_ideal_headers, dict_header, df, remove_header_rows, reset_index):
 492 |         list_expected_headers = dict_header['expected_headers']
 493 |         list_new_names = dict_header['new_headers']
 494 |         list_remove = [
 495 |             dict_header['new_headers'][i] for i in range(len(dict_header['remove'])) 
 496 |             if dict_header['remove'][i] == 'remove'
 497 |         ]
 498 | 
 499 |         # Remove the expected headers rows
 500 |         if remove_header_rows:
 501 |             df.drop(
 502 |                 [i for i in range(len(list_expected_headers))],
 503 |                 axis=0,
 504 |                 inplace=True)
 505 |         if reset_index:
 506 |             df.reset_index(drop=True, inplace=True)
 507 | 
 508 |         # Set the new headers
 509 |         df.columns = list_new_names
 510 | 
 511 |         # Remove the columns to remove
 512 |         if len(list_remove) > 0:
 513 |             df.drop(list_remove, axis=1, inplace=True)
 514 | 
 515 |         # Fill in missing columns and reorder columns
 516 |         list_df_cols = df.columns.tolist()
 517 |         list_cols = [
 518 |             col for col in list_ideal_headers if col not in list_df_cols]
 519 |         for col in list_cols:
 520 |             df[col] = np.nan
 521 | 
 522 |         df = df[list_ideal_headers].copy()
 523 | 
 524 |         return df
 525 | 
 526 |     def assert_linked_headers(
 527 |             self, remove_header_rows=False, reset_index=False):
 528 |         module_logger.info("Starting `assert_linked_headers`")
 529 | 
 530 |         if type(self.tables).__name__ == 'dict':
 531 |             dict_dfs = dict(self.tables)
 532 |             for key in [key for key in self.__link_headers.keys()]:
 533 |                 dict_dfs[key] = self.__assert_linked_headers(
 534 |                     self.headers['ideal_headers'],
 535 |                     self.headers[self.__link_headers[key]],
 536 |                     dict_dfs[key],
 537 |                     remove_header_rows,
 538 |                     reset_index
 539 |                 )
 540 |             self.set_table(dict(dict_dfs))
 541 |         else:
 542 |             key = [key for key in self.__link_headers.keys()][0]
 543 |             df = self.__assert_linked_headers(
 544 |                 self.headers['ideal_headers'],
 545 |                 self.headers[self.__link_headers[key]],
 546 |                 self.tables,
 547 |                 remove_header_rows,
 548 |                 reset_index
 549 |             )
 550 |             self.set_table(df.copy())
 551 | 
 552 |         module_logger.info("Completed `assert_linked_headers`")
 553 | 
 554 |     def set_headers(
 555 |             self, path=None, script_name=None, func_name=None, list_cols=None,
 556 |             function=None, ideal_headers=None, required_headers=None):
 557 |         module_logger.info("Starting `set_headers`")
 558 |         if list_cols is not None:
 559 |             if type(list_cols).__name__ != "list":
 560 |                 var_msg = ("The argument `list_cols` of function `set_headers` "
 561 |                            "needs to be a list")
 562 |                 module_logger.error(var_msg)
 563 |                 raise ValueError(var_msg)
 564 |         elif function is not None:
 565 |             if type(function).__name__ != "function":
 566 |                 var_msg = ("The argument `function` of function `set_headers` "
 567 |                            "needs to be a function")
 568 |                 module_logger.error(var_msg)
 569 |                 raise ValueError(var_msg)
 570 |         elif script_name is not None:
 571 |             function = import_attr(path, script_name, func_name)
 572 |         elif ideal_headers is not None:
 573 |             if type(ideal_headers).__name__ != 'list':
 574 |                 var_msg = ("The argument `ideal_headers` of function "
 575 |                            "`set_headers` needs to be a list")
 576 |                 module_logger.error(var_msg)
 577 |                 raise ValueError(var_msg)
 578 |         elif required_headers is not None:
 579 |             if type(required_headers).__name__ != 'list':
 580 |                 var_msg = ("The argument `required_headers` of function "
 581 |                            "`set_headers` needs to be a list")
 582 |                 module_logger.error(var_msg)
 583 |                 raise ValueError(var_msg)
 584 |         var_type = type(self.tables).__name__
 585 |         if var_type == "dict":
 586 |             dict_dfs = self.tables.copy()
 587 |             var_cond = len(
 588 |                 set([dict_dfs[key].shape[1] for key in dict_dfs.keys()]))
 589 |             var_cond = var_cond != 1
 590 |             if var_cond:
 591 |                 var_msg = ("There are an inconsistent number of columns "
 592 |                            "present in the dictionary of tables")
 593 |                 module_logger.error(var_msg)
 594 |                 raise ValueError(var_msg)
 595 |             if list_cols is not None:
 596 |                 if (len(list_cols) !=
 597 |                         dict_dfs[[x for x in dict_dfs.keys()][0]].shape[1]):
 598 |                     var_msg = ("The length of `list_cols` is different to the "
 599 |                                "number of columns present in the table")
 600 |                     module_logger.error(var_msg)
 601 |                     raise ValueError(var_msg)
 602 |             elif function is not None:
 603 |                 list_cols_org = dict_dfs[
 604 |                     [x for x in dict_dfs.keys()][0]
 605 |                 ].columns.tolist()
 606 |                 list_cols = [function(x) for x in list_cols_org]
 607 |             for key in dict_dfs.keys():
 608 |                 if list_cols is not None:
 609 |                     dict_dfs[key].columns = list_cols
 610 |                 elif function is not None:
 611 |                     dict_dfs[key].columns = list_cols
 612 |                 elif ideal_headers is not None:
 613 |                     for col in [
 614 |                         col for col in ideal_headers if
 615 |                         col not in dict_dfs[key].columns.tolist()
 616 |                     ]:
 617 |                         dict_dfs[key][col] = np.nan
 618 |                     dict_dfs[key] = dict_dfs[key][ideal_headers].copy()
 619 |                 elif required_headers is not None:
 620 |                     for col in [
 621 |                         col for col in required_headers if
 622 |                         col not in dict_dfs[key].columns.tolist()
 623 |                     ]:
 624 |                         dict_dfs[key][col] = np.nan
 625 |             self.set_table(dict_dfs, overwrite=True)
 626 |         elif var_type == "DataFrame":
 627 |             if len(list_cols) != self.tables.shape[1]:
 628 |                 var_msg = ("The length of `list_cols` is different to the "
 629 |                            "number of columns present in the table")
 630 |                 module_logger.error(var_msg)
 631 |                 raise ValueError(var_msg)
 632 |             df = self.tables.copy()
 633 |             if list_cols is not None:
 634 |                 df.columns = list_cols
 635 |             elif function is not None:
 636 |                 df.columns = [function(x) for x in df.columns.tolist()]
 637 |             elif ideal_headers is not None:
 638 |                 for col in [
 639 |                     col for col in ideal_headers if
 640 |                     col not in df.columns.tolist()
 641 |                 ]:
 642 |                     df[col] = np.nan
 643 |                 df = df[ideal_headers].copy()
 644 |             elif required_headers is not None:
 645 |                 for col in [
 646 |                     col for col in required_headers if
 647 |                     col not in df.columns.tolist()
 648 |                 ]:
 649 |                     df[col] = np.nan
 650 |             self.set_table(df, overwrite=True)
 651 |         else:
 652 |             var_msg = ("Somehow the tables are not a dictionary or a DataFrame "
 653 |                        "for function `set_headers`")
 654 |             module_logger.error(var_msg)
 655 |             raise ValueError(var_msg)
 656 | 
 657 |         module_logger.info("Completed `set_headers`")
 658 | 
 659 |     def alter_tables(self, path=None, script_name=None,
 660 |                      object_name="dict_alter", dictionary=None, **kwargs):
 661 |         """
 662 |         Use this functionality to make alterations to the table(s)
 663 |         """
 664 |         module_logger.info("Starting `alter_tables`")
 665 |         # TODO move this check to own function (applies to convert_columns too)
 666 |         if (script_name is not None) & (object_name is not None):
 667 |             dict_alter = import_attr(path, script_name, object_name)
 668 |         elif dictionary is not None:
 669 |             if type(dictionary).__name__ != "dict":
 670 |                 var_msg = "The `dictionary` argument is not a dictionary"
 671 |                 module_logger.error(var_msg)
 672 |                 raise ValueError(var_msg)
 673 |             dict_alter = dictionary
 674 |         else:
 675 |             var_msg = ("Either `dictionary` or both of `script_name` and "
 676 |                        "`path` need to be none null")
 677 |             module_logger.error(var_msg)
 678 |             raise ValueError(var_msg)
 679 | 
 680 |         if type(self.tables).__name__ == "DataFrame":
 681 |             df = self.tables.copy()
 682 |             df_new = self.__alter_cols(
 683 |                 df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
 684 |                 np.nan, **kwargs)
 685 |             self.set_table(df_new)
 686 |         elif type(self.tables).__name__ == "dict":
 687 |             dfs = self.tables
 688 |             for key in self.tables.keys():
 689 |                 df = dfs[key].copy()
 690 |                 df_new = self.__alter_cols(
 691 |                     df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
 692 |                     key, **kwargs)
 693 |                 self.set_table(df_new, key)
 694 |         else:
 695 |             var_msg = ("The tables are in neither a DataFrame or dictionary "
 696 |                        "format, which means something is seriously wrong...")
 697 |             module_logger.error(var_msg)
 698 |             raise ValueError(var_msg)
 699 | 
 700 |         module_logger.info("Completed `alter_tables`")
 701 | 
 702 |     def __alter_cols(self, df, dict_alter, keys, dict_key, **kwargs):
 703 |         module_logger.info("Starting `__alter_cols`")
 704 |         if pd.isnull(dict_key):
 705 |             var_file = np.nan
 706 |             var_subfile = np.nan
 707 |         else:
 708 |             var_file = dict_key.split(self.__key_separator)[0]
 709 |             var_subfile = (dict_key.split(self.__key_separator)[1] if
 710 |                            self.__key_separator in dict_key else np.nan)
 711 |         for alter_key in dict_alter.keys():
 712 |             var_type = dict_alter[alter_key]["type"]
 713 |             function = dict_alter[alter_key]["function"]
 714 |             if var_type == "new_col":
 715 |                 var_col_name = dict_alter[alter_key]["col_name"]
 716 |                 if var_col_name in df.columns.tolist():
 717 |                     var_msg = (
 718 |                         f"The column {var_col_name} is present in the "
 719 |                         f"table so should not be overwritten")
 720 |                     module_logger.error(var_msg)
 721 |                     self.error_handling(var_file, var_subfile, "", var_msg,
 722 |                                         var_col_name, np.nan, np.nan)
 723 |                     continue
 724 |                 try:
 725 |                     s = function(df, keys, **kwargs)
 726 |                     df[var_col_name] = s
 727 |                 except KeyError:
 728 |                     var_msg = (
 729 |                         f"For type new_col the function for alter_key "
 730 |                         f"{alter_key} has not worked with a KeyError")
 731 |                     module_logger.error(var_msg)
 732 |                     self.error_handling(var_file, var_subfile, "", var_msg,
 733 |                                         var_col_name, np.nan, np.nan)
 734 |                     continue
 735 |                 except:
 736 |                     var_msg = (f"For type new_col the function for "
 737 |                                f"alter_key {alter_key} has not worked")
 738 |                     module_logger.error(var_msg)
 739 | 
 740 |                     var_idx = np.nan
 741 |                     var_issue_count = np.nan
 742 |                     if "idx_function" in dict_alter[alter_key]:
 743 |                         func_idx = dict_alter[alter_key]['idx_function']
 744 |                         if type(func_idx).__name__ != 'function':
 745 |                             var_msg = ''
 746 |                             module_logger.error(var_msg)
 747 |                         s_idx = func_idx(df, keys, **kwargs)
 748 |                         var_idx = ', '.join(
 749 |                             [
 750 |                                 str(item) for item in
 751 |                                 s_idx.loc[s_idx].index.tolist()
 752 |                             ]
 753 |                         )
 754 |                         var_issue_count = s_idx.sum()
 755 |                     self.error_handling(var_file, var_subfile, "", var_msg,
 756 |                                         var_col_name, var_issue_count, var_idx)
 757 |                     continue
 758 |             elif var_type == "map_df":
 759 |                 try:
 760 |                     df = function(df, keys, **kwargs)
 761 |                 except:
 762 |                     var_msg = (f"For type map_df the function for "
 763 |                                f"alter_key {alter_key} has not worked")
 764 |                     module_logger.error(var_msg)
 765 | 
 766 |                     var_idx = np.nan
 767 |                     var_issue_count = np.nan
 768 |                     if "idx_function" in dict_alter[alter_key]:
 769 |                         func_idx = dict_alter[alter_key]['idx_function']
 770 |                         if type(func_idx).__name__ != 'function':
 771 |                             var_msg = ''
 772 |                             module_logger.error(var_msg)
 773 |                         s_idx = func_idx(df, keys, **kwargs)
 774 |                         var_idx = ', '.join(
 775 |                             [
 776 |                                 str(item) for item in
 777 |                                 s_idx.loc[s_idx].index.tolist()
 778 |                             ]
 779 |                         )
 780 |                         var_issue_count = s_idx.sum()
 781 |                     self.error_handling(var_file, var_subfile, "", var_msg,
 782 |                                         np.nan, var_issue_count, var_idx)
 783 |                     continue
 784 | 
 785 |         module_logger.info("Completed `__alter_cols`")
 786 |         return df
 787 | 
 788 |     def convert_columns(self, path=None, script_name=None,
 789 |                         object_name="dict_convert", dictionary=None, **kwargs):
 790 |         module_logger.info("Starting `convert_columns`")
 791 |         if (script_name is not None) & (object_name is not None):
 792 |             dict_convert = import_attr(path, script_name, object_name)
 793 |         elif dictionary is not None:
 794 |             if type(dictionary).__name__ != "dict":
 795 |                 var_msg = "The `dictionary` argument is not a dictionary"
 796 |                 module_logger.error(var_msg)
 797 |                 raise ValueError(var_msg)
 798 |             dict_convert = dictionary
 799 |         else:
 800 |             var_msg = ("Either `dictionary` or both of `script_name` and "
 801 |                        "`path` need to be none null")
 802 |             module_logger.error(var_msg)
 803 |             raise ValueError(var_msg)
 804 | 
 805 |         if type(self.tables).__name__ == "DataFrame":
 806 |             df = self.tables.copy()
 807 |             df_new = self.__convert_col(df, dict_convert, "", **kwargs)
 808 |             self.set_table(df_new, overwrite=True)
 809 |         elif type(self.tables).__name__ == "dict":
 810 |             dfs = self.tables
 811 |             for key in self.tables.keys():
 812 |                 df = dfs[key].copy()
 813 |                 df_new = self.__convert_col(df, dict_convert, key, **kwargs)
 814 |                 dfs[key] = df_new.copy()
 815 |             self.set_table(dfs, overwrite=True)
 816 |         else:
 817 |             var_msg = ("The tables are in neither a DataFrame or dictionary "
 818 |                        "format, which means something is seriously wrong...")
 819 |             module_logger.error(var_msg)
 820 |             raise ValueError(var_msg)
 821 | 
 822 |         module_logger.info("Completed `convert_columns`")
 823 | 
 824 |     def __convert_col(self, df, dict_convert, dict_key, **kwargs):
 825 |         module_logger.info("Starting `__convert_col`")
 826 |         for convert_key in dict_convert.keys():
 827 |             cols = dict_convert[convert_key]["columns"]
 828 |             if type(cols).__name__ == 'function':
 829 |                 cols = cols(df, **kwargs)
 830 |             list_cols = list(cols)
 831 |             list_stops = dict_convert[convert_key]["dtypes"]
 832 |             dict_functions = dict_convert[convert_key]["functions"]
 833 |             for col in list_cols:
 834 |                 if col not in df.columns.tolist():
 835 |                     var_msg = f"The column {col} is not present"
 836 |                     module_logger.error(var_msg)
 837 |                     raise ValueError(var_msg)
 838 |                 dtype_flag = 0
 839 |                 var_dtype = df[col].dtype.name
 840 |                 for dtype in list_stops:
 841 |                     if dtype in var_dtype:
 842 |                         dtype_flag = 1
 843 |                         break
 844 |                 if dtype_flag == 1:
 845 |                     continue
 846 |                 converted_flag = 0
 847 |                 for key in dict_functions.keys():
 848 |                     func_use = dict_functions[key]
 849 |                     if type(func_use).__name__ != "function":
 850 |                         var_msg = (f"The function for converting is not a "
 851 |                                    f"function! For keys {convert_key}, {key}")
 852 |                         module_logger.error(var_msg)
 853 |                         raise ValueError(var_msg)
 854 |                     try:
 855 |                         s = func_use(df, col, **kwargs)
 856 |                         df[col] = s.copy()
 857 |                         converted_flag = 1
 858 |                         break
 859 |                     except:
 860 |                         var_msg = (f"The conversion failed for keys "
 861 |                                    f"{convert_key}, {key}, trying next")
 862 |                         module_logger.warning(var_msg)
 863 |                         continue
 864 |                 if converted_flag == 0:
 865 |                     var_idx = np.nan
 866 |                     var_issue_count = np.nan
 867 |                     if "idx_function" in dict_convert[convert_key]:
 868 |                         func_idx = dict_convert[convert_key]['idx_function']
 869 |                         if type(func_idx).__name__ != 'function':
 870 |                             var_msg = (
 871 |                                 f'The `idx_function` argument is not a function'
 872 |                                 f' it is a {type(func_idx).__name__}')
 873 |                             module_logger.error(var_msg)
 874 |                             raise ValueError(var_msg)
 875 |                         s_idx = func_idx(df, col, **kwargs)
 876 |                         var_idx = ', '.join(
 877 |                             [
 878 |                                 str(item) for item in
 879 |                                 s_idx.loc[s_idx].index.tolist()
 880 |                             ]
 881 |                         )
 882 |                         var_issue_count = s_idx.sum()
 883 |                     var_msg = (f"The conversion for column {col} for "
 884 |                                f"convert_key {convert_key} failed.")
 885 |                     module_logger.error(var_msg)
 886 |                     self.error_handling(
 887 |                         dict_key.split(self.__key_separator)[0],
 888 |                         (dict_key.split(self.__key_separator)[1] if
 889 |                          self.__key_separator in dict_key else np.nan),
 890 |                         "",
 891 |                         f"The conversion failed to format {convert_key}",
 892 |                         col,
 893 |                         var_issue_count,
 894 |                         var_idx
 895 |                     )
 896 | 
 897 |         module_logger.info("Completed `__convert_col`")
 898 |         return df
 899 | 
 900 |     def assert_nulls(self, list_nulls=None, list_exclude_cols=None):
 901 |         module_logger.info("Starting `assert_nulls`")
 902 |         if list_nulls is None:
 903 |             list_nulls_use = ["nan", ""]
 904 |         else:
 905 |             list_nulls_use = list_nulls
 906 |         if list_exclude_cols is None:
 907 |             list_exclude_cols_use = []
 908 |         else:
 909 |             list_exclude_cols_use = list_exclude_cols
 910 |         module_logger.info(f"The nulls being used are: {list_nulls_use}")
 911 |         module_logger.info(
 912 |             f"The columns being excluded are: {list_exclude_cols_use}")
 913 |         df = self.tables.copy()
 914 |         if type(df).__name__ == "dict":
 915 |             list_keys = [x for x in df.keys()]
 916 |             for key in list_keys:
 917 |                 for null in list_nulls_use:
 918 |                     if len(list_exclude_cols_use) == 0:
 919 |                         df[key] = df[key].replace(null, np.nan)
 920 |                     else:
 921 |                         for col in [
 922 |                             col for col in df[key].columns.tolist() if
 923 |                             col not in list_exclude_cols_use
 924 |                         ]:
 925 |                             df[key][col] = df[key][col].replace(null, np.nan)
 926 |         else:
 927 |             for null in list_nulls_use:
 928 |                 if len(list_exclude_cols_use) == 0:
 929 |                     df = df.replace(null, np.nan)
 930 |                 else:
 931 |                     for col in [
 932 |                         col for col in df.columns.tolist() if
 933 |                         col not in list_exclude_cols_use
 934 |                     ]:
 935 |                         df[col] = df[col].replace(null, np.nan)
 936 |         self.set_table(df, overwrite=True)
 937 |         module_logger.info("Completed `assert_nulls`")
 938 | 
 939 |     def get_issue_count(self, issue_number_min=None, issue_number_max=None):
 940 |         module_logger.info("Starting `get_issue_count`")
 941 |         df = self.df_issues.copy()
 942 |         if issue_number_min is not None:
 943 |             df = df.loc[df["step_number"] >= issue_number_min].copy()
 944 |         if issue_number_max is not None:
 945 |             df = df.loc[df["step_number"] <= issue_number_max].copy()
 946 |         var_count = df.shape[0]
 947 |         module_logger.info("Completed `get_issue_count`")
 948 |         return var_count
 949 | 
 950 |     def form_summary_tables(self, path=None, script_name=None,
 951 |                             func_name="form_tables", function=None, **kwargs):
 952 |         """
 953 |         Use a function to create summaries off the main table set.
 954 | 
 955 |         The function is passed the arguments:
 956 |             self.tables, self.formed_tables, self.__grouping, self.__key_1,
 957 |             self.__key_2, self.__key_3, self.__key_separator, **kwargs
 958 |         """
 959 |         module_logger.info("Starting `form_summary_tables`")
 960 | 
 961 |         if function is not None:
 962 |             if type(function).__name__ != "function":
 963 |                 var_msg = ("The function passed to `self.form_summary_tables` "
 964 |                            "is not a function.")
 965 |                 module_logger.error(var_msg)
 966 |                 raise ValueError(var_msg)
 967 |         elif script_name is not None:
 968 |             function = import_attr(path, script_name, func_name)
 969 |         else:
 970 |             var_msg = ("One of the `function` or `script_name` arguments needs "
 971 |                        "to be completed. And if `script name is then `path` "
 972 |                        "needs to be too.")
 973 |             module_logger.error(var_msg)
 974 |             raise ValueError(var_msg)
 975 | 
 976 |         dict_formed_tables = function(
 977 |             self.tables, self.formed_tables, self.__grouping, self.__key_1,
 978 |             self.__key_2, self.__key_3, self.__key_separator, **kwargs)
 979 |         if type(dict_formed_tables).__name__ != 'dict':
 980 |             var_msg = ('The output of the function for `form_summary_table` '
 981 |                        'is not a dictionary and it needs to be')
 982 |             module_logger.error(var_msg)
 983 |             raise ValueError(var_msg)
 984 |         self.formed_tables = dict_formed_tables
 985 | 
 986 |         module_logger.info("Completed `form_summary_tables`")
 987 | 
 988 |     def get_step_no(self):
 989 |         module_logger.info("Starting `get_step_no`")
 990 |         module_logger.info("Completed `get_step_no`")
 991 |         return self.__step_no
 992 | 
 993 |     def _repr_html_(self):
 994 |         module_logger.info("Starting `_repr__html_`")
 995 |         var_key_3 = "" if self.__key_3 == "None" else self.__key_3
 996 |         var_out_keys = f"""
 997 | <table style="width:30%">
 998 |   <tr>
 999 |     <th>Grouping</th>
1000 |     <td>{self.__grouping}</td>
1001 |   </tr>
1002 |   <tr>
1003 |     <th>Key 1</th>
1004 |     <td>{self.__key_1}</td>
1005 |   </tr>
1006 |   <tr>
1007 |     <th>Key 2</th>
1008 |     <td>{self.__key_2}</td>
1009 |   </tr>
1010 |   <tr>
1011 |     <th>Key 3</th>
1012 |     <td>{var_key_3}</td>
1013 |   </tr>
1014 | </table>
1015 |         """
1016 |         if type(self.tables).__name__ == 'dict':
1017 |             var_out_tbl_info = """
1018 | <table style="width:100%">
1019 |   <tr>
1020 |     <th>Dictionary key</th>
1021 |     <th>Dataframe shape</th>
1022 |     <th>Count numeric columns</th>
1023 |     <th>Count date columns</th>
1024 |     <th>Count object columns</th>
1025 |   </tr>
1026 |   {}
1027 | </table>
1028 |             """
1029 |             for key in [key for key in self.tables.keys()]:
1030 |                 var_out_tbl_info = var_out_tbl_info.replace(
1031 |                     '{}',
1032 |                     f"""
1033 |   <tr>
1034 |     <td>{key}</td>
1035 |     <td>{self.tables[key].shape}</td>
1036 |     <td>{self.tables[key].select_dtypes(include=[np.number]).shape[1]}</td>
1037 |     <td>{self.tables[key].select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]}</td>
1038 |     <td>{self.tables[key].select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]}</td>
1039 |   </tr>
1040 |   {{}}
1041 |                     """
1042 |                 )
1043 |             var_out_tbl_info = var_out_tbl_info.replace('{}', '')
1044 |         else:
1045 |             var_out_tbl_info = f"""
1046 | <table style="width:100%">
1047 |   <tr>
1048 |     <th>Dataframe shape</th>
1049 |     <th>Count numeric columns</th>
1050 |     <th>Count date columns</th>
1051 |     <th>Count object columns</th>
1052 |   </tr>
1053 |   <tr>
1054 |     <td>{self.tables.shape}</td>
1055 |     <td>{self.tables.select_dtypes(include=[np.number]).shape[1]}</td>
1056 |     <td>{self.tables.select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]}</td>
1057 |     <td>{self.tables.select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]}</td>
1058 |   </tr>
1059 | </table>
1060 |             """
1061 |         var_out_issues = """
1062 |         """
1063 |         var_out = f"{var_out_keys}<br><br>{var_out_tbl_info}<br><br>{var_out_issues}"
1064 |         module_logger.info("Completed `_repr_html_`")
1065 |         return var_out
1066 | 


--------------------------------------------------------------------------------
/data_etl/general_functions.py:
--------------------------------------------------------------------------------
 1 | # Here functions that are typically used when using these scripts or writing
 2 | # these data curation scripts are predefined here
 3 | import logging
 4 | import os
 5 | from datetime import datetime
 6 | import importlib
 7 | 
 8 | import pandas as pd
 9 | 
10 | module_logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def func_initialise_logging(
14 |     script_name, log_folder_path, key_1, key_2, key_3, start_time):
15 |     var_log_name = os.path.abspath(
16 |         os.path.join(
17 |             log_folder_path,
18 |             (f"{script_name}_{key_1}_{key_2}_{key_3}_"
19 |              f"{start_time.strftime('%Y%m%d_%H%M%S')}.log")
20 |         )
21 |     )
22 |     logging.basicConfig(
23 |         filename=var_log_name, filemode="a", datefmt="%H:%M:%S",
24 |         level=logging.DEBUG,
25 |         format="%(asctime)s|%(name)s|%(levelname)s|%(message)s")
26 | 
27 |     logging.info(f"Starting the process at "
28 |                  f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
29 | 
30 | 
31 | def func_check_for_issues(issue_count, cnx, cnx_key, table, step_no,
32 |                           override=False, start_time=None):
33 |     if (issue_count > 0) & (override is not True):
34 |         cnx.write_to_db(cnx_key, table)
35 |         var_msg = f'There were {issue_count} issues found at step {step_no}'
36 |         module_logger.error(var_msg)
37 |         if start_time is not None:
38 |             module_logger.info("Script time taken: {}".format(
39 |                 str(datetime.now() - start_time)))
40 |         raise ValueError(var_msg)
41 | 
42 | 
43 | def func_to_sql(x, datetime_format='%Y-%m-%d'):
44 |     if pd.isnull(x):
45 |         return "NULL"
46 |     elif type(x).__name__ == 'Timestamp':
47 |         return f"'{x.strftime(datetime_format)}'"
48 |     else:
49 |         return f"'{str(x)}'"
50 | 
51 | 
52 | def import_attr(path, script_name, attr_name):
53 |     if (path is None) | (path == '.'):
54 |         mod = importlib.import_module(script_name)
55 |     else:
56 |         var_script_path = os.path.join(path, f"{script_name}.py")
57 |         if not os.path.exists(var_script_path):
58 |             var_msg = f"The script does not exist: {script_name}.py"
59 |             module_logger.error(var_msg)
60 |             raise ValueError(var_msg)
61 |         spec = importlib.util.spec_from_file_location(
62 |             script_name, var_script_path)
63 |         mod = importlib.util.module_from_spec(spec)
64 |         spec.loader.exec_module(mod)
65 |     attr = getattr(mod, attr_name)
66 | 
67 |     return attr
68 | 


--------------------------------------------------------------------------------
/examples/00_create_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Create data\n",
  8 |     "\n",
  9 |     "This notebook creates the data that is used in the examples\n",
 10 |     "\n",
 11 |     "There is a data set that will process without problems in the examples and one that will have issues to see the difference. There are also some excel outputs for the scripts example.\n",
 12 |     "\n",
 13 |     "The specific sections for creating tables are: \n",
 14 |     "+ [Conversions](#Conversions), converting column dtypes\n",
 15 |     "+ [Altering](#Altering), changing the values in the DataFrame, adding new columns, dropping rows or columns etc\n",
 16 |     "+ [Checks](#Checks), looking for outliers or rows that data does not follow the prescribed rules\n",
 17 |     "+ [For summary tables](#For-summary-tables), there is one table here and it's for a summary output\n",
 18 |     "\n",
 19 |     "## Setup\n",
 20 |     "<ht>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Import and settings options"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import sqlite3\n",
 37 |     "import pickle\n",
 38 |     "import datetime\n",
 39 |     "\n",
 40 |     "import pandas as pd\n",
 41 |     "import numpy as np"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "pd.set_option('display.max_rows', 10)\n",
 51 |     "pd.set_option('display.max_columns', 10)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Create tables\n",
 59 |     "<hr>\n",
 60 |     "\n",
 61 |     "There are lots of different but small tables used in the examples\n",
 62 |     "\n",
 63 |     "### Conversions\n",
 64 |     "<hr>"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "df_convert = pd.DataFrame(\n",
 74 |     "    [\n",
 75 |     "        ('A', '1', '0.6', '2019-01-01'),\n",
 76 |     "        ('B', '4', '5.2', '2019-02-05'),\n",
 77 |     "        ('C', '1', '5.6', '2018-12-17'),\n",
 78 |     "        ('D', '10', '15.9', '2019-07-18'),\n",
 79 |     "        ('E', '-8', '4.7', '2018-03-09')\n",
 80 |     "    ],\n",
 81 |     "    columns=['object', 'int', 'float', 'date']\n",
 82 |     ")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "df_convert_issues = pd.DataFrame(\n",
 92 |     "    [\n",
 93 |     "        ('A', '1', '0.6', '2019-02-29'),\n",
 94 |     "        ('B', '4.5', 'A', '2019-22-05'),\n",
 95 |     "        ('C', '1', '5.6', '2018-12-17'),\n",
 96 |     "        ('D', 'b', '15.9', '2019-09-31'),\n",
 97 |     "        (5, '-8', '4.7', '2018-03-09')\n",
 98 |     "    ],\n",
 99 |     "    columns=['object', 'int', 'float', 'date']\n",
100 |     ")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "### Altering\n",
108 |     "<hr>"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "df_alterations = pd.DataFrame(\n",
118 |     "    [\n",
119 |     "        ('A', 2, 'key_1'),\n",
120 |     "        ('B', 199, 'key_2'),\n",
121 |     "        ('C', -1, 'key_1'),\n",
122 |     "        ('D', 20, 'key_3'),\n",
123 |     "        ('E', 6, 'key_2')\n",
124 |     "    ],\n",
125 |     "    columns=['to_map', 'add_1', 'merge_key']\n",
126 |     ")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 6,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "df_alterations_issues = pd.DataFrame(\n",
136 |     "    [\n",
137 |     "        ('A', 2, 'key_1'),\n",
138 |     "        ('B', 199, 2),\n",
139 |     "        ('C', -1, 'key_1'),\n",
140 |     "        (['D'], 'a', 'key_3'),\n",
141 |     "        ('E', 6, 'key_2')\n",
142 |     "    ],\n",
143 |     "    columns=['to_map', 'add_1', 'merge_key']\n",
144 |     ")"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "### Checks\n",
152 |     "<hr>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 7,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "df_checks = pd.DataFrame(\n",
162 |     "    [\n",
163 |     "        (3, 'A', 'a'),\n",
164 |     "        (10, 'A', 'z'),\n",
165 |     "        (9, 'B', 'b'),\n",
166 |     "        (4, 'D', 'd'),\n",
167 |     "        (7, 'C', 'c')\n",
168 |     "    ],\n",
169 |     "    columns=['number', 'category_1', 'category_2']\n",
170 |     ")"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 8,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "df_checks_issues = pd.DataFrame(\n",
180 |     "    [\n",
181 |     "        (1, 'Z', 'y'),\n",
182 |     "        (10, 'A', 'a'),\n",
183 |     "        (9, 'Y', 'b'),\n",
184 |     "        (4, 'B', 'b'),\n",
185 |     "        (-1, 'C', 'c')\n",
186 |     "    ],\n",
187 |     "    columns=['number', 'category_1', 'category_2']\n",
188 |     ")"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "### For summary tables\n",
196 |     "<hr>"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 9,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "df_summary = pd.DataFrame(\n",
206 |     "    [\n",
207 |     "        ('b', 'c', 1, 6),\n",
208 |     "        ('d', 'b', 1, 9),\n",
209 |     "        ('c', 'b', 1, 0),\n",
210 |     "        ('d', 'd', 1, 9),\n",
211 |     "        ('c', 'b', 1, 1),\n",
212 |     "        ('a', 'd', 1, 3),\n",
213 |     "        ('c', 'c', 1, 0),\n",
214 |     "        ('c', 'd', 1, 0),\n",
215 |     "        ('c', 'c', 1, 0),\n",
216 |     "        ('a', 'e', 1, 4),\n",
217 |     "        ('b', 'e', 1, 7),\n",
218 |     "        ('a', 'd', 1, 4),\n",
219 |     "        ('b', 'e', 1, 6),\n",
220 |     "        ('b', 'c', 1, 8),\n",
221 |     "        ('b', 'c', 1, 7),\n",
222 |     "        ('d', 'e', 1, 9),\n",
223 |     "        ('a', 'b', 1, 5),\n",
224 |     "        ('a', 'd', 1, 5),\n",
225 |     "        ('a', 'b', 1, 4),\n",
226 |     "        ('d', 'b', 1, 10),\n",
227 |     "        ('b', 'c', 1, 6),\n",
228 |     "        ('b', 'e', 1, 7),\n",
229 |     "        ('a', 'e', 1, 4),\n",
230 |     "        ('a', 'c', 1, 3),\n",
231 |     "        ('c', 'c', 1, 0),\n",
232 |     "        ('c', 'd', 1, 2),\n",
233 |     "        ('a', 'b', 1, 3),\n",
234 |     "        ('a', 'e', 1, 5),\n",
235 |     "        ('a', 'c', 1, 3),\n",
236 |     "        ('a', 'e', 1, 4),\n",
237 |     "        ('b', 'd', 1, 6),\n",
238 |     "        ('c', 'e', 1, 1),\n",
239 |     "        ('b', 'e', 1, 7),\n",
240 |     "        ('c', 'c', 1, 0),\n",
241 |     "        ('a', 'c', 1, 5),\n",
242 |     "        ('c', 'b', 1, 0),\n",
243 |     "        ('d', 'b', 1, 8),\n",
244 |     "        ('d', 'e', 1, 10),\n",
245 |     "        ('d', 'c', 1, 8),\n",
246 |     "        ('a', 'd', 1, 3),\n",
247 |     "        ('d', 'e', 1, 10),\n",
248 |     "        ('d', 'c', 1, 8),\n",
249 |     "        ('d', 'e', 1, 10),\n",
250 |     "        ('a', 'c', 1, 4),\n",
251 |     "        ('d', 'b', 1, 8),\n",
252 |     "        ('d', 'b', 1, 10),\n",
253 |     "        ('d', 'e', 1, 10),\n",
254 |     "        ('a', 'c', 1, 5),\n",
255 |     "        ('a', 'd', 1, 5),\n",
256 |     "        ('d', 'c', 1, 10)\n",
257 |     "    ],\n",
258 |     "    columns=['str', 'str_2', 'count', 'int_max']\n",
259 |     ")"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### For scripts\n",
267 |     "<hr>"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 10,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "df_data = pd.DataFrame(\n",
277 |     "    [\n",
278 |     "        (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 7, 7, 0, 0), \n",
279 |     "         'A string this is', 51.5074, 0.1278),\n",
280 |     "        (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 4, 9, 0, 0), \n",
281 |     "         'Test', 51.5084, 0.1268),\n",
282 |     "        (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 1, 10, 0, 0), \n",
283 |     "         'testing', 51.5094, 0.1258),\n",
284 |     "        (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 10, 13, 0, 0),\n",
285 |     "         'test test test', 51.5104, 0.1248),\n",
286 |     "        (4, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 7, 16, 0, 0),\n",
287 |     "         np.nan, 51.5114, 0.1238),\n",
288 |     "        (5, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 4, 18, 0, 0), \n",
289 |     "         np.nan, 51.5124, 0.1228),\n",
290 |     "        (6, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 1, 19, 0, 0),\n",
291 |     "         'Blah', 51.5134, 0.1218),\n",
292 |     "        (7, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 10, 22, 0, 0),\n",
293 |     "         'Dah', 51.5144, 0.1208),\n",
294 |     "        (1234, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 7, 25, 0, 0), \n",
295 |     "         'Doh', 51.5154, 0.1198),\n",
296 |     "        (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 4, 27, 0, 0),\n",
297 |     "         'Boh', 51.5164, 0.1188),\n",
298 |     "        (2341243, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 1, 29, 0, 0),\n",
299 |     "         'Pho', 51.5174, 0.1178)\n",
300 |     "    ],\n",
301 |     "    columns=['Number', 'A date', 'Another date£', '   StringStringString   ', 'lat', 'lng']\n",
302 |     ")"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 11,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "df_headers_1 = pd.DataFrame(\n",
312 |     "    [\n",
313 |     "        ('Header', 'Number', 'A date', 'Another date£', '   StringStringString   ', 'lat', 'lng'), \n",
314 |     "        ('New name', 'a_number', 'date_1', 'date_2', 'string', 'lat', 'lng'),\n",
315 |     "        ('Remove', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),\n",
316 |     "        ('Notes', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)\n",
317 |     "    ]\n",
318 |     ")"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 12,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "df_ideal_headers = pd.DataFrame(\n",
328 |     "    [\n",
329 |     "        ('a_number', 'date_1', 'date_2', 'string', 'testing', 'a', 'b', 'lat', 'lng')\n",
330 |     "    ]\n",
331 |     ")"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "## Write out data\n",
339 |     "<hr>"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 13,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "df_convert.to_csv('data/df_convert.tsv', sep='\\t', index=False)\n",
349 |     "df_convert_issues.to_csv('data/df_convert_issues.tsv', sep='\\t', index=False)\n",
350 |     "\n",
351 |     "df_alterations.to_csv('data/df_alterations.tsv', sep='\\t', index=False)\n",
352 |     "df_alterations_issues.to_csv('data/df_alterations_issues.tsv', sep='\\t', index=False)\n",
353 |     "\n",
354 |     "pickle.dump(df_checks, open('data/df_checks.pkl', 'wb'))\n",
355 |     "pickle.dump(df_checks_issues, open('data/df_checks_issues.pkl', 'wb'))\n",
356 |     "\n",
357 |     "pickle.dump(df_summary, open('data/df_summary.pkl', 'wb'))\n",
358 |     "\n",
359 |     "df_data.to_excel('data/A.xlsx', index=False)\n",
360 |     "xl_writer = pd.ExcelWriter('data/headers.xlsx')\n",
361 |     "df_headers_1.to_excel(xl_writer, index=False, sheet_name='A 1', header=None)\n",
362 |     "df_ideal_headers.to_excel(xl_writer, index=False, sheet_name='IdealHeaders', header=None)\n",
363 |     "xl_writer.save()\n",
364 |     "xl_writer.close()"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "---\n",
372 |     "\n",
373 |     "**GigiSR**"
374 |    ]
375 |   }
376 |  ],
377 |  "metadata": {
378 |   "kernelspec": {
379 |    "display_name": "Python 3",
380 |    "language": "python",
381 |    "name": "python3"
382 |   },
383 |   "language_info": {
384 |    "codemirror_mode": {
385 |     "name": "ipython",
386 |     "version": 3
387 |    },
388 |    "file_extension": ".py",
389 |    "mimetype": "text/x-python",
390 |    "name": "python",
391 |    "nbconvert_exporter": "python",
392 |    "pygments_lexer": "ipython3",
393 |    "version": "3.6.10"
394 |   }
395 |  },
396 |  "nbformat": 4,
397 |  "nbformat_minor": 2
398 | }
399 | 


--------------------------------------------------------------------------------
/examples/03_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example notebook 03\n",
  8 |     "\n",
  9 |     "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `Connections` class:\n",
 10 |     "\n",
 11 |     "+ [Initialise a SqliteDB connection](#Initialise-a-SqliteDB-connection)\n",
 12 |     "+ [Read from cnx](#Read-from-cnx)\n",
 13 |     "+ [Write to a table](#Write-to-a-table)\n",
 14 |     "\n",
 15 |     "## Setup\n",
 16 |     "<hr>\n",
 17 |     "\n",
 18 |     "Imports and setting options"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from datetime import datetime\n",
 28 |     "import pickle\n",
 29 |     "\n",
 30 |     "from data_etl import Connections, Checks"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Examples\n",
 38 |     "<hr>"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Initialise the class"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "cnxs = Connections()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "### Initialise a SqliteDB connection\n",
 62 |     "<hr>"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Initialise the SqliteDB, it doesn't already exist so a warning message is output that a file is being created\n",
 70 |     "\n",
 71 |     "The optional kwarg `sqlite_df_issues_create` creates a table structure to match the issues tables present in `DataCuration` and `Checks` objects"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stderr",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "The `file_path` data/00_db.db is not valid so this file will be created\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "cnxs.add_cnx(\n",
 89 |     "    cnx_key='df_issues', \n",
 90 |     "    cnx_type='sqlite3',\n",
 91 |     "    table_name='df_issues',\n",
 92 |     "    file_path='data/00_db.db',\n",
 93 |     "    sqlite_df_issues_create=True\n",
 94 |     ")"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "### Read from cnx\n",
102 |     "<hr>"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Using `read_from_db` you can read data out from a table, or from a database on the same connection"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/html": [
120 |        "<div>\n",
121 |        "<style scoped>\n",
122 |        "    .dataframe tbody tr th:only-of-type {\n",
123 |        "        vertical-align: middle;\n",
124 |        "    }\n",
125 |        "\n",
126 |        "    .dataframe tbody tr th {\n",
127 |        "        vertical-align: top;\n",
128 |        "    }\n",
129 |        "\n",
130 |        "    .dataframe thead th {\n",
131 |        "        text-align: right;\n",
132 |        "    }\n",
133 |        "</style>\n",
134 |        "<table border=\"1\" class=\"dataframe\">\n",
135 |        "  <thead>\n",
136 |        "    <tr style=\"text-align: right;\">\n",
137 |        "      <th></th>\n",
138 |        "      <th>key_1</th>\n",
139 |        "      <th>key_2</th>\n",
140 |        "      <th>key_3</th>\n",
141 |        "      <th>file</th>\n",
142 |        "      <th>sub_file</th>\n",
143 |        "      <th>step_number</th>\n",
144 |        "      <th>category</th>\n",
145 |        "      <th>issue_short_desc</th>\n",
146 |        "      <th>issue_long_desc</th>\n",
147 |        "      <th>column</th>\n",
148 |        "      <th>issue_count</th>\n",
149 |        "      <th>issue_idx</th>\n",
150 |        "      <th>grouping</th>\n",
151 |        "    </tr>\n",
152 |        "  </thead>\n",
153 |        "  <tbody>\n",
154 |        "  </tbody>\n",
155 |        "</table>\n",
156 |        "</div>"
157 |       ],
158 |       "text/plain": [
159 |        "Empty DataFrame\n",
160 |        "Columns: [key_1, key_2, key_3, file, sub_file, step_number, category, issue_short_desc, issue_long_desc, column, issue_count, issue_idx, grouping]\n",
161 |        "Index: []"
162 |       ]
163 |      },
164 |      "execution_count": 4,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "### Write to a table\n",
178 |     "<hr>"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "We needs some issues to write to the table"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 5,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/html": [
196 |        "<div>\n",
197 |        "<style scoped>\n",
198 |        "    .dataframe tbody tr th:only-of-type {\n",
199 |        "        vertical-align: middle;\n",
200 |        "    }\n",
201 |        "\n",
202 |        "    .dataframe tbody tr th {\n",
203 |        "        vertical-align: top;\n",
204 |        "    }\n",
205 |        "\n",
206 |        "    .dataframe thead th {\n",
207 |        "        text-align: right;\n",
208 |        "    }\n",
209 |        "</style>\n",
210 |        "<table border=\"1\" class=\"dataframe\">\n",
211 |        "  <thead>\n",
212 |        "    <tr style=\"text-align: right;\">\n",
213 |        "      <th></th>\n",
214 |        "      <th>key_1</th>\n",
215 |        "      <th>key_2</th>\n",
216 |        "      <th>key_3</th>\n",
217 |        "      <th>file</th>\n",
218 |        "      <th>sub_file</th>\n",
219 |        "      <th>step_number</th>\n",
220 |        "      <th>category</th>\n",
221 |        "      <th>issue_short_desc</th>\n",
222 |        "      <th>issue_long_desc</th>\n",
223 |        "      <th>column</th>\n",
224 |        "      <th>issue_count</th>\n",
225 |        "      <th>issue_idx</th>\n",
226 |        "      <th>grouping</th>\n",
227 |        "    </tr>\n",
228 |        "  </thead>\n",
229 |        "  <tbody>\n",
230 |        "    <tr>\n",
231 |        "      <th>0</th>\n",
232 |        "      <td>1</td>\n",
233 |        "      <td>None</td>\n",
234 |        "      <td>None</td>\n",
235 |        "      <td>df_checks_issues.pkl</td>\n",
236 |        "      <td>NaN</td>\n",
237 |        "      <td>0</td>\n",
238 |        "      <td>NaN</td>\n",
239 |        "      <td>Number should be greater than 0</td>\n",
240 |        "      <td></td>\n",
241 |        "      <td>NaN</td>\n",
242 |        "      <td>1</td>\n",
243 |        "      <td>4</td>\n",
244 |        "      <td>2020-05-26 07:36:41.839557</td>\n",
245 |        "    </tr>\n",
246 |        "  </tbody>\n",
247 |        "</table>\n",
248 |        "</div>"
249 |       ],
250 |       "text/plain": [
251 |        "  key_1 key_2 key_3                  file sub_file  step_number category  \\\n",
252 |        "0     1  None  None  df_checks_issues.pkl      NaN            0      NaN   \n",
253 |        "\n",
254 |        "                  issue_short_desc issue_long_desc column issue_count  \\\n",
255 |        "0  Number should be greater than 0                    NaN           1   \n",
256 |        "\n",
257 |        "  issue_idx                   grouping  \n",
258 |        "0         4 2020-05-26 07:36:41.839557  "
259 |       ]
260 |      },
261 |      "execution_count": 5,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "var_start_time = datetime.now()\n",
268 |     "ch_checks = Checks(var_start_time, '1')\n",
269 |     "\n",
270 |     "dict_data = {\n",
271 |     "    'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n",
272 |     "}\n",
273 |     "\n",
274 |     "dict_checks = dict()\n",
275 |     "dict_checks['Number should be greater than 0'] = {\n",
276 |     "    'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n",
277 |     "}\n",
278 |     "\n",
279 |     "ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n",
280 |     "\n",
281 |     "ch_checks.df_issues"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "Using `write_to_db` creates a temporary table in the background which the data is written to, if that has written with no issues then it moves all that data to the main table"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 6,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "cnxs.write_to_db('df_issues', ch_checks.df_issues)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "And then check it wrote to the table"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 7,
310 |    "metadata": {},
311 |    "outputs": [
312 |     {
313 |      "data": {
314 |       "text/html": [
315 |        "<div>\n",
316 |        "<style scoped>\n",
317 |        "    .dataframe tbody tr th:only-of-type {\n",
318 |        "        vertical-align: middle;\n",
319 |        "    }\n",
320 |        "\n",
321 |        "    .dataframe tbody tr th {\n",
322 |        "        vertical-align: top;\n",
323 |        "    }\n",
324 |        "\n",
325 |        "    .dataframe thead th {\n",
326 |        "        text-align: right;\n",
327 |        "    }\n",
328 |        "</style>\n",
329 |        "<table border=\"1\" class=\"dataframe\">\n",
330 |        "  <thead>\n",
331 |        "    <tr style=\"text-align: right;\">\n",
332 |        "      <th></th>\n",
333 |        "      <th>key_1</th>\n",
334 |        "      <th>key_2</th>\n",
335 |        "      <th>key_3</th>\n",
336 |        "      <th>file</th>\n",
337 |        "      <th>sub_file</th>\n",
338 |        "      <th>step_number</th>\n",
339 |        "      <th>category</th>\n",
340 |        "      <th>issue_short_desc</th>\n",
341 |        "      <th>issue_long_desc</th>\n",
342 |        "      <th>column</th>\n",
343 |        "      <th>issue_count</th>\n",
344 |        "      <th>issue_idx</th>\n",
345 |        "      <th>grouping</th>\n",
346 |        "    </tr>\n",
347 |        "  </thead>\n",
348 |        "  <tbody>\n",
349 |        "    <tr>\n",
350 |        "      <th>0</th>\n",
351 |        "      <td>1</td>\n",
352 |        "      <td>None</td>\n",
353 |        "      <td>None</td>\n",
354 |        "      <td>df_checks_issues.pkl</td>\n",
355 |        "      <td>None</td>\n",
356 |        "      <td>0</td>\n",
357 |        "      <td>None</td>\n",
358 |        "      <td>Number should be greater than 0</td>\n",
359 |        "      <td></td>\n",
360 |        "      <td>None</td>\n",
361 |        "      <td>1</td>\n",
362 |        "      <td>4</td>\n",
363 |        "      <td>2020-05-26 07:36:41.839557</td>\n",
364 |        "    </tr>\n",
365 |        "  </tbody>\n",
366 |        "</table>\n",
367 |        "</div>"
368 |       ],
369 |       "text/plain": [
370 |        "  key_1 key_2 key_3                  file sub_file  step_number category  \\\n",
371 |        "0     1  None  None  df_checks_issues.pkl     None            0     None   \n",
372 |        "\n",
373 |        "                  issue_short_desc issue_long_desc column  issue_count  \\\n",
374 |        "0  Number should be greater than 0                   None            1   \n",
375 |        "\n",
376 |        "  issue_idx                    grouping  \n",
377 |        "0         4  2020-05-26 07:36:41.839557  "
378 |       ]
379 |      },
380 |      "execution_count": 7,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "---\n",
394 |     "**GigiSR**"
395 |    ]
396 |   }
397 |  ],
398 |  "metadata": {
399 |   "kernelspec": {
400 |    "display_name": "Python 3",
401 |    "language": "python",
402 |    "name": "python3"
403 |   },
404 |   "language_info": {
405 |    "codemirror_mode": {
406 |     "name": "ipython",
407 |     "version": 3
408 |    },
409 |    "file_extension": ".py",
410 |    "mimetype": "text/x-python",
411 |    "name": "python",
412 |    "nbconvert_exporter": "python",
413 |    "pygments_lexer": "ipython3",
414 |    "version": "3.6.10"
415 |   }
416 |  },
417 |  "nbformat": 4,
418 |  "nbformat_minor": 2
419 | }
420 | 


--------------------------------------------------------------------------------
/examples/04_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example notebook 04\n",
  8 |     "\n",
  9 |     "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `general_functions` module:\n",
 10 |     "\n",
 11 |     "+ [Initialise logging](#Initialise-logging)\n",
 12 |     "+ [Import attribute](#Import-attribute)\n",
 13 |     "+ [Check for issues](#Check-for-issues)\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "## Setup\n",
 17 |     "<hr>"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Imports and setting options"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from datetime import datetime\n",
 34 |     "import pickle\n",
 35 |     "\n",
 36 |     "from data_etl import Checks, Connections, general_functions"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Initialise logging\n",
 44 |     "<hr>"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "When running interlocking scripts it can be useful to have logging so that if a problem is encountered there's hopefully enough information provided to debug\n",
 52 |     "\n",
 53 |     "This function helps to set up a logging file"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "general_functions.func_initialise_logging(\n",
 63 |     "    'example_04', 'logs/', '1', None, None, datetime.now()\n",
 64 |     ")"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### Import attribute\n",
 72 |     "<hr>"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Quite often it is more useful to define the large dictionaries that go into the checks in a separate script so that it is in a collection but doesn't clutter up the main script where the flow of processing is defined\n",
 80 |     "\n",
 81 |     "This function is also used in the classes as reading in from other scripts is a frequent action for clarity of the code"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "text/plain": [
 92 |        "{'Number should be greater than 0': {'calc_condition': <function 04_example.<lambda>(df, col, **kwargs)>},\n",
 93 |        " 'Number should be greater than 2': {'columns': ['number'],\n",
 94 |        "  'calc_condition': <function 04_example.<lambda>(df, col, **kwargs)>,\n",
 95 |        "  'category': 'severe'},\n",
 96 |        " 'check values in list': {'columns': ['category_1'],\n",
 97 |        "  'calc_condition': <function 04_example.<lambda>(df, col, **kwargs)>,\n",
 98 |        "  'long_description': <function 04_example.<lambda>(df, col, condition, **kwargs)>},\n",
 99 |        " 'The category_1 column can only map to certain values': {'calc_condition': <function 04_example.<lambda>(df, col, **kwargs)>,\n",
100 |        "  'check_condition': <function 04_example.<lambda>(df, col, condition, **kwargs)>,\n",
101 |        "  'count_condition': <function 04_example.<lambda>(df, col, condition, **kwargs)>,\n",
102 |        "  'index_position': <function 04_example.<lambda>(df, col, condition, **kwargs)>,\n",
103 |        "  'relevant_columns': <function 04_example.<lambda>(df, col, condition, **kwargs)>,\n",
104 |        "  'long_description': <function 04_example.<lambda>(df, col, condition, **kwargs)>}}"
105 |       ]
106 |      },
107 |      "execution_count": 3,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "dict_checks = general_functions.import_attr('.', '04_example', 'dict_checks')\n",
114 |     "dict_checks"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "And this can then be used or modified and used in the `DataCuration` and `Checks` classes"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "### Check for issues\n",
129 |     "<hr>"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "The aim of this function is to have a way to create a break in the code if there is are issues, and to store the issues before erroring out of the script\n",
137 |     "\n",
138 |     "To use this function we need a class instance with issue entries and a connections class instance to write the issues out to"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 4,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/html": [
149 |        "<div>\n",
150 |        "<style scoped>\n",
151 |        "    .dataframe tbody tr th:only-of-type {\n",
152 |        "        vertical-align: middle;\n",
153 |        "    }\n",
154 |        "\n",
155 |        "    .dataframe tbody tr th {\n",
156 |        "        vertical-align: top;\n",
157 |        "    }\n",
158 |        "\n",
159 |        "    .dataframe thead th {\n",
160 |        "        text-align: right;\n",
161 |        "    }\n",
162 |        "</style>\n",
163 |        "<table border=\"1\" class=\"dataframe\">\n",
164 |        "  <thead>\n",
165 |        "    <tr style=\"text-align: right;\">\n",
166 |        "      <th></th>\n",
167 |        "      <th>key_1</th>\n",
168 |        "      <th>key_2</th>\n",
169 |        "      <th>key_3</th>\n",
170 |        "      <th>file</th>\n",
171 |        "      <th>sub_file</th>\n",
172 |        "      <th>step_number</th>\n",
173 |        "      <th>category</th>\n",
174 |        "      <th>issue_short_desc</th>\n",
175 |        "      <th>issue_long_desc</th>\n",
176 |        "      <th>column</th>\n",
177 |        "      <th>issue_count</th>\n",
178 |        "      <th>issue_idx</th>\n",
179 |        "      <th>grouping</th>\n",
180 |        "    </tr>\n",
181 |        "  </thead>\n",
182 |        "  <tbody>\n",
183 |        "    <tr>\n",
184 |        "      <th>0</th>\n",
185 |        "      <td>1</td>\n",
186 |        "      <td>None</td>\n",
187 |        "      <td>None</td>\n",
188 |        "      <td>df_checks_issues.pkl</td>\n",
189 |        "      <td>NaN</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>NaN</td>\n",
192 |        "      <td>Number should be greater than 0</td>\n",
193 |        "      <td></td>\n",
194 |        "      <td>NaN</td>\n",
195 |        "      <td>1</td>\n",
196 |        "      <td>4</td>\n",
197 |        "      <td>2020-05-26 07:43:04.328680</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>1</th>\n",
201 |        "      <td>1</td>\n",
202 |        "      <td>None</td>\n",
203 |        "      <td>None</td>\n",
204 |        "      <td>df_checks_issues.pkl</td>\n",
205 |        "      <td>NaN</td>\n",
206 |        "      <td>1</td>\n",
207 |        "      <td>NaN</td>\n",
208 |        "      <td>Number should be greater than 0</td>\n",
209 |        "      <td></td>\n",
210 |        "      <td>NaN</td>\n",
211 |        "      <td>1</td>\n",
212 |        "      <td>4</td>\n",
213 |        "      <td>2020-05-26 07:43:04.328680</td>\n",
214 |        "    </tr>\n",
215 |        "    <tr>\n",
216 |        "      <th>2</th>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>None</td>\n",
219 |        "      <td>None</td>\n",
220 |        "      <td>df_checks_issues.pkl</td>\n",
221 |        "      <td>NaN</td>\n",
222 |        "      <td>2</td>\n",
223 |        "      <td>NaN</td>\n",
224 |        "      <td>Number should be greater than 0</td>\n",
225 |        "      <td></td>\n",
226 |        "      <td>NaN</td>\n",
227 |        "      <td>1</td>\n",
228 |        "      <td>4</td>\n",
229 |        "      <td>2020-05-26 07:43:04.328680</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>3</th>\n",
233 |        "      <td>1</td>\n",
234 |        "      <td>None</td>\n",
235 |        "      <td>None</td>\n",
236 |        "      <td>df_checks_issues.pkl</td>\n",
237 |        "      <td>NaN</td>\n",
238 |        "      <td>3</td>\n",
239 |        "      <td>NaN</td>\n",
240 |        "      <td>Number should be greater than 0</td>\n",
241 |        "      <td></td>\n",
242 |        "      <td>NaN</td>\n",
243 |        "      <td>1</td>\n",
244 |        "      <td>4</td>\n",
245 |        "      <td>2020-05-26 07:43:04.328680</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>4</th>\n",
249 |        "      <td>1</td>\n",
250 |        "      <td>None</td>\n",
251 |        "      <td>None</td>\n",
252 |        "      <td>df_checks_issues.pkl</td>\n",
253 |        "      <td>NaN</td>\n",
254 |        "      <td>4</td>\n",
255 |        "      <td>NaN</td>\n",
256 |        "      <td>Number should be greater than 0</td>\n",
257 |        "      <td></td>\n",
258 |        "      <td>NaN</td>\n",
259 |        "      <td>1</td>\n",
260 |        "      <td>4</td>\n",
261 |        "      <td>2020-05-26 07:43:04.328680</td>\n",
262 |        "    </tr>\n",
263 |        "  </tbody>\n",
264 |        "</table>\n",
265 |        "</div>"
266 |       ],
267 |       "text/plain": [
268 |        "  key_1 key_2 key_3                  file sub_file  step_number category  \\\n",
269 |        "0     1  None  None  df_checks_issues.pkl      NaN            0      NaN   \n",
270 |        "1     1  None  None  df_checks_issues.pkl      NaN            1      NaN   \n",
271 |        "2     1  None  None  df_checks_issues.pkl      NaN            2      NaN   \n",
272 |        "3     1  None  None  df_checks_issues.pkl      NaN            3      NaN   \n",
273 |        "4     1  None  None  df_checks_issues.pkl      NaN            4      NaN   \n",
274 |        "\n",
275 |        "                  issue_short_desc issue_long_desc column issue_count  \\\n",
276 |        "0  Number should be greater than 0                    NaN           1   \n",
277 |        "1  Number should be greater than 0                    NaN           1   \n",
278 |        "2  Number should be greater than 0                    NaN           1   \n",
279 |        "3  Number should be greater than 0                    NaN           1   \n",
280 |        "4  Number should be greater than 0                    NaN           1   \n",
281 |        "\n",
282 |        "  issue_idx                   grouping  \n",
283 |        "0         4 2020-05-26 07:43:04.328680  \n",
284 |        "1         4 2020-05-26 07:43:04.328680  \n",
285 |        "2         4 2020-05-26 07:43:04.328680  \n",
286 |        "3         4 2020-05-26 07:43:04.328680  \n",
287 |        "4         4 2020-05-26 07:43:04.328680  "
288 |       ]
289 |      },
290 |      "execution_count": 4,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "var_start_time = datetime.now()\n",
297 |     "ch_checks = Checks(var_start_time, '1')\n",
298 |     "\n",
299 |     "dict_data = {\n",
300 |     "    'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n",
301 |     "}\n",
302 |     "\n",
303 |     "dict_checks = dict()\n",
304 |     "dict_checks['Number should be greater than 0'] = {\n",
305 |     "    'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n",
306 |     "}\n",
307 |     "\n",
308 |     "for step_no in range(5):\n",
309 |     "    ch_checks.set_step_no(step_no)\n",
310 |     "    ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n",
311 |     "\n",
312 |     "ch_checks.df_issues"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 5,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "cnxs = Connections()\n",
322 |     "cnxs.add_cnx(\n",
323 |     "    cnx_key='df_issues', \n",
324 |     "    cnx_type='sqlite3',\n",
325 |     "    table_name='df_issues',\n",
326 |     "    file_path='data/00_db.db'\n",
327 |     ")"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "Now use the issues table in the function"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 6,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "general_functions.func_check_for_issues(\n",
344 |     "    ch_checks.get_issue_count(), \n",
345 |     "    cnxs, \n",
346 |     "    'df_issues', \n",
347 |     "    ch_checks.df_issues, \n",
348 |     "    ch_checks.get_step_no(),\n",
349 |     "    override=True\n",
350 |     ")"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "The above has `override=True`, this means even if problems are found it will not error out, the below doesn't have `override=True` and intentionally errors"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 7,
363 |    "metadata": {},
364 |    "outputs": [
365 |     {
366 |      "ename": "ValueError",
367 |      "evalue": "There were 5 issues found at step 4",
368 |      "output_type": "error",
369 |      "traceback": [
370 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
371 |       "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
372 |       "\u001b[1;32m<ipython-input-7-b270c52e9ad6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m     \u001b[1;34m'df_issues'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdf_issues\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m     \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_step_no\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m )\n",
373 |       "\u001b[1;32mc:\\users\\georg\\documents\\workspace\\modules\\data_etl\\data_etl\\general_functions.py\u001b[0m in \u001b[0;36mfunc_check_for_issues\u001b[1;34m(issue_count, cnx, cnx_key, table, step_no, override, start_time)\u001b[0m\n\u001b[0;32m     38\u001b[0m             module_logger.info(\"Script time taken: {}\".format(\n\u001b[0;32m     39\u001b[0m                 str(datetime.now() - start_time)))\n\u001b[1;32m---> 40\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvar_msg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     42\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
374 |       "\u001b[1;31mValueError\u001b[0m: There were 5 issues found at step 4"
375 |      ]
376 |     }
377 |    ],
378 |    "source": [
379 |     "general_functions.func_check_for_issues(\n",
380 |     "    ch_checks.get_issue_count(), \n",
381 |     "    cnxs, \n",
382 |     "    'df_issues', \n",
383 |     "    ch_checks.df_issues, \n",
384 |     "    ch_checks.get_step_no()\n",
385 |     ")"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "The benefit of the `override` argument is that you may have a mixture of issues you want definitely resolving and those you can live with, this allows you to have errors but to carry on regardless"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "---\n",
400 |     "**GigiSR**"
401 |    ]
402 |   }
403 |  ],
404 |  "metadata": {
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.10"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 2
425 | }
426 | 


--------------------------------------------------------------------------------
/examples/04_example.py:
--------------------------------------------------------------------------------
 1 | # This script is used in the `02_examples.ipynb` file to highlight how using 
 2 | # externally defined information works
 3 | 
 4 | import pandas as pd
 5 | 
 6 | dict_cat_1_map = {
 7 |     'A': ['a', 'z'],
 8 |     'B': ['b'],
 9 |     'C': ['c'],
10 |     'D': ['d'],
11 |     'Y': ['y'],
12 |     'Z': ['z']
13 | }
14 | 
15 | dict_checks = {
16 |     'Number should be greater than 0': {
17 |         'calc_condition': lambda df, col, **kwargs: df['number'] <= 0
18 |     },
19 |     'Number should be greater than 2': {
20 |         "columns": ['number'],
21 |         'calc_condition': lambda df, col, **kwargs: df[col] <= 2,
22 |         'category': 'severe'
23 |     },
24 |     'check values in list': {
25 |         'columns': ['category_1'],
26 |         'calc_condition': lambda df, col, **kwargs: ~df[col].isin(['A', 'B', 'C', 'D']),
27 |         'long_description': lambda df, col, condition, **kwargs: 
28 |             f"The invalid values are: {df.loc[~df[col].isin(['A', 'B', 'C', 'D'])][col].unique().tolist()}"
29 |     },
30 |     'The category_1 column can only map to certain values': {
31 |         'calc_condition': lambda df, col, **kwargs: [
32 |             item[1] not in dict_cat_1_map[item[0]] for item in 
33 |             df[['category_1', 'category_2']].values.tolist()
34 |         ],
35 |         'check_condition': lambda df, col, condition, **kwargs: sum(condition) > 0,
36 |         'count_condition': lambda df, col, condition, **kwargs: sum(condition),
37 |         'index_position': lambda df, col, condition, **kwargs: pd.Series(condition),
38 |         'relevant_columns': lambda df, col, condition, **kwargs: 'category_1, category_2',
39 |         'long_description': lambda df, col, condition, **kwargs: (
40 |             f"The values that have no mapping are: "
41 |             f"{df.loc[pd.Series(condition)]['category_1'].unique().tolist()}"
42 |         )
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | A collection of examples for potential uses of my package!
 4 | 
 5 | A lot of the functionality is easy to code in yourself and is dependant on the data set in use. But I have found it useful to be able to try to apply all conversions at once and then check that there were no errors rather than stop each time there is an error. For example, knowing exactly which columns failed to convert to integer means you can investigate all of them at once. And then having the flexibility to define a function to find out which data rows specifically failed is even more powerful.
 6 | 
 7 | The main use I have for using this package at work is so I can feed back to the data creators where there are errors in their manually entered or system extracted data sets so they can make corrections to it before I use it. And if there are values that break my assumptions but are actually valid values I get the feedback from the domain experts that can help me modify my assumptions, or keep the check as-is because it's a highly unlikely occurance and it's good to know when it's cropped up. So, although the problems are labelled as being in a `issues log` they could just be flags for unusual or specific values of particular interest or they could be genuine errors that need resolving.
 8 | 
 9 | # The structure
10 | 
11 | + `data/` will contain any generated data we need, some of the tables may be pre-existing hard coded ones
12 | + `test_scripts/` contains an example in scripts rather than notebooks, from this form which runs well locally you can easily convert it into an Airflow compatible form, the `main.py` script accesses all the other scripts so you only need to run one
13 | + `00_create_data.ipynb` creates the data and dbs that are used in the examples
14 | + `01_example.ipnb` a look at some basic functionality: finding files, reading in the data, setting new headers, asserting nulls, then converting to the correct dtypes
15 | + `02_example.ipynb` a concentrated look at individual bits of functionality available and a look at the issue output produced when there are problems
16 | + `02_example.py` some externally defined information to use in the `02_example.ipynb` notebook for one of the sections
17 | 
18 | # Run order
19 | 
20 | 1. Run `00_create_data.ipynb` first to create the data files for the examples
21 | 
22 | You can then run either the notebooks or the `test_scripts/` files.
23 | 


--------------------------------------------------------------------------------
/examples/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/data/.gitkeep


--------------------------------------------------------------------------------
/examples/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/logs/.gitkeep


--------------------------------------------------------------------------------
/examples/test_scripts/.config:
--------------------------------------------------------------------------------
1 | [TEST]
2 | DRIVER = {SQLite3 ODBC Driver}
3 | SERVER = localhost
4 | DATABASE = test.db
5 | Trusted_connection = yes
6 | 


--------------------------------------------------------------------------------
/examples/test_scripts/alter_cols.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | dict_alter = dict()
 4 | 
 5 | dict_alter['01'] = {
 6 |     'type': 'new_col',
 7 |     'col_name': 'number_2',
 8 |     'function': lambda df, keys, **kwargs: df['a_number'] * 2
 9 | }
10 | dict_alter['02'] = {
11 |     'type': 'new_col',
12 |     'col_name': 'key_1',
13 |     'function': lambda df, keys, **kwargs: keys[0]
14 | }
15 | dict_alter['03'] = {
16 |     'type': 'new_col',
17 |     'col_name': 'key_2',
18 |     'function': lambda df, keys, **kwargs: keys[1]
19 | }
20 | dict_alter['04'] = {
21 |     'type': 'map_df',
22 |     'function': lambda df, keys, **kwargs: df,
23 |     'idx_function': lambda df, keys, **kwargs: pd.Series(True, index=df.index)
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/test_scripts/checks_1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | dict_checks = dict()
 4 | 
 5 | dict_checks["This check is for numbers being greater than 6"] = {
 6 |     "columns": ["a_number", "number_2"],
 7 |     "calc_condition": lambda df, col, **kwargs: df[col] <= 6,
 8 |     "long_description": lambda df, col, condition, **kwargs:
 9 |         "There are numbers less than or equal to 6",
10 |     "index_position": lambda df, col, condition, **kwargs:
11 |         pd.Series(False, df.index)
12 | }
13 | 
14 | dict_checks["This check is for the column to be not null"] = {
15 |     "columns": ['string'],
16 |     "calc_condition": lambda df, col, **kwargs: df[col].isnull(),
17 |     "long_description": lambda df, col, condition, **kwargs:
18 |         f"The column `{col}` should not be null",
19 |     "category": 'must be resolved'
20 | }
21 | 


--------------------------------------------------------------------------------
/examples/test_scripts/convert_columns.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | 
 5 | dict_convert = dict()
 6 | 
 7 | 
 8 | def func_string_to_int(df, col):
 9 |     s = df[col].copy()
10 |     s = s.str.replace(',', '')  # thousand separators
11 |     s = s.str.replace('%', '')  # percentage sign
12 |     s = s.str.replace('£', '')  # pound stirling sign
13 |     s = s.str.replace('$', '')  # dollar sign
14 |     s = s.str.replace('€', '')  # euro sign
15 |     s = s.str.replace('¥', '')  # yen sign
16 |     s = s.astype(int)
17 |     return s
18 | 
19 | 
20 | def func_string_to_float(df, col):
21 |     s = df[col].copy()
22 |     s = s.str.replace(',', '')  # thousand separators
23 |     s = s.str.replace('%', '')  # percentage sign
24 |     s = s.str.replace('£', '')  # pound stirling sign
25 |     s = s.str.replace('$', '')  # dollar sign
26 |     s = s.str.replace('€', '')  # euro sign
27 |     s = s.str.replace('¥', '')  # yen sign
28 |     s = s.astype(float)
29 |     return s
30 | 
31 | 
32 | dict_convert['int'] = {
33 |     'columns': lambda df, **kwargs: ['a_number'],
34 |     'dtypes': ['int', 'float'],
35 |     'functions': {
36 |         1: lambda df, col, **kwargs: df[col].astype(int),
37 |         2: lambda df, col, **kwargs: func_string_to_int(df, col),
38 |         3: lambda df, col, **kwargs: df[col].astype(float),
39 |         4: lambda df, col, **kwargs: func_string_to_float(df, col)
40 |     }
41 | }
42 | dict_convert['float'] = {
43 |     'columns': ['lat', 'lng'],
44 |     'dtypes': ['float'],
45 |     'functions': {
46 |         1: lambda df, col, **kwargs: df[col].astype(float),
47 |         2: lambda df, col, **kwargs: func_string_to_float(df, col)
48 |     }
49 | }
50 | # TODO have a mash-up function that also takes care of Excel dates?
51 | dict_convert['date'] = {
52 |     'columns': ['date_1', 'date_2'],
53 |     'dtypes': ['datetime'],
54 |     'functions': {
55 |         1: lambda df, col, *kwargs:
56 |             pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
57 |     }
58 | }
59 | 
60 | 
61 | def func_string_format(df, col):
62 |     s = df[col].copy()
63 |     s_null = s.isnull()
64 |     s = s.astype(str)
65 |     s = s.str.strip()
66 |     reg_ex = re.compile(' +')
67 |     s = s.map(lambda x: re.sub(reg_ex, ' ', x))
68 |     s.loc[s_null] = pd.np.nan
69 |     return s
70 | 
71 | 
72 | dict_convert['string'] = {
73 |     'columns': ['string'],
74 |     'dtypes': [],
75 |     'functions': {
76 |         1: lambda df, col, **kwargs: func_string_format(df, col)
77 |     },
78 |     'idx_function': lambda df, col, **kwargs: pd.Series(True, index=df.index)
79 | }
80 | 


--------------------------------------------------------------------------------
/examples/test_scripts/main.py:
--------------------------------------------------------------------------------
  1 | # This is the section where we put all the classes together in combinations
  2 | # that are required for specific data sets
  3 | import logging
  4 | from datetime import datetime
  5 | import pickle
  6 | # This is only used to create a table, usually this would already be done
  7 | import sqlite3
  8 | 
  9 | from data_etl import DataCuration, Checks, Connections, Reporting, \
 10 |     func_check_for_issues, func_initialise_logging
 11 | 
 12 | if __name__ == "__main__":
 13 |     var_key_1 = "A"
 14 |     var_key_2 = "1"
 15 |     var_key_3 = "1"
 16 |     var_start_time = datetime.now()
 17 | 
 18 |     var_checks_1_pass = True
 19 |     var_write_out = True
 20 | 
 21 |     func_initialise_logging('pipeline_test_1', '../logs/', var_key_1,
 22 |                             var_key_2, var_key_3, var_start_time)
 23 | 
 24 |     # Initialise objects required
 25 |     cnxs = Connections()
 26 |     data = DataCuration(var_start_time, "A")
 27 |     check = Checks(var_start_time, "A")
 28 |     reporting = Reporting(var_start_time, "A")
 29 | 
 30 |     # Set up connections
 31 |     cnxs.add_cnx(
 32 |         cnx_key='df_issues', cnx_type='sqlite3', table_name='df_issues',
 33 |         file_path='../data/00_db.db', sqlite_df_issues_create=True)
 34 | 
 35 |     # # This is only needed to create the structure,
 36 |     cnx = sqlite3.connect('../data/00_db.db')
 37 |     var_create_table = """CREATE TABLE IF NOT EXISTS data (
 38 |             a_number INTEGER, date_1 TEXT, date_2 TEXT, string TEXT, 
 39 |             testing REAL, a REAL, b REAL, lat REAL, lng REAL, number_2 INTEGER, 
 40 |             key_1 TEXT, key_2 TEXT, level_0 TEXT
 41 |         );"""
 42 |     cnx.execute(var_create_table)
 43 |     cnx.commit()
 44 |     cnx.close()
 45 | 
 46 |     cnxs.add_cnx(cnx_key='data_out', cnx_type='sqlite3', table_name='data',
 47 |                  file_path='../data/00_db.db')
 48 | 
 49 |     # Data etl testing
 50 | 
 51 |     # Read the files in
 52 |     data.find_files(files_path="../data",
 53 |                     script_name="test_reading_in", path='.')
 54 |     data.reading_in(path=".", script_name="test_reading_in")
 55 | 
 56 |     # Set the step number
 57 |     data.set_step_no(1)
 58 | 
 59 |     # Read in the headers
 60 |     data.set_comparison_headers(
 61 |         path=".",
 62 |         script_name="test_reading_in",
 63 |         filepath="../data/headers.xlsx")
 64 |     data.link_headers()
 65 |     data.assert_linked_headers(remove_header_rows=True, reset_index=True)
 66 | 
 67 |     data.set_step_no(2)
 68 |     data.assert_nulls([""])
 69 |     data.convert_columns(".", "convert_columns")
 70 |     func_check_for_issues(
 71 |         data.get_issue_count(2, 2), cnxs, 'df_issues', data.df_issues,
 72 |         data.get_step_no(), start_time=var_start_time)
 73 | 
 74 |     data.set_step_no(3)
 75 |     data.alter_tables(".", "alter_cols")
 76 |     func_check_for_issues(
 77 |         data.get_issue_count(3, 3), cnxs, 'df_issues', data.df_issues,
 78 |         data.get_step_no(), start_time=var_start_time)
 79 | 
 80 |     data.set_step_no(4)
 81 |     data.concatenate_tables()
 82 | 
 83 |     check.set_step_no(5)
 84 |     check.set_defaults(idx_flag=True)
 85 |     check.apply_checks(data.tables, ".", "checks_1")
 86 |     func_check_for_issues(
 87 |         check.get_issue_count(5, 5), cnxs, 'df_issues', check.df_issues,
 88 |         check.get_step_no(), var_checks_1_pass, var_start_time)
 89 | 
 90 |     # Now the data is cleansed do the reporting, this could be
 91 |     # post writing to DB
 92 |     data.set_step_no(6)
 93 |     data.form_summary_tables(path='.', script_name='reporting_1')
 94 | 
 95 |     # Temporary snapshot for testing
 96 |     pickle.dump(
 97 |         {'data': data, 'checks': check, 'report': reporting, 'cnx': cnxs},
 98 |         open("../data/dict_dc.pkl", "wb"))
 99 | 
100 |     # Log issues found
101 |     cnxs.write_to_db('df_issues', data.df_issues)
102 |     cnxs.write_to_db('df_issues', check.df_issues)
103 | 
104 |     # Write the data out
105 |     if var_write_out:
106 |         cnxs.write_to_db('data_out', data.tables)
107 | 
108 |     logging.info("Script time taken: {}".format(
109 |         str(datetime.now() - var_start_time)))
110 | 


--------------------------------------------------------------------------------
/examples/test_scripts/reporting_1.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import folium
 5 | 
 6 | 
 7 | def form_tables(tables, formed_tables, grouping, key_1, key_2, key_3,
 8 |                 key_separator, **kwargs):
 9 |     dict_data = dict()
10 |     dict_data['main_data'] = tables.copy()
11 |     return dict_data
12 | 
13 | 
14 | dict_reporting = dict()
15 | 
16 | 
17 | def func_chart_1(tables, file_path, file_name):
18 |     df = tables['main_data']
19 |     plt.figure()
20 |     g = df['number_2'].hist(bins=50)
21 |     plt.title('Histogram')
22 |     plt.savefig(os.path.join(file_path, file_name))
23 |     return None
24 | 
25 | 
26 | dict_reporting['Histogram 1'] = {
27 |     'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3,
28 |                         **kwargs: 'chart_1.png',
29 |     'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
30 |                        key_3,**kwargs:
31 |         func_chart_1(tables, file_path, file_name)
32 | }
33 | dict_reporting['Histogram 2'] = {
34 |     'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3,
35 |                         **kwargs: 'sub_folder_test/chart_1.png',
36 |     'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
37 |                        key_3, **kwargs:
38 |         func_chart_1(tables, file_path, file_name)
39 | }
40 | 
41 | 
42 | def func_map_1(tables, file_path, file_name):
43 |     df = tables['main_data']
44 |     m = folium.Map([51.5074, 0.1278], zoom_start=12)
45 |     for idx in df.index.tolist():
46 |         folium.Marker([df.loc[idx, 'lat'], df.loc[idx, 'lng']]).add_to(m)
47 |     m.save(os.path.join(file_path, file_name))
48 |     return df
49 | 
50 | 
51 | dict_reporting['Map 1'] = {
52 |     'file_name': lambda tables, file_path, grouping, key_1, key_2,
53 |                         key_3, **kwargs: 'map_1.html',
54 |     'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
55 |                        key_3, **kwargs: func_map_1(tables, file_path, file_name)
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/test_scripts/test_reading_in.py:
--------------------------------------------------------------------------------
 1 | # This file contains the information required for listing files and reading in
 2 | # tables of data
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def list_the_files(path):
 9 |     list_files = os.listdir(path)
10 |     list_files = [os.path.abspath(os.path.join(path, x)) for x in list_files]
11 |     list_files = [x for x in list_files if '.xlsx' in x.lower()]
12 |     list_files = [x for x in list_files if '~' not in x.lower()]
13 |     list_files = [x for x in list_files if 'header' not in x.lower()]
14 |     return list_files
15 | 
16 | 
17 | def read_files(list_files):
18 |     dict_files = dict()
19 |     for file in list_files:
20 |         xl = pd.ExcelFile(file)
21 |         for sheet in xl.sheet_names:
22 |             df = xl.parse(
23 |                 sheet_name=sheet, dtype=str, keep_default_na=False, header=None)
24 |             key = '{} -:- {}'.format(
25 |                 file.split('\\')[-1].lower().replace('.xlsx', ''), sheet)
26 |             dict_files[key] = df.copy()
27 |     return dict_files
28 | 
29 | 
30 | def read_headers(filepath):
31 |     if not os.path.exists(filepath):
32 |         raise ValueError(
33 |             'The passed file path does not exist: {}'.format(filepath))
34 |     dict_headers = dict()
35 |     file = pd.ExcelFile(filepath)
36 |     dict_headers['ideal_headers'] = file.parse(
37 |         'IdealHeaders', header=None).values.tolist()[0]
38 |     for sheet in [sheet for sheet in
39 |                   file.sheet_names if sheet != 'IdealHeaders']:
40 |         df_header = file.parse(sheet, header=None)
41 |         dict_headers[sheet] = {
42 |             'expected_headers': df_header[
43 |                 df_header[0] == 'Header'].iloc[:, 1:].values.tolist()[0],
44 |             'new_headers': df_header[
45 |                 df_header[0] == 'New name'].iloc[:, 1:].values.tolist()[0],
46 |             'remove': df_header[
47 |                 df_header[0] == 'Remove'].iloc[:, 1:].values.tolist()[0],
48 |             'notes': df_header[
49 |                 df_header[0] == 'Notes'].iloc[:, 1:].values.tolist()[0]
50 |         }
51 |     return dict_headers
52 | 
53 | 
54 | def link_headers(dfs, df_headers):
55 |     dict_link = dict()
56 |     for key_df in dfs.keys():
57 |         for key_header in df_headers.keys():
58 |             check_shape = (
59 |                 # + 1 because the headers have an index to explain the
60 |                 # row purposes
61 |                 dfs[key_df].shape[1] + 1 == df_headers[key_header].shape[1])
62 |             if check_shape is True:
63 |                 dict_link[key_df] = str(key_header)
64 |                 break
65 |     return dict_link
66 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='data_etl',
 5 |     version='0.1.0dev',
 6 |     packages=['data_etl',],
 7 |     license='MIT',
 8 |     url="https://github.com/gigisr/data_etl",
 9 |     
10 |     author='GigiSR', requires=['pandas', 'numpy', 'pyodbc']
11 | )
12 | 


--------------------------------------------------------------------------------
/tests/00_pytest.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import pickle
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from data_curation import DataCuration, Checks
  8 | 
  9 | 
 10 | var_cnv_1_start_time = datetime.now()
 11 | data_cnv_1 = DataCuration(var_cnv_1_start_time , 'test')
 12 | df_convert_issues = pd.DataFrame(
 13 |     [
 14 |         ('A', '1', '0.6', '2019-02-29'),
 15 |         ('B', '4.5', 'A', '2019-22-05'),
 16 |         ('C', '1', '5.6', '2018-12-17'),
 17 |         ('D', 'b', '15.9', '2019-09-31'),
 18 |         (5, '-8', '4.7', '2018-03-09')
 19 |     ],
 20 |     columns=['object', 'int', 'float', 'date']
 21 | )
 22 | data_cnv_1.set_table({'df_convert_issues.tsv': df_convert_issues})
 23 | 
 24 | 
 25 | def func_try_float_cnv(x):
 26 |     try:
 27 |         var = float(x)
 28 |     except:
 29 |         return True
 30 |     return False
 31 | 
 32 | 
 33 | def func_try_int_cnv(x):
 34 |     try:
 35 |         var = int(x)
 36 |     except:
 37 |         return True
 38 |     return False
 39 | 
 40 | 
 41 | def func_str_cnv(s):
 42 |     var_is_null_pre = s.isnull().sum()
 43 |     s_cnv = s.map(func_to_int).str.strip()
 44 |     var_is_null_post = s_cnv.isnull().sum()
 45 |     if var_is_null_post != var_is_null_pre:
 46 |         raise ValueError
 47 |     return s_cnv
 48 | 
 49 | 
 50 | def func_to_int(x):
 51 |     try:
 52 |         return int(x)
 53 |     except:
 54 |         return x
 55 | 
 56 | 
 57 | def func_try_str_cnv(s):
 58 |     var_is_null_pre = s.isnull().sum()
 59 |     s_cnv = s.map(func_to_int).str.strip()
 60 |     var_is_null_post = s_cnv.isnull().sum()
 61 |     return s != s_cnv
 62 | 
 63 | 
 64 | def func_try_date_cnv(x):
 65 |     if pd.isnull(x):
 66 |         return False
 67 |     if pd.isnull(pd.to_datetime(x, format='%Y-%m-%d', errors='coerce')):
 68 |         return True
 69 |     return False
 70 | 
 71 | 
 72 | dict_cnv_1 = {
 73 |     'float': {
 74 |         'columns': ['float'],
 75 |         'dtypes': ['float'],
 76 |         'functions': {
 77 |             1: lambda df, col, **kwargs: df[col].astype(float)
 78 |         },
 79 |         'idx_function': lambda df, col, **kwargs: df[col].map(func_try_float_cnv)
 80 |     },
 81 |     'int': {
 82 |         'columns': ['int'],
 83 |         'dtypes': ['int'],
 84 |         'functions': {
 85 |             1: lambda df, col, **kwargs: df[col].astype(int)
 86 |         },
 87 |         'idx_function': lambda df, col, **kwargs: df[col].map(func_try_int_cnv)
 88 |     },
 89 |     'object': {
 90 |         'columns': ['object'],
 91 |         'dtypes': [],
 92 |         'functions': {
 93 |             1: lambda df, col, **kwargs: func_str_cnv(df[col])
 94 |         },
 95 |         'idx_function': lambda df, col, **kwargs: func_try_str_cnv(df[col])
 96 |     },
 97 |     'date': {
 98 |         'columns': ['date'],
 99 |         'dtypes': ['date', '[ns]'],
100 |         'functions': {
101 |             1: lambda df, col, **kwargs: pd.to_datetime(
102 |                 df[col], format='%Y-%m-%d')
103 |         },
104 |         'idx_function': lambda df, col, **kwargs: df[col].map(func_try_date_cnv)
105 |     }
106 | }
107 | 
108 | df_cnv_1_expected_df_issues = pd.DataFrame(
109 |     [
110 |         ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
111 |          'The conversion failed to format float', 'float', 1, '1',
112 |          var_cnv_1_start_time),
113 |         ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
114 |          'The conversion failed to format int', 'int', 2, '1, 3',
115 |          var_cnv_1_start_time),
116 |         ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
117 |          'The conversion failed to format object', 'object', 1, '4',
118 |          var_cnv_1_start_time),
119 |         ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
120 |          'The conversion failed to format date', 'date', 3, '0, 1, 3',
121 |          var_cnv_1_start_time)
122 |     ],
123 |     columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number',
124 |        'category', 'issue_short_desc', 'issue_long_desc', 'column',
125 |        'issue_count', 'issue_idx', 'grouping']
126 | )
127 | 
128 | 
129 | def test_cnv_1():
130 |     data_cnv_1.convert_columns(dictionary=dict_cnv_1)
131 |     assert data_cnv_1.df_issues.fillna('').equals(
132 |         df_cnv_1_expected_df_issues.fillna(''))
133 | 
134 | 
135 | var_alter_1_start_time = datetime.now()
136 | data_alter_1 = DataCuration(var_alter_1_start_time, 'test')
137 | 
138 | data_alter_1.set_table(
139 |     {
140 |         'df_alterations.tsv': pd.DataFrame(
141 |             [
142 |                 ('A', 2, 'key_1'),
143 |                 ('B', 199, 'key_2'),
144 |                 ('C', -1, 'key_1'),
145 |                 ('D', 20, 'key_3'),
146 |                 ('E', 6, 'key_2')
147 |             ],
148 |             columns=['to_map', 'add_1', 'merge_key']
149 |         ),
150 |         'df_alterations_issues.tsv': pd.DataFrame(
151 |             [
152 |                 ('A', 2, 'key_1'),
153 |                 ('B', 199, 2),
154 |                 ('C', -1, 'key_1'),
155 |                 (['D'], 'a', 'key_3'),
156 |                 ('E', 6, 'key_2')
157 |             ],
158 |             columns=['to_map', 'add_1', 'merge_key']
159 |         )
160 |     }
161 | )
162 | 
163 | 
164 | df_mapping = pd.DataFrame(
165 |     [
166 |         ('key_1', 1),
167 |         ('key_2', 2),
168 |         ('key_3', 3)
169 |     ],
170 |     columns=['merge_key', 'out_value']
171 | )
172 | 
173 | 
174 | def func_alter_merge(df, df_mapping):
175 |     df_mapped = pd.merge(
176 |         df,
177 |         df_mapping,
178 |         on='merge_key',
179 |         how='left'
180 |     )
181 |     if (
182 |         df_mapped['out_value'].isnull().sum() !=
183 |         df['merge_key'].isnull().sum()
184 |     ):
185 |         raise ValueError
186 |     return df_mapped
187 | 
188 | 
189 | dict_alter_1 = {
190 |     '01': {
191 |         'type': 'new_col',
192 |         'col_name': 'key',
193 |         'function': lambda df, keys, **kwargs: keys[0]
194 |     },
195 |     '02': {
196 |         'type': 'new_col',
197 |         'col_name': 'done_add_1',
198 |         'function': lambda df, keys, **kwargs: df['add_1'] + 1,
199 |         'idx_function': lambda df, keys, **kwargs:
200 |             df['add_1'].map(
201 |                 lambda x: type(x).__name__).map(
202 |                 lambda x: ('int' in x) | ('float' in x)).map(
203 |                 {True: False, False: True})
204 |     },
205 |     '03': {
206 |         'type': 'new_col',
207 |         'col_name': 'mapped',
208 |         'function': lambda df, keys, **kwargs: df['to_map'].map({
209 |             'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5}),
210 |         'idx_function': lambda df, keys, **kwargs:
211 |             ~df['to_map'].astype(str).isin(['A', 'B', 'C', 'D', 'E'])
212 |     },
213 |     '04': {
214 |         'type': 'map_df',
215 |         'function': lambda df, keys, **kwargs:
216 |             func_alter_merge(df, kwargs['df_mapping']),
217 |         'idx_function': lambda df, keys, **kwargs:
218 |             ~df['merge_key'].isin(['key_1', 'key_2', 'key_3', np.nan])
219 |     }
220 | }
221 | 
222 | df_alter_1_expected_df_issues = pd.DataFrame(
223 |     [
224 |         ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
225 |          '', 'For type new_col the function for alter_key 02 has not worked',
226 |          'done_add_1', 1, '3', var_alter_1_start_time),
227 |         ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
228 |          '', 'For type new_col the function for alter_key 03 has not worked',
229 |          'mapped', 1, '3', var_alter_1_start_time),
230 |         ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
231 |          '', 'For type map_df the function for alter_key 04 has not worked',
232 |          np.nan, 1, '1', var_alter_1_start_time)
233 |     ],
234 |     columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number',
235 |        'category', 'issue_short_desc', 'issue_long_desc', 'column',
236 |        'issue_count', 'issue_idx', 'grouping']
237 | )
238 | 
239 | 
240 | def test_alter_1():
241 |     data_alter_1.alter_tables(dictionary=dict_alter_1, df_mapping=df_mapping)
242 |     assert data_alter_1.df_issues.fillna('').equals(
243 |         df_alter_1_expected_df_issues.fillna(''))
244 | 


--------------------------------------------------------------------------------