├── .gitignore
├── README.md
├── condaenv.yml
├── data_etl
├── __init__.py
├── checks.py
├── connections.py
├── data_files.py
└── general_functions.py
├── examples
├── 00_create_data.ipynb
├── 01_example.ipynb
├── 02_example.ipynb
├── 03_example.ipynb
├── 04_example.ipynb
├── 04_example.py
├── README.md
├── data
│ └── .gitkeep
├── logs
│ └── .gitkeep
└── test_scripts
│ ├── .config
│ ├── alter_cols.py
│ ├── checks_1.py
│ ├── convert_columns.py
│ ├── main.py
│ ├── reporting_1.py
│ └── test_reading_in.py
├── setup.py
└── tests
└── 00_pytest.py
/.gitignore:
--------------------------------------------------------------------------------
1 | logs/*.log
2 | .idea/*
3 | pickles/*
4 | *~*
5 | data/processed/*
6 | data/deliverables/*
7 | *.pkl
8 | *.tsv
9 | *.db
10 | *.csv
11 | *.xlsx
12 | *.log
13 | .ipynb_checkpoints/*
14 | */.ipynb_checkpoints/*
15 | *.pyc
16 | docs/*
17 | data_etl.egg-info/*
18 | logs/*
19 | !logs/README_logs.md
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data ETL
2 |
3 | [](https://opensource.org/licenses/MIT)
4 |
5 | A package for dealing with data curation, transformation and checks.
6 |
7 | This can be reading in and converting to the correct dtypes, making suitable alterations to bring data into a uniform format. Or just taking an existing data set and performing some checks on it.
8 |
9 | The aim is to help with regular data sources provided by others, or by systems. This means it could be in a flat file format, it could mean that you are given data that isn't logically correct, it could mean missing data, there could be any number of problems. But hopefully having an issue report with a good amount of information on how something is wrong and where will give us capacity to provide this back to the data creators. Thus checks can be done in bulk, quickly, and issue reports put the responsibility on the data creator to make the corrections.
10 |
11 | The checks are not just considering single columns or single values they can consider the whole data set or even in conjunction with extra data sets, because that's how data often behaves.
12 |
13 | With models if certain assumptions are made then these can be tested.
14 |
15 | There is also benefit in performing checks in bulk, even if they produce issues, so it stops the stop start process.
16 |
17 | To use this package you should already have a good understanding of how the `pandas` package works.
18 |
19 | ## How to use this repoistory
20 |
21 | ### Setup environment
22 |
23 | There is a YML file for the main requirements.
24 |
25 | ```
26 | conda env create --file condaenv.yml
27 | ```
28 |
29 | Then you can use `pip` to install the `data_etl` module, navigate to the same directory as contains the `setup.py` file then:
30 |
31 | ```
32 | pip install -e .
33 | ```
34 |
35 | This now means you can import `data_etl` from the environment.
36 |
37 | ## Examples
38 |
39 | There are multiple examples present in the repository in the `examples` files.
40 |
41 | Use the `00_create_data.py` file to create the data to run the examples on and the sqlitedb file that will contain any errors or written out data.
42 |
43 | The other files, both `*.ipynb` and `*.py`, are the examples files.
44 |
45 | A brief code example of how to use:
46 |
47 | ```python
48 | from data_etl import Checks
49 | import pandas as pd
50 |
51 | data = pd.DataFrame([1, -3, 2], columns=['number'])
52 |
53 | # Initialise the Checks class
54 | ch_simple = Checks('grouping_label', 'key_1', 'key_2', 'key_3')
55 |
56 | # Define a simple check
57 | dict_checks = {
58 | 'Number should be greater than 0': {
59 | 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0
60 | }
61 | }
62 | # Apply the checks to the tables
63 | ch_simple.apply_checks(data, dictionary=dict_checks)
64 |
65 | # If any issues are found then they are stored internal to the class as a Pandas DataFrame
66 | ch_simple.df_issues
67 | ```
68 |
--------------------------------------------------------------------------------
/condaenv.yml:
--------------------------------------------------------------------------------
1 | name: data_etl
2 | channels:
3 | - defaults
4 | dependencies:
5 | - python=3.6
6 | - pandas=0.24.0
7 | - pytest=5.0.1
8 | - jupyter=1.0.0
9 | - matplotlib=3.0.3
10 | - xlrd=1.2.0
11 | - pyodbc=4.0.27
12 | - openpyxl=3.0.3
13 |
--------------------------------------------------------------------------------
/data_etl/__init__.py:
--------------------------------------------------------------------------------
1 | from data_etl.data_files import DataCuration
2 | from data_etl.checks import Checks
3 | from data_etl.connections import Connections
4 | from data_etl.general_functions import func_check_for_issues, \
5 | func_initialise_logging, import_attr
6 |
7 | __all__ = [
8 | DataCuration, Checks, Connections, func_check_for_issues,
9 | func_initialise_logging, import_attr
10 | ]
11 | __version__ = '0.1.0dev'
--------------------------------------------------------------------------------
/data_etl/checks.py:
--------------------------------------------------------------------------------
1 | # Here we are defining a class that will deal with checking data sets
2 | import logging
3 | from inspect import getfullargspec
4 | from copy import deepcopy
5 | from inspect import getsourcelines
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 | from data_etl.general_functions import import_attr
11 |
12 | module_logger = logging.getLogger(__name__)
13 |
14 | dict_checks_defaults = {
15 | 'columns': [np.nan],
16 | 'check_condition':
17 | lambda df, col, condition, **kwargs: condition.sum() > 0,
18 | 'count_condition': lambda df, col, condition, **kwargs: condition.sum(),
19 | 'index_position': lambda df, col, condition, **kwargs: condition,
20 | 'relevant_columns': lambda df, col, condition, **kwargs: col,
21 | 'long_description': lambda df, col, condition, **kwargs: "",
22 | 'idx_flag': True,
23 | 'category': np.nan
24 | }
25 |
26 |
27 | class Checks:
28 | __step_no = 0
29 | __key_1 = None
30 | __key_2 = None
31 | __key_3 = None
32 | __grouping = None
33 | df_issues = None
34 | __key_separator = " -:- "
35 | __checks_defaults = None
36 |
37 | def __init__(self, grouping, key_1, key_2=None, key_3=None):
38 | module_logger.info("Initialising `Checks` object")
39 | # Three keys, all good things come in threes
40 | self.__key_1 = str(key_1)
41 | self.__key_2 = str(key_2)
42 | self.__key_3 = str(key_3)
43 | self.__grouping = grouping
44 | self.__checks_defaults = dict(dict_checks_defaults)
45 | # Initialise the `df_issues` table
46 | df_issues = pd.DataFrame(
47 | columns=[
48 | "key_1", "key_2", "key_3", "file", "sub_file", "step_number",
49 | "category", "issue_short_desc", "issue_long_desc", "column",
50 | "issue_count", "issue_idx", "grouping"
51 | ]
52 | )
53 | df_issues["step_number"] = df_issues["step_number"].astype(int)
54 | self.df_issues = df_issues
55 | module_logger.info("Initialising `Checks` object complete")
56 |
57 | def error_handling(self, file, subfile, issue_short_desc, issue_long_desc,
58 | column, issue_count, issue_idx, category=np.nan):
59 | """
60 | If an error is handled, as they all should be, we need to specify what
61 | happens with the error. By putting it into a single function it will
62 | hopefully make the code briefer.
63 | """
64 | # TODO work out how to add in `file` and `subfile` where data is a
65 | # dictionary
66 | module_logger.info("Logging an error with `error_handling`")
67 | df = self.df_issues.copy()
68 | list_vals = [
69 | self.__key_1, self.__key_2, self.__key_3, file, subfile,
70 | self.__step_no, category, issue_short_desc, issue_long_desc, column,
71 | issue_count, issue_idx, self.__grouping
72 | ]
73 | try:
74 | df.loc[df.shape[0]] = list_vals
75 | self.df_issues = df.copy()
76 | except:
77 | var_msg = f"Logging the issue failed, values: {list_vals}"
78 | module_logger.error(var_msg)
79 | raise ValueError(var_msg)
80 | module_logger.info(f"Error logged: {list_vals}")
81 |
82 | def set_defaults(
83 | self, columns=None, check_condition=None, count_condition=None,
84 | index_position=None, relevant_columns=None, long_description=None,
85 | idx_flag=None):
86 | module_logger.info("Starting `set_defaults`")
87 | if columns is not None:
88 | if type(columns).__name__ != 'list':
89 | var_msg = 'The `columns` argument is not a list as required'
90 | module_logger.error(var_msg)
91 | raise ValueError(var_msg)
92 | if len(columns) == 0:
93 | var_msg = ('The `columns` argument is empty, it needs to be '
94 | 'at least length 1, this can be a null')
95 | module_logger.error(var_msg)
96 | raise ValueError(var_msg)
97 | self.__checks_defaults['columns'] = columns
98 | if check_condition is not None:
99 | self.__set_defaults_check(check_condition, 'check_condition')
100 | self.__checks_defaults['check_condition'] = check_condition
101 | if count_condition is not None:
102 | self.__set_defaults_check(count_condition, 'count_condition')
103 | self.__checks_defaults['count_condition'] = count_condition
104 | if index_position is not None:
105 | self.__set_defaults_check(index_position, 'index_position')
106 | self.__checks_defaults['index_position'] = index_position
107 | if relevant_columns is not None:
108 | self.__set_defaults_check(relevant_columns, 'relevant_columns')
109 | self.__checks_defaults['relevant_columns'] = relevant_columns
110 | if long_description is not None:
111 | self.__set_defaults_check(long_description, 'long_descriptions')
112 | self.__checks_defaults['long_description'] = long_description
113 | if idx_flag is not None:
114 | if idx_flag not in [True, False]:
115 | var_msg = 'The value of `idx_flag` need to be True or False'
116 | module_logger.error(var_msg)
117 | raise ValueError(var_msg)
118 | self.__checks_defaults['idx_flag'] = idx_flag
119 | module_logger.info("Completed `set_defaults`")
120 |
121 | @staticmethod
122 | def __set_defaults_check(function, label):
123 | module_logger.info("Starting `__set_defaults_check`")
124 | if type(function).__name__ != 'function':
125 | var_msg = f'The passed value for `{label}` is not a function'
126 | module_logger.error(var_msg)
127 | raise ValueError(var_msg)
128 | arg_spec = getfullargspec(function)
129 | if arg_spec.args != ['df', 'col', 'condition']:
130 | var_msg = (
131 | f'The arguments passed in for the function `{label}` does not '
132 | f'match with the required args: df, col, condition')
133 | module_logger.error(var_msg)
134 | raise ValueError(var_msg)
135 | if arg_spec.varkw != 'kwargs':
136 | var_msg = (f'The **kwargs argument has not been provided for '
137 | f'`{label}` and is required')
138 | module_logger.error(var_msg)
139 | raise ValueError(var_msg)
140 | module_logger.info("Completed `__set_defaults_check`")
141 |
142 | def set_key_separator(self, separator):
143 | module_logger.info("Starting `set_key_separator`")
144 | if (type(separator).__name__ != "str") | (len(separator) == 0):
145 | var_msg = ("The argument `separator` for function "
146 | "`set_key_separator` should be a string of length "
147 | "greater than 0")
148 | module_logger.error(var_msg)
149 | raise ValueError(var_msg)
150 | self.__key_separator = separator
151 | module_logger.info(f"Completed `set_key_separator`, the key separator "
152 | f"is: {self.__key_separator}")
153 |
154 | def apply_checks(
155 | self, tables, path=None, script_name=None,
156 | object_name="dict_checks", dictionary=None, **kwargs):
157 | module_logger.info("Starting `apply_checks`")
158 | if (script_name is not None) & (object_name is not None):
159 | dict_checks = import_attr(path, script_name, object_name)
160 | elif dictionary is not None:
161 | if type(dictionary).__name__ != "dict":
162 | var_msg = "The `dictionary` argument is not a dictionary"
163 | module_logger.error(var_msg)
164 | raise ValueError(var_msg)
165 | dict_checks = dictionary
166 | else:
167 | var_msg = ("Either `dictionary` or both of `script_name` and "
168 | "`path` need to be none null")
169 | module_logger.error(var_msg)
170 | raise ValueError(var_msg)
171 |
172 | if type(tables).__name__ == "dict":
173 | for table_key in tables.keys():
174 | for check_key in dict_checks.keys():
175 | self.__apply_the_check(
176 | tables[table_key], dict_checks[check_key], check_key,
177 | table_key, **kwargs)
178 | elif type(tables).__name__ == "DataFrame":
179 | for check_key in dict_checks.keys():
180 | self.__apply_the_check(tables, dict_checks[check_key],
181 | check_key, np.nan, **kwargs)
182 |
183 | module_logger.info("Completed `apply_checks`")
184 |
185 | def __apply_the_check(
186 | self, df, dict_check_info, check_key, table_key, **kwargs):
187 | module_logger.info(f"Starting check `{check_key}`")
188 | if "calc_condition" not in dict_check_info:
189 | var_msg = "The check requires a value for key `calc_condition`"
190 | module_logger.error(var_msg)
191 | raise AttributeError(var_msg)
192 | func_calc_condition = dict_check_info["calc_condition"]
193 | func_long_description = (
194 | self.__checks_defaults['long_description'] if
195 | "long_description" not in dict_check_info else
196 | dict_check_info["long_description"])
197 | func_check_condition = (
198 | self.__checks_defaults['check_condition'] if
199 | "check_condition" not in dict_check_info else
200 | dict_check_info["check_condition"])
201 | list_columns = (
202 | self.__checks_defaults['columns'] if
203 | "columns" not in dict_check_info else
204 | dict_check_info["columns"])
205 | if type(list_columns).__name__ == 'str':
206 | list_columns = [list_columns]
207 | func_count_condition = (
208 | self.__checks_defaults['count_condition'] if
209 | "count_condition" not in dict_check_info else
210 | dict_check_info["count_condition"])
211 | func_index_position = (
212 | self.__checks_defaults['index_position'] if
213 | "index_position" not in dict_check_info else
214 | dict_check_info["index_position"])
215 | func_relevant_columns = (
216 | self.__checks_defaults['relevant_columns'] if
217 | "relevant_columns" not in dict_check_info else
218 | dict_check_info["relevant_columns"])
219 | var_idx_flag = (
220 | self.__checks_defaults['idx_flag'] if
221 | "idx_flag" not in dict_check_info else
222 | dict_check_info['idx_flag'])
223 | var_category = (
224 | self.__checks_defaults['category'] if
225 | "category" not in dict_check_info else
226 | dict_check_info['category'])
227 | if len(list_columns) == 0:
228 | var_msg = ('The `list_columns` value somehow has length 0, needs '
229 | 'to have at least one element, which can be `np.nan`')
230 | module_logger.error(var_msg)
231 | raise ValueError(var_msg)
232 | for col in list_columns:
233 | self.__evaluate_check(
234 | check_key, df, col, func_calc_condition,
235 | func_check_condition, func_count_condition, func_index_position,
236 | func_relevant_columns, func_long_description, var_idx_flag,
237 | var_category, table_key, **kwargs)
238 |
239 | module_logger.info(f"Completed check `{check_key}`")
240 |
241 | def __evaluate_check(
242 | self, check_key, df, col, func_calc_condition, func_check_condition,
243 | func_count_condition, func_index_position, func_relevant_columns,
244 | func_long_description, var_idx_flag, var_category, table_key,
245 | **kwargs):
246 | module_logger.info(
247 | f"Starting evaluating check `{check_key}` for column {col}")
248 | s_calc_condition = func_calc_condition(df, col, **kwargs)
249 | var_check_condition = func_check_condition(
250 | df, col, s_calc_condition, **kwargs)
251 | var_count_condition = func_count_condition(
252 | df, col, s_calc_condition, **kwargs)
253 | s_index_conditions = func_index_position(
254 | df, col, s_calc_condition, **kwargs)
255 | if var_idx_flag is False:
256 | s_index_conditions = s_index_conditions.map(
257 | {True: False, False: True})
258 | var_relevant_columns = func_relevant_columns(
259 | df, col, s_calc_condition, **kwargs)
260 | var_long_description = func_long_description(
261 | df, col, s_calc_condition, **kwargs)
262 | if type(var_long_description).__name__ != "str":
263 | var_msg = (
264 | f"The variable `var_long_description` is not a string! It is a"
265 | f" {type(var_long_description).__name__}")
266 | module_logger.warning(var_msg)
267 | if (
268 | (type(var_relevant_columns).__name__ != "str") &
269 | (pd.isnull(var_relevant_columns) is False)
270 | ):
271 | var_msg = (
272 | f"The variable `var_relevant_columns` is not a string or null! "
273 | f"It is a {type(var_relevant_columns).__name__}")
274 | module_logger.warning(var_msg)
275 | if "int" not in type(var_count_condition).__name__:
276 | var_msg = (
277 | f"The variable `var_count_condition` is not an integer! It is a"
278 | f" {type(var_count_condition).__name__}")
279 | module_logger.warning(var_msg)
280 | if type(s_calc_condition).__name__ != "Series":
281 | var_msg = (
282 | f"The variable `s_calc_condition` is not a Series! It is a "
283 | f"{type(s_calc_condition).__name__}")
284 | module_logger.warning(var_msg)
285 | if type(s_index_conditions).__name__ != "Series":
286 | var_msg = (
287 | f"The variable `s_index_conditions` is not a Series! It is a "
288 | f"{type(s_index_conditions).__name__}")
289 | module_logger.warning(var_msg)
290 | if (
291 | (type(var_category).__name__ != 'str') &
292 | (pd.isnull(var_category) is False)
293 | ):
294 | var_msg = (f'The variable `category` is not a string or null! It '
295 | f'is a {type(var_category).__name__}')
296 | module_logger.warning(var_msg)
297 | if var_check_condition:
298 | if pd.isnull(table_key):
299 | var_file = np.nan
300 | var_subfile = np.nan
301 | else:
302 | var_file = table_key.split(self.__key_separator)[0]
303 | var_subfile = (table_key.split(self.__key_separator)[1] if
304 | self.__key_separator in table_key else np.nan)
305 | self.error_handling(
306 | var_file, var_subfile, check_key, var_long_description,
307 | var_relevant_columns, var_count_condition,
308 | ", ".join(
309 | [
310 | str(item) for item in
311 | s_index_conditions.loc[
312 | s_index_conditions].index.tolist()
313 | ]
314 | ),
315 | var_category
316 | )
317 | module_logger.info(
318 | f"Completed evaluating check `{check_key}` for column {col}")
319 |
320 | def get_issue_count(self, issue_number_min=None, issue_number_max=None):
321 | module_logger.info("Starting `get_issue_count`")
322 | df = self.df_issues.copy()
323 | if issue_number_min is not None:
324 | df = df.loc[df["step_number"] >= issue_number_min].copy()
325 | if issue_number_max is not None:
326 | df = df.loc[df["step_number"] <= issue_number_max].copy()
327 | var_count = df.shape[0]
328 | module_logger.info("Completed `get_issue_count`")
329 | return var_count
330 |
331 | def table_look(self, table, issue_idx):
332 | module_logger.info("Starting `table_look`")
333 | if issue_idx not in self.df_issues.index.tolist():
334 | var_msg = (f"The requested issue index, {issue_idx}, is not "
335 | f"present in the `df_issues` table")
336 | module_logger.error(var_msg)
337 | raise AttributeError(var_msg)
338 | if type(table).__name__ != 'DataFrame':
339 | var_msg = 'The `table` argument is not a DataFrame as required'
340 | module_logger.error(var_msg)
341 | raise ValueError(var_msg)
342 | df_check = table.loc[
343 | [
344 | int(item) for item in
345 | self.df_issues.loc[issue_idx, "issue_idx"].split(", ")
346 | ]
347 | ]
348 | module_logger.info("Completed `table_look`")
349 | return self.df_issues.loc[[issue_idx]], df_check
350 |
351 | @staticmethod
352 | def __func_summary_(key_value):
353 | if type(key_value).__name__ == 'function':
354 | var_out = ''.join([
355 | x.strip().strip("['\\n']") for x in
356 | getsourcelines(key_value)[0]
357 | ])
358 | if (var_out.strip()[-1] == ':') | (var_out.strip()[-1] == '('):
359 | return ('raise Exception("The definition does not allow for'
360 | ' this info to be retrieved")')
361 | var_out = var_out.split(':')[-1].strip()
362 | if var_out[-1] == ',':
363 | var_out = var_out[:-1]
364 | return var_out
365 | else:
366 | return key_value
367 |
368 | def summary(self, path=None, script_name=None,
369 | object_name="dict_checks", dictionary=None):
370 | if (script_name is not None) & (object_name is not None):
371 | dict_checks = import_attr(path, script_name, object_name)
372 | elif dictionary is not None:
373 | if type(dictionary).__name__ != "dict":
374 | var_msg = "The `dictionary` argument is not a dictionary"
375 | module_logger.error(var_msg)
376 | raise ValueError(var_msg)
377 | dict_checks = dictionary
378 | else:
379 | var_msg = ("Either `dictionary` or both of `script_name` and "
380 | "`path` need to be none null")
381 | module_logger.error(var_msg)
382 | raise ValueError(var_msg)
383 |
384 | list_keys = [
385 | 'calc_condition', 'long_description', 'check_condition', 'columns',
386 | 'count_condition', 'index_position', 'relevant_columns', 'idx_flag',
387 | 'category'
388 | ]
389 |
390 | dict_checks_values = deepcopy(dict_checks)
391 | for check in [key for key in dict_checks_values.keys()]:
392 | for key in [key for key in list_keys if
393 | key not in dict_checks_values[check].keys()]:
394 | dict_checks_values[check][key] = self.__checks_defaults[key]
395 |
396 | for check in [key for key in dict_checks_values.keys()]:
397 | for key in [key for key in dict_checks_values[check].keys()]:
398 | dict_checks_values[check][key] = self.__func_summary_(
399 | dict_checks_values[check][key])
400 |
401 | df_summary = pd.DataFrame(
402 | dict_checks_values
403 | ).T.reset_index().rename(columns={'index': 'check'})
404 |
405 | return {'df': df_summary, 'dict': dict_checks}
406 |
407 | def set_step_no(self, step_no):
408 | """
409 | Set the step number, this allows errors to be recorded against a
410 | specific step which in turn can help with issue tracking and checking
411 | once issues are recorded.
412 |
413 | The argument step_no needs to be convertible to integer format.
414 | """
415 | module_logger.info("Starting `set_step_no`")
416 | try:
417 | self.__step_no = int(step_no)
418 | except ValueError:
419 | var_msg = (f"Function set_step_no: The value {step_no} can not be "
420 | f"converted to int.")
421 | module_logger.error(var_msg)
422 | raise ValueError(var_msg)
423 | module_logger.info(
424 | f"Completed `set_step_no`, the step number is {self.__step_no}")
425 |
426 | def get_step_no(self):
427 | module_logger.info("Starting `get_step_no`")
428 | module_logger.info("Completed `get_step_no`")
429 | return self.__step_no
430 |
--------------------------------------------------------------------------------
/data_etl/connections.py:
--------------------------------------------------------------------------------
1 | # Here we are defining a class that will deal with the various connections
2 | # required by the pipeline
3 | import logging
4 | import sqlite3
5 | import os
6 | import configparser
7 |
8 | import pandas as pd
9 | import pyodbc
10 |
11 | from data_etl.general_functions import func_to_sql
12 |
13 | module_logger = logging.getLogger(__name__)
14 | # TODO account for tables not existing and existing when writing to the cnx,
15 | # ideally any tables used should have been pre-emptively setup in the required
16 | # databases
17 | # TODO add MSSQL connection handling
18 |
19 |
20 | class Connections:
21 | __step_no = 0
22 | __df_issues = None
23 | __dict_cnx = None
24 |
25 | def __init__(self, step_no=None):
26 | module_logger.info("Initialising `Connections` object")
27 | if step_no is not None:
28 | self.set_step_no(step_no)
29 | self.__dict_cnx = {
30 | 'blank': {'cnx_type': 'blank'}
31 | }
32 | module_logger.info("Initialising `Connections` object complete")
33 |
34 | def set_step_no(self, step_no):
35 | module_logger.info(f"Starting `set_step_no`")
36 | self.__step_no = step_no
37 | module_logger.info(f"Completed `set_step_no`")
38 |
39 | def get_step_no(self):
40 | module_logger.info("Starting `get_step_no`")
41 | module_logger.info("Completed `get_step_no`")
42 | return self.__step_no
43 |
44 | def add_cnx(self, cnx_key, cnx_type, table_name, cnx_string=None,
45 | file_path=None, config_section=None, overwrite=False,
46 | timestamp_format='%Y-%m-%d', **kwargs):
47 | module_logger.info(f"Starting `add_cnx` for cnx key `{cnx_key}`")
48 | # TODO query is the file existing, if not then error out
49 | if (cnx_key in self.__dict_cnx) & (overwrite is False):
50 | var_msg = ('This connection string is already set, use the '
51 | 'argument `overwrite=True` to overwrite')
52 | module_logger.error(var_msg)
53 | raise ValueError(var_msg)
54 | if cnx_type not in ['sqlite3', 'db']:
55 | var_msg = (
56 | 'The `cnx_type` argument only takes values `sqlite3`, `db`')
57 | module_logger.error(var_msg)
58 | raise AttributeError(var_msg)
59 | if (table_name is None) & (cnx_type in ['sqlite3', 'db']):
60 | var_msg = 'The argument `table_name` is required'
61 | module_logger.error(var_msg)
62 | raise AttributeError(var_msg)
63 | if (file_path is None) & (cnx_type in ['sqlite3', 'db']):
64 | var_msg = 'The argument `file_path` is required'
65 | module_logger.error(var_msg)
66 | raise AttributeError(var_msg)
67 | if (
68 | (not os.path.exists(file_path)) &
69 | (cnx_string is None) &
70 | (cnx_type in ['db'])
71 | ):
72 | var_msg = (
73 | f'The `file_path` to the config file {file_path} is not valid, '
74 | f'the `file_path` is expected since the `cnx_string` is None'
75 | )
76 | module_logger.error(var_msg)
77 | raise AttributeError(var_msg)
78 | if (
79 | (not os.path.exists(os.path.dirname(file_path))) &
80 | (cnx_type in ['sqlite3'])
81 | ):
82 | var_msg = (
83 | f'The folder path {os.path.dirname(file_path)} is not valid')
84 | module_logger.error(var_msg)
85 | raise AttributeError(var_msg)
86 | if (not os.path.exists(file_path)) & (cnx_type in ['sqlite3']):
87 | var_msg = (f'The `file_path` {file_path} is not valid so this '
88 | f'file will be created')
89 | module_logger.warning(var_msg)
90 | if cnx_type == 'sqlite3':
91 | module_logger.info(
92 | f'The information is: {cnx_type}, {file_path}, {table_name}')
93 | self.__dict_cnx[cnx_key] = {
94 | 'cnx_type': cnx_type,
95 | 'file_path': file_path,
96 | 'table_name': table_name
97 | }
98 | elif cnx_type == 'db':
99 | if (config_section is None) & (cnx_string is None):
100 | var_msg = ('The argument `config_section` or `cnx_string` is '
101 | 'required for `cnx_type=db`')
102 | module_logger.error(var_msg)
103 | raise AttributeError(var_msg)
104 | if config_section is not None:
105 | dict_config = configparser.ConfigParser()
106 | dict_config.read(file_path)
107 | var_cnx_string = ''.join(
108 | [
109 | f"{key}={dict_config[config_section][key]};" for
110 | key in dict_config[config_section]
111 | ]
112 | )
113 | self.__dict_cnx[cnx_key] = {
114 | 'cnx_type': cnx_type,
115 | 'file_path': file_path,
116 | 'cnx_string': var_cnx_string ,
117 | 'table_name': table_name,
118 | 'timestamp_format': timestamp_format
119 | }
120 | elif cnx_string is not None:
121 | self.__dict_cnx[cnx_key] = {
122 | 'cnx_type': cnx_type,
123 | 'file_path': file_path,
124 | 'cnx_string': cnx_string,
125 | 'table_name': table_name,
126 | 'timestamp_format': timestamp_format
127 | }
128 | self.test_cnx(cnx_key, **kwargs)
129 | module_logger.info("Completed `add_cnx`")
130 |
131 | def test_cnx(self, cnx_key, **kwargs):
132 | module_logger.info(f"Starting `test_cnx` for cnx key `{cnx_key}`")
133 | if cnx_key not in self.__dict_cnx:
134 | var_msg = f'The key {cnx_key} is not present'
135 | module_logger.error(var_msg)
136 | raise AttributeError(var_msg)
137 | dict_cnx = self.__dict_cnx[cnx_key]
138 | var_cnx_type = dict_cnx['cnx_type']
139 | if var_cnx_type == 'sqlite3':
140 | cnx = sqlite3.connect(dict_cnx['file_path'])
141 | if kwargs.get('sqlite_df_issues_create') is True:
142 | var_create_table_sql = """
143 | CREATE TABLE IF NOT EXISTS {} (
144 | key_1 text,
145 | key_2 text,
146 | key_3 text,
147 | file text,
148 | sub_file text,
149 | step_number integer,
150 | category text,
151 | issue_short_desc text,
152 | issue_long_desc text,
153 | column text,
154 | issue_count integer,
155 | issue_idx text,
156 | grouping text
157 | );
158 | """.format(dict_cnx['table_name'])
159 | cnx.execute(var_create_table_sql)
160 | try:
161 | pd.read_sql(
162 | f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;",
163 | cnx
164 | )
165 | cnx.close()
166 | except:
167 | cnx.close()
168 | var_msg = 'Reading in from the table has not worked'
169 | module_logger.error(var_msg)
170 | raise AttributeError(var_msg)
171 | elif var_cnx_type == 'db':
172 | cnx = pyodbc.connect(dict_cnx['cnx_string'])
173 | try:
174 | pd.read_sql(
175 | f"SELECT TOP (0) * FROM {dict_cnx['table_name']};",
176 | cnx
177 | )
178 | cnx.close()
179 | except:
180 | cnx.close()
181 | module_logger.info("Completed `test_cnx`")
182 |
183 | def read_from_db(self, cnx_key, sql_stmt):
184 | module_logger.info("Starting `read_from_db`")
185 | module_logger.info(f'Sql statement: {sql_stmt}')
186 | dict_cnx = self.__dict_cnx[cnx_key]
187 | var_cnx_type = dict_cnx['cnx_type']
188 | df = pd.DataFrame()
189 | if var_cnx_type == 'blank':
190 | var_msg = 'Trying to use `read_from_db` using a blank connection'
191 | module_logger.error(var_msg)
192 | raise ValueError(var_msg)
193 | elif var_cnx_type == 'sqlite3':
194 | cnx = sqlite3.connect(dict_cnx['file_path'])
195 | try:
196 | df = pd.read_sql(sql_stmt, cnx)
197 | cnx.close()
198 | except:
199 | cnx.close()
200 | var_msg = 'Reading in using a `sqlite3` connection has failed'
201 | module_logger.error(var_msg)
202 | raise ValueError(var_msg)
203 | elif var_cnx_type == 'db':
204 | cnx = pyodbc.connect(dict_cnx['cnx_string'])
205 | try:
206 | df = pd.read_sql(sql_stmt, cnx)
207 | cnx.close()
208 | except:
209 | cnx.close()
210 | var_msg = 'Reading in using a `db` connection has failed'
211 | module_logger.error(var_msg)
212 | raise ValueError(var_msg)
213 | module_logger.info("Completed `read_from_db`")
214 | return df
215 |
216 | def write_to_db(self, cnx_key, table, batch_size=None,
217 | flag_sql_logging=False):
218 | module_logger.info("Starting `write_to_db`")
219 | dict_cnx = self.__dict_cnx[cnx_key]
220 | var_cnx_type = dict_cnx['cnx_type']
221 | # Temp table first
222 | var_write_works = 0
223 | if var_cnx_type == 'blank':
224 | var_write_works += 1
225 | elif var_cnx_type == 'sqlite3':
226 | cnx = sqlite3.connect(dict_cnx['file_path'])
227 | cursor = cnx.cursor()
228 | var_sql = (f"CREATE TEMP TABLE temp.{dict_cnx['table_name']} AS "
229 | f"SELECT * FROM {dict_cnx['table_name']} LIMIT 0;")
230 | module_logger.info(var_sql)
231 | cursor.execute(var_sql)
232 | cnx.commit()
233 | for idx in table.index.tolist():
234 | var_sql = "INSERT INTO temp.{} VALUES ({});".format(
235 | dict_cnx['table_name'],
236 | ', '.join(
237 | table.loc[idx].map(
238 | lambda value: 'NULL' if pd.isnull(value) else
239 | f"'{str(value)}'"
240 | ).astype(str).values.tolist()
241 | )
242 | )
243 | if flag_sql_logging:
244 | module_logger.info(var_sql)
245 | cursor.execute(var_sql)
246 | cnx.commit()
247 |
248 | df_test = pd.read_sql(
249 | f"SELECT * FROM temp.{dict_cnx['table_name']}", cnx)
250 |
251 | if df_test.shape[0] == table.shape[0]:
252 | var_write_works += 1
253 |
254 | cnx.close()
255 | elif var_cnx_type == 'db':
256 | cnx = pyodbc.connect(dict_cnx['cnx_string'])
257 | cursor = cnx.cursor()
258 |
259 | var_sql = (f"DROP TABLE IF EXISTS #Temp "
260 | f"SELECT TOP(0) * INTO #Temp "
261 | f"FROM {dict_cnx['table_name']}")
262 | module_logger.info(var_sql)
263 | cursor.execute(var_sql)
264 | cnx.commit()
265 |
266 | var_sql_template = "INSERT INTO #Temp ([{}]) VALUES {}".format(
267 | "], [".join(table.columns.tolist()),
268 | '{}'
269 | )
270 | module_logger.info(var_sql_template)
271 | s_sql_values = table.apply(
272 | lambda s: s.map(
273 | lambda x: func_to_sql(x, dict_cnx['timestamp_format']))
274 | ).apply(
275 | lambda r: f"({', '.join(r)})", axis=1)
276 | var_iloc_min = 0
277 | for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2):
278 | s_filtered = s_sql_values.iloc[
279 | var_iloc_min:(i * batch_size)]
280 | var_sql = var_sql_template.format(
281 | ", ".join(s_filtered.values.tolist()))
282 | if flag_sql_logging:
283 | module_logger.info(var_sql)
284 | cursor.execute(var_sql)
285 | cnx.commit()
286 | var_iloc_min = i * batch_size
287 |
288 | df_test = pd.read_sql("SELECT * FROM #Temp", cnx)
289 |
290 | if df_test.shape[0] == table.shape[0]:
291 | var_write_works += 1
292 |
293 | cnx.close()
294 |
295 | if var_write_works == 0:
296 | var_msg = ('The writing to a temporary table has not worked, '
297 | 'will not try writing to main table')
298 | module_logger.error(var_msg)
299 | raise ValueError(var_msg)
300 | if var_write_works > 1:
301 | var_msg = ('The writing to a temporary table has happened '
302 | 'multiple times, will not try writing to main table')
303 | module_logger.error(var_msg)
304 | raise ValueError(var_msg)
305 | # Then move to the main table only if the temporary table worked
306 | if var_cnx_type == 'blank':
307 | pass
308 | elif var_cnx_type == 'sqlite3':
309 | cnx = sqlite3.connect(dict_cnx['file_path'])
310 | try:
311 | table.to_sql(dict_cnx['table_name'], cnx,
312 | index=False, if_exists='append')
313 | cnx.close()
314 | except:
315 | cnx.close()
316 | var_msg = 'Writing to the table has not worked'
317 | module_logger.error(var_msg)
318 | raise ValueError(var_msg)
319 | elif var_cnx_type == 'db':
320 | cnx = pyodbc.connect(dict_cnx['cnx_string'])
321 | cursor = cnx.cursor()
322 | try:
323 | var_sql_template = "INSERT INTO {} ([{}]) VALUES {}".format(
324 | dict_cnx['table_name'],
325 | "], [".join(table.columns.tolist()),
326 | '{}'
327 | )
328 | s_sql_values = table.apply(
329 | lambda s: s.map(
330 | lambda x: func_to_sql(x, dict_cnx['timestamp_format']))
331 | ).apply(
332 | lambda r: f"({', '.join(r)})", axis=1)
333 | var_iloc_min = 0
334 | for i in range(1, int(s_sql_values.shape[0] / batch_size) + 2):
335 | s_filtered = s_sql_values.iloc[
336 | var_iloc_min:(i * batch_size)]
337 | var_sql = var_sql_template.format(
338 | ", ".join(s_filtered.values.tolist()))
339 | if flag_sql_logging:
340 | module_logger.info(var_sql)
341 | cursor.execute(var_sql)
342 | cnx.commit()
343 | var_iloc_min = i * batch_size
344 | cnx.close()
345 | except:
346 | cnx.close()
347 | var_msg = 'Writing to the table has not worked'
348 | module_logger.error(var_msg)
349 | raise ValueError(var_msg)
350 |
351 | module_logger.info("Completed `write_to_db`")
352 |
353 | def get_cnx_keys(self):
354 | module_logger.info("Starting `get_cnx_keys`")
355 | module_logger.info("Completed `get_cnx_keys`")
356 | return [x for x in self.__dict_cnx.keys()]
357 |
--------------------------------------------------------------------------------
/data_etl/data_files.py:
--------------------------------------------------------------------------------
1 | # Here we are defining a class that will deal with all the data storage and
2 | # manipulations
3 | import logging
4 |
5 | import pandas as pd
6 | import numpy as np
7 |
8 | from data_etl.general_functions import import_attr
9 |
10 | module_logger = logging.getLogger(__name__)
11 |
12 |
13 | class DataCuration:
14 | __step_no = 0
15 | df_issues = None
16 | headers = None
17 | __key_1 = None
18 | __key_2 = None
19 | __key_3 = None
20 | __grouping = None
21 | tables = None
22 | formed_tables = None
23 | list_files = None
24 | __key_separator = " -:- "
25 | __link_headers = None
26 |
27 | def __init__(self, grouping, key_1, key_2=None, key_3=None):
28 | """
29 | All data actions are taken on all tables, the aim is to process data to
30 | end up with a uniform data set that can be utilised and is consistent.
31 |
32 | The three arguments are individual identifiers for the data.
33 |
34 | The end form would be a pipeline that has regular data ingests.
35 | """
36 | module_logger.info("Initialising `DataCuration` object")
37 | # Three keys, all good things come in threes
38 | self.__key_1 = str(key_1)
39 | self.__key_2 = str(key_2)
40 | self.__key_3 = str(key_3)
41 | self.__grouping = grouping
42 | # sub_file, e.g. sheet for a spreadsheet, may not always be applicable
43 | df_issues = pd.DataFrame(
44 | columns=[
45 | "key_1", "key_2", "key_3", "file", "sub_file", "step_number",
46 | "category", "issue_short_desc", "issue_long_desc", "column",
47 | "issue_count", "issue_idx", "grouping"
48 | ]
49 | )
50 | df_issues["step_number"] = df_issues["step_number"].astype(int)
51 | self.df_issues = df_issues
52 | self.tables = dict()
53 | self.formed_tables = dict()
54 | self.list_files = list()
55 | self.__link_headers = dict()
56 | module_logger.info("Initialising `DataCuration` object complete")
57 |
58 | def error_handling(self, file, subfile, issue_short_desc, issue_long_desc,
59 | column, issue_count, issue_idx, category=np.nan):
60 | """
61 | If an error is handled, as they all should be, we need to specify what
62 | happens with the error. By putting it into a single function it will
63 | hopefully make the code briefer.
64 | """
65 | module_logger.info("Logging an error with `error_handling`")
66 | df = self.df_issues.copy()
67 | list_vals = [
68 | self.__key_1, self.__key_2, self.__key_3, file, subfile,
69 | self.__step_no, category, issue_short_desc, issue_long_desc, column,
70 | issue_count, issue_idx, self.__grouping
71 | ]
72 | try:
73 | df.loc[df.shape[0]] = list_vals
74 | self.df_issues = df.copy()
75 | except:
76 | var_msg = f"Logging the issue failed for values: {list_vals}"
77 | module_logger.error(var_msg)
78 | raise ValueError(var_msg)
79 | module_logger.info(f"Error logged: {list_vals}")
80 |
81 | def set_step_no(self, step_no):
82 | """
83 | Set the step number, this allows errors to be recorded against a
84 | specific step which in turn can help with issue tracking and checking
85 | once issues are recorded.
86 |
87 | The argument step_no needs to be convertible to integer format.
88 | """
89 | module_logger.info("Starting `set_step_no`")
90 | try:
91 | self.__step_no = int(step_no)
92 | except ValueError:
93 | var_msg = (f"Function set_step_no: The value {step_no} can not be "
94 | f"converted to int.")
95 | module_logger.error(var_msg)
96 | raise ValueError(var_msg)
97 | module_logger.info(
98 | f"Completed `set_step_no`, the step number is {self.__step_no}")
99 |
100 | def set_key_separator(self, separator):
101 | """
102 | The key separator is used in the error handling section to split out the
103 | file and sub file portions of the dictionary keys of the files read in.
104 |
105 | So if you have a key of 'file name -:- sheet name', for tables read in
106 | from an Excel file, and an issue is found. The associated issues log
107 | entry will then have a file value of 'file name' and a sub file value of
108 | 'sheet name'.
109 | """
110 | module_logger.info("Starting `set_key_separator`")
111 | if (type(separator).__name__ != "str") | (len(separator) == 0):
112 | var_msg = ("The argument `separator` for function "
113 | "`set_key_separator` should be a string of length "
114 | "greater than 0")
115 | module_logger.error(var_msg)
116 | raise ValueError(var_msg)
117 | self.__key_separator = separator
118 | module_logger.info(f"Completed `set_key_separator`, the key separator "
119 | f"is: {self.__key_separator}")
120 |
121 | def set_file_list(self, list_files, append=False):
122 | """
123 | If there is a know list of files then define them here rather than
124 | setting a function to find the files.
125 | """
126 | module_logger.info("Starting `set_file_list`")
127 | var_type = type(list_files).__name__
128 | if (var_type != "list") & (var_type != "str"):
129 | var_msg = ("The type of the `list_files` argument is not a list or "
130 | "a string.")
131 | module_logger.error(var_msg)
132 | raise ValueError(var_msg)
133 | elif var_type == "str":
134 | if len(list_files) == 0:
135 | var_msg = ("The length of the `list_files` argument is 0, it "
136 | "needs to be a valid value.")
137 | module_logger.error(var_msg)
138 | raise ValueError(var_msg)
139 | list_files = [list_files]
140 | elif var_type == 'list':
141 | if len(list_files) == 0:
142 | var_msg = ("The length of the `list_files` argument is 0, it "
143 | "needs to be a valid value.")
144 | module_logger.error(var_msg)
145 | raise ValueError(var_msg)
146 | list_files = list_files
147 | else:
148 | var_msg = (f"Unhandled type for function `set_file_list`: "
149 | f"{var_type}")
150 | module_logger.error(var_msg)
151 | raise ValueError(var_msg)
152 |
153 | if append:
154 | self.list_files += list_files
155 | else:
156 | self.list_files = list_files
157 | module_logger.info(f"Completed `set_file_list`, the list of files is: "
158 | f"{self.list_files}")
159 |
160 | def find_files(self, path=None, script_name=None,
161 | func_name="list_the_files", function=None, files_path='.',
162 | append=False, **kwargs):
163 | """
164 | Using an externally defined function, as specified in the module
165 | argument script, acquire a list of files to be read in.
166 |
167 | In the case that we want to accumulate a list of files from different
168 | main paths there is an append option.
169 | """
170 | module_logger.info("Starting `find_files`")
171 | # TODO move this to an internal function as it's used so often!
172 | if script_name is not None:
173 | function = import_attr(path, script_name, func_name)
174 | elif function is not None:
175 | if type(function).__name__ != "function":
176 | var_msg = "The `function` argument needs to be a function"
177 | module_logger.error(var_msg)
178 | raise ValueError(var_msg)
179 | else:
180 | var_msg = ("One of `script_name` or `function` needs to be not "
181 | "None in the function `find_files`")
182 | module_logger.error(var_msg)
183 | raise ValueError(var_msg)
184 | list_files = function(files_path, **kwargs)
185 | # TODO move these to be calls on the self.set_file_list function instead
186 | # of setting the value here
187 | if append:
188 | self.list_files += list_files
189 | else:
190 | self.list_files = list_files
191 | module_logger.info(
192 | f"Completed `find_files`, the list of files is: {self.list_files}")
193 |
194 | def reading_in(self, path=None, script_name=None, func_name="read_files",
195 | function=None, overwrite=True, **kwargs):
196 | """
197 | Using an externally defined reading in function, and the internally
198 | defined list of files, read in each of the tables required.
199 |
200 | `path` being the relative script file path
201 | """
202 | module_logger.info("Starting `reading_in`")
203 | if type(self.tables).__name__ != "dict":
204 | var_msg = ("The tables need to be in dictionary format for this "
205 | "`self.reading_in` step")
206 | module_logger.error(var_msg)
207 | raise ValueError(var_msg)
208 | if function is not None:
209 | if type(function).__name__ != "function":
210 | var_msg = ("The function passed to `self.reading_in` is not a "
211 | "function.")
212 | module_logger.error(var_msg)
213 | raise ValueError(var_msg)
214 | elif script_name is not None:
215 | function = import_attr(path, script_name, func_name)
216 | else:
217 | var_msg = ("One of the `function` or `script_name` arguments needs "
218 | "to be completed. And if `script name is then `path` "
219 | "needs to be too.")
220 | module_logger.error(var_msg)
221 | raise ValueError(var_msg)
222 |
223 | try:
224 | dfs = function(self.list_files, **kwargs)
225 | except AttributeError:
226 | if len([x for x in kwargs.keys()]) > 0:
227 | var_msg = (f"Function reading_in, kwargs may have been passed "
228 | f"when the function {func_name} in the script "
229 | f"{script_name} does not take kwargs")
230 | else:
231 | var_msg = (f"Function reading in: The {func_name} function "
232 | f"does not exist in the {script_name} script.")
233 | module_logger.error(var_msg)
234 | raise AttributeError(var_msg)
235 | if overwrite is False:
236 | df_org = self.tables.copy()
237 | df_org.update(dfs)
238 | elif overwrite is True:
239 | pass
240 | else:
241 | var_msg = ("The attribute `overwrite` in the function "
242 | "`reading_in` needs to be `True` or `False`")
243 | module_logger.error(var_msg)
244 | raise ValueError(var_msg)
245 | self.set_table(dfs, overwrite=overwrite)
246 | if type(dfs).__name__ == "DataFrame":
247 | module_logger.info(f"The table has shape '{dfs.shape}'")
248 | else:
249 | for key in dfs:
250 | module_logger.info(
251 | f"The table with key '{key}' has shape '{dfs[key].shape}'")
252 |
253 | module_logger.info("Completed `reading_in`")
254 |
255 | def set_table(self, tables, dict_key=None, overwrite=True):
256 | """
257 | If self.tables is a dictionary set df to key else overwrite existing
258 | table if argument is True
259 | """
260 | module_logger.info("Starting `set_table`")
261 | if (overwrite is True) & (dict_key is None):
262 | self.tables = tables
263 | elif (
264 | (overwrite is True) &
265 | (dict_key is not None) &
266 | (type(self.tables).__name__ == 'dict') &
267 | (type(tables).__name__ == 'DataFrame')
268 | ):
269 | self.tables[dict_key] = tables
270 | elif (
271 | (overwrite is False) &
272 | (dict_key is not None) &
273 | (type(self.tables).__name__ == 'dict') &
274 | (type(tables).__name__ == 'DataFrame')
275 | ):
276 | if dict_key not in [key for key in self.tables.keys()]:
277 | self.tables[dict_key] = tables
278 | else:
279 | var_msg = (
280 | f'The combination of attributes has resulted in no change: '
281 | f'`self.tables` type - {type(self.tables).__name__}, '
282 | f'`tables` type - {type(tables).__name__}, `dict_key` - '
283 | f'{dict_key}, `overwrite` - {overwrite}')
284 | module_logger.error(var_msg)
285 | raise AttributeError(var_msg)
286 | else:
287 | var_msg = (
288 | f'The combination of attributes has resulted in no change: '
289 | f'`self.tables` type - {type(self.tables).__name__}, `tables` '
290 | f'type - {type(tables).__name__}, `dict_key` - {dict_key}, '
291 | f'`overwrite` - {overwrite}')
292 | module_logger.error(var_msg)
293 | raise AttributeError(var_msg)
294 | module_logger.info("Completed `set_table`")
295 |
296 | def concatenate_tables(self):
297 | """
298 | Where the tables are in a dictionary format put them into a DataFrame
299 | """
300 | module_logger.info("Starting `concatenate_tables`")
301 | if type(self.tables).__name__ != "dict":
302 | var_msg = ("For the function `concatenate_tables` the `tables` "
303 | "should be in dictionary format")
304 | module_logger.error(var_msg)
305 | raise ValueError(var_msg)
306 | if len([key for key in self.tables.keys()]) > 1:
307 | df = pd.concat(self.tables, axis=1)
308 | elif len([key for key in self.tables.keys()]) == 1:
309 | dict_df = self.tables.copy()
310 | dict_key = [key for key in dict_df.keys()][0]
311 | df = dict_df[dict_key].copy()
312 | df['level_0'] = dict_key
313 | else:
314 | var_msg = "The dictionary `self.tables` is empty"
315 | module_logger.error(var_msg)
316 | raise AttributeError(var_msg)
317 | self.set_table(df, overwrite=True)
318 | module_logger.info("Completed `concatenate_tables`")
319 |
320 | def dictionary_tables(self, key=None):
321 | """
322 | Where the tables are in a DataFrame format put them in a dictionary,
323 | using the values in the key column as the new dictionary keys
324 | """
325 | module_logger.info("Starting `dictionary_tables`")
326 | if type(self.tables).__name__ != "DataFrame":
327 | var_msg = ("For the function `dictionary_tables` the `tables` "
328 | "should be in DataFrame format.")
329 | module_logger.error(var_msg)
330 | raise ValueError(var_msg)
331 | df = self.tables
332 | dict_dfs = dict()
333 |
334 | if key is not None:
335 | var_cycle = key
336 | else:
337 | var_cycle = "level_0"
338 | if var_cycle not in self.tables.columns.tolist():
339 | var_msg = f"There is no {var_cycle} column present in the table"
340 | module_logger.error(var_msg)
341 | raise ValueError(var_msg)
342 | for val in df[var_cycle].unique().tolist():
343 | dict_dfs[val] = df.loc[df[var_cycle] == val].copy()
344 | self.set_table(dict_dfs)
345 |
346 | module_logger.info("Completed `dictionary_tables`")
347 |
348 | def set_comparison_headers(
349 | self, path=None, script_name=None, func_name="read_headers",
350 | function=None, dictionary=None, **kwargs):
351 | # TODO Need to see if we can isolate just a set of new tables? Maybe
352 | # have a list of dictionary keys that have had their headers done
353 | # already?
354 | module_logger.info("Starting `set_comparison_headers`")
355 |
356 | if function is not None:
357 | if type(function).__name__ != "function":
358 | var_msg = ("The function passed to "
359 | "`self.set_comparison_headers` is not a function.")
360 | module_logger.error(var_msg)
361 | raise ValueError(var_msg)
362 | elif script_name is not None:
363 | function = import_attr(path, script_name, func_name)
364 | elif dictionary is not None:
365 | def function(**kwargs): return dictionary
366 | else:
367 | var_msg = ("One of the `function` or `script_name` arguments needs "
368 | "to be completed. And if `script name is then `path` "
369 | "needs to be too.")
370 | module_logger.error(var_msg)
371 | raise ValueError(var_msg)
372 |
373 | try:
374 | dict_headers = function(**kwargs)
375 | except AttributeError:
376 | if len([x for x in kwargs.keys()]) > 0:
377 | var_msg = (
378 | f"Function set_comparison_headers, kwargs may have been "
379 | f"passed when the function {func_name} in the script "
380 | f"{script_name} does not take kwargs")
381 | else:
382 | var_msg = (
383 | f"Function set_comparison_headers: The {func_name} function"
384 | f" does not exist in the {script_name} script.")
385 | module_logger.error(var_msg)
386 | raise AttributeError(var_msg)
387 |
388 | if type(dict_headers).__name__ != 'dict':
389 | var_msg = 'The headers output should be a dictionary'
390 | module_logger.error(var_msg)
391 | raise Exception(var_msg)
392 | list_keys = [
393 | key for key in dict_headers.keys() if key != 'ideal_headers']
394 | list_keys = [
395 | key for key in list_keys if
396 | (dict_headers[key].get('expected_headers') is None) |
397 | (dict_headers[key].get('new_headers') is None) |
398 | (dict_headers[key].get('remove') is None)
399 | ]
400 | if len(list_keys) > 0:
401 | var_msg = (
402 | f'There are dictionary keys that do not have all the required '
403 | f'values: {", ".join([str(key) for key in list_keys])}')
404 | module_logger.error(var_msg)
405 | raise Exception(var_msg)
406 | if dict_headers.get('ideal_headers') is None:
407 | var_msg = ('There needs to be a key to the headers dictionary that'
408 | ' is "ideal_headers"')
409 | module_logger.error(var_msg)
410 | raise Exception(var_msg)
411 | if type(dict_headers.get('ideal_headers')).__name__ != 'list':
412 | var_msg = 'The value of key "ideal_headers" needs to be a list'
413 | module_logger.error(var_msg)
414 | raise Exception(var_msg)
415 |
416 | self.headers = dict(dict_headers)
417 |
418 | module_logger.info(
419 | f"There are {len(dict_headers)} header keys and they are: "
420 | f"{', '.join([key for key in dict_headers.keys()])}")
421 |
422 | module_logger.info("Completed `set_comparison_headers`")
423 |
424 | @staticmethod
425 | def _link_headers(tables, headers, **kwargs):
426 | dict_link = dict()
427 | list_headers_keys = [
428 | key for key in headers.keys() if key != 'ideal_headers']
429 | if type(tables).__name__ == 'dict':
430 | for df_key in [key for key in tables.keys()]:
431 | for header_set in list_headers_keys:
432 | list_expected = headers[header_set]['expected_headers']
433 | if list_expected == tables[
434 | df_key].iloc[:len(list_expected)].values.tolist()[0]:
435 | dict_link[df_key] = header_set
436 | break
437 | else:
438 | for header_set in list_headers_keys:
439 | list_expected = headers[header_set]['expected_headers']
440 | if list_expected == tables.iloc[
441 | :len(list_expected)].values.tolist()[0]:
442 | dict_link['combined'] = header_set
443 | break
444 | return dict_link
445 |
446 | def link_headers(self, path=None, script_name=None,
447 | func_name="link_headers", function=None, **kwargs):
448 | # TODO Need to see if we can isolate just a set of new tables? Maybe
449 | # have a list of dictionary keys that have had their headers
450 | # done already?
451 | module_logger.info("Starting `link_headers`")
452 |
453 | if function is not None:
454 | if type(function).__name__ != "function":
455 | var_msg = ("The function passed to `self.link_headers` is "
456 | "not a function.")
457 | module_logger.error(var_msg)
458 | raise ValueError(var_msg)
459 | elif script_name is not None:
460 | function = import_attr(path, script_name, func_name)
461 | else:
462 | function = self._link_headers
463 |
464 | try:
465 | dict_link = function(self.tables, self.headers, **kwargs)
466 | except AttributeError:
467 | if len([x for x in kwargs.keys()]) > 0:
468 | var_msg = (
469 | f"Function link_headers, kwargs may have been passed when "
470 | f"the function {func_name} in the script {script_name} does"
471 | f" not take kwargs")
472 | else:
473 | var_msg = (f"Function link_headers: The {func_name} function "
474 | f"does not exist in the {script_name} script.")
475 | module_logger.error(var_msg)
476 | raise AttributeError(var_msg)
477 |
478 | list_unallocated_keys = set(self.tables.keys()) - set(dict_link.keys())
479 | if len(list_unallocated_keys) != 0:
480 | var_msg = (f"Not all the headers are linked, the unlinked tables "
481 | f"are: {list_unallocated_keys}")
482 | module_logger.error(var_msg)
483 | raise ValueError(var_msg)
484 |
485 | self.__link_headers = dict(dict_link)
486 |
487 | module_logger.info("Completed `link_headers`")
488 |
489 | @staticmethod
490 | def __assert_linked_headers(
491 | list_ideal_headers, dict_header, df, remove_header_rows, reset_index):
492 | list_expected_headers = dict_header['expected_headers']
493 | list_new_names = dict_header['new_headers']
494 | list_remove = [
495 | dict_header['new_headers'][i] for i in range(len(dict_header['remove']))
496 | if dict_header['remove'][i] == 'remove'
497 | ]
498 |
499 | # Remove the expected headers rows
500 | if remove_header_rows:
501 | df.drop(
502 | [i for i in range(len(list_expected_headers))],
503 | axis=0,
504 | inplace=True)
505 | if reset_index:
506 | df.reset_index(drop=True, inplace=True)
507 |
508 | # Set the new headers
509 | df.columns = list_new_names
510 |
511 | # Remove the columns to remove
512 | if len(list_remove) > 0:
513 | df.drop(list_remove, axis=1, inplace=True)
514 |
515 | # Fill in missing columns and reorder columns
516 | list_df_cols = df.columns.tolist()
517 | list_cols = [
518 | col for col in list_ideal_headers if col not in list_df_cols]
519 | for col in list_cols:
520 | df[col] = np.nan
521 |
522 | df = df[list_ideal_headers].copy()
523 |
524 | return df
525 |
526 | def assert_linked_headers(
527 | self, remove_header_rows=False, reset_index=False):
528 | module_logger.info("Starting `assert_linked_headers`")
529 |
530 | if type(self.tables).__name__ == 'dict':
531 | dict_dfs = dict(self.tables)
532 | for key in [key for key in self.__link_headers.keys()]:
533 | dict_dfs[key] = self.__assert_linked_headers(
534 | self.headers['ideal_headers'],
535 | self.headers[self.__link_headers[key]],
536 | dict_dfs[key],
537 | remove_header_rows,
538 | reset_index
539 | )
540 | self.set_table(dict(dict_dfs))
541 | else:
542 | key = [key for key in self.__link_headers.keys()][0]
543 | df = self.__assert_linked_headers(
544 | self.headers['ideal_headers'],
545 | self.headers[self.__link_headers[key]],
546 | self.tables,
547 | remove_header_rows,
548 | reset_index
549 | )
550 | self.set_table(df.copy())
551 |
552 | module_logger.info("Completed `assert_linked_headers`")
553 |
554 | def set_headers(
555 | self, path=None, script_name=None, func_name=None, list_cols=None,
556 | function=None, ideal_headers=None, required_headers=None):
557 | module_logger.info("Starting `set_headers`")
558 | if list_cols is not None:
559 | if type(list_cols).__name__ != "list":
560 | var_msg = ("The argument `list_cols` of function `set_headers` "
561 | "needs to be a list")
562 | module_logger.error(var_msg)
563 | raise ValueError(var_msg)
564 | elif function is not None:
565 | if type(function).__name__ != "function":
566 | var_msg = ("The argument `function` of function `set_headers` "
567 | "needs to be a function")
568 | module_logger.error(var_msg)
569 | raise ValueError(var_msg)
570 | elif script_name is not None:
571 | function = import_attr(path, script_name, func_name)
572 | elif ideal_headers is not None:
573 | if type(ideal_headers).__name__ != 'list':
574 | var_msg = ("The argument `ideal_headers` of function "
575 | "`set_headers` needs to be a list")
576 | module_logger.error(var_msg)
577 | raise ValueError(var_msg)
578 | elif required_headers is not None:
579 | if type(required_headers).__name__ != 'list':
580 | var_msg = ("The argument `required_headers` of function "
581 | "`set_headers` needs to be a list")
582 | module_logger.error(var_msg)
583 | raise ValueError(var_msg)
584 | var_type = type(self.tables).__name__
585 | if var_type == "dict":
586 | dict_dfs = self.tables.copy()
587 | var_cond = len(
588 | set([dict_dfs[key].shape[1] for key in dict_dfs.keys()]))
589 | var_cond = var_cond != 1
590 | if var_cond:
591 | var_msg = ("There are an inconsistent number of columns "
592 | "present in the dictionary of tables")
593 | module_logger.error(var_msg)
594 | raise ValueError(var_msg)
595 | if list_cols is not None:
596 | if (len(list_cols) !=
597 | dict_dfs[[x for x in dict_dfs.keys()][0]].shape[1]):
598 | var_msg = ("The length of `list_cols` is different to the "
599 | "number of columns present in the table")
600 | module_logger.error(var_msg)
601 | raise ValueError(var_msg)
602 | elif function is not None:
603 | list_cols_org = dict_dfs[
604 | [x for x in dict_dfs.keys()][0]
605 | ].columns.tolist()
606 | list_cols = [function(x) for x in list_cols_org]
607 | for key in dict_dfs.keys():
608 | if list_cols is not None:
609 | dict_dfs[key].columns = list_cols
610 | elif function is not None:
611 | dict_dfs[key].columns = list_cols
612 | elif ideal_headers is not None:
613 | for col in [
614 | col for col in ideal_headers if
615 | col not in dict_dfs[key].columns.tolist()
616 | ]:
617 | dict_dfs[key][col] = np.nan
618 | dict_dfs[key] = dict_dfs[key][ideal_headers].copy()
619 | elif required_headers is not None:
620 | for col in [
621 | col for col in required_headers if
622 | col not in dict_dfs[key].columns.tolist()
623 | ]:
624 | dict_dfs[key][col] = np.nan
625 | self.set_table(dict_dfs, overwrite=True)
626 | elif var_type == "DataFrame":
627 | if len(list_cols) != self.tables.shape[1]:
628 | var_msg = ("The length of `list_cols` is different to the "
629 | "number of columns present in the table")
630 | module_logger.error(var_msg)
631 | raise ValueError(var_msg)
632 | df = self.tables.copy()
633 | if list_cols is not None:
634 | df.columns = list_cols
635 | elif function is not None:
636 | df.columns = [function(x) for x in df.columns.tolist()]
637 | elif ideal_headers is not None:
638 | for col in [
639 | col for col in ideal_headers if
640 | col not in df.columns.tolist()
641 | ]:
642 | df[col] = np.nan
643 | df = df[ideal_headers].copy()
644 | elif required_headers is not None:
645 | for col in [
646 | col for col in required_headers if
647 | col not in df.columns.tolist()
648 | ]:
649 | df[col] = np.nan
650 | self.set_table(df, overwrite=True)
651 | else:
652 | var_msg = ("Somehow the tables are not a dictionary or a DataFrame "
653 | "for function `set_headers`")
654 | module_logger.error(var_msg)
655 | raise ValueError(var_msg)
656 |
657 | module_logger.info("Completed `set_headers`")
658 |
659 | def alter_tables(self, path=None, script_name=None,
660 | object_name="dict_alter", dictionary=None, **kwargs):
661 | """
662 | Use this functionality to make alterations to the table(s)
663 | """
664 | module_logger.info("Starting `alter_tables`")
665 | # TODO move this check to own function (applies to convert_columns too)
666 | if (script_name is not None) & (object_name is not None):
667 | dict_alter = import_attr(path, script_name, object_name)
668 | elif dictionary is not None:
669 | if type(dictionary).__name__ != "dict":
670 | var_msg = "The `dictionary` argument is not a dictionary"
671 | module_logger.error(var_msg)
672 | raise ValueError(var_msg)
673 | dict_alter = dictionary
674 | else:
675 | var_msg = ("Either `dictionary` or both of `script_name` and "
676 | "`path` need to be none null")
677 | module_logger.error(var_msg)
678 | raise ValueError(var_msg)
679 |
680 | if type(self.tables).__name__ == "DataFrame":
681 | df = self.tables.copy()
682 | df_new = self.__alter_cols(
683 | df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
684 | np.nan, **kwargs)
685 | self.set_table(df_new)
686 | elif type(self.tables).__name__ == "dict":
687 | dfs = self.tables
688 | for key in self.tables.keys():
689 | df = dfs[key].copy()
690 | df_new = self.__alter_cols(
691 | df, dict_alter, [self.__key_1, self.__key_2, self.__key_3],
692 | key, **kwargs)
693 | self.set_table(df_new, key)
694 | else:
695 | var_msg = ("The tables are in neither a DataFrame or dictionary "
696 | "format, which means something is seriously wrong...")
697 | module_logger.error(var_msg)
698 | raise ValueError(var_msg)
699 |
700 | module_logger.info("Completed `alter_tables`")
701 |
702 | def __alter_cols(self, df, dict_alter, keys, dict_key, **kwargs):
703 | module_logger.info("Starting `__alter_cols`")
704 | if pd.isnull(dict_key):
705 | var_file = np.nan
706 | var_subfile = np.nan
707 | else:
708 | var_file = dict_key.split(self.__key_separator)[0]
709 | var_subfile = (dict_key.split(self.__key_separator)[1] if
710 | self.__key_separator in dict_key else np.nan)
711 | for alter_key in dict_alter.keys():
712 | var_type = dict_alter[alter_key]["type"]
713 | function = dict_alter[alter_key]["function"]
714 | if var_type == "new_col":
715 | var_col_name = dict_alter[alter_key]["col_name"]
716 | if var_col_name in df.columns.tolist():
717 | var_msg = (
718 | f"The column {var_col_name} is present in the "
719 | f"table so should not be overwritten")
720 | module_logger.error(var_msg)
721 | self.error_handling(var_file, var_subfile, "", var_msg,
722 | var_col_name, np.nan, np.nan)
723 | continue
724 | try:
725 | s = function(df, keys, **kwargs)
726 | df[var_col_name] = s
727 | except KeyError:
728 | var_msg = (
729 | f"For type new_col the function for alter_key "
730 | f"{alter_key} has not worked with a KeyError")
731 | module_logger.error(var_msg)
732 | self.error_handling(var_file, var_subfile, "", var_msg,
733 | var_col_name, np.nan, np.nan)
734 | continue
735 | except:
736 | var_msg = (f"For type new_col the function for "
737 | f"alter_key {alter_key} has not worked")
738 | module_logger.error(var_msg)
739 |
740 | var_idx = np.nan
741 | var_issue_count = np.nan
742 | if "idx_function" in dict_alter[alter_key]:
743 | func_idx = dict_alter[alter_key]['idx_function']
744 | if type(func_idx).__name__ != 'function':
745 | var_msg = ''
746 | module_logger.error(var_msg)
747 | s_idx = func_idx(df, keys, **kwargs)
748 | var_idx = ', '.join(
749 | [
750 | str(item) for item in
751 | s_idx.loc[s_idx].index.tolist()
752 | ]
753 | )
754 | var_issue_count = s_idx.sum()
755 | self.error_handling(var_file, var_subfile, "", var_msg,
756 | var_col_name, var_issue_count, var_idx)
757 | continue
758 | elif var_type == "map_df":
759 | try:
760 | df = function(df, keys, **kwargs)
761 | except:
762 | var_msg = (f"For type map_df the function for "
763 | f"alter_key {alter_key} has not worked")
764 | module_logger.error(var_msg)
765 |
766 | var_idx = np.nan
767 | var_issue_count = np.nan
768 | if "idx_function" in dict_alter[alter_key]:
769 | func_idx = dict_alter[alter_key]['idx_function']
770 | if type(func_idx).__name__ != 'function':
771 | var_msg = ''
772 | module_logger.error(var_msg)
773 | s_idx = func_idx(df, keys, **kwargs)
774 | var_idx = ', '.join(
775 | [
776 | str(item) for item in
777 | s_idx.loc[s_idx].index.tolist()
778 | ]
779 | )
780 | var_issue_count = s_idx.sum()
781 | self.error_handling(var_file, var_subfile, "", var_msg,
782 | np.nan, var_issue_count, var_idx)
783 | continue
784 |
785 | module_logger.info("Completed `__alter_cols`")
786 | return df
787 |
788 | def convert_columns(self, path=None, script_name=None,
789 | object_name="dict_convert", dictionary=None, **kwargs):
790 | module_logger.info("Starting `convert_columns`")
791 | if (script_name is not None) & (object_name is not None):
792 | dict_convert = import_attr(path, script_name, object_name)
793 | elif dictionary is not None:
794 | if type(dictionary).__name__ != "dict":
795 | var_msg = "The `dictionary` argument is not a dictionary"
796 | module_logger.error(var_msg)
797 | raise ValueError(var_msg)
798 | dict_convert = dictionary
799 | else:
800 | var_msg = ("Either `dictionary` or both of `script_name` and "
801 | "`path` need to be none null")
802 | module_logger.error(var_msg)
803 | raise ValueError(var_msg)
804 |
805 | if type(self.tables).__name__ == "DataFrame":
806 | df = self.tables.copy()
807 | df_new = self.__convert_col(df, dict_convert, "", **kwargs)
808 | self.set_table(df_new, overwrite=True)
809 | elif type(self.tables).__name__ == "dict":
810 | dfs = self.tables
811 | for key in self.tables.keys():
812 | df = dfs[key].copy()
813 | df_new = self.__convert_col(df, dict_convert, key, **kwargs)
814 | dfs[key] = df_new.copy()
815 | self.set_table(dfs, overwrite=True)
816 | else:
817 | var_msg = ("The tables are in neither a DataFrame or dictionary "
818 | "format, which means something is seriously wrong...")
819 | module_logger.error(var_msg)
820 | raise ValueError(var_msg)
821 |
822 | module_logger.info("Completed `convert_columns`")
823 |
824 | def __convert_col(self, df, dict_convert, dict_key, **kwargs):
825 | module_logger.info("Starting `__convert_col`")
826 | for convert_key in dict_convert.keys():
827 | cols = dict_convert[convert_key]["columns"]
828 | if type(cols).__name__ == 'function':
829 | cols = cols(df, **kwargs)
830 | list_cols = list(cols)
831 | list_stops = dict_convert[convert_key]["dtypes"]
832 | dict_functions = dict_convert[convert_key]["functions"]
833 | for col in list_cols:
834 | if col not in df.columns.tolist():
835 | var_msg = f"The column {col} is not present"
836 | module_logger.error(var_msg)
837 | raise ValueError(var_msg)
838 | dtype_flag = 0
839 | var_dtype = df[col].dtype.name
840 | for dtype in list_stops:
841 | if dtype in var_dtype:
842 | dtype_flag = 1
843 | break
844 | if dtype_flag == 1:
845 | continue
846 | converted_flag = 0
847 | for key in dict_functions.keys():
848 | func_use = dict_functions[key]
849 | if type(func_use).__name__ != "function":
850 | var_msg = (f"The function for converting is not a "
851 | f"function! For keys {convert_key}, {key}")
852 | module_logger.error(var_msg)
853 | raise ValueError(var_msg)
854 | try:
855 | s = func_use(df, col, **kwargs)
856 | df[col] = s.copy()
857 | converted_flag = 1
858 | break
859 | except:
860 | var_msg = (f"The conversion failed for keys "
861 | f"{convert_key}, {key}, trying next")
862 | module_logger.warning(var_msg)
863 | continue
864 | if converted_flag == 0:
865 | var_idx = np.nan
866 | var_issue_count = np.nan
867 | if "idx_function" in dict_convert[convert_key]:
868 | func_idx = dict_convert[convert_key]['idx_function']
869 | if type(func_idx).__name__ != 'function':
870 | var_msg = (
871 | f'The `idx_function` argument is not a function'
872 | f' it is a {type(func_idx).__name__}')
873 | module_logger.error(var_msg)
874 | raise ValueError(var_msg)
875 | s_idx = func_idx(df, col, **kwargs)
876 | var_idx = ', '.join(
877 | [
878 | str(item) for item in
879 | s_idx.loc[s_idx].index.tolist()
880 | ]
881 | )
882 | var_issue_count = s_idx.sum()
883 | var_msg = (f"The conversion for column {col} for "
884 | f"convert_key {convert_key} failed.")
885 | module_logger.error(var_msg)
886 | self.error_handling(
887 | dict_key.split(self.__key_separator)[0],
888 | (dict_key.split(self.__key_separator)[1] if
889 | self.__key_separator in dict_key else np.nan),
890 | "",
891 | f"The conversion failed to format {convert_key}",
892 | col,
893 | var_issue_count,
894 | var_idx
895 | )
896 |
897 | module_logger.info("Completed `__convert_col`")
898 | return df
899 |
900 | def assert_nulls(self, list_nulls=None, list_exclude_cols=None):
901 | module_logger.info("Starting `assert_nulls`")
902 | if list_nulls is None:
903 | list_nulls_use = ["nan", ""]
904 | else:
905 | list_nulls_use = list_nulls
906 | if list_exclude_cols is None:
907 | list_exclude_cols_use = []
908 | else:
909 | list_exclude_cols_use = list_exclude_cols
910 | module_logger.info(f"The nulls being used are: {list_nulls_use}")
911 | module_logger.info(
912 | f"The columns being excluded are: {list_exclude_cols_use}")
913 | df = self.tables.copy()
914 | if type(df).__name__ == "dict":
915 | list_keys = [x for x in df.keys()]
916 | for key in list_keys:
917 | for null in list_nulls_use:
918 | if len(list_exclude_cols_use) == 0:
919 | df[key] = df[key].replace(null, np.nan)
920 | else:
921 | for col in [
922 | col for col in df[key].columns.tolist() if
923 | col not in list_exclude_cols_use
924 | ]:
925 | df[key][col] = df[key][col].replace(null, np.nan)
926 | else:
927 | for null in list_nulls_use:
928 | if len(list_exclude_cols_use) == 0:
929 | df = df.replace(null, np.nan)
930 | else:
931 | for col in [
932 | col for col in df.columns.tolist() if
933 | col not in list_exclude_cols_use
934 | ]:
935 | df[col] = df[col].replace(null, np.nan)
936 | self.set_table(df, overwrite=True)
937 | module_logger.info("Completed `assert_nulls`")
938 |
939 | def get_issue_count(self, issue_number_min=None, issue_number_max=None):
940 | module_logger.info("Starting `get_issue_count`")
941 | df = self.df_issues.copy()
942 | if issue_number_min is not None:
943 | df = df.loc[df["step_number"] >= issue_number_min].copy()
944 | if issue_number_max is not None:
945 | df = df.loc[df["step_number"] <= issue_number_max].copy()
946 | var_count = df.shape[0]
947 | module_logger.info("Completed `get_issue_count`")
948 | return var_count
949 |
950 | def form_summary_tables(self, path=None, script_name=None,
951 | func_name="form_tables", function=None, **kwargs):
952 | """
953 | Use a function to create summaries off the main table set.
954 |
955 | The function is passed the arguments:
956 | self.tables, self.formed_tables, self.__grouping, self.__key_1,
957 | self.__key_2, self.__key_3, self.__key_separator, **kwargs
958 | """
959 | module_logger.info("Starting `form_summary_tables`")
960 |
961 | if function is not None:
962 | if type(function).__name__ != "function":
963 | var_msg = ("The function passed to `self.form_summary_tables` "
964 | "is not a function.")
965 | module_logger.error(var_msg)
966 | raise ValueError(var_msg)
967 | elif script_name is not None:
968 | function = import_attr(path, script_name, func_name)
969 | else:
970 | var_msg = ("One of the `function` or `script_name` arguments needs "
971 | "to be completed. And if `script name is then `path` "
972 | "needs to be too.")
973 | module_logger.error(var_msg)
974 | raise ValueError(var_msg)
975 |
976 | dict_formed_tables = function(
977 | self.tables, self.formed_tables, self.__grouping, self.__key_1,
978 | self.__key_2, self.__key_3, self.__key_separator, **kwargs)
979 | if type(dict_formed_tables).__name__ != 'dict':
980 | var_msg = ('The output of the function for `form_summary_table` '
981 | 'is not a dictionary and it needs to be')
982 | module_logger.error(var_msg)
983 | raise ValueError(var_msg)
984 | self.formed_tables = dict_formed_tables
985 |
986 | module_logger.info("Completed `form_summary_tables`")
987 |
988 | def get_step_no(self):
989 | module_logger.info("Starting `get_step_no`")
990 | module_logger.info("Completed `get_step_no`")
991 | return self.__step_no
992 |
993 | def _repr_html_(self):
994 | module_logger.info("Starting `_repr__html_`")
995 | var_key_3 = "" if self.__key_3 == "None" else self.__key_3
996 | var_out_keys = f"""
997 |
998 |
999 | Grouping |
1000 | {self.__grouping} |
1001 |
1002 |
1003 | Key 1 |
1004 | {self.__key_1} |
1005 |
1006 |
1007 | Key 2 |
1008 | {self.__key_2} |
1009 |
1010 |
1011 | Key 3 |
1012 | {var_key_3} |
1013 |
1014 |
1015 | """
1016 | if type(self.tables).__name__ == 'dict':
1017 | var_out_tbl_info = """
1018 |
1019 |
1020 | Dictionary key |
1021 | Dataframe shape |
1022 | Count numeric columns |
1023 | Count date columns |
1024 | Count object columns |
1025 |
1026 | {}
1027 |
1028 | """
1029 | for key in [key for key in self.tables.keys()]:
1030 | var_out_tbl_info = var_out_tbl_info.replace(
1031 | '{}',
1032 | f"""
1033 |
1034 | {key} |
1035 | {self.tables[key].shape} |
1036 | {self.tables[key].select_dtypes(include=[np.number]).shape[1]} |
1037 | {self.tables[key].select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]} |
1038 | {self.tables[key].select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]} |
1039 |
1040 | {{}}
1041 | """
1042 | )
1043 | var_out_tbl_info = var_out_tbl_info.replace('{}', '')
1044 | else:
1045 | var_out_tbl_info = f"""
1046 |
1047 |
1048 | Dataframe shape |
1049 | Count numeric columns |
1050 | Count date columns |
1051 | Count object columns |
1052 |
1053 |
1054 | {self.tables.shape} |
1055 | {self.tables.select_dtypes(include=[np.number]).shape[1]} |
1056 | {self.tables.select_dtypes(include=[np.datetime64, np.timedelta64]).shape[1]} |
1057 | {self.tables.select_dtypes(exclude=[np.number, np.datetime64, np.timedelta64]).shape[1]} |
1058 |
1059 |
1060 | """
1061 | var_out_issues = """
1062 | """
1063 | var_out = f"{var_out_keys}
{var_out_tbl_info}
{var_out_issues}"
1064 | module_logger.info("Completed `_repr_html_`")
1065 | return var_out
1066 |
--------------------------------------------------------------------------------
/data_etl/general_functions.py:
--------------------------------------------------------------------------------
1 | # Here functions that are typically used when using these scripts or writing
2 | # these data curation scripts are predefined here
3 | import logging
4 | import os
5 | from datetime import datetime
6 | import importlib
7 |
8 | import pandas as pd
9 |
10 | module_logger = logging.getLogger(__name__)
11 |
12 |
13 | def func_initialise_logging(
14 | script_name, log_folder_path, key_1, key_2, key_3, start_time):
15 | var_log_name = os.path.abspath(
16 | os.path.join(
17 | log_folder_path,
18 | (f"{script_name}_{key_1}_{key_2}_{key_3}_"
19 | f"{start_time.strftime('%Y%m%d_%H%M%S')}.log")
20 | )
21 | )
22 | logging.basicConfig(
23 | filename=var_log_name, filemode="a", datefmt="%H:%M:%S",
24 | level=logging.DEBUG,
25 | format="%(asctime)s|%(name)s|%(levelname)s|%(message)s")
26 |
27 | logging.info(f"Starting the process at "
28 | f"{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
29 |
30 |
31 | def func_check_for_issues(issue_count, cnx, cnx_key, table, step_no,
32 | override=False, start_time=None):
33 | if (issue_count > 0) & (override is not True):
34 | cnx.write_to_db(cnx_key, table)
35 | var_msg = f'There were {issue_count} issues found at step {step_no}'
36 | module_logger.error(var_msg)
37 | if start_time is not None:
38 | module_logger.info("Script time taken: {}".format(
39 | str(datetime.now() - start_time)))
40 | raise ValueError(var_msg)
41 |
42 |
43 | def func_to_sql(x, datetime_format='%Y-%m-%d'):
44 | if pd.isnull(x):
45 | return "NULL"
46 | elif type(x).__name__ == 'Timestamp':
47 | return f"'{x.strftime(datetime_format)}'"
48 | else:
49 | return f"'{str(x)}'"
50 |
51 |
52 | def import_attr(path, script_name, attr_name):
53 | if (path is None) | (path == '.'):
54 | mod = importlib.import_module(script_name)
55 | else:
56 | var_script_path = os.path.join(path, f"{script_name}.py")
57 | if not os.path.exists(var_script_path):
58 | var_msg = f"The script does not exist: {script_name}.py"
59 | module_logger.error(var_msg)
60 | raise ValueError(var_msg)
61 | spec = importlib.util.spec_from_file_location(
62 | script_name, var_script_path)
63 | mod = importlib.util.module_from_spec(spec)
64 | spec.loader.exec_module(mod)
65 | attr = getattr(mod, attr_name)
66 |
67 | return attr
68 |
--------------------------------------------------------------------------------
/examples/00_create_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Create data\n",
8 | "\n",
9 | "This notebook creates the data that is used in the examples\n",
10 | "\n",
11 | "There is a data set that will process without problems in the examples and one that will have issues to see the difference. There are also some excel outputs for the scripts example.\n",
12 | "\n",
13 | "The specific sections for creating tables are: \n",
14 | "+ [Conversions](#Conversions), converting column dtypes\n",
15 | "+ [Altering](#Altering), changing the values in the DataFrame, adding new columns, dropping rows or columns etc\n",
16 | "+ [Checks](#Checks), looking for outliers or rows that data does not follow the prescribed rules\n",
17 | "+ [For summary tables](#For-summary-tables), there is one table here and it's for a summary output\n",
18 | "\n",
19 | "## Setup\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "Import and settings options"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 1,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import sqlite3\n",
37 | "import pickle\n",
38 | "import datetime\n",
39 | "\n",
40 | "import pandas as pd\n",
41 | "import numpy as np"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "pd.set_option('display.max_rows', 10)\n",
51 | "pd.set_option('display.max_columns', 10)"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Create tables\n",
59 | "
\n",
60 | "\n",
61 | "There are lots of different but small tables used in the examples\n",
62 | "\n",
63 | "### Conversions\n",
64 | "
"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "df_convert = pd.DataFrame(\n",
74 | " [\n",
75 | " ('A', '1', '0.6', '2019-01-01'),\n",
76 | " ('B', '4', '5.2', '2019-02-05'),\n",
77 | " ('C', '1', '5.6', '2018-12-17'),\n",
78 | " ('D', '10', '15.9', '2019-07-18'),\n",
79 | " ('E', '-8', '4.7', '2018-03-09')\n",
80 | " ],\n",
81 | " columns=['object', 'int', 'float', 'date']\n",
82 | ")"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "df_convert_issues = pd.DataFrame(\n",
92 | " [\n",
93 | " ('A', '1', '0.6', '2019-02-29'),\n",
94 | " ('B', '4.5', 'A', '2019-22-05'),\n",
95 | " ('C', '1', '5.6', '2018-12-17'),\n",
96 | " ('D', 'b', '15.9', '2019-09-31'),\n",
97 | " (5, '-8', '4.7', '2018-03-09')\n",
98 | " ],\n",
99 | " columns=['object', 'int', 'float', 'date']\n",
100 | ")"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### Altering\n",
108 | "
"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 5,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "df_alterations = pd.DataFrame(\n",
118 | " [\n",
119 | " ('A', 2, 'key_1'),\n",
120 | " ('B', 199, 'key_2'),\n",
121 | " ('C', -1, 'key_1'),\n",
122 | " ('D', 20, 'key_3'),\n",
123 | " ('E', 6, 'key_2')\n",
124 | " ],\n",
125 | " columns=['to_map', 'add_1', 'merge_key']\n",
126 | ")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 6,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "df_alterations_issues = pd.DataFrame(\n",
136 | " [\n",
137 | " ('A', 2, 'key_1'),\n",
138 | " ('B', 199, 2),\n",
139 | " ('C', -1, 'key_1'),\n",
140 | " (['D'], 'a', 'key_3'),\n",
141 | " ('E', 6, 'key_2')\n",
142 | " ],\n",
143 | " columns=['to_map', 'add_1', 'merge_key']\n",
144 | ")"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "### Checks\n",
152 | "
"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 7,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "df_checks = pd.DataFrame(\n",
162 | " [\n",
163 | " (3, 'A', 'a'),\n",
164 | " (10, 'A', 'z'),\n",
165 | " (9, 'B', 'b'),\n",
166 | " (4, 'D', 'd'),\n",
167 | " (7, 'C', 'c')\n",
168 | " ],\n",
169 | " columns=['number', 'category_1', 'category_2']\n",
170 | ")"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 8,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "df_checks_issues = pd.DataFrame(\n",
180 | " [\n",
181 | " (1, 'Z', 'y'),\n",
182 | " (10, 'A', 'a'),\n",
183 | " (9, 'Y', 'b'),\n",
184 | " (4, 'B', 'b'),\n",
185 | " (-1, 'C', 'c')\n",
186 | " ],\n",
187 | " columns=['number', 'category_1', 'category_2']\n",
188 | ")"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "### For summary tables\n",
196 | "
"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 9,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "df_summary = pd.DataFrame(\n",
206 | " [\n",
207 | " ('b', 'c', 1, 6),\n",
208 | " ('d', 'b', 1, 9),\n",
209 | " ('c', 'b', 1, 0),\n",
210 | " ('d', 'd', 1, 9),\n",
211 | " ('c', 'b', 1, 1),\n",
212 | " ('a', 'd', 1, 3),\n",
213 | " ('c', 'c', 1, 0),\n",
214 | " ('c', 'd', 1, 0),\n",
215 | " ('c', 'c', 1, 0),\n",
216 | " ('a', 'e', 1, 4),\n",
217 | " ('b', 'e', 1, 7),\n",
218 | " ('a', 'd', 1, 4),\n",
219 | " ('b', 'e', 1, 6),\n",
220 | " ('b', 'c', 1, 8),\n",
221 | " ('b', 'c', 1, 7),\n",
222 | " ('d', 'e', 1, 9),\n",
223 | " ('a', 'b', 1, 5),\n",
224 | " ('a', 'd', 1, 5),\n",
225 | " ('a', 'b', 1, 4),\n",
226 | " ('d', 'b', 1, 10),\n",
227 | " ('b', 'c', 1, 6),\n",
228 | " ('b', 'e', 1, 7),\n",
229 | " ('a', 'e', 1, 4),\n",
230 | " ('a', 'c', 1, 3),\n",
231 | " ('c', 'c', 1, 0),\n",
232 | " ('c', 'd', 1, 2),\n",
233 | " ('a', 'b', 1, 3),\n",
234 | " ('a', 'e', 1, 5),\n",
235 | " ('a', 'c', 1, 3),\n",
236 | " ('a', 'e', 1, 4),\n",
237 | " ('b', 'd', 1, 6),\n",
238 | " ('c', 'e', 1, 1),\n",
239 | " ('b', 'e', 1, 7),\n",
240 | " ('c', 'c', 1, 0),\n",
241 | " ('a', 'c', 1, 5),\n",
242 | " ('c', 'b', 1, 0),\n",
243 | " ('d', 'b', 1, 8),\n",
244 | " ('d', 'e', 1, 10),\n",
245 | " ('d', 'c', 1, 8),\n",
246 | " ('a', 'd', 1, 3),\n",
247 | " ('d', 'e', 1, 10),\n",
248 | " ('d', 'c', 1, 8),\n",
249 | " ('d', 'e', 1, 10),\n",
250 | " ('a', 'c', 1, 4),\n",
251 | " ('d', 'b', 1, 8),\n",
252 | " ('d', 'b', 1, 10),\n",
253 | " ('d', 'e', 1, 10),\n",
254 | " ('a', 'c', 1, 5),\n",
255 | " ('a', 'd', 1, 5),\n",
256 | " ('d', 'c', 1, 10)\n",
257 | " ],\n",
258 | " columns=['str', 'str_2', 'count', 'int_max']\n",
259 | ")"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "### For scripts\n",
267 | "
"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 10,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "df_data = pd.DataFrame(\n",
277 | " [\n",
278 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 7, 7, 0, 0), \n",
279 | " 'A string this is', 51.5074, 0.1278),\n",
280 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 4, 9, 0, 0), \n",
281 | " 'Test', 51.5084, 0.1268),\n",
282 | " (1, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2018, 1, 10, 0, 0), \n",
283 | " 'testing', 51.5094, 0.1258),\n",
284 | " (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 10, 13, 0, 0),\n",
285 | " 'test test test', 51.5104, 0.1248),\n",
286 | " (4, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 7, 16, 0, 0),\n",
287 | " np.nan, 51.5114, 0.1238),\n",
288 | " (5, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 4, 18, 0, 0), \n",
289 | " np.nan, 51.5124, 0.1228),\n",
290 | " (6, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2017, 1, 19, 0, 0),\n",
291 | " 'Blah', 51.5134, 0.1218),\n",
292 | " (7, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 10, 22, 0, 0),\n",
293 | " 'Dah', 51.5144, 0.1208),\n",
294 | " (1234, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 7, 25, 0, 0), \n",
295 | " 'Doh', 51.5154, 0.1198),\n",
296 | " (3, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 4, 27, 0, 0),\n",
297 | " 'Boh', 51.5164, 0.1188),\n",
298 | " (2341243, datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2016, 1, 29, 0, 0),\n",
299 | " 'Pho', 51.5174, 0.1178)\n",
300 | " ],\n",
301 | " columns=['Number', 'A date', 'Another date£', ' StringStringString ', 'lat', 'lng']\n",
302 | ")"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 11,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "df_headers_1 = pd.DataFrame(\n",
312 | " [\n",
313 | " ('Header', 'Number', 'A date', 'Another date£', ' StringStringString ', 'lat', 'lng'), \n",
314 | " ('New name', 'a_number', 'date_1', 'date_2', 'string', 'lat', 'lng'),\n",
315 | " ('Remove', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),\n",
316 | " ('Notes', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)\n",
317 | " ]\n",
318 | ")"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 12,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "df_ideal_headers = pd.DataFrame(\n",
328 | " [\n",
329 | " ('a_number', 'date_1', 'date_2', 'string', 'testing', 'a', 'b', 'lat', 'lng')\n",
330 | " ]\n",
331 | ")"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "## Write out data\n",
339 | "
"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 13,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "df_convert.to_csv('data/df_convert.tsv', sep='\\t', index=False)\n",
349 | "df_convert_issues.to_csv('data/df_convert_issues.tsv', sep='\\t', index=False)\n",
350 | "\n",
351 | "df_alterations.to_csv('data/df_alterations.tsv', sep='\\t', index=False)\n",
352 | "df_alterations_issues.to_csv('data/df_alterations_issues.tsv', sep='\\t', index=False)\n",
353 | "\n",
354 | "pickle.dump(df_checks, open('data/df_checks.pkl', 'wb'))\n",
355 | "pickle.dump(df_checks_issues, open('data/df_checks_issues.pkl', 'wb'))\n",
356 | "\n",
357 | "pickle.dump(df_summary, open('data/df_summary.pkl', 'wb'))\n",
358 | "\n",
359 | "df_data.to_excel('data/A.xlsx', index=False)\n",
360 | "xl_writer = pd.ExcelWriter('data/headers.xlsx')\n",
361 | "df_headers_1.to_excel(xl_writer, index=False, sheet_name='A 1', header=None)\n",
362 | "df_ideal_headers.to_excel(xl_writer, index=False, sheet_name='IdealHeaders', header=None)\n",
363 | "xl_writer.save()\n",
364 | "xl_writer.close()"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {},
370 | "source": [
371 | "---\n",
372 | "\n",
373 | "**GigiSR**"
374 | ]
375 | }
376 | ],
377 | "metadata": {
378 | "kernelspec": {
379 | "display_name": "Python 3",
380 | "language": "python",
381 | "name": "python3"
382 | },
383 | "language_info": {
384 | "codemirror_mode": {
385 | "name": "ipython",
386 | "version": 3
387 | },
388 | "file_extension": ".py",
389 | "mimetype": "text/x-python",
390 | "name": "python",
391 | "nbconvert_exporter": "python",
392 | "pygments_lexer": "ipython3",
393 | "version": "3.6.10"
394 | }
395 | },
396 | "nbformat": 4,
397 | "nbformat_minor": 2
398 | }
399 |
--------------------------------------------------------------------------------
/examples/03_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Example notebook 03\n",
8 | "\n",
9 | "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `Connections` class:\n",
10 | "\n",
11 | "+ [Initialise a SqliteDB connection](#Initialise-a-SqliteDB-connection)\n",
12 | "+ [Read from cnx](#Read-from-cnx)\n",
13 | "+ [Write to a table](#Write-to-a-table)\n",
14 | "\n",
15 | "## Setup\n",
16 | "
\n",
17 | "\n",
18 | "Imports and setting options"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "from datetime import datetime\n",
28 | "import pickle\n",
29 | "\n",
30 | "from data_etl import Connections, Checks"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Examples\n",
38 | "
"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Initialise the class"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "cnxs = Connections()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "### Initialise a SqliteDB connection\n",
62 | "
"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Initialise the SqliteDB, it doesn't already exist so a warning message is output that a file is being created\n",
70 | "\n",
71 | "The optional kwarg `sqlite_df_issues_create` creates a table structure to match the issues tables present in `DataCuration` and `Checks` objects"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stderr",
81 | "output_type": "stream",
82 | "text": [
83 | "The `file_path` data/00_db.db is not valid so this file will be created\n"
84 | ]
85 | }
86 | ],
87 | "source": [
88 | "cnxs.add_cnx(\n",
89 | " cnx_key='df_issues', \n",
90 | " cnx_type='sqlite3',\n",
91 | " table_name='df_issues',\n",
92 | " file_path='data/00_db.db',\n",
93 | " sqlite_df_issues_create=True\n",
94 | ")"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Read from cnx\n",
102 | "
"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "Using `read_from_db` you can read data out from a table, or from a database on the same connection"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 4,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/html": [
120 | "\n",
121 | "\n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " | \n",
138 | " key_1 | \n",
139 | " key_2 | \n",
140 | " key_3 | \n",
141 | " file | \n",
142 | " sub_file | \n",
143 | " step_number | \n",
144 | " category | \n",
145 | " issue_short_desc | \n",
146 | " issue_long_desc | \n",
147 | " column | \n",
148 | " issue_count | \n",
149 | " issue_idx | \n",
150 | " grouping | \n",
151 | "
\n",
152 | " \n",
153 | " \n",
154 | " \n",
155 | "
\n",
156 | "
"
157 | ],
158 | "text/plain": [
159 | "Empty DataFrame\n",
160 | "Columns: [key_1, key_2, key_3, file, sub_file, step_number, category, issue_short_desc, issue_long_desc, column, issue_count, issue_idx, grouping]\n",
161 | "Index: []"
162 | ]
163 | },
164 | "execution_count": 4,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### Write to a table\n",
178 | "
"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "We needs some issues to write to the table"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 5,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "data": {
195 | "text/html": [
196 | "\n",
197 | "\n",
210 | "
\n",
211 | " \n",
212 | " \n",
213 | " | \n",
214 | " key_1 | \n",
215 | " key_2 | \n",
216 | " key_3 | \n",
217 | " file | \n",
218 | " sub_file | \n",
219 | " step_number | \n",
220 | " category | \n",
221 | " issue_short_desc | \n",
222 | " issue_long_desc | \n",
223 | " column | \n",
224 | " issue_count | \n",
225 | " issue_idx | \n",
226 | " grouping | \n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " \n",
231 | " 0 | \n",
232 | " 1 | \n",
233 | " None | \n",
234 | " None | \n",
235 | " df_checks_issues.pkl | \n",
236 | " NaN | \n",
237 | " 0 | \n",
238 | " NaN | \n",
239 | " Number should be greater than 0 | \n",
240 | " | \n",
241 | " NaN | \n",
242 | " 1 | \n",
243 | " 4 | \n",
244 | " 2020-05-26 07:36:41.839557 | \n",
245 | "
\n",
246 | " \n",
247 | "
\n",
248 | "
"
249 | ],
250 | "text/plain": [
251 | " key_1 key_2 key_3 file sub_file step_number category \\\n",
252 | "0 1 None None df_checks_issues.pkl NaN 0 NaN \n",
253 | "\n",
254 | " issue_short_desc issue_long_desc column issue_count \\\n",
255 | "0 Number should be greater than 0 NaN 1 \n",
256 | "\n",
257 | " issue_idx grouping \n",
258 | "0 4 2020-05-26 07:36:41.839557 "
259 | ]
260 | },
261 | "execution_count": 5,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "var_start_time = datetime.now()\n",
268 | "ch_checks = Checks(var_start_time, '1')\n",
269 | "\n",
270 | "dict_data = {\n",
271 | " 'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n",
272 | "}\n",
273 | "\n",
274 | "dict_checks = dict()\n",
275 | "dict_checks['Number should be greater than 0'] = {\n",
276 | " 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n",
277 | "}\n",
278 | "\n",
279 | "ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n",
280 | "\n",
281 | "ch_checks.df_issues"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "Using `write_to_db` creates a temporary table in the background which the data is written to, if that has written with no issues then it moves all that data to the main table"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 6,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "cnxs.write_to_db('df_issues', ch_checks.df_issues)"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "And then check it wrote to the table"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 7,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "data": {
314 | "text/html": [
315 | "\n",
316 | "\n",
329 | "
\n",
330 | " \n",
331 | " \n",
332 | " | \n",
333 | " key_1 | \n",
334 | " key_2 | \n",
335 | " key_3 | \n",
336 | " file | \n",
337 | " sub_file | \n",
338 | " step_number | \n",
339 | " category | \n",
340 | " issue_short_desc | \n",
341 | " issue_long_desc | \n",
342 | " column | \n",
343 | " issue_count | \n",
344 | " issue_idx | \n",
345 | " grouping | \n",
346 | "
\n",
347 | " \n",
348 | " \n",
349 | " \n",
350 | " 0 | \n",
351 | " 1 | \n",
352 | " None | \n",
353 | " None | \n",
354 | " df_checks_issues.pkl | \n",
355 | " None | \n",
356 | " 0 | \n",
357 | " None | \n",
358 | " Number should be greater than 0 | \n",
359 | " | \n",
360 | " None | \n",
361 | " 1 | \n",
362 | " 4 | \n",
363 | " 2020-05-26 07:36:41.839557 | \n",
364 | "
\n",
365 | " \n",
366 | "
\n",
367 | "
"
368 | ],
369 | "text/plain": [
370 | " key_1 key_2 key_3 file sub_file step_number category \\\n",
371 | "0 1 None None df_checks_issues.pkl None 0 None \n",
372 | "\n",
373 | " issue_short_desc issue_long_desc column issue_count \\\n",
374 | "0 Number should be greater than 0 None 1 \n",
375 | "\n",
376 | " issue_idx grouping \n",
377 | "0 4 2020-05-26 07:36:41.839557 "
378 | ]
379 | },
380 | "execution_count": 7,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "cnxs.read_from_db('df_issues', 'SELECT * FROM df_issues')"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "---\n",
394 | "**GigiSR**"
395 | ]
396 | }
397 | ],
398 | "metadata": {
399 | "kernelspec": {
400 | "display_name": "Python 3",
401 | "language": "python",
402 | "name": "python3"
403 | },
404 | "language_info": {
405 | "codemirror_mode": {
406 | "name": "ipython",
407 | "version": 3
408 | },
409 | "file_extension": ".py",
410 | "mimetype": "text/x-python",
411 | "name": "python",
412 | "nbconvert_exporter": "python",
413 | "pygments_lexer": "ipython3",
414 | "version": "3.6.10"
415 | }
416 | },
417 | "nbformat": 4,
418 | "nbformat_minor": 2
419 | }
420 |
--------------------------------------------------------------------------------
/examples/04_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Example notebook 04\n",
8 | "\n",
9 | "Using the data generated from notebook `00_create_data.ipynb` this notebook takes you through some of the basic functionality using the `general_functions` module:\n",
10 | "\n",
11 | "+ [Initialise logging](#Initialise-logging)\n",
12 | "+ [Import attribute](#Import-attribute)\n",
13 | "+ [Check for issues](#Check-for-issues)\n",
14 | "\n",
15 | "\n",
16 | "## Setup\n",
17 | "
"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "Imports and setting options"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "from datetime import datetime\n",
34 | "import pickle\n",
35 | "\n",
36 | "from data_etl import Checks, Connections, general_functions"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Initialise logging\n",
44 | "
"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "When running interlocking scripts it can be useful to have logging so that if a problem is encountered there's hopefully enough information provided to debug\n",
52 | "\n",
53 | "This function helps to set up a logging file"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "general_functions.func_initialise_logging(\n",
63 | " 'example_04', 'logs/', '1', None, None, datetime.now()\n",
64 | ")"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Import attribute\n",
72 | "
"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Quite often it is more useful to define the large dictionaries that go into the checks in a separate script so that it is in a collection but doesn't clutter up the main script where the flow of processing is defined\n",
80 | "\n",
81 | "This function is also used in the classes as reading in from other scripts is a frequent action for clarity of the code"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 3,
87 | "metadata": {},
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/plain": [
92 | "{'Number should be greater than 0': {'calc_condition': (df, col, **kwargs)>},\n",
93 | " 'Number should be greater than 2': {'columns': ['number'],\n",
94 | " 'calc_condition': (df, col, **kwargs)>,\n",
95 | " 'category': 'severe'},\n",
96 | " 'check values in list': {'columns': ['category_1'],\n",
97 | " 'calc_condition': (df, col, **kwargs)>,\n",
98 | " 'long_description': (df, col, condition, **kwargs)>},\n",
99 | " 'The category_1 column can only map to certain values': {'calc_condition': (df, col, **kwargs)>,\n",
100 | " 'check_condition': (df, col, condition, **kwargs)>,\n",
101 | " 'count_condition': (df, col, condition, **kwargs)>,\n",
102 | " 'index_position': (df, col, condition, **kwargs)>,\n",
103 | " 'relevant_columns': (df, col, condition, **kwargs)>,\n",
104 | " 'long_description': (df, col, condition, **kwargs)>}}"
105 | ]
106 | },
107 | "execution_count": 3,
108 | "metadata": {},
109 | "output_type": "execute_result"
110 | }
111 | ],
112 | "source": [
113 | "dict_checks = general_functions.import_attr('.', '04_example', 'dict_checks')\n",
114 | "dict_checks"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "And this can then be used or modified and used in the `DataCuration` and `Checks` classes"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "### Check for issues\n",
129 | "
"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "The aim of this function is to have a way to create a break in the code if there is are issues, and to store the issues before erroring out of the script\n",
137 | "\n",
138 | "To use this function we need a class instance with issue entries and a connections class instance to write the issues out to"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 4,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "data": {
148 | "text/html": [
149 | "\n",
150 | "\n",
163 | "
\n",
164 | " \n",
165 | " \n",
166 | " | \n",
167 | " key_1 | \n",
168 | " key_2 | \n",
169 | " key_3 | \n",
170 | " file | \n",
171 | " sub_file | \n",
172 | " step_number | \n",
173 | " category | \n",
174 | " issue_short_desc | \n",
175 | " issue_long_desc | \n",
176 | " column | \n",
177 | " issue_count | \n",
178 | " issue_idx | \n",
179 | " grouping | \n",
180 | "
\n",
181 | " \n",
182 | " \n",
183 | " \n",
184 | " 0 | \n",
185 | " 1 | \n",
186 | " None | \n",
187 | " None | \n",
188 | " df_checks_issues.pkl | \n",
189 | " NaN | \n",
190 | " 0 | \n",
191 | " NaN | \n",
192 | " Number should be greater than 0 | \n",
193 | " | \n",
194 | " NaN | \n",
195 | " 1 | \n",
196 | " 4 | \n",
197 | " 2020-05-26 07:43:04.328680 | \n",
198 | "
\n",
199 | " \n",
200 | " 1 | \n",
201 | " 1 | \n",
202 | " None | \n",
203 | " None | \n",
204 | " df_checks_issues.pkl | \n",
205 | " NaN | \n",
206 | " 1 | \n",
207 | " NaN | \n",
208 | " Number should be greater than 0 | \n",
209 | " | \n",
210 | " NaN | \n",
211 | " 1 | \n",
212 | " 4 | \n",
213 | " 2020-05-26 07:43:04.328680 | \n",
214 | "
\n",
215 | " \n",
216 | " 2 | \n",
217 | " 1 | \n",
218 | " None | \n",
219 | " None | \n",
220 | " df_checks_issues.pkl | \n",
221 | " NaN | \n",
222 | " 2 | \n",
223 | " NaN | \n",
224 | " Number should be greater than 0 | \n",
225 | " | \n",
226 | " NaN | \n",
227 | " 1 | \n",
228 | " 4 | \n",
229 | " 2020-05-26 07:43:04.328680 | \n",
230 | "
\n",
231 | " \n",
232 | " 3 | \n",
233 | " 1 | \n",
234 | " None | \n",
235 | " None | \n",
236 | " df_checks_issues.pkl | \n",
237 | " NaN | \n",
238 | " 3 | \n",
239 | " NaN | \n",
240 | " Number should be greater than 0 | \n",
241 | " | \n",
242 | " NaN | \n",
243 | " 1 | \n",
244 | " 4 | \n",
245 | " 2020-05-26 07:43:04.328680 | \n",
246 | "
\n",
247 | " \n",
248 | " 4 | \n",
249 | " 1 | \n",
250 | " None | \n",
251 | " None | \n",
252 | " df_checks_issues.pkl | \n",
253 | " NaN | \n",
254 | " 4 | \n",
255 | " NaN | \n",
256 | " Number should be greater than 0 | \n",
257 | " | \n",
258 | " NaN | \n",
259 | " 1 | \n",
260 | " 4 | \n",
261 | " 2020-05-26 07:43:04.328680 | \n",
262 | "
\n",
263 | " \n",
264 | "
\n",
265 | "
"
266 | ],
267 | "text/plain": [
268 | " key_1 key_2 key_3 file sub_file step_number category \\\n",
269 | "0 1 None None df_checks_issues.pkl NaN 0 NaN \n",
270 | "1 1 None None df_checks_issues.pkl NaN 1 NaN \n",
271 | "2 1 None None df_checks_issues.pkl NaN 2 NaN \n",
272 | "3 1 None None df_checks_issues.pkl NaN 3 NaN \n",
273 | "4 1 None None df_checks_issues.pkl NaN 4 NaN \n",
274 | "\n",
275 | " issue_short_desc issue_long_desc column issue_count \\\n",
276 | "0 Number should be greater than 0 NaN 1 \n",
277 | "1 Number should be greater than 0 NaN 1 \n",
278 | "2 Number should be greater than 0 NaN 1 \n",
279 | "3 Number should be greater than 0 NaN 1 \n",
280 | "4 Number should be greater than 0 NaN 1 \n",
281 | "\n",
282 | " issue_idx grouping \n",
283 | "0 4 2020-05-26 07:43:04.328680 \n",
284 | "1 4 2020-05-26 07:43:04.328680 \n",
285 | "2 4 2020-05-26 07:43:04.328680 \n",
286 | "3 4 2020-05-26 07:43:04.328680 \n",
287 | "4 4 2020-05-26 07:43:04.328680 "
288 | ]
289 | },
290 | "execution_count": 4,
291 | "metadata": {},
292 | "output_type": "execute_result"
293 | }
294 | ],
295 | "source": [
296 | "var_start_time = datetime.now()\n",
297 | "ch_checks = Checks(var_start_time, '1')\n",
298 | "\n",
299 | "dict_data = {\n",
300 | " 'df_checks_issues.pkl': pickle.load(open('data/df_checks_issues.pkl', 'rb'))\n",
301 | "}\n",
302 | "\n",
303 | "dict_checks = dict()\n",
304 | "dict_checks['Number should be greater than 0'] = {\n",
305 | " 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0\n",
306 | "}\n",
307 | "\n",
308 | "for step_no in range(5):\n",
309 | " ch_checks.set_step_no(step_no)\n",
310 | " ch_checks.apply_checks(dict_data, dictionary=dict_checks)\n",
311 | "\n",
312 | "ch_checks.df_issues"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 5,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "cnxs = Connections()\n",
322 | "cnxs.add_cnx(\n",
323 | " cnx_key='df_issues', \n",
324 | " cnx_type='sqlite3',\n",
325 | " table_name='df_issues',\n",
326 | " file_path='data/00_db.db'\n",
327 | ")"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "Now use the issues table in the function"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 6,
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "general_functions.func_check_for_issues(\n",
344 | " ch_checks.get_issue_count(), \n",
345 | " cnxs, \n",
346 | " 'df_issues', \n",
347 | " ch_checks.df_issues, \n",
348 | " ch_checks.get_step_no(),\n",
349 | " override=True\n",
350 | ")"
351 | ]
352 | },
353 | {
354 | "cell_type": "markdown",
355 | "metadata": {},
356 | "source": [
357 | "The above has `override=True`, this means even if problems are found it will not error out, the below doesn't have `override=True` and intentionally errors"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 7,
363 | "metadata": {},
364 | "outputs": [
365 | {
366 | "ename": "ValueError",
367 | "evalue": "There were 5 issues found at step 4",
368 | "output_type": "error",
369 | "traceback": [
370 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
371 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
372 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;34m'df_issues'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdf_issues\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mch_checks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_step_no\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m )\n",
373 | "\u001b[1;32mc:\\users\\georg\\documents\\workspace\\modules\\data_etl\\data_etl\\general_functions.py\u001b[0m in \u001b[0;36mfunc_check_for_issues\u001b[1;34m(issue_count, cnx, cnx_key, table, step_no, override, start_time)\u001b[0m\n\u001b[0;32m 38\u001b[0m module_logger.info(\"Script time taken: {}\".format(\n\u001b[0;32m 39\u001b[0m str(datetime.now() - start_time)))\n\u001b[1;32m---> 40\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvar_msg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
374 | "\u001b[1;31mValueError\u001b[0m: There were 5 issues found at step 4"
375 | ]
376 | }
377 | ],
378 | "source": [
379 | "general_functions.func_check_for_issues(\n",
380 | " ch_checks.get_issue_count(), \n",
381 | " cnxs, \n",
382 | " 'df_issues', \n",
383 | " ch_checks.df_issues, \n",
384 | " ch_checks.get_step_no()\n",
385 | ")"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "The benefit of the `override` argument is that you may have a mixture of issues you want definitely resolving and those you can live with, this allows you to have errors but to carry on regardless"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "---\n",
400 | "**GigiSR**"
401 | ]
402 | }
403 | ],
404 | "metadata": {
405 | "kernelspec": {
406 | "display_name": "Python 3",
407 | "language": "python",
408 | "name": "python3"
409 | },
410 | "language_info": {
411 | "codemirror_mode": {
412 | "name": "ipython",
413 | "version": 3
414 | },
415 | "file_extension": ".py",
416 | "mimetype": "text/x-python",
417 | "name": "python",
418 | "nbconvert_exporter": "python",
419 | "pygments_lexer": "ipython3",
420 | "version": "3.6.10"
421 | }
422 | },
423 | "nbformat": 4,
424 | "nbformat_minor": 2
425 | }
426 |
--------------------------------------------------------------------------------
/examples/04_example.py:
--------------------------------------------------------------------------------
1 | # This script is used in the `02_examples.ipynb` file to highlight how using
2 | # externally defined information works
3 |
4 | import pandas as pd
5 |
6 | dict_cat_1_map = {
7 | 'A': ['a', 'z'],
8 | 'B': ['b'],
9 | 'C': ['c'],
10 | 'D': ['d'],
11 | 'Y': ['y'],
12 | 'Z': ['z']
13 | }
14 |
15 | dict_checks = {
16 | 'Number should be greater than 0': {
17 | 'calc_condition': lambda df, col, **kwargs: df['number'] <= 0
18 | },
19 | 'Number should be greater than 2': {
20 | "columns": ['number'],
21 | 'calc_condition': lambda df, col, **kwargs: df[col] <= 2,
22 | 'category': 'severe'
23 | },
24 | 'check values in list': {
25 | 'columns': ['category_1'],
26 | 'calc_condition': lambda df, col, **kwargs: ~df[col].isin(['A', 'B', 'C', 'D']),
27 | 'long_description': lambda df, col, condition, **kwargs:
28 | f"The invalid values are: {df.loc[~df[col].isin(['A', 'B', 'C', 'D'])][col].unique().tolist()}"
29 | },
30 | 'The category_1 column can only map to certain values': {
31 | 'calc_condition': lambda df, col, **kwargs: [
32 | item[1] not in dict_cat_1_map[item[0]] for item in
33 | df[['category_1', 'category_2']].values.tolist()
34 | ],
35 | 'check_condition': lambda df, col, condition, **kwargs: sum(condition) > 0,
36 | 'count_condition': lambda df, col, condition, **kwargs: sum(condition),
37 | 'index_position': lambda df, col, condition, **kwargs: pd.Series(condition),
38 | 'relevant_columns': lambda df, col, condition, **kwargs: 'category_1, category_2',
39 | 'long_description': lambda df, col, condition, **kwargs: (
40 | f"The values that have no mapping are: "
41 | f"{df.loc[pd.Series(condition)]['category_1'].unique().tolist()}"
42 | )
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | A collection of examples for potential uses of my package!
4 |
5 | A lot of the functionality is easy to code in yourself and is dependant on the data set in use. But I have found it useful to be able to try to apply all conversions at once and then check that there were no errors rather than stop each time there is an error. For example, knowing exactly which columns failed to convert to integer means you can investigate all of them at once. And then having the flexibility to define a function to find out which data rows specifically failed is even more powerful.
6 |
7 | The main use I have for using this package at work is so I can feed back to the data creators where there are errors in their manually entered or system extracted data sets so they can make corrections to it before I use it. And if there are values that break my assumptions but are actually valid values I get the feedback from the domain experts that can help me modify my assumptions, or keep the check as-is because it's a highly unlikely occurance and it's good to know when it's cropped up. So, although the problems are labelled as being in a `issues log` they could just be flags for unusual or specific values of particular interest or they could be genuine errors that need resolving.
8 |
9 | # The structure
10 |
11 | + `data/` will contain any generated data we need, some of the tables may be pre-existing hard coded ones
12 | + `test_scripts/` contains an example in scripts rather than notebooks, from this form which runs well locally you can easily convert it into an Airflow compatible form, the `main.py` script accesses all the other scripts so you only need to run one
13 | + `00_create_data.ipynb` creates the data and dbs that are used in the examples
14 | + `01_example.ipnb` a look at some basic functionality: finding files, reading in the data, setting new headers, asserting nulls, then converting to the correct dtypes
15 | + `02_example.ipynb` a concentrated look at individual bits of functionality available and a look at the issue output produced when there are problems
16 | + `02_example.py` some externally defined information to use in the `02_example.ipynb` notebook for one of the sections
17 |
18 | # Run order
19 |
20 | 1. Run `00_create_data.ipynb` first to create the data files for the examples
21 |
22 | You can then run either the notebooks or the `test_scripts/` files.
23 |
--------------------------------------------------------------------------------
/examples/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/data/.gitkeep
--------------------------------------------------------------------------------
/examples/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gigisr/data_etl/ddd2bf742615d659f96bfd6543a657ab195b67c7/examples/logs/.gitkeep
--------------------------------------------------------------------------------
/examples/test_scripts/.config:
--------------------------------------------------------------------------------
1 | [TEST]
2 | DRIVER = {SQLite3 ODBC Driver}
3 | SERVER = localhost
4 | DATABASE = test.db
5 | Trusted_connection = yes
6 |
--------------------------------------------------------------------------------
/examples/test_scripts/alter_cols.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | dict_alter = dict()
4 |
5 | dict_alter['01'] = {
6 | 'type': 'new_col',
7 | 'col_name': 'number_2',
8 | 'function': lambda df, keys, **kwargs: df['a_number'] * 2
9 | }
10 | dict_alter['02'] = {
11 | 'type': 'new_col',
12 | 'col_name': 'key_1',
13 | 'function': lambda df, keys, **kwargs: keys[0]
14 | }
15 | dict_alter['03'] = {
16 | 'type': 'new_col',
17 | 'col_name': 'key_2',
18 | 'function': lambda df, keys, **kwargs: keys[1]
19 | }
20 | dict_alter['04'] = {
21 | 'type': 'map_df',
22 | 'function': lambda df, keys, **kwargs: df,
23 | 'idx_function': lambda df, keys, **kwargs: pd.Series(True, index=df.index)
24 | }
25 |
--------------------------------------------------------------------------------
/examples/test_scripts/checks_1.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | dict_checks = dict()
4 |
5 | dict_checks["This check is for numbers being greater than 6"] = {
6 | "columns": ["a_number", "number_2"],
7 | "calc_condition": lambda df, col, **kwargs: df[col] <= 6,
8 | "long_description": lambda df, col, condition, **kwargs:
9 | "There are numbers less than or equal to 6",
10 | "index_position": lambda df, col, condition, **kwargs:
11 | pd.Series(False, df.index)
12 | }
13 |
14 | dict_checks["This check is for the column to be not null"] = {
15 | "columns": ['string'],
16 | "calc_condition": lambda df, col, **kwargs: df[col].isnull(),
17 | "long_description": lambda df, col, condition, **kwargs:
18 | f"The column `{col}` should not be null",
19 | "category": 'must be resolved'
20 | }
21 |
--------------------------------------------------------------------------------
/examples/test_scripts/convert_columns.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pandas as pd
4 |
5 | dict_convert = dict()
6 |
7 |
8 | def func_string_to_int(df, col):
9 | s = df[col].copy()
10 | s = s.str.replace(',', '') # thousand separators
11 | s = s.str.replace('%', '') # percentage sign
12 | s = s.str.replace('£', '') # pound stirling sign
13 | s = s.str.replace('$', '') # dollar sign
14 | s = s.str.replace('€', '') # euro sign
15 | s = s.str.replace('¥', '') # yen sign
16 | s = s.astype(int)
17 | return s
18 |
19 |
20 | def func_string_to_float(df, col):
21 | s = df[col].copy()
22 | s = s.str.replace(',', '') # thousand separators
23 | s = s.str.replace('%', '') # percentage sign
24 | s = s.str.replace('£', '') # pound stirling sign
25 | s = s.str.replace('$', '') # dollar sign
26 | s = s.str.replace('€', '') # euro sign
27 | s = s.str.replace('¥', '') # yen sign
28 | s = s.astype(float)
29 | return s
30 |
31 |
32 | dict_convert['int'] = {
33 | 'columns': lambda df, **kwargs: ['a_number'],
34 | 'dtypes': ['int', 'float'],
35 | 'functions': {
36 | 1: lambda df, col, **kwargs: df[col].astype(int),
37 | 2: lambda df, col, **kwargs: func_string_to_int(df, col),
38 | 3: lambda df, col, **kwargs: df[col].astype(float),
39 | 4: lambda df, col, **kwargs: func_string_to_float(df, col)
40 | }
41 | }
42 | dict_convert['float'] = {
43 | 'columns': ['lat', 'lng'],
44 | 'dtypes': ['float'],
45 | 'functions': {
46 | 1: lambda df, col, **kwargs: df[col].astype(float),
47 | 2: lambda df, col, **kwargs: func_string_to_float(df, col)
48 | }
49 | }
50 | # TODO have a mash-up function that also takes care of Excel dates?
51 | dict_convert['date'] = {
52 | 'columns': ['date_1', 'date_2'],
53 | 'dtypes': ['datetime'],
54 | 'functions': {
55 | 1: lambda df, col, *kwargs:
56 | pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
57 | }
58 | }
59 |
60 |
61 | def func_string_format(df, col):
62 | s = df[col].copy()
63 | s_null = s.isnull()
64 | s = s.astype(str)
65 | s = s.str.strip()
66 | reg_ex = re.compile(' +')
67 | s = s.map(lambda x: re.sub(reg_ex, ' ', x))
68 | s.loc[s_null] = pd.np.nan
69 | return s
70 |
71 |
72 | dict_convert['string'] = {
73 | 'columns': ['string'],
74 | 'dtypes': [],
75 | 'functions': {
76 | 1: lambda df, col, **kwargs: func_string_format(df, col)
77 | },
78 | 'idx_function': lambda df, col, **kwargs: pd.Series(True, index=df.index)
79 | }
80 |
--------------------------------------------------------------------------------
/examples/test_scripts/main.py:
--------------------------------------------------------------------------------
1 | # This is the section where we put all the classes together in combinations
2 | # that are required for specific data sets
3 | import logging
4 | from datetime import datetime
5 | import pickle
6 | # This is only used to create a table, usually this would already be done
7 | import sqlite3
8 |
9 | from data_etl import DataCuration, Checks, Connections, Reporting, \
10 | func_check_for_issues, func_initialise_logging
11 |
12 | if __name__ == "__main__":
13 | var_key_1 = "A"
14 | var_key_2 = "1"
15 | var_key_3 = "1"
16 | var_start_time = datetime.now()
17 |
18 | var_checks_1_pass = True
19 | var_write_out = True
20 |
21 | func_initialise_logging('pipeline_test_1', '../logs/', var_key_1,
22 | var_key_2, var_key_3, var_start_time)
23 |
24 | # Initialise objects required
25 | cnxs = Connections()
26 | data = DataCuration(var_start_time, "A")
27 | check = Checks(var_start_time, "A")
28 | reporting = Reporting(var_start_time, "A")
29 |
30 | # Set up connections
31 | cnxs.add_cnx(
32 | cnx_key='df_issues', cnx_type='sqlite3', table_name='df_issues',
33 | file_path='../data/00_db.db', sqlite_df_issues_create=True)
34 |
35 | # # This is only needed to create the structure,
36 | cnx = sqlite3.connect('../data/00_db.db')
37 | var_create_table = """CREATE TABLE IF NOT EXISTS data (
38 | a_number INTEGER, date_1 TEXT, date_2 TEXT, string TEXT,
39 | testing REAL, a REAL, b REAL, lat REAL, lng REAL, number_2 INTEGER,
40 | key_1 TEXT, key_2 TEXT, level_0 TEXT
41 | );"""
42 | cnx.execute(var_create_table)
43 | cnx.commit()
44 | cnx.close()
45 |
46 | cnxs.add_cnx(cnx_key='data_out', cnx_type='sqlite3', table_name='data',
47 | file_path='../data/00_db.db')
48 |
49 | # Data etl testing
50 |
51 | # Read the files in
52 | data.find_files(files_path="../data",
53 | script_name="test_reading_in", path='.')
54 | data.reading_in(path=".", script_name="test_reading_in")
55 |
56 | # Set the step number
57 | data.set_step_no(1)
58 |
59 | # Read in the headers
60 | data.set_comparison_headers(
61 | path=".",
62 | script_name="test_reading_in",
63 | filepath="../data/headers.xlsx")
64 | data.link_headers()
65 | data.assert_linked_headers(remove_header_rows=True, reset_index=True)
66 |
67 | data.set_step_no(2)
68 | data.assert_nulls([""])
69 | data.convert_columns(".", "convert_columns")
70 | func_check_for_issues(
71 | data.get_issue_count(2, 2), cnxs, 'df_issues', data.df_issues,
72 | data.get_step_no(), start_time=var_start_time)
73 |
74 | data.set_step_no(3)
75 | data.alter_tables(".", "alter_cols")
76 | func_check_for_issues(
77 | data.get_issue_count(3, 3), cnxs, 'df_issues', data.df_issues,
78 | data.get_step_no(), start_time=var_start_time)
79 |
80 | data.set_step_no(4)
81 | data.concatenate_tables()
82 |
83 | check.set_step_no(5)
84 | check.set_defaults(idx_flag=True)
85 | check.apply_checks(data.tables, ".", "checks_1")
86 | func_check_for_issues(
87 | check.get_issue_count(5, 5), cnxs, 'df_issues', check.df_issues,
88 | check.get_step_no(), var_checks_1_pass, var_start_time)
89 |
90 | # Now the data is cleansed do the reporting, this could be
91 | # post writing to DB
92 | data.set_step_no(6)
93 | data.form_summary_tables(path='.', script_name='reporting_1')
94 |
95 | # Temporary snapshot for testing
96 | pickle.dump(
97 | {'data': data, 'checks': check, 'report': reporting, 'cnx': cnxs},
98 | open("../data/dict_dc.pkl", "wb"))
99 |
100 | # Log issues found
101 | cnxs.write_to_db('df_issues', data.df_issues)
102 | cnxs.write_to_db('df_issues', check.df_issues)
103 |
104 | # Write the data out
105 | if var_write_out:
106 | cnxs.write_to_db('data_out', data.tables)
107 |
108 | logging.info("Script time taken: {}".format(
109 | str(datetime.now() - var_start_time)))
110 |
--------------------------------------------------------------------------------
/examples/test_scripts/reporting_1.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import matplotlib.pyplot as plt
4 | import folium
5 |
6 |
7 | def form_tables(tables, formed_tables, grouping, key_1, key_2, key_3,
8 | key_separator, **kwargs):
9 | dict_data = dict()
10 | dict_data['main_data'] = tables.copy()
11 | return dict_data
12 |
13 |
14 | dict_reporting = dict()
15 |
16 |
17 | def func_chart_1(tables, file_path, file_name):
18 | df = tables['main_data']
19 | plt.figure()
20 | g = df['number_2'].hist(bins=50)
21 | plt.title('Histogram')
22 | plt.savefig(os.path.join(file_path, file_name))
23 | return None
24 |
25 |
26 | dict_reporting['Histogram 1'] = {
27 | 'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3,
28 | **kwargs: 'chart_1.png',
29 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
30 | key_3,**kwargs:
31 | func_chart_1(tables, file_path, file_name)
32 | }
33 | dict_reporting['Histogram 2'] = {
34 | 'file_name': lambda tables, file_path, grouping, key_1, key_2, key_3,
35 | **kwargs: 'sub_folder_test/chart_1.png',
36 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
37 | key_3, **kwargs:
38 | func_chart_1(tables, file_path, file_name)
39 | }
40 |
41 |
42 | def func_map_1(tables, file_path, file_name):
43 | df = tables['main_data']
44 | m = folium.Map([51.5074, 0.1278], zoom_start=12)
45 | for idx in df.index.tolist():
46 | folium.Marker([df.loc[idx, 'lat'], df.loc[idx, 'lng']]).add_to(m)
47 | m.save(os.path.join(file_path, file_name))
48 | return df
49 |
50 |
51 | dict_reporting['Map 1'] = {
52 | 'file_name': lambda tables, file_path, grouping, key_1, key_2,
53 | key_3, **kwargs: 'map_1.html',
54 | 'function': lambda tables, file_path, file_name, grouping, key_1, key_2,
55 | key_3, **kwargs: func_map_1(tables, file_path, file_name)
56 | }
57 |
--------------------------------------------------------------------------------
/examples/test_scripts/test_reading_in.py:
--------------------------------------------------------------------------------
1 | # This file contains the information required for listing files and reading in
2 | # tables of data
3 | import os
4 |
5 | import pandas as pd
6 |
7 |
8 | def list_the_files(path):
9 | list_files = os.listdir(path)
10 | list_files = [os.path.abspath(os.path.join(path, x)) for x in list_files]
11 | list_files = [x for x in list_files if '.xlsx' in x.lower()]
12 | list_files = [x for x in list_files if '~' not in x.lower()]
13 | list_files = [x for x in list_files if 'header' not in x.lower()]
14 | return list_files
15 |
16 |
17 | def read_files(list_files):
18 | dict_files = dict()
19 | for file in list_files:
20 | xl = pd.ExcelFile(file)
21 | for sheet in xl.sheet_names:
22 | df = xl.parse(
23 | sheet_name=sheet, dtype=str, keep_default_na=False, header=None)
24 | key = '{} -:- {}'.format(
25 | file.split('\\')[-1].lower().replace('.xlsx', ''), sheet)
26 | dict_files[key] = df.copy()
27 | return dict_files
28 |
29 |
30 | def read_headers(filepath):
31 | if not os.path.exists(filepath):
32 | raise ValueError(
33 | 'The passed file path does not exist: {}'.format(filepath))
34 | dict_headers = dict()
35 | file = pd.ExcelFile(filepath)
36 | dict_headers['ideal_headers'] = file.parse(
37 | 'IdealHeaders', header=None).values.tolist()[0]
38 | for sheet in [sheet for sheet in
39 | file.sheet_names if sheet != 'IdealHeaders']:
40 | df_header = file.parse(sheet, header=None)
41 | dict_headers[sheet] = {
42 | 'expected_headers': df_header[
43 | df_header[0] == 'Header'].iloc[:, 1:].values.tolist()[0],
44 | 'new_headers': df_header[
45 | df_header[0] == 'New name'].iloc[:, 1:].values.tolist()[0],
46 | 'remove': df_header[
47 | df_header[0] == 'Remove'].iloc[:, 1:].values.tolist()[0],
48 | 'notes': df_header[
49 | df_header[0] == 'Notes'].iloc[:, 1:].values.tolist()[0]
50 | }
51 | return dict_headers
52 |
53 |
54 | def link_headers(dfs, df_headers):
55 | dict_link = dict()
56 | for key_df in dfs.keys():
57 | for key_header in df_headers.keys():
58 | check_shape = (
59 | # + 1 because the headers have an index to explain the
60 | # row purposes
61 | dfs[key_df].shape[1] + 1 == df_headers[key_header].shape[1])
62 | if check_shape is True:
63 | dict_link[key_df] = str(key_header)
64 | break
65 | return dict_link
66 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 |
3 | setup(
4 | name='data_etl',
5 | version='0.1.0dev',
6 | packages=['data_etl',],
7 | license='MIT',
8 | url="https://github.com/gigisr/data_etl",
9 |
10 | author='GigiSR', requires=['pandas', 'numpy', 'pyodbc']
11 | )
12 |
--------------------------------------------------------------------------------
/tests/00_pytest.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import pickle
3 |
4 | import pandas as pd
5 | import numpy as np
6 |
7 | from data_curation import DataCuration, Checks
8 |
9 |
10 | var_cnv_1_start_time = datetime.now()
11 | data_cnv_1 = DataCuration(var_cnv_1_start_time , 'test')
12 | df_convert_issues = pd.DataFrame(
13 | [
14 | ('A', '1', '0.6', '2019-02-29'),
15 | ('B', '4.5', 'A', '2019-22-05'),
16 | ('C', '1', '5.6', '2018-12-17'),
17 | ('D', 'b', '15.9', '2019-09-31'),
18 | (5, '-8', '4.7', '2018-03-09')
19 | ],
20 | columns=['object', 'int', 'float', 'date']
21 | )
22 | data_cnv_1.set_table({'df_convert_issues.tsv': df_convert_issues})
23 |
24 |
25 | def func_try_float_cnv(x):
26 | try:
27 | var = float(x)
28 | except:
29 | return True
30 | return False
31 |
32 |
33 | def func_try_int_cnv(x):
34 | try:
35 | var = int(x)
36 | except:
37 | return True
38 | return False
39 |
40 |
41 | def func_str_cnv(s):
42 | var_is_null_pre = s.isnull().sum()
43 | s_cnv = s.map(func_to_int).str.strip()
44 | var_is_null_post = s_cnv.isnull().sum()
45 | if var_is_null_post != var_is_null_pre:
46 | raise ValueError
47 | return s_cnv
48 |
49 |
50 | def func_to_int(x):
51 | try:
52 | return int(x)
53 | except:
54 | return x
55 |
56 |
57 | def func_try_str_cnv(s):
58 | var_is_null_pre = s.isnull().sum()
59 | s_cnv = s.map(func_to_int).str.strip()
60 | var_is_null_post = s_cnv.isnull().sum()
61 | return s != s_cnv
62 |
63 |
64 | def func_try_date_cnv(x):
65 | if pd.isnull(x):
66 | return False
67 | if pd.isnull(pd.to_datetime(x, format='%Y-%m-%d', errors='coerce')):
68 | return True
69 | return False
70 |
71 |
72 | dict_cnv_1 = {
73 | 'float': {
74 | 'columns': ['float'],
75 | 'dtypes': ['float'],
76 | 'functions': {
77 | 1: lambda df, col, **kwargs: df[col].astype(float)
78 | },
79 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_float_cnv)
80 | },
81 | 'int': {
82 | 'columns': ['int'],
83 | 'dtypes': ['int'],
84 | 'functions': {
85 | 1: lambda df, col, **kwargs: df[col].astype(int)
86 | },
87 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_int_cnv)
88 | },
89 | 'object': {
90 | 'columns': ['object'],
91 | 'dtypes': [],
92 | 'functions': {
93 | 1: lambda df, col, **kwargs: func_str_cnv(df[col])
94 | },
95 | 'idx_function': lambda df, col, **kwargs: func_try_str_cnv(df[col])
96 | },
97 | 'date': {
98 | 'columns': ['date'],
99 | 'dtypes': ['date', '[ns]'],
100 | 'functions': {
101 | 1: lambda df, col, **kwargs: pd.to_datetime(
102 | df[col], format='%Y-%m-%d')
103 | },
104 | 'idx_function': lambda df, col, **kwargs: df[col].map(func_try_date_cnv)
105 | }
106 | }
107 |
108 | df_cnv_1_expected_df_issues = pd.DataFrame(
109 | [
110 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
111 | 'The conversion failed to format float', 'float', 1, '1',
112 | var_cnv_1_start_time),
113 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
114 | 'The conversion failed to format int', 'int', 2, '1, 3',
115 | var_cnv_1_start_time),
116 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
117 | 'The conversion failed to format object', 'object', 1, '4',
118 | var_cnv_1_start_time),
119 | ('test', 'None', 'None', 'df_convert_issues.tsv', np.nan, 0, np.nan, '',
120 | 'The conversion failed to format date', 'date', 3, '0, 1, 3',
121 | var_cnv_1_start_time)
122 | ],
123 | columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number',
124 | 'category', 'issue_short_desc', 'issue_long_desc', 'column',
125 | 'issue_count', 'issue_idx', 'grouping']
126 | )
127 |
128 |
129 | def test_cnv_1():
130 | data_cnv_1.convert_columns(dictionary=dict_cnv_1)
131 | assert data_cnv_1.df_issues.fillna('').equals(
132 | df_cnv_1_expected_df_issues.fillna(''))
133 |
134 |
135 | var_alter_1_start_time = datetime.now()
136 | data_alter_1 = DataCuration(var_alter_1_start_time, 'test')
137 |
138 | data_alter_1.set_table(
139 | {
140 | 'df_alterations.tsv': pd.DataFrame(
141 | [
142 | ('A', 2, 'key_1'),
143 | ('B', 199, 'key_2'),
144 | ('C', -1, 'key_1'),
145 | ('D', 20, 'key_3'),
146 | ('E', 6, 'key_2')
147 | ],
148 | columns=['to_map', 'add_1', 'merge_key']
149 | ),
150 | 'df_alterations_issues.tsv': pd.DataFrame(
151 | [
152 | ('A', 2, 'key_1'),
153 | ('B', 199, 2),
154 | ('C', -1, 'key_1'),
155 | (['D'], 'a', 'key_3'),
156 | ('E', 6, 'key_2')
157 | ],
158 | columns=['to_map', 'add_1', 'merge_key']
159 | )
160 | }
161 | )
162 |
163 |
164 | df_mapping = pd.DataFrame(
165 | [
166 | ('key_1', 1),
167 | ('key_2', 2),
168 | ('key_3', 3)
169 | ],
170 | columns=['merge_key', 'out_value']
171 | )
172 |
173 |
174 | def func_alter_merge(df, df_mapping):
175 | df_mapped = pd.merge(
176 | df,
177 | df_mapping,
178 | on='merge_key',
179 | how='left'
180 | )
181 | if (
182 | df_mapped['out_value'].isnull().sum() !=
183 | df['merge_key'].isnull().sum()
184 | ):
185 | raise ValueError
186 | return df_mapped
187 |
188 |
189 | dict_alter_1 = {
190 | '01': {
191 | 'type': 'new_col',
192 | 'col_name': 'key',
193 | 'function': lambda df, keys, **kwargs: keys[0]
194 | },
195 | '02': {
196 | 'type': 'new_col',
197 | 'col_name': 'done_add_1',
198 | 'function': lambda df, keys, **kwargs: df['add_1'] + 1,
199 | 'idx_function': lambda df, keys, **kwargs:
200 | df['add_1'].map(
201 | lambda x: type(x).__name__).map(
202 | lambda x: ('int' in x) | ('float' in x)).map(
203 | {True: False, False: True})
204 | },
205 | '03': {
206 | 'type': 'new_col',
207 | 'col_name': 'mapped',
208 | 'function': lambda df, keys, **kwargs: df['to_map'].map({
209 | 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5}),
210 | 'idx_function': lambda df, keys, **kwargs:
211 | ~df['to_map'].astype(str).isin(['A', 'B', 'C', 'D', 'E'])
212 | },
213 | '04': {
214 | 'type': 'map_df',
215 | 'function': lambda df, keys, **kwargs:
216 | func_alter_merge(df, kwargs['df_mapping']),
217 | 'idx_function': lambda df, keys, **kwargs:
218 | ~df['merge_key'].isin(['key_1', 'key_2', 'key_3', np.nan])
219 | }
220 | }
221 |
222 | df_alter_1_expected_df_issues = pd.DataFrame(
223 | [
224 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
225 | '', 'For type new_col the function for alter_key 02 has not worked',
226 | 'done_add_1', 1, '3', var_alter_1_start_time),
227 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
228 | '', 'For type new_col the function for alter_key 03 has not worked',
229 | 'mapped', 1, '3', var_alter_1_start_time),
230 | ('test', 'None', 'None', 'df_alterations_issues.tsv', np.nan, 0, np.nan,
231 | '', 'For type map_df the function for alter_key 04 has not worked',
232 | np.nan, 1, '1', var_alter_1_start_time)
233 | ],
234 | columns=['key_1', 'key_2', 'key_3', 'file', 'sub_file', 'step_number',
235 | 'category', 'issue_short_desc', 'issue_long_desc', 'column',
236 | 'issue_count', 'issue_idx', 'grouping']
237 | )
238 |
239 |
240 | def test_alter_1():
241 | data_alter_1.alter_tables(dictionary=dict_alter_1, df_mapping=df_mapping)
242 | assert data_alter_1.df_issues.fillna('').equals(
243 | df_alter_1_expected_df_issues.fillna(''))
244 |
--------------------------------------------------------------------------------