├── .gitignore ├── LICENSE ├── README.md └── pylad ├── __init__.py ├── advanced_sql_templates.py ├── advanced_sql_templates_example.sql ├── pandas_merging.py ├── sql_template_example.sql └── sql_templates_base.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Eclipse 107 | .project 108 | .pydevproject 109 | *.pyc 110 | 111 | .idea 112 | .DS_Store 113 | 114 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Sergei Izrailev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # life-around-data-code 2 | Code snippets and tools published on the Life Around Data blog at www.lifearounddata.com 3 | -------------------------------------------------------------------------------- /pylad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sizrailev/life-around-data-code/60ce15dd1ef13ae51f5c57def243ebbc31d1a34b/pylad/__init__.py -------------------------------------------------------------------------------- /pylad/advanced_sql_templates.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # MIT License 3 | # 4 | # Copyright (c) 2019-2020 Sergei Izrailev 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | #=============================================================================== 24 | # Source: https://github.com/sizrailev/life-around-data-code/blob/master/pylad/ 25 | #=============================================================================== 26 | 27 | ''' 28 | Examples for the blog on advanced SQL templates in Python using JinjaSql 29 | ''' 30 | 31 | import os 32 | 33 | from pylad.sql_templates_base import apply_sql_template 34 | 35 | # A single dimension template 36 | _BASIC_STATS_TEMPLATE = ''' 37 | select 38 | {{ dim | sqlsafe }} 39 | , count(*) as num_transactions 40 | , sum(amount) as total_amount 41 | , avg(amount) as avg_amount 42 | from 43 | transactions 44 | group by 45 | {{ dim | sqlsafe }} 46 | order by total_amount desc 47 | ''' 48 | 49 | # Apply the template 50 | params = { 51 | 'dim': 'payment_method' 52 | } 53 | print(apply_sql_template(_BASIC_STATS_TEMPLATE, params)) 54 | 55 | 56 | # Multiple columns joined outside of the template: 57 | def get_basic_stats_sql(dimensions): 58 | ''' 59 | Returns the sql computing the number of transactions, 60 | as well as the total and the average transaction amounts 61 | for the provided list of column names as dimensions. 62 | ''' 63 | params = { 64 | 'dim': '\n , '.join(dimensions) 65 | } 66 | return apply_sql_template(_BASIC_STATS_TEMPLATE, params) 67 | 68 | 69 | print(get_basic_stats_sql(['store_id', 'payment_method'])) 70 | 71 | dimension_lists = [ 72 | ['user_id'], 73 | ['store_id'], 74 | ['payment_method'], 75 | ['store_id', 'payment_method'], 76 | ] 77 | 78 | dimension_queries = [get_basic_stats_sql(dims) for dims in dimension_lists] 79 | 80 | 81 | # Set a variable inside the template 82 | _PRESET_VAR_STATS_TEMPLATE = ''' 83 | {% set dims = '\n , '.join(dimensions) %} 84 | select 85 | {{ dims | sqlsafe }} 86 | , count(*) as num_transactions 87 | , sum(amount) as total_amount 88 | , avg(amount) as avg_amount 89 | from 90 | transactions 91 | group by 92 | {{ dims | sqlsafe }} 93 | order by total_amount desc 94 | ''' 95 | 96 | 97 | def get_preset_stats_sql(dimensions): 98 | ''' 99 | Returns the sql computing the number of transactions, 100 | as well as the total and the average transaction amounts 101 | for the provided list of column names as dimensions. 102 | ''' 103 | params = { 104 | 'dimensions': dimensions 105 | } 106 | return apply_sql_template(_PRESET_VAR_STATS_TEMPLATE, params) 107 | 108 | 109 | print(get_preset_stats_sql(['store_id', 'payment_method'])) 110 | 111 | # Using loops 112 | _LOOPS_STATS_TEMPLATE = ''' 113 | select 114 | {{ dimensions[0] | sqlsafe }}\ 115 | {% for dim in dimensions[1:] %} 116 | , {{ dim | sqlsafe }}{% endfor %} 117 | , count(*) as num_transactions 118 | , sum(amount) as total_amount 119 | , avg(amount) as avg_amount 120 | from 121 | transactions 122 | group by 123 | {{ dimensions[0] | sqlsafe }}\ 124 | {% for dim in dimensions[1:] %} 125 | , {{ dim | sqlsafe }}{% endfor %} 126 | order by total_amount desc 127 | ''' 128 | 129 | 130 | def get_loops_stats_sql(dimensions): 131 | ''' 132 | Returns the sql computing the number of transactions, 133 | as well as the total and the average transaction amounts 134 | for the provided list of column names as dimensions. 135 | ''' 136 | params = { 137 | 'dimensions': dimensions 138 | } 139 | return apply_sql_template(_LOOPS_STATS_TEMPLATE, params) 140 | 141 | 142 | print(get_loops_stats_sql(['store_id', 'payment_method'])) 143 | 144 | 145 | def strip_blank_lines(text): 146 | ''' 147 | Removes blank lines from the text, including those containing only spaces. 148 | https://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string 149 | ''' 150 | return os.linesep.join([s for s in text.splitlines() if s.strip()]) 151 | 152 | 153 | _LOOPS_STATS_TEMPLATE = ''' 154 | select 155 | {{ dimensions[0] | sqlsafe }} 156 | {% for dim in dimensions[1:] %} 157 | , {{ dim | sqlsafe }} 158 | {% endfor %} 159 | , count(*) as num_transactions 160 | , sum(amount) as total_amount 161 | , avg(amount) as avg_amount 162 | from 163 | transactions 164 | group by 165 | {{ dimensions[0] | sqlsafe }} 166 | {% for dim in dimensions[1:] %} 167 | , {{ dim | sqlsafe }} 168 | {% endfor %} 169 | order by total_amount desc 170 | ''' 171 | 172 | # Formatted without blank lines 173 | print(strip_blank_lines(get_loops_stats_sql(['store_id', 'payment_method']))) 174 | 175 | 176 | # Custom dimensions and looping over a dictionary 177 | custom_dimensions = { 178 | 'store_id': 'store_id', 179 | 'card_or_cash': "case when payment_method = 'cash' then 'cash' else 'card' end", 180 | } 181 | 182 | _CUSTOM_STATS_TEMPLATE = ''' 183 | {% set dims = '\n , '.join(dimensions.keys()) %} 184 | select 185 | sum(amount) as total_amount 186 | {% for dim, def in dimensions.items() %} 187 | , {{ def | sqlsafe }} as {{ dim | sqlsafe }} 188 | {% endfor %} 189 | , count(*) as num_transactions 190 | , avg(amount) as avg_amount 191 | from 192 | transactions 193 | group by 194 | {{ dims | sqlsafe }} 195 | order by total_amount desc 196 | ''' 197 | 198 | 199 | def get_custom_stats_sql(dimensions): 200 | ''' 201 | Returns the sql computing the number of transactions, 202 | as well as the total and the average transaction amounts 203 | for the provided list of column names as dimensions. 204 | ''' 205 | params = { 206 | 'dimensions': dimensions 207 | } 208 | return apply_sql_template(_CUSTOM_STATS_TEMPLATE, params) 209 | 210 | 211 | print(strip_blank_lines(get_custom_stats_sql(custom_dimensions))) 212 | print(strip_blank_lines( 213 | apply_sql_template(template=_CUSTOM_STATS_TEMPLATE, 214 | parameters={'dimensions': custom_dimensions}))) 215 | 216 | 217 | # Using custom functions 218 | def transform_dimensions(dimensions: dict) -> str: 219 | ''' 220 | Generate SQL for aliasing or transforming the dimension columns. 221 | ''' 222 | return '\n , '.join([ 223 | '{val} as {key}'.format(val=val, key=key) 224 | for key, val in dimensions.items() 225 | ]) 226 | 227 | 228 | print(transform_dimensions(custom_dimensions)) 229 | 230 | _FUNCTION_STATS_TEMPLATE = ''' 231 | {% set dims = '\n , '.join(dimensions.keys()) %} 232 | select 233 | {{ transform_dimensions(dimensions) | sqlsafe }} 234 | , sum(amount) as total_amount 235 | , count(*) as num_transactions 236 | , avg(amount) as avg_amount 237 | from 238 | transactions 239 | group by 240 | {{ dims | sqlsafe }} 241 | order by total_amount desc 242 | ''' 243 | 244 | print(strip_blank_lines( 245 | apply_sql_template(template=_FUNCTION_STATS_TEMPLATE, 246 | parameters={'dimensions': custom_dimensions}, 247 | func_list=[transform_dimensions]))) 248 | -------------------------------------------------------------------------------- /pylad/advanced_sql_templates_example.sql: -------------------------------------------------------------------------------- 1 | create table db_stg.analytics.transactions ( 2 | transaction_id int, 3 | user_id int, 4 | transaction_date date, 5 | store_id int, 6 | payment_method varchar(10), 7 | amount float 8 | ) 9 | ; 10 | 11 | insert into db_stg.analytics.transactions 12 | (transaction_id, user_id, transaction_date, store_id, payment_method, amount) 13 | values 14 | (1, 1234, '2019-03-02', 1, 'cash', 5.25), 15 | (1, 1234, '2019-03-01', 1, 'credit', 10.75), 16 | (1, 1234, '2019-03-02', 2, 'cash', 25.50), 17 | (1, 1234, '2019-03-03', 2, 'credit', 17.00), 18 | (1, 4321, '2019-03-01', 2, 'cash', 20.00), 19 | (1, 4321, '2019-03-02', 2, 'credit', 30.00), 20 | (1, 4321, '2019-03-03', 1, 'cash', 3.00) 21 | ; 22 | 23 | select 24 | store_id 25 | , payment_method 26 | , count(*) as num_transactions 27 | , sum(amount) as total_amount 28 | , avg(amount) as avg_amount 29 | from 30 | transactions 31 | group by 32 | store_id 33 | , payment_method 34 | order by total_amount desc 35 | ; 36 | 37 | 38 | select 39 | sum(amount) as total_amount 40 | , store_id as store_id 41 | , case when payment_method = 'cash' then 'cash' else 'card' end as card_or_cash 42 | , count(*) as num_transactions 43 | , avg(amount) as avg_amount 44 | from 45 | transactions 46 | group by 47 | store_id 48 | , card_or_cash 49 | order by total_amount desc 50 | ; 51 | 52 | with 53 | user_transactions as ( 54 | select 55 | store_id 56 | , user_id 57 | , count(*) as num_transactions 58 | , min(amount) as min_amount 59 | , max(amount) as max_amount 60 | , sum(amount) as total_amount 61 | , avg(amount) as avg_amount 62 | , case when max_amount > 15 then 1 else 0 end as is_high_spender 63 | from 64 | transactions 65 | group by 66 | store_id 67 | , user_id 68 | ) 69 | , 70 | store_transactions as ( 71 | select 72 | store_id 73 | , count(*) as user_count 74 | , sum(num_transactions) as num_transactions 75 | , sum(total_amount) as total_amount 76 | , sum(case when is_high_spender > 0 then total_amount else 0.0 end) as high_spender_amount 77 | , sum(is_high_spender) as high_spender_count 78 | from 79 | user_transactions 80 | group by store_id 81 | ) 82 | select 83 | num_transactions / user_count as avg_user_num_transactions 84 | , total_amount / user_count as avg_user_spend 85 | , high_spender_count / user_count * 100 as pct_high_spenders 86 | -- division by zero 87 | --, high_spender_amount / high_spender_count as avg_high_spender_spend 88 | from 89 | store_transactions 90 | order by total_amount desc 91 | ; 92 | -------------------------------------------------------------------------------- /pylad/pandas_merging.py: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/questions/53645882/pandas-merging-101 2 | # https://stackoverflow.com/questions/1299871/how-to-join-merge-data-frames-inner-outer-left-right 3 | import numpy as np 4 | import pandas as pd 5 | 6 | np.random.seed(0) 7 | left = pd.DataFrame({'transaction_id': ['A', 'B', 'C', 'D'], 8 | 'user_id': ['Peter', 'John', 'John', 'Anna'], 9 | 'value': np.random.randn(4), 10 | }) 11 | 12 | right = pd.DataFrame({'user_id': ['Paul', 'Mary', 'John', 'Anna'], 13 | 'favorite_color': ['blue', 'blue', 'red', np.NaN], 14 | }) 15 | 16 | left.merge(right, on='user_id', how='left') 17 | left.merge(right, on='user_id', how='left', indicator=True) 18 | left.merge(right.rename({'user_id': 'user_id_r'}, axis=1), 19 | left_on='user_id', right_on='user_id_r', how='left') 20 | 21 | # this doesn't work 22 | left.join(right, rsuffix='_r', how='left') 23 | 24 | # set the index for user_id first 25 | left1 = left.set_index('user_id', drop=False) 26 | right1 = right.set_index('user_id', drop=False) 27 | left1.join(right1, rsuffix='_r', how='left') 28 | -------------------------------------------------------------------------------- /pylad/sql_template_example.sql: -------------------------------------------------------------------------------- 1 | create table transactions ( 2 | transaction_id int, 3 | user_id int, 4 | transaction_date date, 5 | amount float 6 | ) 7 | ; 8 | 9 | insert into transactions (transaction_id, user_id, transaction_date, amount) 10 | values (1, 1234, '2019-03-02', 5.25); 11 | insert into _transactions (transaction_id, user_id, transaction_date, amount) 12 | values (1, 1234, '2019-03-01', 15.75); 13 | insert into transactions (transaction_id, user_id, transaction_date, amount) 14 | values (1, 1234, '2019-03-02', 25.50); 15 | insert into transactions (transaction_id, user_id, transaction_date, amount) 16 | values (1, 1234, '2019-03-03', 10.00); 17 | insert into transactions (transaction_id, user_id, transaction_date, amount) 18 | values (1, 4321, '2019-03-01', 20.00); 19 | insert into transactions (transaction_id, user_id, transaction_date, amount) 20 | values (1, 4321, '2019-03-02', 30.00); 21 | 22 | select 23 | user_id 24 | , count(*) as num_transactions 25 | , sum(amount) as total_amount 26 | from 27 | transactions 28 | where 29 | user_id = 1234 30 | and transaction_date = '2019-03-02' 31 | group by 32 | user_id 33 | ; 34 | 35 | -- Result: 36 | -- 1234|2|30.75 37 | 38 | -------------------------------------------------------------------------------- /pylad/sql_templates_base.py: -------------------------------------------------------------------------------- 1 | #=============================================================================== 2 | # MIT License 3 | # 4 | # Copyright (c) 2019-2020 Sergei Izrailev 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | #=============================================================================== 24 | # Source: https://github.com/sizrailev/life-around-data-code/blob/master/pylad/ 25 | #=============================================================================== 26 | 27 | ''' 28 | Helper functions for parameterizing SQL queries in Python using JinjaSql 29 | ''' 30 | 31 | from copy import deepcopy 32 | 33 | from six import string_types 34 | 35 | from jinjasql import JinjaSql 36 | 37 | __all__ = ['quote_sql_string', 'get_sql_from_template', 'apply_sql_template', 38 | 'get_column_stats_sql'] 39 | 40 | 41 | def quote_sql_string(value): 42 | ''' 43 | If `value` is a string type, escapes single quotes in the string 44 | and returns the string enclosed in single quotes. 45 | ''' 46 | if isinstance(value, string_types): 47 | new_value = str(value) 48 | new_value = new_value.replace("'", "''") 49 | return "'{}'".format(new_value) 50 | return value 51 | 52 | 53 | def get_sql_from_template(query, bind_params): 54 | ''' 55 | Given a query and binding parameters produced by JinjaSql's prepare_query(), 56 | produce and return a complete SQL query string. 57 | ''' 58 | if not bind_params: 59 | return query 60 | params = deepcopy(bind_params) 61 | for key, val in params.items(): 62 | params[key] = quote_sql_string(val) 63 | return query % params 64 | 65 | 66 | def apply_sql_template(template, parameters, func_list=None): 67 | ''' 68 | Apply a JinjaSql template (string) substituting parameters (dict) and return 69 | the final SQL. Use the func_list to pass any functions called from the template. 70 | ''' 71 | j = JinjaSql(param_style='pyformat') 72 | if func_list: 73 | for func in func_list: 74 | j.env.globals[func.__name__] = func 75 | query, bind_params = j.prepare_query(template, parameters) 76 | return get_sql_from_template(query, bind_params) 77 | 78 | 79 | COLUMN_STATS_TEMPLATE = ''' 80 | select 81 | {{ column_name | sqlsafe }} as column_name 82 | , count(*) as num_rows 83 | , count(distinct {{ column_name | sqlsafe }}) as num_unique 84 | , sum(case when {{ column_name | sqlsafe }} is null then 1 else 0 end) as num_nulls 85 | {% if default_value %} 86 | , sum(case when {{ column_name | sqlsafe }} = {{ default_value }} 87 | then 1 else 0 end) as num_default 88 | {% else %} 89 | , 0 as num_default 90 | {% endif %} 91 | , min({{ column_name | sqlsafe }}) as min_value 92 | , max({{ column_name | sqlsafe }}) as max_value 93 | from 94 | {{ table_name | sqlsafe }} 95 | ''' 96 | 97 | 98 | def get_column_stats_sql(table_name, column_name, default_value): 99 | ''' 100 | Returns the SQL for computing column statistics. 101 | Passing None for the default_value results in zero output for the number 102 | of default values. 103 | ''' 104 | params = { 105 | 'table_name': table_name, 106 | 'column_name': column_name, 107 | 'default_value': default_value, 108 | } 109 | return apply_sql_template(COLUMN_STATS_TEMPLATE, params) 110 | --------------------------------------------------------------------------------