├── .gitignore ├── README.md └── code ├── __init__.py ├── model.py ├── my_tools.py ├── predictors.json └── template.sql /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea* 3 | .#* 4 | *~ 5 | \#* 6 | .Rproj.user 7 | *.egg-info* 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jinja_sql_mockup 2 | 3 | Mock application for using Jinja2 to productionize a simple machine learning model. 4 | 5 | This is meant to supplement [my blog post](http://multithreaded.stitchfix.com/blog/2017/07/06/one-weird-trick/) on the utility of Jinja2 in writing complex ETL. 6 | 7 | To run the application: 8 | ``` 9 | cd code 10 | python models.py 11 | ``` 12 | -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RAvdek/jinja_sql_mockup/48d1326c6a9b05a9b4391eec14bdf2eed89843f7/code/__init__.py -------------------------------------------------------------------------------- /code/model.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sklearn.linear_model import LinearRegression 3 | from jinja2 import Template 4 | from my_tools import query_to_pandas, query_to_table 5 | import warnings 6 | warnings.filterwarnings("ignore") # DON'T DO THIS IN REAL LIFE 7 | 8 | with open('predictors.json') as f: 9 | PREDICTORS = json.load(f) 10 | with open('template.sql') as f: 11 | QUERY_TEMPLATE = f.read() 12 | 13 | 14 | def get_model_coefs(): 15 | """ Returns a dictionary of coefs from training """ 16 | query = Template( 17 | QUERY_TEMPLATE 18 | ).render( 19 | predictors=PREDICTORS, 20 | train=True 21 | ) 22 | print "-- Training query --" 23 | print query 24 | data = query_to_pandas(query) 25 | model = LinearRegression().fit( 26 | data[[p['name'] for p in PREDICTORS]], 27 | data['income'] 28 | ) 29 | output = {'intercept': model.intercept_} 30 | for i, p in enumerate(PREDICTORS): 31 | output[p['name']] = model.coef_[i] 32 | return output 33 | 34 | 35 | def evaluate_model(coefs): 36 | """ Uses coefs to evaluate a model 37 | and write output to a table """ 38 | query = Template( 39 | QUERY_TEMPLATE 40 | ).render( 41 | predictors=PREDICTORS, 42 | train=False, 43 | coefs=coefs 44 | ) 45 | print "-- Evaluation query --" 46 | print query 47 | query_to_table(query) 48 | 49 | 50 | if __name__ == "__main__": 51 | coefs = get_model_coefs() 52 | evaluate_model(coefs) 53 | -------------------------------------------------------------------------------- /code/my_tools.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | # Load predictor data 5 | with open('predictors.json') as f: 6 | PREDICTORS = json.load(f) 7 | 8 | # Generate some fake data 9 | DF = pd.DataFrame({ 10 | 'customer_id': range(500), 11 | 'height': pd.np.random.normal(70, 20, size=500), 12 | 'is_male': pd.np.random.binomial(1, .5, size=500) 13 | }) 14 | DF['income'] = 30 + .1*DF['height'] + 5*DF['is_male'] 15 | 16 | 17 | # Define our functions 18 | # We will just create mocks for illustrative purposes 19 | # and not run any actual SQL 20 | 21 | def query_to_pandas(query): 22 | return DF[[p["name"] for p in PREDICTORS] + ['income']] 23 | 24 | 25 | def query_to_table(query): 26 | pass -------------------------------------------------------------------------------- /code/predictors.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"name": "height", "fill_na": 70}, 3 | {"name": "is_male", "fill_na": 0.5} 4 | ] -------------------------------------------------------------------------------- /code/template.sql: -------------------------------------------------------------------------------- 1 | WITH cleaned_input as ( 2 | SELECT customer_id{% for p in predictors %} 3 | , CASE WHEN {{ p['name'] }} IS NULL 4 | THEN {{ p['fill_na'] }} 5 | ELSE {{ p['name'] }} END 6 | AS {{ p['name'] }}{% endfor %}{% if train %} 7 | , income{% endif %} 8 | FROM customer_attr{% if train %} 9 | WHERE customer_id % 10 = 0 10 | AND income IS NOT NULL{% endif %} 11 | ) 12 | {% if train %} 13 | SELECT * 14 | FROM cleaned_input 15 | {% else %} 16 | SELECT customer_id 17 | , {{ coefs['intercept'] }}{% for p in predictors %} 18 | + {{ coefs[p['name']] }}*{{ p['name'] }}{% endfor %} 19 | AS predicted_income 20 | FROM cleaned_input 21 | {% endif %} --------------------------------------------------------------------------------