├── .DS_Store ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── cfg.py ├── d6tflow-template-minimal.zip ├── export.py ├── reports └── plot.png ├── run.py ├── tasks.py ├── tasks_export.py ├── visualize.ipynb └── visualize.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d6t/d6tflow-template/c9f1f01f44db587312db4546cca07fac0a3798ab/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | .creds* 3 | 4 | .idea/ 5 | data/ 6 | *.pq 7 | 8 | .vscode 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Databolt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # d6tflow - Project Templates 2 | ## Clean and scalable project structure for data science projects 3 | 4 | Templates with common design patterns for https://github.com/d6t/d6tflow 5 | 6 | ## Structure 7 | 8 | `task.py`: workflow tasks 9 | `cfg.py`: parameter and other config 10 | `run.py`: execute workflow tasks 11 | `visualize.py`: use outputs for further analysis 12 | `visualize.ipynb`: use outputs in jupyter notebook 13 | `.creds.yaml`: optional file with protected credentials in [yaml format](https://docs.ansible.com/ansible/latest/reference_appendices/YAMLSyntax.html), not commited to git to protect credentials 14 | 15 | ## Clean branch 16 | 17 | For repeat usage you don't need all those comments and can use the clean branch. Clone into an existing folder using `git clone -b clean --single-branch https://github.com/d6t/d6tflow-template.git .` 18 | 19 | ## Minimal branch 20 | 21 | For frequent users with a variety of projects, this is the best starting point. Available as zip https://github.com/d6t/d6tflow-template/raw/master/d6tflow-template-minimal.zip 22 | 23 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | do_preprocess = True 2 | 3 | import datetime 4 | dt_start = datetime.date(2010,1,1) 5 | dt_end = datetime.date(2020,1,1) 6 | 7 | # load protected credentials 8 | try: 9 | import d6tpipe 10 | uri = d6tpipe.utils.loadyaml('.creds.yaml').get('uri') 11 | except: 12 | pass 13 | -------------------------------------------------------------------------------- /d6tflow-template-minimal.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d6t/d6tflow-template/c9f1f01f44db587312db4546cca07fac0a3798ab/d6tflow-template-minimal.zip -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | # Sharing Workflows and Outputs 2 | # see https://d6tflow.readthedocs.io/en/latest/collaborate.html 3 | import d6tflow 4 | import tasks 5 | 6 | flow = d6tflow.Workflow(task=tasks.TaskTrain, params=params) 7 | e = d6tflow.FlowExport(flows=flow, save=True, path_export='tasks_export.py') 8 | e.generate() 9 | 10 | # auto save d6tflow data to d6tpipe repo 11 | cfg_pipe = 'pipename' 12 | cfg_profile = 'default' 13 | d6tflow.pipes.init(cfg_pipe,profile=cfg_profile, local_pipe=True) # work in local mode first 14 | 15 | # execute flow 16 | flow = d6tflow.Workflow(task = tasks.TaskTrain) 17 | flow.run() 18 | 19 | # auto generate files for data consumer to run 20 | d6tflow.pipes.FlowExport(flows=flow,pipename=cfg_pipe).generate() 21 | params=dict() 22 | -------------------------------------------------------------------------------- /reports/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d6t/d6tflow-template/c9f1f01f44db587312db4546cca07fac0a3798ab/reports/plot.png -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import d6tflow 2 | import cfg, tasks, visualize 3 | 4 | # Check task dependencies and their execution status 5 | flow = d6tflow.Workflow(task=tasks.TaskTrain) 6 | flow.preview() 7 | 8 | # Execute the model training task including dependencies. See https://d6tflow.readthedocs.io/en/latest/run.html 9 | flow.run() 10 | 11 | # use output 12 | visualize.accuracy() 13 | visualize.plot_importances() 14 | 15 | # change parameter and rerun, see https://d6tflow.readthedocs.io/en/latest/advparam.html 16 | params = dict(do_preprocess=False) 17 | flow1 = d6tflow.Workflow(params=params, task=tasks.TaskTrain) 18 | flow1.run() 19 | visualize.accuracy(do_preprocess=False) # task output is parameter specific 20 | 21 | # rerun flow after code changes 22 | import importlib 23 | 24 | importlib.reload(cfg) 25 | importlib.reload(tasks) 26 | 27 | # say you changed TaskGetData, reset all tasks depending on TaskGetData 28 | flow.reset(tasks.TaskGetData) 29 | flow1.reset(tasks.TaskGetData) 30 | 31 | flow.preview() 32 | flow.run() 33 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | import d6tflow 2 | import sklearn, sklearn.datasets 3 | from sklearn.ensemble import RandomForestClassifier 4 | import pandas as pd 5 | 6 | 7 | import cfg 8 | 9 | # define workflow 10 | class TaskGetData(d6tflow.tasks.TaskPqPandas): # save dataframe as parquet, see https://d6tflow.readthedocs.io/en/latest/targets.html 11 | dt_start = d6tflow.DateParameter(default=cfg.dt_start) # workflow parameters. See https://d6tflow.readthedocs.io/en/latest/advparam.html 12 | dt_end = d6tflow.DateParameter(default=cfg.dt_end) 13 | 14 | def run(self): 15 | iris = sklearn.datasets.load_iris() 16 | df_train = pd.DataFrame(iris.data,columns=['feature{}'.format(i) for i in range(4)]) 17 | df_train['y'] = iris.target 18 | # optional: df_train[df_train['date']>=self.dt_start] 19 | self.save(df_train) # quickly save dataframe 20 | 21 | @d6tflow.requires(TaskGetData) # define dependency. See https://d6tflow.readthedocs.io/en/latest/tasks.html 22 | class TaskPreprocess(d6tflow.tasks.TaskPqPandas): 23 | do_preprocess = d6tflow.BoolParameter(default=cfg.do_preprocess) # parameter for preprocessing yes/no 24 | 25 | def run(self): 26 | df_train = self.input().load() # quickly load required data, see https://d6tflow.readthedocs.io/en/latest/tasks.html#load-input-data 27 | if self.do_preprocess: 28 | df_train.iloc[:,:-1] = sklearn.preprocessing.scale(df_train.iloc[:,:-1]) 29 | self.save(df_train) # save task output, see https://d6tflow.readthedocs.io/en/latest/tasks.html#save-output-data 30 | 31 | @d6tflow.requires(TaskPreprocess) # define dependency. See https://d6tflow.readthedocs.io/en/latest/tasks.html 32 | class TaskTrain(d6tflow.tasks.TaskPickle): # save output as pickle 33 | model = d6tflow.BoolParameter(default='rf') 34 | 35 | def run(self): 36 | df_train = self.input().load() 37 | if self.model=='rf': 38 | model = RandomForestClassifier(n_jobs=2, random_state=0) 39 | else: 40 | model = RandomForestClassifier(n_jobs=2, random_state=0) 41 | 42 | model.fit(df_train.iloc[:,:-1], df_train['y']) 43 | self.save(model) 44 | -------------------------------------------------------------------------------- /tasks_export.py: -------------------------------------------------------------------------------- 1 | 2 | import d6tflow 3 | import luigi 4 | import datetime 5 | 6 | class TaskPreprocess(d6tflow.tasks.TaskPqPandas): 7 | external=True 8 | persist=['data'] 9 | dt_start=luigi.parameter.DateParameter(default=datetime.date(2010, 1, 1)) 10 | dt_end=luigi.parameter.DateParameter(default=datetime.date(2020, 1, 1)) 11 | do_preprocess=luigi.parameter.BoolParameter(default=True) 12 | 13 | class TaskGetData(d6tflow.tasks.TaskPqPandas): 14 | external=True 15 | persist=['data'] 16 | dt_start=luigi.parameter.DateParameter(default=datetime.date(2010, 1, 1)) 17 | dt_end=luigi.parameter.DateParameter(default=datetime.date(2020, 1, 1)) 18 | 19 | class TaskTrain(d6tflow.tasks.TaskPickle): 20 | external=True 21 | persist=['data'] 22 | dt_start=luigi.parameter.DateParameter(default=datetime.date(2010, 1, 1)) 23 | dt_end=luigi.parameter.DateParameter(default=datetime.date(2020, 1, 1)) 24 | do_preprocess=luigi.parameter.BoolParameter(default=True) 25 | 26 | -------------------------------------------------------------------------------- /visualize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "output_type": "stream", 10 | "name": "stdout", 11 | "text": [ 12 | "Welcome to d6tflow! For Q&A see https://github.com/d6t/d6tflow\n", 13 | "Welcome to d6tpipe!\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import sklearn\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "# import d6tflow tasks\n", 22 | "import tasks" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 6, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "output_type": "stream", 32 | "name": "stdout", 33 | "text": [ 34 | "\n===== Luigi Execution Summary =====\n\nScheduled 2 tasks of which:\n* 1 complete ones were encountered:\n - 1 TaskPreprocess(dt_start=2010-01-01, dt_end=2020-01-01, do_preprocess=True)\n* 1 ran successfully:\n - 1 TaskTrain(dt_start=2010-01-01, dt_end=2020-01-01, do_preprocess=True)\n\nThis progress looks :) because there were no failed tasks or missing dependencies\n\n===== Luigi Execution Summary =====\n\n" 35 | ] 36 | }, 37 | { 38 | "output_type": "execute_result", 39 | "data": { 40 | "text/plain": [ 41 | "LuigiRunResult(status=,worker=,scheduling_succeeded=True)" 42 | ] 43 | }, 44 | "metadata": {}, 45 | "execution_count": 6 46 | } 47 | ], 48 | "source": [ 49 | "import d6tflow\n", 50 | "d6tflow.run(tasks.TaskTrain(), confirm=False, forced_all=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "name": "stderr", 61 | "text": [ 62 | "C:\\Anaconda3\\lib\\site-packages\\sklearn\\base.py:315: UserWarning: Trying to unpickle estimator DecisionTreeClassifier from version 0.22.1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.\n", 63 | " UserWarning)\n", 64 | "C:\\Anaconda3\\lib\\site-packages\\sklearn\\base.py:315: UserWarning: Trying to unpickle estimator RandomForestClassifier from version 0.22.1 when using version 0.24.1. This might lead to breaking code or invalid results. Use at your own risk.\n", 65 | " UserWarning)\n", 66 | "1.0\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "model = tasks.TaskTrain().output().load()\n", 72 | "df_train = tasks.TaskPreprocess().output().load()\n", 73 | "print(sklearn.metrics.accuracy_score(df_train['y'],model.predict(df_train.iloc[:,:-1])))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "" 85 | ] 86 | }, 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | }, 91 | { 92 | "data": { 93 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEdCAYAAAD3ryfCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAD/1JREFUeJzt3X+s3Xddx/Hna20KhF/G9QZxLbRAg1QgA651/sMvMdkktCaAtkSzJZCqsQKBRJtAljpjxBElBitQA2ySYBnwhxctzl/8UBDSO6nMrhTqMtjNjF5gjl9C6Xj7xz1jx3NPe7/37nt7dj59PpIm5/v9fnbOO68tr337Ped7TqoKSVJbLpv0AJKk/lnuktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUoI2TeuHNmzfXtm3bJvXykjSVbrvttq9W1cxK6yZW7tu2bWN+fn5SLy9JUynJl7us63RZJsnVSU4nOZPk4Jjj1yVZTHJi8Oc1qx1YktSfFc/ck2wADgM/BywAx5PMVdUdI0s/UFUH1mFGSdIqdTlz3wWcqao7q+oscBTYs75jSZIeii7lfgVw99D2wmDfqJcn+XySDyXZ2st0kqQ16VLuGbNv9Bc+PgJsq6pnA38P3Dz2iZL9SeaTzC8uLq5uUklSZ13KfQEYPhPfAtwzvKCqvlZV3xts/hnwvHFPVFVHqmq2qmZnZlb8JI8kaY26lPtxYEeS7Uk2AXuBueEFSZ44tLkbONXfiJKk1Vrx0zJVdS7JAeBWYAPwnqo6meQGYL6q5oDXJtkNnAO+Dly3jjNLklaQSf1A9uzsbPV9E9O2g3/d6/Otl7ve8tJJjyBpSiW5rapmV1rnd8tIUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUIMtdkhpkuUtSgyx3SWqQ5S5JDbLcJalBlrskNchyl6QGWe6S1CDLXZIaZLlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDOpV7kquTnE5yJsnBC6x7RZJKMtvfiJKk1Vqx3JNsAA4D1wA7gX1Jdo5Z91jgtcBn+x5SkrQ6Xc7cdwFnqurOqjoLHAX2jFn3u8CNwHd7nE+StAZdyv0K4O6h7YXBvh9K8hxga1X9VY+zSZLWqEu5Z8y++uHB5DLgbcAbV3yiZH+S+STzi4uL3aeUJK1Kl3JfALYObW8B7hnafizwTODjSe4CrgLmxr2pWlVHqmq2qmZnZmbWPrUk6YK6lPtxYEeS7Uk2AXuBuQcOVtV9VbW5qrZV1TbgM8Duqppfl4klSStasdyr6hxwALgVOAXcUlUnk9yQZPd6DyhJWr2NXRZV1THg2Mi+68+z9oUPfSxJ0kPhHaqS1CDLXZIaZLlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBnW6iUmXqEOPn/QE3Ry6b9ITSA87nrlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUIMtdkhpkuUtSgyx3SWqQ5S5JDbLcJalBlrskNchyl6QGWe6S1CDLXZIa1Knck1yd5HSSM0kOjjn+a0luT3IiyT8n2dn/qJKkrlYs9yQbgMPANcBOYN+Y8n5/VT2rqq4EbgT+qPdJJUmddTlz3wWcqao7q+oscBTYM7ygqr4xtPlooPobUZK0Whs7rLkCuHtoewH46dFFSX4DeAOwCXhxL9NJktaky5l7xuxbdmZeVYer6qnAbwNvHvtEyf4k80nmFxcXVzepJKmzLuW+AGwd2t4C3HOB9UeBXxh3oKqOVNVsVc3OzMx0n1KStCpdyv04sCPJ9iSbgL3A3PCCJDuGNl8KfKm/ESVJq7XiNfeqOpfkAHArsAF4T1WdTHIDMF9Vc8CBJC8Bvg/cC1y7nkNLki6syxuqVNUx4NjIvuuHHr+u57kkSQ+Bd6hKUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUIMtdkhpkuUtSgyx3SWqQ5S5JDbLcJalBlrskNchyl6QGWe6S1CDLXZIaZLlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDLHdJalCnck9ydZLTSc4kOTjm+BuS3JHk80n+IcmT+x9VktTViuWeZANwGLgG2AnsS7JzZNnngNmqejbwIeDGvgeVJHXX5cx9F3Cmqu6sqrPAUWDP8IKq+lhVfWew+RlgS79jSpJWo0u5XwHcPbS9MNh3Pq8GPjruQJL9SeaTzC8uLnafUpK0Kl3KPWP21diFyS8Ds8Bbxx2vqiNVNVtVszMzM92nlCStysYOaxaArUPbW4B7RhcleQnwJuAFVfW9fsaTJK1FlzP348COJNuTbAL2AnPDC5I8B3gXsLuq/rv/MSVJq7FiuVfVOeAAcCtwCrilqk4muSHJ7sGytwKPAT6Y5ESSufM8nSTpIuhyWYaqOgYcG9l3/dDjl/Q8lyTpIfAOVUlqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUIMtdkhpkuUtSgyx3SWqQ5S5JDbLcJalBlrskNchyl6QGWe6S1CDLXZIaZLlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBm2c9ADSpeBZNz9r0iN0cvu1t096BPXEM3dJapDlLkkNstwlqUGdyj3J1UlOJzmT5OCY489P8q9JziV5Rf9jSpJWY8VyT7IBOAxcA+wE9iXZObLsK8B1wPv7HlCStHpdPi2zCzhTVXcCJDkK7AHueGBBVd01OPaDdZhRkrRKXS7LXAHcPbS9MNgnSXqY6lLuGbOv1vJiSfYnmU8yv7i4uJankCR10KXcF4CtQ9tbgHvW8mJVdaSqZqtqdmZmZi1PIUnqoEu5Hwd2JNmeZBOwF5hb37EkSQ/FiuVeVeeAA8CtwCnglqo6meSGJLsBkvxUkgXglcC7kpxcz6ElSRfW6btlquoYcGxk3/VDj4+zdLlGkvQw4B2qktQgy12SGmS5S1KDLHdJapDlLkkNstwlqUGWuyQ1yHKXpAZZ7pLUIMtdkhpkuUtSgyx3SWqQ5S5JDbLcJalBlrskNchyl6QGWe6S1CDLXZIaZLlLUoMsd0lqkOUuSQ2y3CWpQZa7JDXIcpekBlnuktQgy12SGmS5S1KDNk56AElarVM/8YxJj9DJM75wamKv7Zm7JDXIcpekBlnuktQgy12SGmS5S1KDOpV7kquTnE5yJsnBMccfkeQDg+OfTbKt70ElSd2tWO5JNgCHgWuAncC+JDtHlr0auLeqnga8DfiDvgeVJHXX5cx9F3Cmqu6sqrPAUWDPyJo9wM2Dxx8CfjZJ+htTkrQaXcr9CuDuoe2Fwb6xa6rqHHAfcHkfA0qSVq/LHarjzsBrDWtIsh/YP9j8VpLTHV5/0jYDX+3zCXNpX7TqPU9+55L9S2L//21ed8lmCevx3+b6XMB4cpdFXcp9Adg6tL0FuOc8axaSbAQeD3x99Imq6ghwpMtgDxdJ5qtqdtJztMI8+2OW/Wotzy6XZY4DO5JsT7IJ2AvMjayZA64dPH4F8I9VtezMXZJ0cax45l5V55IcAG4FNgDvqaqTSW4A5qtqDng38L4kZ1g6Y9+7nkNLki6s07dCVtUx4NjIvuuHHn8XeGW/oz1sTNVlpClgnv0xy341lWe8eiJJ7fHrBySpQZa7JDXIcpekBlnuktQgy30gyY8leUeSw0kuT3Ioye1JbknyxEnPN22SPC7J7yd5X5JXjRz700nNNa2SPD7JW5J8IcnXBn9ODfb9yKTna0WSj056hr5Y7g+6CbiDpe/I+Rjwv8BLgX8C3jm5sabWe1n6WooPA3uTfDjJIwbHrprcWFPrFuBe4IVVdXlVXQ68aLDvgxOdbMokee55/jwPuHLS8/XFj0IOJPlcVT1n8PgrVfWkoWMnqqqZf+kXw2hmSd4E/DywG/i7qnruxIabQklOV9XTV3tMyyW5H/gE478T66qqetRFHmlddLqJ6RIx/LeYP7/AMXXziCSXVdUPAKrq95IsAJ8EHjPZ0abSl5P8FnBzVf0XQJInANfx/7+1VSs7BfxqVX1p9ECSZrK0tB70l0keA1BVb35gZ5KnAV+c2FTT6yPAi4d3VNXNwBuBsxOZaLr9Ektfo/2JJPcmuRf4OPCjwC9OcrApdIjzd99vXsQ51pWXZSSpQZ65j0jyhCTvfuBd8yQ7k7x60nNNK/Psl3n2p/UsLfflbmLpGzB/fLD9ReD1E5tm+t2EefbpJsyzLzfRcJaW+3Kbq+oW4IE3As8B9092pKlmnv0yz/40naXlvty3k1zO4GcCk1zF0m/Cam3Ms1/m2Z+ms/SjkMu9gaVflnpqkk8BMyz9upTWxjz7ZZ79aTpLy31IksuARwIvAJ7O0k0Op6vq+xMdbEqZZ7/Msz+XQpZ+FHJEkn+pqp+Z9BytMM9+mWd/Ws/Sa+7L/W2SlycZd2uyVs88+2We/Wk6S8/cRyT5JvBo4BzwXZb+ulZV9biJDjalzLNf5tmf1rO03CWpQb6hOiLJ88ftr6pPXuxZWmCe/TLP/rSepWfuI5J8ZGjzkcAu4LaqevF5/hFdgHn2yzz703qWnrmPqKqXDW8n2QrcOKFxpp559ss8+9N6ln5aZmULwDMnPURDzLNf5tmfprL0zH1EkrczuB2Zpf/5XQn82+Qmmm7m2S/z7E/rWXrNfUSSa4c2zwF3VdWnJjXPtDPPfplnf1rP0jP35X6kqv54eEeS143uU2fm2S/z7E/TWXrNfblrx+y77mIP0RDz7Jd59qfpLD1zH0iyD3gVsD3J3NChxwJfm8xU08s8+2We/blUsrTcH/Rp4D+BzcAfDu3/JvD5iUw03cyzX+bZn0siS99QlaQGec19RJKrkhxP8q0kZ5Pcn+Qbk55rWplnv8yzP61nabkv9yfAPuBLwKOA1wBvn+hE0808+2We/Wk6S6+5j1FVZ5JsqKr7gfcm+fSkZ5pm5tkv8+xPy1la7st9J8km4ESSG1l64+XRE55pmplnv8yzP01n6WWZ5X6FpVwOAN8GtgIvn+hE0808+2We/Wk6Sz8tM0aSRwFPqqrTk56lBebZL/PsT8tZeuY+IsnLgBPA3wy2rxy50UGrYJ79Ms/+tJ6l5b7cIZa+tP9/AKrqBLBtgvNMu0OYZ58OYZ59OUTDWVruy52rqvsmPURDzLNf5tmfprP00zLL/XuSVwEbkuwAXsvS7cpaG/Psl3n2p+ksPXMfSPK+wcP/AH4S+B7wF8A3gNdPaq5pZZ79Ms/+XCpZ+mmZgSR3ANcAc8CLRo9X1dcv+lBTzDz7ZZ79uVSy9LLMg97J0rvmTwHmh/aHpZ/iesokhppi5tkv8+zPJZGlZ+4jkryjqn590nO0wjz7ZZ79aT1Ly12SGuQbqpLUIMtdkhpkuUtSgyx3SWqQ5S5JDfo/TLboX7Tp56MAAAAASUVORK5CYII=\n", 94 | "text/plain": [ 95 | "" 96 | ] 97 | }, 98 | "metadata": {}, 99 | "output_type": "display_data" 100 | } 101 | ], 102 | "source": [ 103 | "df_importance = pd.Series(model.feature_importances_, index=df_train.iloc[:,:-1].columns)\n", 104 | "import matplotlib.pyplot as plt\n", 105 | "df_importance.sort_values(ascending=False).plot.bar()\n", 106 | "\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# save figure\n", 116 | "plt.savefig('reports/plot.png')" 117 | ] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "name": "python376jvsc74a57bd097ae724bfa85b9b34df7982b8bb8c7216f435b92902d749e4263f71162bea840", 123 | "display_name": "Python 3.7.6 64-bit ('base': conda)" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.7.6" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 2 140 | } -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | import tasks 3 | import pandas as pd 4 | 5 | import cfg 6 | 7 | def accuracy(do_preprocess=cfg.do_preprocess): 8 | model = tasks.TaskTrain(do_preprocess=do_preprocess).output().load() 9 | df_train = tasks.TaskPreprocess().output().load() 10 | print(sklearn.metrics.accuracy_score(df_train['y'],model.predict(df_train.iloc[:,:-1]))) 11 | 12 | def plot_importances(): 13 | model = tasks.TaskTrain().output().load() 14 | df_train = tasks.TaskPreprocess().output().load() 15 | df_importance = pd.Series(model.feature_importances_, index=df_train.iloc[:,:-1].columns) 16 | import matplotlib.pyplot as plt 17 | df_importance.sort_values(ascending=False).plot.bar() 18 | plt.savefig('reports/plot.png') 19 | 20 | --------------------------------------------------------------------------------