├── .gitignore ├── README.md ├── autoviz └── autoviz_client_example.ipynb ├── compliant_driverlessai ├── README.md ├── data │ ├── CreditCard-test.csv │ ├── compliance_maralutu.csv │ ├── dai_10_10_1_9cf011b9.csv │ ├── dai_6_4_6_nulafigi.csv │ └── mono_xgb_nisonote.csv └── notebooks │ ├── compliant_dia_gender.ipynb │ ├── compliant_dia_marriage.ipynb │ ├── dai_10_10_1_gender.ipynb │ ├── dai_10_10_1_marriage.ipynb │ ├── dai_6_4_6_gender.ipynb │ ├── dai_6_4_6_marriage.ipynb │ ├── mono_xgb_dia_gender.ipynb │ ├── mono_xgb_dia_marriage.ipynb │ └── requirements.txt ├── dai_installation ├── Aws │ ├── Rhel7.7.md │ └── images │ │ └── 01_compute_config.png ├── Azure │ ├── Ubuntu16.04.md │ └── images │ │ ├── 01_nvidia_driver_install.gif │ │ ├── 01_select_compute_ubuntu.png │ │ ├── 02_compute_config.png │ │ ├── 02_cuda_install.gif │ │ ├── 03_create_new_disk.png │ │ ├── 03_docker_install.gif │ │ ├── 04_dai_install_e13.gif │ │ ├── 04_new_disk_attached.png │ │ ├── 05_configure_networking.png │ │ └── 06_configure_nsg_ports_open.png └── README.md ├── dai_python_client ├── README.md ├── algorithm_family_comparison.ipynb └── common_workflow.ipynb ├── driverlessai_experiments ├── iid │ ├── credit_card_experiment │ │ ├── credit_card_default.ipynb │ │ ├── credit_card_models_by_accuracy_time_complexity.Rmd │ │ ├── credit_card_models_by_accuracy_time_complexity.html │ │ └── images │ │ │ ├── download_mojo.png │ │ │ ├── exp_running_creditcard.png │ │ │ ├── experiment_complete_creditcard.png │ │ │ ├── experiment_list_complete.png │ │ │ ├── experiment_list_running.png │ │ │ ├── import_data_sets_creditcard.png │ │ │ ├── mli_external.png │ │ │ ├── mli_list.png │ │ │ ├── model_diagnostics_complete.png │ │ │ ├── model_diagnostics_setup.png │ │ │ ├── py_client_link.png │ │ │ ├── set_columns_creditcard.png │ │ │ ├── set_parameters_creditcard.png │ │ │ ├── sign_in_home_page_0.png │ │ │ └── skip_sign_in_home_page_1.png │ ├── imbalanced │ │ ├── images │ │ │ ├── compare_weighted_experiments.png │ │ │ ├── py_client_link.png │ │ │ └── weighted_project.png │ │ └── imbalanced_experiment.ipynb │ ├── model_family_comparison │ │ └── model_family_comparison.ipynb │ └── reject_inference │ │ └── Reject_Inference_with_Fuzzy_Augment.Rmd ├── nlp │ ├── airline_sentiment_experiment │ │ ├── demo_nlp_airline_sentiment.ipynb │ │ └── nlp_airline_sentiment_mli.ipynb │ └── custom_word2vec_embeddings.ipynb ├── nlp_timeseries │ ├── imgs │ │ ├── coffee.gif │ │ ├── create_experiment.png │ │ ├── mapbox.png │ │ └── scpf_lb_progress.png │ ├── kaggle_see_click_predict_fix.ipynb │ └── predictions │ │ └── .gitkeep └── timeseries │ ├── stock_timeseries_experiment │ └── demo_stock_timeseries.ipynb │ ├── ts-full-pipeline │ ├── .gitignore │ ├── 01-generate-data.sh │ ├── 01_process_full_TS_csv.py │ ├── 02-create-experiment-data.sh │ ├── 02_extract_experiment_datasets.py │ ├── 03-default-experiment-configs.json │ ├── 03-run-experiment.sh │ ├── 03_run_experiment.py │ ├── 04-create-tta-scoring-files.sh │ ├── 04_generate_tta_files.py │ ├── 05-score-tta-files.sh │ ├── 05_score_tta_files.py │ ├── 10_plot_score_metric.py │ ├── 11_http_server2.py │ ├── README.md │ ├── environment.yml │ ├── images │ │ ├── TTA - Rolling Window.odp │ │ ├── TTA-RollWindow-duration.png │ │ ├── metrics_plot.png │ │ └── metrics_plot.svg │ └── ts-definition.json │ └── walmart_timeseries_experiment │ ├── images │ ├── import_data_sets_stock.png │ └── launching_experiment.png │ ├── timeseries_model_rollingwindow.ipynb │ └── training_timeseries_model.ipynb ├── interpretable_ml ├── DAIDIA.ipynb ├── FormatReasonCodes.ipynb ├── MLIDTSurrogate.ipynb ├── MLIPDPICE.ipynb ├── MLIResidualAnalysis.ipynb ├── MLISensitivityAnalysis.ipynb ├── README.md ├── TimeSeriesDiagnostics.ipynb └── data │ ├── credit_test.csv │ ├── credit_train.csv │ ├── default_of_credit_card_clients.xls │ ├── klime_frame.csv │ └── shapley.csv └── scoring-pipeline-deployment ├── R └── Shiny_Example │ ├── 1_Data_Recoding.R │ ├── 2_DAI_Interaction.R │ ├── 3_DAI_Model_Prediction.R │ ├── 4_MOJO_Predictions.R │ ├── CreditCard.csv │ ├── CreditCardRe.csv │ ├── Data_Preprocessing_for_app.R │ ├── full_app.R │ ├── simple_app.R │ └── train_preds_custom.csv ├── README.md ├── java └── README.md └── python ├── centos ├── docker │ ├── Dockerfile │ └── README.md └── vagrant │ ├── README.md │ ├── Vagrantfile │ ├── bootstrap.sh │ ├── payload.sh │ └── payload │ └── README.md ├── pyspark ├── README.md └── get_predictions.py └── ubuntu ├── README.md ├── docker ├── .gitignore ├── Dockerfile ├── Dockerfile-pip-batch ├── Dockerfile-pip-http ├── README.md ├── batch_scorer.py └── payload │ └── README.md └── vagrant ├── README.md ├── Vagrantfile ├── bootstrap.sh ├── payload.sh └── payload └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | # C extensions 6 | *.so 7 | # Distribution / packaging 8 | .Python 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | # Unit test / coverage reports 34 | htmlcov/ 35 | .tox/ 36 | .coverage 37 | .coverage.* 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | *.cover 42 | .hypothesis/ 43 | .pytest_cache/ 44 | # Translations 45 | *.mo 46 | *.pot 47 | # Django stuff: 48 | *.log 49 | local_settings.py 50 | db.sqlite3 51 | # Flask stuff: 52 | instance/ 53 | .webassets-cache 54 | # Scrapy stuff: 55 | .scrapy 56 | # Sphinx documentation 57 | docs/_build/ 58 | # PyBuilder 59 | target/ 60 | # Jupyter Notebook 61 | .ipynb_checkpoints 62 | # pyenv 63 | .python-version 64 | # celery beat schedule file 65 | celerybeat-schedule 66 | # SageMath parsed files 67 | *.sage.py 68 | # Environments 69 | .env 70 | .venv 71 | env/ 72 | venv/ 73 | ENV/ 74 | env.bak/ 75 | venv.bak/ 76 | # Spyder project settings 77 | .spyderproject 78 | .spyproject 79 | # Rope project settings 80 | .ropeproject 81 | # mkdocs documentation 82 | /site 83 | # mypy 84 | .mypy_cache/ 85 | # Scoring pipeline payloads 86 | scorer.zip 87 | mojo.zip 88 | # DAI license 89 | license.sig 90 | # VAgrant 91 | .vagrant 92 | .DS_Store 93 | .Rproj.user 94 | driverlessai-tutorials.Rproj 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Driverless AI Code Samples and Tutorials 2 | 3 | This repository provides code examples and tutorials demonstrating use of Driverless AI. 4 | 5 | -------------------------------------------------------------------------------- /compliant_driverlessai/README.md: -------------------------------------------------------------------------------- 1 | # Compliant Driverless AI 2 | 3 | # Contents 4 | 5 | * Jupyter notebooks for `compliant` modelling in Driverless AI (version 1.6.2): 6 | * `Compliant` mode: 7 | * Documentation about `Compliant` mode in Driverless AI is [here](http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/expert-settings.html?highlight=compliant#pipeline-building-recipe) 8 | * Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/compliant_dia_gender.ipynb) 9 | * Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/compliant_dia_marriage.ipynb) 10 | * Monotonic XGBoost modelling in Driverless AI: 11 | * Documentation about monotonicity constraints in Driverless AI is [here](http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/experiment-settings.html?highlight=monotonic#interpretability) 12 | * Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/mono_xgb_dia_gender.ipynb) 13 | * Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/mono_xgb_dia_marriage.ipynb) 14 | * DAI Experiment with settings 6/4/6 (Accuracy/Time/Interpretability): 15 | * Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_6_4_6_gender.ipynb) 16 | * Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_6_4_6_marriage.ipynb) 17 | * DAI Experiment with Settings 10/10/1 (Accruracy/Time/Interpretability): 18 | * Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_10_10_1_gender.ipynb) 19 | * Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_10_10_1_marriage.ipynb) 20 | -------------------------------------------------------------------------------- /compliant_driverlessai/notebooks/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.14.5 2 | pandas==0.22.0 3 | matplotlib==2.2.2 4 | xlrd==1.1.0 5 | shap==0.24.0 6 | scikit-learn==0.19.2 7 | jupyter==1.0.0 8 | eli5==0.8 9 | h2o==3.20.0.8 10 | seaborn==0.9.0 11 | datatable==0.8.0 12 | -------------------------------------------------------------------------------- /dai_installation/Aws/Rhel7.7.md: -------------------------------------------------------------------------------- 1 | Install H2O Driverless AI on base RHEL7.7 on AWS EC2 Instances 2 | ======================================================= 3 | 4 | > While this guide mentions about installing Driverless AI from scratch on Aws, it can be used on bare-metal machine or on any other cloud VM from the `Install Nvidia driver` step onwards. 5 | 6 | Create base RHEL 7.7 Server 7 | ----------------------------------- 8 | 9 | - **Select OS** 10 | - Login to Aws console and create a new EC2 compute instance. 11 | - Search for Rhel 7.7 AMI. 12 | - Select RHEL 7.7 AMI and start configuring the EC2 instance. 13 | - **Select EC2 instance Type** 14 | - Choose the instance type For this exercise, we choose g4dn.16xlarge which 15 | has 1 GPU and 64 CPU cores. Consider the proper [EC2 instances type][1] based on your use case. 16 | - **Configure Instance** 17 | - Configure instance setting for network, IAM role, suhtdwon polict etc., 18 | as required. 19 | - **Configure Storage** 20 | - SSD is recommended persistance store for Driverless AI. 21 | - For this setup, I installed DAI on the same disk where the OS is installed. 22 | - To increase the OS disk size, once the VM is running you will need to stop it. 23 | - If you are doing the RPM install, DAI will put the bulk of its data in the `/opt/h2oai/dai` directory. So if you are attaching an additional drive, ensure that you mount it at `/opt`. 24 | - If you are going with a docker based approach, you can mount the disk to any mount point as you will be mapping the host directories as volumes in the docker container. 25 | - **Configure Security Group** 26 | - Configure security group as needed. 27 | - At a minimum, ensure that your compute instance would have a public IP. 28 | - Configure the Network Security Group to allow incoming connections to port 22 (for SSH connection) and 12345 (for Driverless AI web UI). 29 | ![AWS compute configuration](images/01_compute_config.png) 30 | 31 | > H2O Driverless AI uses Tensorflow built against CUDA 10.0, hence this is the recommended CUDA version to use. Per [Nvidia Compatibility Matrix][2], Nvidia driver version 384.XX is the minimum version needed and was the default when CUDA 9.0 was shipped. Per [Nvidia Hardware Support][3], driver 384.xx does not support the latest Turing architecture cards. 32 | > The latest Nvidia Driver we have tested to work with CUDA 10.0 and Driverless AI is the 440.82+ branch. We install 450.XX in the steps below. 33 | 34 | Install pciutils 35 | ---------------------- 36 | 37 | - Once the server is up, ssh to it. 38 | - Disable SELINUX to avoid interference with Nvidia driver installation 39 | ```shell 40 | sudo vi /etc/sysconfig/selinux 41 | SELINUX=disabled 42 | ``` 43 | - Ensure GPU's are installed using pciutils. To install pciutils 44 | ```shell 45 | sudo yum -y install pciutils 46 | ``` 47 | - Check to determine what GPU card is installed 48 | ```shell 49 | lspci | grep -e VGA -ie NVIDIA 50 | ``` 51 | - The output of lspci should be something similar to 52 | ``` 53 | 00:02.0 VGA compatible controller: Intel Corporation 4th Gen ... 54 | 01:00.0 VGA compatible controller: Nvidia Corporation ... 55 | ``` 56 | 57 | Disable Nouveau driver 58 | ---------------------- 59 | 60 | - The `nouveau` driver is an alternative to the Nvidia driver that is generally installed on the server. It does not work with `CUDA` and needs to be disabled. 61 | - If Nouveau drivers are installed and loaded, then you need to follow the steps for your Linux version to [Disable Nouveau Drivers][4]. For RHEL, the steps are 62 | 63 | ```shell 64 | cat < /dev/null && echo "WARNING: nouveau still active" ||echo "Success" 98 | ``` 99 | 100 | Install Nvidia driver 101 | ---------------------------------- 102 | The installation steps are to be followed when EPEL repository and DKMS libraries need not to be installed on the server. These steps also require that the user has ssh'd into the server already. 103 | 104 | - Add the tech preview repository. For detailed instructions refer[5]. 105 | ```shell 106 | sudo yum-config-manager --add-repo=http://developer.download.nvidia.com/compute/cuda/preview/repos/rhel7/x86_64/techpreview_nvidia_rh_drv.repo 107 | ``` 108 | - Install NVIDIA yum plugin. 109 | ```shell 110 | sudo yum install yum-plugin-nvidia 111 | ``` 112 | - Verify that you have a supported kernel installed. 113 | ``` 114 | uname -r 115 | 116 | # The above cmd should show a similar output 117 | 3.10.0-957.12.2.el7.x86_64 118 | ``` 119 | 120 | - Install required dependencies 121 | ``` 122 | sudo yum -y install kernel-devel kernel-headers gcc acpid make 123 | ``` 124 | - Upgrade kernel and rebbot 125 | ``` 126 | sudo yum upgrade kernel 127 | sudo reboot now 128 | ``` 129 | - Once server is up, ssh to the server 130 | - Check Nvidia driver version 131 | ``` 132 | lspci | grep -e VGA -ie NVIDIA 133 | ``` 134 | 135 | - Navigate to [Nvidia Unix driver archive][6], and select `Linux x86_64` > `Latest Long Lived Branch`. Here we choose version `450.80.02` 136 | - Select Download and it should download similar file `NVIDIA-Linux-x86_64-450.80.02.run`. 137 | - Alternatively, on the ssh session, download the file using `wget ` to the server. 138 | - Install the downloaded package 139 | ```shell 140 | sudo chmod +x ./NVIDIA-Linux-$(uname -m)-*.run 141 | sudo ./NVIDIA-Linux-$(uname -m)-*.run 142 | ``` 143 | - At this point you will need to restart the machine. This will ensure that nvidia drivers are correctly loaded to the kernel. 144 | 145 | Set Nvidia Persistance mode 146 | --------------------------- 147 | 148 | - Driverless AI requires the persistance mode to be enabled on each GPU that would be used with DAI 149 | - To manually enable persistance mode on all GPUs issue the command `sudo nvidia-smi -pm 1` 150 | - To validate, issue the command `nvidia-smi` and verify that persistance mode setting is turned ON. 151 | 152 | > At this point your system setup tasks are completed. You can now proceed with a native rpm package install of Driverless AI or proceed to install `Docker CE` and `nvidia-runtime` for a docker based installation of Driverless AI. 153 | 154 | Install Driverless AI native RPM package 155 | ---------------------------------------- 156 | 157 | - If you want docker container based Driverless AI install, skip this section and proceed from Install Docker CE onwards. 158 | - If you want a deb based install, follow the steps in this section and do not follow any of the docker installation sections below. 159 | - [Download latest Driverless AI][8] rpm package from [https://www.h2o.ai/download/#driverless-ai][8]. You can get the URL and issue the command `wget ` to download the file. 160 | - Issue the command `sudo rpm -i .rpm` to install Driverless AI. 161 | - Proceed to Driverless AI documentation to understand the steps to [manage Driverless AI i.e. start, stop, uninstall, update] 162 | 163 | Great, you should be done with native installation of Driverless AI. 164 | 165 | Follow on from here in case you are doing a Docker install for H2O Driverless AI 166 | 167 | Install Docker CE 168 | ----------------- 169 | 170 | - Using redhat subscription manager check if repository exists. Refer to the gist[10] for detailed information 171 | ``` 172 | subscription-manager repos --list | grep -i extras 173 | ``` 174 | - Enable the repository 175 | ``` 176 | sudo subscription-manager repos --enable rhel-7-server-extras-rpms 177 | ``` 178 | - List docker packages 179 | ``` 180 | sudo yum list "*docker*" 181 | ``` 182 | - Install docker 183 | ``` 184 | sudo yum -y install docker 185 | ``` 186 | - Start docker and check docker version 187 | ``` 188 | sudo systemctl start docker.service 189 | docker -v 190 | ``` 191 | - Stop docker 192 | ``` 193 | sudo systemctl stop docker.service 194 | ``` 195 | 196 | Install nvidia-docker 197 | ---------------------- 198 | 199 | - Ensure docker is started and runs. 200 | ``` 201 | sudo systemctl start docker && sudo systemctl enable docker 202 | ``` 203 | 204 | - Setup the stable repository and the GPG key 205 | ```shell 206 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 207 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo 208 | ``` 209 | - Install nvidia-docker 210 | ```shell 211 | sudo yum install nvidia-docker 212 | ``` 213 | - Install nvidia-container-toolkit package 214 | ```shell 215 | sudo yum clean expire-cache 216 | sudo yum install nvidia-container-toolkit -y 217 | ``` 218 | - Restart docker and test setup using a base CUDA container 219 | ```shell 220 | sudo systemctl restart docker 221 | sudo docker run --rm -e NVIDIA_VISIBLE_DEVICES=all nvidia/cuda:11.0-base nvidia-smi 222 | 223 | ``` 224 | - This completes nvidia-docker installation. 225 | [nvidia-docker install reference][11] 226 | 227 | Install H2O Driverless AI as docker 228 | ----------------------------------- 229 | 230 | - [Download latest Driverless AI][12] docker image from [https://www.h2o.ai/download/#driverless-ai][12] 231 | - Load the download image to docker using command `docker load < dai_image_name.tar.gz`. Substitute the correct file name. 232 | - Proceed with [installing Driverless AI][13] following the directions step 5 onwards on that page. 233 | 234 | [1]: https://aws.amazon.com/emr/pricing/ 235 | [2]: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver 236 | [3]: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#support-hardware 237 | [4]: https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-linux/index.html#runfile-nouveau 238 | [5]: http://developer.download.nvidia.com/compute/cuda/preview/repos/rhel7/x86_64/README.html 239 | [6]: https://www.nvidia.com/en-us/drivers/unix/ 240 | [7]: https://docs.nvidia.com/deploy/driver-persistence/index.html#usage 241 | [8]: https://www.h2o.ai/download/#driverless-ai 242 | [9]: http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/install/linux-rpm.html#installing-driverless-ai 243 | [10]: https://gist.github.com/WelshSean/d55289acba43d9c305fbffda2befe201 244 | [11]: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-rhel-7 245 | [12]: https://www.h2o.ai/download/#driverless-ai 246 | [13]: http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/install/rhel.html#install-on-rhel-with-gpus -------------------------------------------------------------------------------- /dai_installation/Aws/images/01_compute_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Aws/images/01_compute_config.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/01_nvidia_driver_install.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/01_nvidia_driver_install.gif -------------------------------------------------------------------------------- /dai_installation/Azure/images/01_select_compute_ubuntu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/01_select_compute_ubuntu.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/02_compute_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/02_compute_config.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/02_cuda_install.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/02_cuda_install.gif -------------------------------------------------------------------------------- /dai_installation/Azure/images/03_create_new_disk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/03_create_new_disk.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/03_docker_install.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/03_docker_install.gif -------------------------------------------------------------------------------- /dai_installation/Azure/images/04_dai_install_e13.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/04_dai_install_e13.gif -------------------------------------------------------------------------------- /dai_installation/Azure/images/04_new_disk_attached.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/04_new_disk_attached.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/05_configure_networking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/05_configure_networking.png -------------------------------------------------------------------------------- /dai_installation/Azure/images/06_configure_nsg_ports_open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/06_configure_nsg_ports_open.png -------------------------------------------------------------------------------- /dai_installation/README.md: -------------------------------------------------------------------------------- 1 | H2O.ai Driverless AI installation from scratch 2 | ============================================== 3 | 4 | This directory lists guides to manually setup [H2O.ai Driverless AI][1] on baremetal machines and various clouds. 5 | 6 | While the guides mention a cloud provider (which is where I tried the steps), they are not specific to the cloud provider and should work on other clouds like AWS, GCP, and even on a bare-metal machine. 7 | 8 | **[Azure/Ubuntu16.04.md](Azure/Ubuntu16.04.md)** 9 | 10 | - Guide to setup Driverless AI from scratch on Ubuntu 16.04 LTS. 11 | - We install the following things in order 12 | - Nvidia Drivers 13 | - CUDA 9.0 14 | - docker-ce 15 | - nvidia-docker and then configure the GPU cards for use is H2O Driverless AI 16 | - We explain the process using a VM on Azure, but then setup steps should be valid for baremetal as well as VMs in other clouds. 17 | 18 | **[Aws/Rhel7.7.md](Aws/Rhel7.7.md)** 19 | 20 | - Guide to setup Driverless AI from scratch on RHEL 7.7. 21 | - We install the following things in order 22 | - Nvidia Drivers 23 | - docker-ce 24 | - nvidia-docker and then configure the GPU cards for use is H2O Driverless AI 25 | - We explain the process using a EC2 instance on AWS, but then setup steps should be valid for baremetal as well as VMs in other clouds. 26 | 27 | [1]: https://www.h2o.ai/products/h2o-driverless-ai/ -------------------------------------------------------------------------------- /dai_python_client/README.md: -------------------------------------------------------------------------------- 1 | # Python Client Examples: driverlessai 2 | 3 | The intuitive, static Python client for Drivierless AI. 4 | 5 | ### Install 6 | 7 | Install with `pip install driverlessai` or `conda install -c h2oai driverlessai`. 8 | 9 | Upgrade with `pip install --upgrade driverlessai` or `conda update -c h2oai driverlessai`. 10 | 11 | ### Documentation 12 | 13 | http://docs.h2o.ai/driverless-ai/pyclient/docs/html/index.html 14 | 15 | ## Available Examples 16 | 17 | * [algorithm_family_comparison](https://github.com/h2oai/driverlessai-tutorials/blob/master/dai_python_client/algorithm_family_comparison.ipynb) - Compare complexity of algorithm vs. accuracy of experiments 18 | 1. Notebook Setup 19 | 2. Connect to Driverless AI 20 | 3. Load a Dataset 21 | 4. Split Dataset 22 | 5. Run Experiments 23 | 6. View Results 24 | * [common_workflow](https://github.com/h2oai/driverlessai-tutorials/blob/master/dai_python_client/common_workflow.ipynb) - Common DAI UI flow from python 25 | 1. Connect to Driverless AI 26 | 2. Documentation 27 | 3. Data 28 | 4. Recipes 29 | 5. Modeling 30 | 6. Launching Machine Learning Interpretability 31 | 7. Disconnect 32 | 33 | Please send feedback to help improve the client and documentation to support@h2o.ai. 34 | -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/download_mojo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/download_mojo.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/exp_running_creditcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/exp_running_creditcard.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/experiment_complete_creditcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_complete_creditcard.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_complete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_complete.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_running.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/import_data_sets_creditcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/import_data_sets_creditcard.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/mli_external.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/mli_external.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/mli_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/mli_list.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_complete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_complete.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_setup.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/py_client_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/py_client_link.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/set_columns_creditcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/set_columns_creditcard.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/set_parameters_creditcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/set_parameters_creditcard.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/sign_in_home_page_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/sign_in_home_page_0.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/credit_card_experiment/images/skip_sign_in_home_page_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/skip_sign_in_home_page_1.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/imbalanced/images/compare_weighted_experiments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/compare_weighted_experiments.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/imbalanced/images/py_client_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/py_client_link.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/imbalanced/images/weighted_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/weighted_project.png -------------------------------------------------------------------------------- /driverlessai_experiments/iid/reject_inference/Reject_Inference_with_Fuzzy_Augment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: html_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | ```{r} 8 | library(DiagrammeR) 9 | library(data.table) 10 | library(ggplot2) 11 | library(scales) 12 | library(ggthemes) 13 | library(R.utils) 14 | ``` 15 | 16 | # Reject Inference Workflow 17 | 18 | ```{r} 19 | mermaid(" 20 | graph TB 21 | 22 | subgraph Application Data 23 | AllApplicantsDS[\"All Applicants\"] --> UnknownGoodBadDS[\"Rejects\"] 24 | AllApplicantsDS --> KnownGoodBadDS[\"Accepts\"] 25 | KnownGoodBadDS -.- KnownGoodDS[\"Loans Paid Off (Good)\"] 26 | KnownGoodBadDS -.- KnownBadDS[\"Charged Off (Bad)\"] 27 | end 28 | 29 | subgraph Accepted Applicants Model 30 | Training1((\"Training\")) --> KnownGoodBadModel(-\"Accepted Loans Model\"-) 31 | KnownGoodBadModel --> Scoring1((\"Scoring\")) 32 | end 33 | 34 | subgraph Fuzzy Augmentation 35 | ScoredUnknownDS[\"Rejects Scored\"] --> UnknownDSLabeledGoodDS[\"Label and Weight as Good\"] 36 | ScoredUnknownDS --> UnknownDSLabeledBadDS[\"Label and Weight as Bad\"] 37 | AllGoodBadWeightedDS[\"Accepted Applicants plus Weighted Rejects\"] 38 | UnknownDSLabeledGoodDS --> AllGoodBadWeightedDS 39 | UnknownDSLabeledBadDS --> AllGoodBadWeightedDS 40 | end 41 | 42 | FinalAllApplicantsScoredDS[\"All Scored on Augmented Model\"] 43 | 44 | subgraph Augmented Model 45 | Training2((\"Training\")) --> FinalRejectInferenceModel(-\"Final Reject Inference Model\"-) 46 | FinalRejectInferenceModel --> Scoring2((\"Scoring\")) 47 | end 48 | 49 | KnownGoodBadDS --> Training1 50 | UnknownGoodBadDS --> Scoring1 51 | Scoring1 --> ScoredUnknownDS 52 | KnownGoodBadDS --> AllGoodBadWeightedDS 53 | AllGoodBadWeightedDS --> Training2 54 | AllApplicantsDS --> Scoring2 55 | Scoring2 --> FinalAllApplicantsScoredDS 56 | 57 | ") 58 | 59 | ``` 60 | 61 | # Loading Dataset 62 | 63 | Original dataset contains loans with either paid off or charged off status. A tiny fraction of loans doesn't have status and thus treated as rejected. Since its number is insufficient to similuate reject inference use case half of loans with status will be assigned to rejected (no status). At the end, two datasets will represent rejected and accepted loans: "KnownGoodBad.csv" and "UnknownGoodBad.csv". 64 | 65 | ```{r createInitialDatasets} 66 | # make changes to where your file located 67 | data_dir = "~/Projects/Playground/data/US_Small_Business_Admin_Loans/" 68 | 69 | # Source: 70 | # https://amstat.tandfonline.com/doi/full/10.1080/10691898.2018.1434342 71 | 72 | sba_national = fread(paste0(data_dir,"SBAnational.csv")) 73 | 74 | # separate data based on labels and randomness 75 | sba_national[MIS_Status!="", Target := MIS_Status=="CHGOFF"] 76 | table(sba_national$Target, useNA = "ifany") 77 | 78 | sba_national[, Reject_Status := MIS_Status==""] 79 | table(sba_national$Reject_Status, useNA = "ifany") 80 | 81 | sba_national[Target==TRUE, Reject_Status := 82 | sample(c(TRUE, FALSE), nrow(sba_national[Target==TRUE,]), replace = TRUE, prob = c(.5,.5))] 83 | sba_national[Target==FALSE, Reject_Status := 84 | sample(c(TRUE, FALSE), nrow(sba_national[Target==FALSE,]), replace = TRUE, prob = c(.5,.5))] 85 | table(sba_national$Reject_Status, useNA = "ifany") 86 | 87 | # make loan number character-based 88 | sba_national[ , LoanNr_ChkDgt := paste0('#', as.character(LoanNr_ChkDgt))] 89 | # parse money amounts to numeric 90 | cols = c("DisbursementGross", "BalanceGross", "ChgOffPrinGr", "GrAppv", "SBA_Appv") 91 | sba_national[ , (cols) := lapply(.SD, FUN = function(x){ 92 | as.numeric(gsub(",", "", substring(x, 2))) 93 | }), .SDcols = cols] 94 | 95 | unknownGoodBad = sba_national[Reject_Status==TRUE] 96 | knownGoodBad = sba_national[Reject_Status==FALSE] 97 | 98 | fwrite(unknownGoodBad, file = paste0(data_dir, "UnknownGoodBad.csv")) 99 | gzip(paste0(data_dir,'UnknownGoodBad.csv'), destname=paste0(data_dir,'UnknownGoodBad.csv.gz'), 100 | remove=FALSE, overwrite=TRUE) 101 | fwrite(knownGoodBad, file = paste0(data_dir, "KnownGoodBad.csv")) 102 | gzip(paste0(data_dir,'KnownGoodBad.csv'), destname=paste0(data_dir,'KnownGoodBad.csv.gz'), 103 | remove=FALSE, overwrite=TRUE) 104 | ``` 105 | 106 | # Connect to Driverless AI 107 | 108 | ```{r connectDAI, include=FALSE} 109 | library(dai) 110 | 111 | dai_uri = "" 112 | usr = "h2oai" 113 | pwd = "" 114 | dai.connect(uri = dai_uri, username = usr, password = pwd, force_version = FALSE) 115 | ``` 116 | 117 | ```{r connectDAIvisible, eval=FALSE, include=TRUE} 118 | dai_uri = "http://mydai.instance.com:12345" 119 | usr = "mydaiuser" 120 | pwd = "mydaipassword" 121 | dai.connect(uri = dai_uri, username = usr, password = pwd, force_version = FALSE) 122 | ``` 123 | 124 | # Import data into Driverless AI 125 | 126 | Import datasets for both accepted and rejected loans, then split accepted loans into training and test partitions to train 1st loan default model. 127 | 128 | ```{r findOrCreateDatasets} 129 | existing_datasets = data.table(dai.list_datasets(limit = 1000)) 130 | if(nrow(existing_datasets) > 0 && 131 | nrow(existing_datasets[name=='KnownGoodBad.csv.gz']) == 1) { 132 | known_key = existing_datasets[name=='KnownGoodBad.csv.gz','key'][[1,1]] 133 | known_data = dai.get_frame(known_key) 134 | }else { 135 | known_data = dai.upload_dataset(paste0(data_dir, "KnownGoodBad.csv.gz")) 136 | } 137 | 138 | if(nrow(existing_datasets) > 0 && 139 | nrow(existing_datasets[name=="KnownGoodBad_train"]) == 1 && 140 | nrow(existing_datasets[name=="KnownGoodBad_test"]) == 1) { 141 | known_train_key = existing_datasets[name=="KnownGoodBad_train",'key'][[1,1]] 142 | known_train_set = dai.get_frame(known_train_key) 143 | known_test_key = existing_datasets[name=="KnownGoodBad_test",'key'][[1,1]] 144 | known_test_set = dai.get_frame(known_test_key) 145 | }else { 146 | partitions = dai.split_dataset(dataset = known_data, 147 | output_name1 = "KnownGoodBad_train", output_name2 = "KnownGoodBad_test", 148 | ratio = 0.8, seed = 75252, target = "Target") 149 | known_train_set = partitions[[1]] 150 | known_test_set = partitions[[2]] 151 | } 152 | ``` 153 | 154 | # Train Primary Default Model 155 | 156 | Build classification model for loan defaults. 157 | 158 | ```{r buildKnownModel} 159 | existing_models = data.table(dai.list_models(offset = 0, limit = 1000)[,c("key","description")]) 160 | if(nrow(existing_models) > 0 && 161 | nrow(existing_models[description=="known-goodbad-445"]) == 1) { 162 | known_model_key = existing_models[description=="known-goodbad-445","key"][[1,1]] 163 | known_model = dai.get_model(known_model_key) 164 | }else { 165 | known_model = dai.train(training_frame = known_train_set, testing_frame = known_test_set, 166 | target_col = "Target", is_classification = TRUE, is_timeseries = FALSE, 167 | cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt"), 168 | time = 4, accuracy = 4, interpretability = 5, 169 | experiment_name = "known-goodbad-445", 170 | enable_gpus = TRUE, seed = 75252, 171 | config_overrides = "make_python_scoring_pipeline = 'off'") 172 | } 173 | ``` 174 | 175 | # Scoring Test Set and Visualizing 176 | ```{r} 177 | known_model_key = existing_models[description=="2.known_goodbad_glm","key"][[1,1]] 178 | known_model_glm = dai.get_model(known_model_key) 179 | test_scored = predict(known_model_glm, newdata = known_test_set, 180 | include_columns = c("LoanNr_ChkDgt","Target"), return_df = TRUE) 181 | ggplot(test_scored) + 182 | # geom_histogram(aes(Target.1, fill=factor(Target)), alpha=0.7, bins = 100, position = "dodge") + 183 | geom_density(aes(LoanNr_ChkDgt, Target.1, color=factor(Target)), alpha=0.7) + 184 | theme_tufte(base_size = 12, base_family = 'Palatino', ticks = FALSE) 185 | ``` 186 | 187 | 188 | # Scoring Rejected Loans 189 | 190 | Imported rejected loan dataset and score on primary default loan model. 191 | ```{r importAndScoreRejects} 192 | if(nrow(existing_datasets) > 0 && 193 | nrow(existing_datasets[name=='UnknownGoodBad.csv.gz']) == 1) { 194 | unknown_key = existing_datasets[name=='UnknownGoodBad.csv.gz','key'][[1,1]] 195 | unknown_data = dai.get_frame(known_key) 196 | }else { 197 | unknown_data = dai.upload_dataset(paste0(data_dir, "UnknownGoodBad.csv.gz")) 198 | } 199 | 200 | unknown_scored = predict(known_model, newdata = unknown_data, 201 | include_columns = "LoanNr_ChkDgt", return_df = TRUE) 202 | ``` 203 | 204 | Manufacture new weighted dataset for Reject Inference with Fuzzy Augmentation 205 | 206 | ```{r} 207 | unknownScored = data.frame(unknown_scored) 208 | N = nrow(sba_national) # total number of rejected and accepted loans 209 | 210 | unknownGoodOnly = data.table(unknownGoodBad) 211 | unknownGoodOnly[unknownScored, c("Target", "weight", "weight_btb") := 212 | list(FALSE, as.double(`Target.0`), as.double(`Target.0`)/N), on='LoanNr_ChkDgt'] 213 | 214 | unknownBadOnly = data.table(unknownGoodBad) 215 | unknownBadOnly[unknownScored, c("Target", "weight", "weight_btb") := 216 | list(TRUE, as.double(`Target.1`), as.double(`Target.1`)/N), on='LoanNr_ChkDgt'] 217 | 218 | allGoodBad = rbindlist(list(knownGoodBad[, c("weight", "weight_btb") := list(1, 1/N)], 219 | unknownGoodOnly, 220 | unknownBadOnly)) 221 | 222 | fwrite(allGoodBad, file = paste0(data_dir, "AllGoodBad.csv")) 223 | gzip(paste0(data_dir,'AllGoodBad.csv'), destname=paste0(data_dir,'AllGoodBad.csv.gz'), 224 | remove=FALSE, overwrite=TRUE) 225 | 226 | fwrite(allGoodBad[, -c("weight","weight_btb")], file = paste0(data_dir, "AllGoodBad_noweight.csv")) 227 | gzip(paste0(data_dir,'AllGoodBad_noweight.csv'), destname=paste0(data_dir,'AllGoodBad_noweight.csv.gz'), 228 | remove=FALSE, overwrite=TRUE) 229 | 230 | all_data = dai.upload_dataset(paste0(data_dir, "AllGoodBad.csv.gz")) 231 | all_data_noweight = dai.upload_dataset(paste0(data_dir, "AllGoodBad_noweight.csv.gz")) 232 | ``` 233 | 234 | 235 | ```{r buildAllGoodBadModel} 236 | if(nrow(existing_datasets) > 0 && 237 | nrow(existing_datasets[name=='AllGoodBad_train']) >= 1 && 238 | nrow(existing_datasets[name=='AllGoodBad_test']) >= 1) { 239 | alltrain_set_key = existing_datasets[name=='AllGoodBad_train','key'][[1,1]] 240 | alltrain_set = dai.get_frame(alltrain_set_key) 241 | alltest_set_key = existing_datasets[name=='AllGoodBad_test','key'][[1,1]] 242 | alltest_set = dai.get_frame(alltest_set_key) 243 | }else { 244 | partitions = dai.split_dataset(dataset = all_data, 245 | output_name1 = "AllGoodBad_train", output_name2 = "AllGoodBad_test", 246 | ratio = 0.8, seed = 75252, target = "Target") 247 | alltrain_set = partitions[[1]] 248 | alltest_set = partitions[[2]] 249 | } 250 | 251 | all_model = dai.train(training_frame = alltrain_set, testing_frame = alltest_set, 252 | target_col = "Target", weight_col = "weight", 253 | is_classification = TRUE, is_timeseries = FALSE, 254 | cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt", 255 | "weight_btb"), 256 | time = 4, accuracy = 4, interpretability = 5, 257 | experiment_name = "all-goodbad-445", 258 | enable_gpus = TRUE, seed = 75252, 259 | config_overrides = "make_python_scoring_pipeline = 'off'") 260 | 261 | all_model_btb = dai.train(training_frame = alltrain_set, testing_frame = alltest_set, 262 | target_col = "Target", weight_col = "weight_btb", 263 | is_classification = TRUE, is_timeseries = FALSE, 264 | cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt", 265 | "weight"), 266 | time = 4, accuracy = 4, interpretability = 5, 267 | experiment_name = "all-goodbad-btb-445", 268 | enable_gpus = TRUE, seed = 75252, 269 | config_overrides = "make_python_scoring_pipeline = 'off'") 270 | ``` 271 | 272 | # Make Final Model Predictions on Rejected Loans 273 | 274 | ```{r predictRejectsOnFinalModel} 275 | existing_datasets = data.table(dai.list_datasets(limit = 1000)) 276 | test_data_key = existing_datasets[name=='KnownGoodBad_test','key'][[1,1]] 277 | test_data = dai.get_frame(test_data_key) 278 | 279 | existing_models = data.table(dai.list_models(offset = 0, limit = 1000)[,c("key","description")]) 280 | all_good_bad_model_key = existing_models[description=="known-goodbad-445","key"][[1,1]] 281 | all_good_bad_model = dai.get_model(all_good_bad_model_key) 282 | 283 | alltest_final_scored = predict(all_good_bad_model, newdata = test_data, 284 | include_columns = c("LoanNr_ChkDgt","Target","UrbanRural"), return_df = TRUE) 285 | alltest_final_scored$Target = factor(alltest_final_scored$Target) 286 | alltest_final_scored$UrbanRural = factor(alltest_final_scored$UrbanRural) 287 | 288 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) + 289 | geom_histogram(bins=50, position="stack", color="black") + 290 | theme_tufte(ticks=TRUE) + geom_rangeframe() + 291 | theme(legend.position = "bottom") 292 | 293 | ggplot(alltest_final_scored, aes(x=Target.1)) + 294 | geom_density(alpha = .7, trim=TRUE) + 295 | theme_tufte() + geom_rangeframe() + # geom_rug() + 296 | theme(legend.position = "bottom") 297 | 298 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) + 299 | geom_histogram(bins=50, position="dodge", color="black") + 300 | theme_tufte(ticks=TRUE) + geom_rangeframe() + 301 | theme(legend.position = "bottom") 302 | 303 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) + 304 | geom_density(alpha = .7, trim=TRUE) + 305 | # facet_wrap(~UrbanRural, ncol=1, scales = "free_y") + 306 | theme_tufte(ticks=TRUE) + geom_rangeframe() + 307 | theme(legend.position = "bottom") 308 | ``` 309 | 310 | -------------------------------------------------------------------------------- /driverlessai_experiments/nlp/airline_sentiment_experiment/demo_nlp_airline_sentiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Driverless AI NLP Demo - Airline Sentiment Dataset ###\n", 8 | "\n", 9 | "In this notebook, we will see how to use Driverless AI python client to build text classification models using the Airline sentiment twitter dataset.\n", 10 | "\n", 11 | "Import the necessary python modules to get started including the Driverless AI client. If not already installed, please download and install the python client from Driverless AI GUI." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "from sklearn import model_selection\n", 22 | "from h2oai_client import Client" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "The below code downloads the twitter airline sentiment dataset and save it in the current folder. " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "--2019-08-21 15:42:17-- https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv\n", 42 | "Resolving www.figure-eight.com (www.figure-eight.com)... 54.164.48.21, 54.165.94.158\n", 43 | "Connecting to www.figure-eight.com (www.figure-eight.com)|54.164.48.21|:443... connected.\n", 44 | "HTTP request sent, awaiting response... 200 OK\n", 45 | "Length: 3704908 (3.5M) [application/octet-stream]\n", 46 | "Saving to: ‘Airline-Sentiment-2-w-AA.csv’\n", 47 | "\n", 48 | "Airline-Sentiment-2 100%[===================>] 3.53M 4.79MB/s in 0.7s \n", 49 | "\n", 50 | "2019-08-21 15:42:18 (4.79 MB/s) - ‘Airline-Sentiment-2-w-AA.csv’ saved [3704908/3704908]\n", 51 | "\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "! wget https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "We can now split the data into training and testing datasets." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "al = pd.read_csv(\"Airline-Sentiment-2-w-AA.csv\", encoding='ISO-8859-1')\n", 73 | "train_al, test_al = model_selection.train_test_split(al, test_size=0.2, random_state=2018)\n", 74 | "train_al.to_csv(\"train_airline_sentiment.csv\", index=False)\n", 75 | "test_al.to_csv(\"test_airline_sentiment.csv\", index=False)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "The first step is to establish a connection to Driverless AI using `Client`. Please key in your credentials and the url address." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "address = 'http://ip_where_driverless_is_running:12345'\n", 92 | "username = 'username'\n", 93 | "password = 'password'\n", 94 | "h2oai = Client(address = address, username = username, password = password)\n", 95 | "# # make sure to use the same user name and password when signing in through the GUI" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Read the train and test files into Driverless AI using the `upload_dataset_sync` command." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 8, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "train_path = './train_airline_sentiment.csv'\n", 112 | "test_path = './test_airline_sentiment.csv'\n", 113 | "\n", 114 | "train = h2oai.upload_dataset_sync(train_path)\n", 115 | "test = h2oai.upload_dataset_sync(test_path)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Now let us look at some basic information about the dataset." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "Train Dataset: 20 x 11712\n", 135 | "Test Dataset: 20 x 2928\n" 136 | ] 137 | }, 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "['_unit_id',\n", 142 | " '_golden',\n", 143 | " '_unit_state',\n", 144 | " '_trusted_judgments',\n", 145 | " '_last_judgment_at',\n", 146 | " 'airline_sentiment',\n", 147 | " 'airline_sentiment:confidence',\n", 148 | " 'negativereason',\n", 149 | " 'negativereason:confidence',\n", 150 | " 'airline',\n", 151 | " 'airline_sentiment_gold',\n", 152 | " 'name',\n", 153 | " 'negativereason_gold',\n", 154 | " 'retweet_count',\n", 155 | " 'text',\n", 156 | " 'tweet_coord',\n", 157 | " 'tweet_created',\n", 158 | " 'tweet_id',\n", 159 | " 'tweet_location',\n", 160 | " 'user_timezone']" 161 | ] 162 | }, 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "print('Train Dataset: ', len(train.columns), 'x', train.row_count)\n", 170 | "print('Test Dataset: ', len(test.columns), 'x', test.row_count)\n", 171 | "\n", 172 | "[c.name for c in train.columns]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "We just need two columns for our experiment. `text` which contains the text of the tweet and `airline_sentiment` which contains the sentiment of the tweet (target column). We can drop the remaining columns for this experiment. \n", 180 | "\n", 181 | "We will enable tensorflow models and transformations to take advantage of CNN based text features." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 22, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "['ACCURACY [6/10]:',\n", 193 | " '- Training data size: *11,712 rows, 2 cols*',\n", 194 | " '- Feature evolution: *[LightGBM, TensorFlow, XGBoostGBM]*, *3-fold CV**, 2 reps*',\n", 195 | " '- Final pipeline: *Ensemble (6 models), 3-fold CV*',\n", 196 | " '',\n", 197 | " 'TIME [4/10]:',\n", 198 | " '- Feature evolution: *4 individuals*, up to *56 iterations*',\n", 199 | " '- Early stopping: After *5* iterations of no improvement',\n", 200 | " '',\n", 201 | " 'INTERPRETABILITY [5/10]:',\n", 202 | " '- Feature pre-pruning strategy: None',\n", 203 | " '- XGBoost Monotonicity constraints: disabled',\n", 204 | " '- Feature engineering search space (where applicable): [CVCatNumEncode, CVTargetEncode, ClusterTE, Dates, Frequent, Interactions, IsHoliday, NumCatTE, NumToCatTE, Original, TextBiGRU, TextCNN, TextCharCNN, Text]',\n", 205 | " '',\n", 206 | " '[LightGBM, TensorFlow, XGBoostGBM] models to train:',\n", 207 | " '- Model and feature tuning: *192*',\n", 208 | " '- Feature evolution: *504*',\n", 209 | " '- Final pipeline: *6*',\n", 210 | " '',\n", 211 | " 'Estimated runtime: *minutes*']" 212 | ] 213 | }, 214 | "execution_count": 22, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "exp_preview = h2oai.get_experiment_preview_sync(\n", 221 | " dataset_key=train.key\n", 222 | " , validset_key=''\n", 223 | " , target_col='airline_sentiment'\n", 224 | " , classification=True\n", 225 | " , dropped_cols=[\"_unit_id\", \"_golden\", \"_unit_state\", \"_trusted_judgments\", \"_last_judgment_at\",\n", 226 | " \"airline_sentiment:confidence\", \"negativereason\", \"negativereason:confidence\", \"airline\",\n", 227 | " \"airline_sentiment_gold\", \"name\", \"negativereason_gold\", \"retweet_count\", \n", 228 | " \"tweet_coord\", \"tweet_created\", \"tweet_id\", \"tweet_location\", \"user_timezone\"]\n", 229 | " , accuracy=6\n", 230 | " , time=4\n", 231 | " , interpretability=5\n", 232 | " , is_time_series=False\n", 233 | " , enable_gpus=True\n", 234 | " , reproducible=False\n", 235 | " , resumed_experiment_id=''\n", 236 | " , config_overrides=\"\"\"\n", 237 | " enable_tensorflow='on'\n", 238 | " enable_tensorflow_charcnn='on'\n", 239 | " enable_tensorflow_textcnn='on'\n", 240 | " enable_tensorflow_textbigru='on'\n", 241 | " \"\"\"\n", 242 | ")\n", 243 | "exp_preview" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Please note that the `Text` and `TextCNN` features are enabled for this experiment.\n", 251 | "\n", 252 | "Now we can start the experiment." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 24, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "model = h2oai.start_experiment_sync(\n", 262 | " dataset_key=train.key,\n", 263 | " testset_key=test.key,\n", 264 | " target_col='airline_sentiment',\n", 265 | " scorer='F1',\n", 266 | " is_classification=True,\n", 267 | " cols_to_drop=[\"_unit_id\", \"_golden\", \"_unit_state\", \"_trusted_judgments\", \"_last_judgment_at\",\n", 268 | " \"airline_sentiment:confidence\", \"negativereason\", \"negativereason:confidence\", \"airline\",\n", 269 | " \"airline_sentiment_gold\", \"name\", \"negativereason_gold\", \"retweet_count\", \n", 270 | " \"tweet_coord\", \"tweet_created\", \"tweet_id\", \"tweet_location\", \"user_timezone\"],\n", 271 | " accuracy=6,\n", 272 | " time=2,\n", 273 | " interpretability=5,\n", 274 | " enable_gpus=True,\n", 275 | " config_overrides=\"\"\"\n", 276 | " enable_tensorflow='on'\n", 277 | " enable_tensorflow_charcnn='on'\n", 278 | " enable_tensorflow_textcnn='on'\n", 279 | " enable_tensorflow_textbigru='on'\n", 280 | " \"\"\"\n", 281 | ")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 25, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "Modeling completed for model d272df9c-c466-11e9-b1a0-0242ac110002\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "print('Modeling completed for model ' + model.key)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 29, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "Logs available at ./test_preds.csv\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "logs = h2oai.download(model.log_file_path, '.')\n", 316 | "print('Logs available at', test_preds)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "We can download the predictions to the current folder." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 28, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "Test set predictions available at ./test_preds.csv\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "test_preds = h2oai.download(model.test_predictions_path, '.')\n", 341 | "print('Test set predictions available at', test_preds)" 342 | ] 343 | } 344 | ], 345 | "metadata": { 346 | "kernelspec": { 347 | "display_name": "Python 3", 348 | "language": "python", 349 | "name": "python3" 350 | }, 351 | "language_info": { 352 | "codemirror_mode": { 353 | "name": "ipython", 354 | "version": 3 355 | }, 356 | "file_extension": ".py", 357 | "mimetype": "text/x-python", 358 | "name": "python", 359 | "nbconvert_exporter": "python", 360 | "pygments_lexer": "ipython3", 361 | "version": "3.6.5" 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 2 366 | } 367 | -------------------------------------------------------------------------------- /driverlessai_experiments/nlp/custom_word2vec_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Pretrained Word Embeddings\n", 8 | "\n", 9 | "From Driverless AI version 1.7.0, text models can take in pretrained word embeddings through expert settings. There are several pre-trained word embeddings available in the open source domain like [Glove](https://nlp.stanford.edu/projects/glove/) and [Fasttext](https://fasttext.cc/docs/en/crawl-vectors.html). We can download these embeddings and use them in our models. These embeddings are trained on corpus like wikipedia, common crawl etc. \n", 10 | "\n", 11 | "We can also train our own embeddings on our domain dataset instead of using the publicly available ones. This one is particularly useful when there is a good amount of text data that is not tagged and want to use that information. This notebook is to help create custom pre-trained embeddings.\n", 12 | "\n", 13 | "The data used in this example is [US Airline Sentiment dataset](https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv) from [Figure Eight’s Data for Everyone](https://www.figure-eight.com/data-for-everyone/) library. The dataset is split into training and test with this [simple script](https://gist.github.com/woobe/bd79d9f4d7ea139c5d2eb4cf1de1e7db) and the train file is used for word embeddings creation. Please use your own text corpus inplace of this airline train file." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Please enter the file name\n", 23 | "file_name = \"train_airline_sentiment.csv\"\n", 24 | "# Please enter the name of the text column\n", 25 | "col_name = \"text\"" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Import the h2o module and H2OWord2vecEstimator" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.\n", 45 | "Attempting to start a local H2O server...\n", 46 | " Java Version: openjdk version \"11.0.1\" 2018-10-16; OpenJDK Runtime Environment 18.9 (build 11.0.1+13); OpenJDK 64-Bit Server VM 18.9 (build 11.0.1+13, mixed mode)\n", 47 | " Starting server from /Users/srk/envs/DS2/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar\n", 48 | " Ice root: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0\n", 49 | " JVM stdout: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0/h2o_srk_started_from_python.out\n", 50 | " JVM stderr: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0/h2o_srk_started_from_python.err\n", 51 | " Server is running at http://127.0.0.1:54321\n", 52 | "Connecting to H2O server at http://127.0.0.1:54321 ... successful.\n" 53 | ] 54 | }, 55 | { 56 | "data": { 57 | "text/html": [ 58 | "
\n", 59 | "\n", 60 | "\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | "\n", 65 | "\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | "\n", 70 | "\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "\n", 89 | "
H2O cluster uptime:01 secs
H2O cluster timezone:Asia/Kolkata
H2O data parsing timezone:UTC
H2O cluster version:3.24.0.4
H2O cluster version age:1 month and 24 days
H2O cluster name:H2O_from_python_srk_z7y5eb
H2O cluster total nodes:1
H2O cluster free memory:4 Gb
H2O cluster total cores:12
H2O cluster allowed cores:12
H2O cluster status:accepting new members, healthy
H2O connection url:http://127.0.0.1:54321
H2O connection proxy:None
H2O internal security:False
H2O API Extensions:Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
Python version:3.6.5 final
" 90 | ], 91 | "text/plain": [ 92 | "-------------------------- ---------------------------------------------------\n", 93 | "H2O cluster uptime: 01 secs\n", 94 | "H2O cluster timezone: Asia/Kolkata\n", 95 | "H2O data parsing timezone: UTC\n", 96 | "H2O cluster version: 3.24.0.4\n", 97 | "H2O cluster version age: 1 month and 24 days\n", 98 | "H2O cluster name: H2O_from_python_srk_z7y5eb\n", 99 | "H2O cluster total nodes: 1\n", 100 | "H2O cluster free memory: 4 Gb\n", 101 | "H2O cluster total cores: 12\n", 102 | "H2O cluster allowed cores: 12\n", 103 | "H2O cluster status: accepting new members, healthy\n", 104 | "H2O connection url: http://127.0.0.1:54321\n", 105 | "H2O connection proxy:\n", 106 | "H2O internal security: False\n", 107 | "H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4\n", 108 | "Python version: 3.6.5 final\n", 109 | "-------------------------- ---------------------------------------------------" 110 | ] 111 | }, 112 | "metadata": {}, 113 | "output_type": "display_data" 114 | } 115 | ], 116 | "source": [ 117 | "import h2o\n", 118 | "h2o.init()\n", 119 | "from h2o.estimators.word2vec import H2OWord2vecEstimator" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "Import the dataset file. Please note that the input file should be a csv file with a valid header in the first line." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 3, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "Parse progress: |█████████████████████████████████████████████████████████| 100%\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "df = h2o.import_file(file_name, header=1, sep=\",\")\n", 144 | "df = df[[col_name]].ascharacter()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Do some text preprocessing." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 4, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "def tokenize(sentences):\n", 161 | " # tokenize the sentences\n", 162 | " tokenized = sentences.tokenize(\"\\\\W+\")\n", 163 | " # lower case the text column\n", 164 | " tokenized = tokenized.tolower()\n", 165 | " # filter out the sentences which has less than 2 characters or where text is missing\n", 166 | " tokenized = tokenized[(tokenized.nchar() >= 2) | (tokenized.isna()),:]\n", 167 | " return tokenized\n", 168 | "\n", 169 | "words = tokenize(df[col_name])" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "The next step is to build the word2vec model. We can also adjust the parameters of the word2vec mdoel. Please refer to the [documentation of H2oWord2vecEstimator](http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2oword2vecestimator) for more details on the parameters. " 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 5, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "Build word2vec model\n", 189 | "word2vec Model Build progress: |██████████████████████████████████████████| 100%\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "print(\"Build word2vec model\")\n", 195 | "w2v_model = H2OWord2vecEstimator(min_word_freq=3,\n", 196 | " vec_size=300,\n", 197 | " window_size=5,\n", 198 | " epochs=10,\n", 199 | " word_model=\"skip_gram\")\n", 200 | "w2v_model.train(training_frame=words)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Save the word embeddings as text file. \n", 208 | "\n", 209 | "This file can be given as pre-trained word embedding input for Driverless AI. The option is present in `Expert Settings -> NLP -> Path to pretrained embeddings for TensorFlow NLP models` " 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 6, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "w2v_model.to_frame().as_data_frame().to_csv(\"w2vec.txt\", float_format='%.6f', sep=\" \", header=False, index=False)" 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.5" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 2 243 | } 244 | -------------------------------------------------------------------------------- /driverlessai_experiments/nlp_timeseries/imgs/coffee.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/coffee.gif -------------------------------------------------------------------------------- /driverlessai_experiments/nlp_timeseries/imgs/create_experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/create_experiment.png -------------------------------------------------------------------------------- /driverlessai_experiments/nlp_timeseries/imgs/mapbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/mapbox.png -------------------------------------------------------------------------------- /driverlessai_experiments/nlp_timeseries/imgs/scpf_lb_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/scpf_lb_progress.png -------------------------------------------------------------------------------- /driverlessai_experiments/nlp_timeseries/predictions/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/predictions/.gitkeep -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | # Exclude specific files 2 | scratch.py 3 | 4 | # Exclude specific folders 5 | .idea 6 | data_fullts 7 | experiment_data 8 | tmp 9 | 10 | # Exclude files based on extensions 11 | *.jar 12 | *.csv 13 | *.pickle 14 | 15 | 16 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/01-generate-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Commented. Enable for debugging 4 | # set -x 5 | 6 | force_overwrite=false 7 | current_dir="$(pwd)" 8 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | conda_env_name="ts-pipeline-env" 10 | conda_env_def_file="environment.yml" 11 | ts_process_script="01_process_full_TS_csv.py" 12 | tmp_csv_file="temp.csv" 13 | fullts_data_directory="data_fullts" 14 | 15 | error_exit(){ 16 | echo "" 17 | echo "$1" 1>&2 18 | echo "" 19 | exit 1 20 | } 21 | 22 | print_usage(){ 23 | echo "Usage:" 24 | echo " bash $0 -d -o [-f | --force] [-h | --help]" 25 | echo "Options:" 26 | echo " -d Timeseries definition file. Must be JSON file." 27 | echo " -o Output file name. Will generate .csv, .pickle, and .svg files in ${fullts_data_directory} directory" 28 | echo " -f, --force Force overwrite of output file." 29 | echo " -h, --help Display usage information." 30 | echo "Details:" 31 | echo " Creates the master time series dataset for this pipeline demo. It simulates a larger database" 32 | echo " from which section of data will be extracted to train and then predict on" 33 | } 34 | 35 | check_or_download_tsimulus(){ 36 | if [[ ! -e tsimulus-cli.jar ]]; then 37 | local latest_tag=$(curl --silent 'https://api.github.com/repos/cetic/tsimulus-cli/releases/latest' | grep -Po '"tag_name": "\K.*?(?=")') 38 | curl https://github.com/cetic/tsimulus-cli/releases/download/"${latest_tag}"/tsimulus-cli.jar --o tsimulus-cli.jar -silent 39 | fi 40 | # finally check that the file does exist, or error out 41 | [[ -e "tsimulus-cli.jar" ]] || error_exit "Error downloading TSimulus CLI. Cannot continue" 42 | } 43 | 44 | generate_ts_data(){ 45 | # if flow reaches here, validation checks are assumed to be passed and output file is ok to overwrite if present 46 | java -jar tsimulus-cli.jar "${ts_def_file}" | tail -n +2 | sed -r 's/;/,/g' > "${fullts_data_directory}/${tmp_csv_file}" 47 | } 48 | 49 | check_create_condaenv(){ 50 | conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly." 51 | local env_count=$(conda env list | grep "${conda_env_name}" | wc -l) 52 | if [[ "${env_count}" == 0 ]]; then 53 | # create conda environment from the yml file 54 | [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found" 55 | conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment" 56 | fi 57 | } 58 | 59 | process_ts_file(){ 60 | # if control reaches here, then conda environment is available 61 | [[ -e "${ts_process_script}" ]] || error_exit "Python script to process timeseries data not found" 62 | pushd "${fullts_data_directory}" > /dev/null && 63 | source activate "${conda_env_name}" && 64 | python "${script_dir}/${ts_process_script}" -i "${tmp_csv_file}" -o "${ts_out_file}" && 65 | mv "${tmp_csv_file}" "${ts_out_file}.csv" && 66 | conda deactivate && 67 | popd > /dev/null 68 | } 69 | 70 | parse_args_then_exec(){ 71 | # fail fast in case no parameters are passed 72 | [[ ! -z "${1}" ]] || { print_usage; error_exit "Timeseries definition file is mandatory"; } 73 | while [[ "$1" != "" ]]; do 74 | case "$1" in 75 | -d ) 76 | shift 77 | ts_def_file="$1" 78 | # error if such file does not exits 79 | [[ -e "${ts_def_file}" ]] || { print_usage; error_exit "Timeseries definition file does not exist"; } 80 | ;; 81 | -o ) 82 | shift 83 | ts_out_file="$1" 84 | ;; 85 | -f | --force ) 86 | force_overwrite=true 87 | ;; 88 | -h | --help ) 89 | print_usage 90 | exit 0 91 | ;; 92 | * ) 93 | print_usage 94 | error_exit "Error: Incorrect parameters passed" 95 | ;; 96 | esac 97 | shift 98 | done 99 | 100 | # If required parame 101 | [[ ! -z "${ts_def_file}" ]] || { print_usage; error_exit "Timeseries definition file is mandatory"; } 102 | [[ ! -z "${ts_out_file}" ]] || { print_usage; error_exit "Timeseries output file is mandatory"; } 103 | 104 | # check if output file exist. If exists, and overwrite option is not specified then show error 105 | if [[ -e "${fullts_data_directory}/${ts_out_file}.csv" || -e "${fullts_data_directory}/${ts_out_file}.pickle" ]] && [[ "${force_overwrite}" == false ]]; then 106 | print_usage 107 | error_exit "Cannot overwite existing file. Use -f option" 108 | fi 109 | 110 | # Make fullts_data directory if it does not exists. if, exists do nothing 111 | mkdir -p "${fullts_data_directory}" 112 | 113 | # check Java exists, if not exit with error 114 | java -version 2>/dev/null || error_exit "Java required. Please install java runtime" 115 | 116 | # check curl exists 117 | curl -V >/dev/null || error_exit "Curl required. Please install curl" 118 | 119 | # check tsimulus cli available, if not, download it 120 | check_or_download_tsimulus 121 | 122 | # generate Timeseries data based on the definition file 123 | generate_ts_data 124 | 125 | # Create conda environment if it does not exist 126 | check_create_condaenv 127 | 128 | # process the temp.csv file. Generate plots, save as feather for better read/write performance 129 | process_ts_file 130 | } 131 | 132 | main() { 133 | parse_args_then_exec $@ 134 | } 135 | 136 | main $@ -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/01_process_full_TS_csv.py: -------------------------------------------------------------------------------- 1 | import click 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | 6 | from pandas.plotting import register_matplotlib_converters 7 | 8 | 9 | @click.command() 10 | @click.option('-i', '--input', 'in_file', type=click.Path(exists=True), help='Input time series data file (csv)') 11 | @click.option('-o', '--output', 'output', type=click.STRING, help='Output file prefix.') 12 | def process(in_file, output): 13 | """ 14 | Process a time series file, create a plot, save the data as pickle. 15 | 16 | This function processes the time series csv file, provided as input. 17 | Creates a plot of the time series and saves it as output_plot.svg. 18 | It also converts the input csv file and stores it as output.pickle for faster processing. 19 | """ 20 | # Read csv to data frame. 21 | df = pd.read_csv(in_file, 22 | sep=',', 23 | names=['Timeslot', 'StoreID', 'Product', 'Sale'], 24 | parse_dates=['Timeslot'], 25 | infer_datetime_format=True) 26 | 27 | # Round Sale and convert from float to int64 28 | df['Sale'] = pd.Series.round(df['Sale']).apply(np.int64) 29 | df['StoreID'] = df['StoreID'].astype('category') 30 | df['Product'] = df['Product'].astype('category') 31 | 32 | # Set dataframe index to help easy slicing 33 | df.set_index('Timeslot', drop=False, inplace=True) 34 | 35 | # Create TS plots for each store id in a separate file 36 | register_matplotlib_converters() 37 | sns.set_context('notebook') 38 | 39 | sns.relplot(x='Timeslot', 40 | y='Sale', 41 | hue='StoreID', 42 | row='Product', 43 | kind='line', 44 | height=3, 45 | aspect=10, 46 | data=df).fig.savefig(output+'_plot.svg') 47 | 48 | # Store the file as pickle 49 | df.to_pickle(output+'.pickle') 50 | 51 | 52 | if __name__ == '__main__': 53 | process() 54 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/02-create-experiment-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Commented. Enable for debugging 4 | # set -x 5 | 6 | current_dir="$(pwd)" 7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | conda_env_name="ts-pipeline-env" 9 | conda_env_def_file="environment.yml" 10 | process_script="02_extract_experiment_datasets.py" 11 | exp_data_dir_root="experiment_data" 12 | missing_data_percentage=0 13 | 14 | error_exit(){ 15 | echo "" 16 | echo "$1" 1>&2 17 | echo "" 18 | exit 1 19 | } 20 | 21 | print_usage(){ 22 | echo "Usage:" 23 | echo " bash $0 -i -s -e -g -t [-m ] [-h | --help]" 24 | echo "Options:" 25 | echo " -i Full time series dataset, created by 01-generate-data script. Provide .pickle file" 26 | echo " -s Starting date for Train data YYYY-MM-DD format. Train dataset will start from 00:00:00.000 hours for that date." 27 | echo " -e Ending date for Train data in YYYY-MM-DD format. Train dataset will include data for this date till 23:00:00 hours i.e. full 24 hour period." 28 | echo " -g Gap (in days) between last training date and first testing date." 29 | echo " -t Duration (in days) for which we are generating test data. It starts from gap days after the last date in train dataset." 30 | echo " -m Proportion of target data that is missing in both Training and Test dataset. Optional, defaults to 0." 31 | echo " -h, --help Display usage information." 32 | echo "Details:" 33 | echo " Creates train, gap and test datasets (csv and pickle) in the output directory. Also creates timeseries plots for train and test datasets. " 34 | echo " The output directory will be created in the format sYYYYMMDD-eYYYYMMDD-gdG-tdF-mMP, where" 35 | echo " - sYYYYMMDD-eYYYYMMDD is the training dataset start and end date" 36 | echo " - gdG is the gap duration" 37 | echo " - tdF is the test duration" 38 | echo " - mMP is proportion of missing data in Train and Test datasets" 39 | echo " When the script is executed with certain inputs which results in an output directory that already exists, no action is taken." 40 | } 41 | 42 | check_create_condaenv(){ 43 | conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly." 44 | local env_count=$(conda env list | grep "${conda_env_name}" | wc -l) 45 | if [[ "${env_count}" == 0 ]]; then 46 | # create conda environment from the yml file 47 | [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found" 48 | conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment" 49 | fi 50 | } 51 | 52 | process_ts_file(){ 53 | # if control reaches here, then conda environment is available 54 | [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found" 55 | pushd "${exp_data_dir_root}/${exp_data_dir}" > /dev/null && 56 | source activate "${conda_env_name}" && 57 | python "${script_dir}/${process_script}" -i "${script_dir}/${ts_full_data_file}" \ 58 | -s "${formatted_start_date}" \ 59 | -e "${formatted_end_date}" \ 60 | -g "${gap_duration}" \ 61 | -t "${test_duration}" \ 62 | -m "${missing_data_percentage}" && 63 | conda deactivate && 64 | popd > /dev/null 65 | } 66 | 67 | parse_args_then_exec(){ 68 | # fail fast in case no parameters are passed 69 | [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; } 70 | while [[ "$1" != "" ]]; do 71 | case "$1" in 72 | -i ) 73 | shift 74 | ts_full_data_file="$1" 75 | # If file exists, proceed; else print message and exit with error code 76 | [[ -f "${ts_full_data_file}" ]] || { print_usage; error_exit "Provided time series full data file does not exist."; } 77 | ;; 78 | -s ) 79 | shift 80 | start_date="$1" 81 | # convert input to expected date format and check with input. if they match, input is in expected format, so proceed ; else error 82 | formatted_start_date=$(date "+%F" -d "${start_date}" 2>/dev/null) 83 | [[ "${formatted_start_date}" == "${start_date}" ]] || { print_usage; error_exit "Invalid start date or date format. Use YYYY-MM-DD format."; } 84 | ;; 85 | -e ) 86 | shift 87 | end_date="$1" 88 | # error is date is not in the valid format 89 | formatted_end_date=$(date "+%F" -d "${end_date}" 2>/dev/null) 90 | [[ "${formatted_end_date}" == "${end_date}" ]] || { print_usage; error_exit "Invalid end date or date format. Use YYYY-MM-DD format."; } 91 | ;; 92 | -g ) 93 | shift 94 | gap_duration="$1" 95 | [[ "${gap_duration}" =~ ^[0-9]+$ ]] || { print_usage; error_exit "Gap duration (days) is expected to be an integer. If no gap is needed pass 0."; } 96 | ;; 97 | -t ) 98 | shift 99 | test_duration="$1" 100 | # error is date is not in the valid format 101 | [[ "${test_duration}" =~ ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Test data duration (days) is expected to be a non-zero integer."; } 102 | ;; 103 | -m ) 104 | shift 105 | missing_data_percentage="$1" 106 | # error is date is not in the valid format 107 | [[ "${missing_data_percentage}" =~ ^[0-9]{1,2}$ ]] || { print_usage; error_exit "Proportion (%) of missing data to create in Train and Test datasets. Optional, defaults to 0."; } 108 | ;; 109 | -h | --help ) 110 | print_usage 111 | exit 0 112 | ;; 113 | * ) 114 | print_usage 115 | error_exit "Error: Incorrect parameters passed" 116 | ;; 117 | esac 118 | shift 119 | done 120 | 121 | # If required parameters are missing, print usage and exit 122 | [[ ! -z "${ts_full_data_file}" ]] || { print_usage; error_exit "Timeseries input data file is mandatory"; } 123 | [[ -f "${ts_full_data_file}" ]] || { print_usage; error_exit "Provided timeseries input data file is missing"; } 124 | [[ ! -z "${formatted_start_date}" ]] || { print_usage; error_exit "Training data start date is mandatory"; } 125 | [[ ! -z "${formatted_end_date}" ]] || { print_usage; error_exit "Training data end date is mandatory"; } 126 | [[ ! -z "${gap_duration}" ]] || { print_usage; error_exit "Gap duration is mandatory. If no gap, pass 0 as the value"; } 127 | [[ ! -z "${test_duration}" ]] || { print_usage; error_exit "Test data duration is mandatory"; } 128 | 129 | # Check if experiment data directory exists, if so dont proceed. If it does not exist, create it. 130 | exp_data_dir="s${formatted_start_date}-e${formatted_end_date}-gd${gap_duration}-td${test_duration}-m${missing_data_percentage}" 131 | [[ ! -d "${exp_data_dir_root}/${exp_data_dir}" ]] || error_exit "Experiment data directory ${exp_data_dir_root}/${exp_data_dir} already exists. No action taken." 132 | mkdir -p "${exp_data_dir_root}/${exp_data_dir}" 133 | 134 | # Create conda environment if it does not exist 135 | check_create_condaenv 136 | 137 | # process the temp.csv file. Generate plots, save as feather for better read/write performance 138 | process_ts_file 139 | } 140 | 141 | main() { 142 | parse_args_then_exec $@ 143 | } 144 | 145 | main $@ -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/02_extract_experiment_datasets.py: -------------------------------------------------------------------------------- 1 | import click 2 | import random 3 | 4 | import datetime as dt 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | 9 | from pandas.plotting import register_matplotlib_converters 10 | 11 | 12 | @click.command() 13 | @click.option('-i', '--input', 'input_pickle', type=click.Path(exists=True, 14 | file_okay=True, 15 | dir_okay=False, 16 | readable=True), 17 | required=True, 18 | help='Full time series dataset pickle file from which to extract experiment data.') 19 | @click.option('-s', '--start', 'train_start_date', 20 | required=True, 21 | type=click.DateTime(formats=['%Y-%m-%d']), 22 | help='Start date for training data.') 23 | @click.option('-e', '--end', 'train_end_date', 24 | required=True, 25 | type=click.DateTime(formats=['%Y-%m-%d']), 26 | help='End date for training data.') 27 | @click.option('-g', '--gap', 'gap_duration', 28 | required=True, 29 | type=click.INT, 30 | help='Gap (in days) between training and test data') 31 | @click.option('-t', '--test', 'test_duration', 32 | required=True, 33 | type=click.INT, 34 | help='Duration (in days) for the testing dataset.') 35 | @click.option('-m', '--missing', 'missing_data_percentage', 36 | default=0, 37 | required=False, 38 | type=click.INT, 39 | help='Proportion (in %) of missing data in train and test datasets. Optional, defaults to 0') 40 | def process(input_pickle, 41 | train_start_date, 42 | train_end_date, 43 | gap_duration, 44 | test_duration, 45 | missing_data_percentage): 46 | """ 47 | Creates train and test datasets (csv and pickle) in the output directory. 48 | Also creates timeseries plots for both the files. 49 | 50 | :param input_pickle: Full time series dataset pickle file from which to extract experiment data. 51 | :param train_start_date: Start date for training dataset 52 | :param train_end_date: End date for training datset 53 | :param gap_duration: Gap (in days) between training and testing dataset. 54 | :param test_duration: Duration (in days) of the testing dataset. 55 | :param missing_data_percentage: Proportion of missing data in train and test datasets. Optional, defaults to 0. 56 | :return: None 57 | """ 58 | # Read the input data file 59 | df = pd.read_pickle(input_pickle) 60 | 61 | # Calculate data slice times 62 | train_end_date = train_end_date.replace(hour=23) 63 | gap_start_date = train_end_date + dt.timedelta(hours=1) 64 | gap_end_date = gap_start_date + dt.timedelta(days=gap_duration, hours=-1) 65 | test_start_date = gap_end_date + dt.timedelta(hours=1) 66 | test_end_date = test_start_date + dt.timedelta(days=test_duration, hours=-1) 67 | 68 | # Slice data 69 | train_df = df[train_start_date:train_end_date].copy() 70 | test_df = df[test_start_date:test_end_date].copy() 71 | 72 | # Add missing data 73 | if missing_data_percentage != 0: 74 | create_missing_data(train_df, missing_data_percentage, 3) 75 | create_missing_data(test_df, missing_data_percentage, 3) 76 | 77 | # Plot train and test data 78 | create_plots(train_df, 'train') 79 | create_plots(test_df, 'test') 80 | 81 | # Save as CSV and pickle 82 | save_datasets(train_df, 'train', as_csv=True, as_pickle=True) 83 | save_datasets(test_df, 'test', as_csv=True, as_pickle=True) 84 | 85 | # Handle gap data 86 | if gap_duration != 0: 87 | gap_df = df[gap_start_date:gap_end_date].copy() 88 | create_missing_data(gap_df, missing_data_percentage, 3) 89 | save_datasets(gap_df, 'gap', as_csv=True, as_pickle=True) 90 | 91 | def create_plots(data_frame, 92 | filename_prefix): 93 | """ 94 | Create timeseries plot for the passed dataframe 95 | 96 | :param data_frame: Input time series dataframe to plot 97 | :param filename_prefix: File name prefix. Generated file will be filename_prefix_plot.svg 98 | :return: None 99 | """ 100 | sns.relplot(x='Timeslot', 101 | y='Sale', 102 | hue='StoreID', 103 | row='Product', 104 | kind='line', 105 | height=3, 106 | aspect=10, 107 | data=data_frame).fig.savefig(filename_prefix+'_plot.svg') 108 | 109 | 110 | def create_missing_data(df, 111 | missing_data_percentage, 112 | target_col_index): 113 | """ 114 | Creates missing data in the target column specified by the index (target_col_index). 115 | Proportion of rows for which missing data is created is determined by missing_data_percentage 116 | 117 | :param df: Input time series dataframe to inject NaN into. 118 | :param missing_data_percentage: Proportion of rows to mark as missing target data 119 | :param target_col_index: Index of the column (target) in which to create missing data 120 | :return: None 121 | """ 122 | rows, _ = df.shape 123 | df.iloc[sorted(random.sample(range(rows), round(rows * missing_data_percentage/100))), target_col_index] = np.nan 124 | 125 | 126 | def save_datasets(df: pd.DataFrame, 127 | filename: str, 128 | as_pickle=True, 129 | as_csv=True): 130 | """ 131 | Saves the input dataframe as pickle and csv files, by default. 132 | 133 | :param df: The dataframe to save 134 | :param filename: File name to save as, output file will be filename.csv and filename.pickle 135 | :param as_pickle: Flag to save file as pickle, by default True 136 | :param as_csv: Flag to save file as csv, by default True 137 | :return: None 138 | """ 139 | if as_pickle: 140 | df.to_pickle(filename+'.pickle') 141 | if as_csv: 142 | df.to_csv(filename+'.csv', 143 | sep=",", header=True, index=False) 144 | 145 | 146 | if __name__ == '__main__': 147 | # Set sns and matplotlib options 148 | register_matplotlib_converters() 149 | sns.set_context('notebook') 150 | 151 | # process the dataframe 152 | process() 153 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/03-default-experiment-configs.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_key": "", 3 | "testset_key": "", 4 | "validset_key": "", 5 | "target_col": "Sale", 6 | "fold_col": "", 7 | "weight_col": "", 8 | "orig_time_col": "Timeslot", 9 | "time_col": "Timeslot", 10 | "is_classification": false, 11 | "cols_to_drop": [], 12 | "enable_gpus": false, 13 | "seed": 1234, 14 | "accuracy": 6, 15 | "time": 3, 16 | "interpretability": 8, 17 | "scorer": "RMSE", 18 | "time_groups_columns": [ 19 | "Timeslot", 20 | "StoreID", 21 | "Product" 22 | ], 23 | "time_period_in_seconds": 3600, 24 | "num_prediction_periods": 24, 25 | "num_gap_periods": 0, 26 | "is_timeseries": true, 27 | "config_overrides": "recipe = \"auto\"\nenable_xgboost = \"auto\"\nenable_lightgbm = \"auto\"\nenable_rf = \"auto\"\nenable_glm = \"auto\"\nenable_tensorflow = \"off\"\nenable_rulefit = \"off\"\nenable_ftrl = \"off\"\nparameter_tuning_num_models = -1\nfixed_ensemble_level = -1\ncheck_distribution_shift = true\ndrop_features_distribution_shift_threshold_auc = 0.6\ntarget_transformer = \"auto\"\nenable_target_encoding = true\ntime_series_recipe = true\noverride_lag_sizes = \"\"\nprob_lag_non_targets = 0.1\nmake_python_scoring_pipeline = true\nmake_mojo_scoring_pipeline = false\nrulefit_max_num_rules = -1\nfeature_brain_level = 2\nquantile_imbalanced_sampling = false\nholiday_features = true\nseed = 1234\nforce_64bit_precision = false\nmin_num_rows = 100\nmax_orig_cols_selected = 10000\nnfeatures_max = -1\nfeature_evolution_data_size = 100000000\nfeature_engineering_effort = 5\nmax_feature_interaction_depth = 8\nmax_relative_cardinality = 0.95\nstring_col_as_text_threshold = 0.3\ntensorflow_max_epochs = 10\nenable_tensorflow_textcnn = false\nenable_tensorflow_textbigru = false\nenable_tensorflow_charcnn = false\ntensorflow_max_epochs_nlp = 2\nmin_dai_iterations = 0\nmax_nestimators = 3000\nmax_nestimators_feature_evolution_factor = 0.2\nmax_learning_rate = 0.5\nmax_cores = -1\nnum_gpus_per_model = 1\nnum_gpus_per_experiment = -1\ngpu_id_start = 0\ncompute_correlation = false\nhigh_correlation_value_to_report = 0.95\ndump_modelparams_every_scored_indiv = false\ndump_varimp_every_scored_indiv = false\ndetailed_traces = false\nconfig_overrides = \"\"\n" 28 | } 29 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/03-run-experiment.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Commented. Enable for debugging 4 | # set -x 5 | 6 | current_dir="$(pwd)" 7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | conda_env_name="ts-pipeline-env" 9 | conda_env_def_file="environment.yml" 10 | process_script="03_run_experiment.py" 11 | exp_data_dir_root="experiment_data" 12 | exp_run_dir_root="experiment_runs" 13 | exp_accuracy=1 14 | exp_time=1 15 | exp_interpretability=8 16 | exp_scorer="RMSE" 17 | cur_date=$(date +%y%m%d) 18 | temp_dir_name="run_${cur_date}_${BASHPID}" 19 | 20 | error_exit(){ 21 | echo "" 22 | echo "$1" 1>&2 23 | echo "" 24 | exit 1 25 | } 26 | 27 | print_usage(){ 28 | echo "Usage:" 29 | echo " bash $0 -d -c [-t | --test] [-h | --help]" 30 | echo "Options:" 31 | echo " -d Path (relative to this script) to the experiment data directory containing train.csv and test.csv files" 32 | echo " -c Path (relative to this script) to the default experiment config settings. Dataset details not needed in file." 33 | echo " -t, --test Include test dataset when executing the experiment (optional)." 34 | echo " -h, --help Display usage information." 35 | echo "Details:" 36 | echo " Executes an experiment on the Driverless AI server at DAI_HOST. The train dataset (train.csv) is obtained from " 37 | echo " the experiment_data_dir. Experiment configuration is obtained from experiment_config_file. The dataset key information" 38 | echo " in experiment_config_file can be left as it is. It will be obtained at runtime. " 39 | echo " " 40 | echo " The script expects below three environment variables to be set with Driverless AI connection information" 41 | echo " - DAI_HOST - Url where DAI is running. Include full URL till the port e.g. http://localhost:12345" 42 | echo " - DAI_USER - Username for connecting to Driverless AI" 43 | echo " - DAI_PASS - Password for the above user" 44 | echo " " 45 | echo " If the experiment completes successfully; python and mojo scoring pipelines are downloaded for the experiment. " 46 | } 47 | 48 | check_create_condaenv(){ 49 | conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly." 50 | local env_count=$(conda env list | grep "${conda_env_name}" | wc -l) 51 | if [[ "${env_count}" == 0 ]]; then 52 | # create conda environment from the yml file 53 | [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found" 54 | conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment" 55 | fi 56 | } 57 | 58 | run_experiment(){ 59 | # Make the temporary directory for this experiment run 60 | mkdir -p "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" && echo "Created temporary directory ${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" 61 | # pushd this directory 62 | # call python file. Pass DAI credentials. full path for train,test datasets and config file. aLso pass project name 63 | # read read experiment.json and get the experiment key 64 | # popd 65 | # rename the temporary directory to the experiment key 66 | # if control reaches here, then conda environment is available 67 | [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found" 68 | pushd "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" > /dev/null && 69 | source activate "${conda_env_name}" && 70 | python "${script_dir}/${process_script}" -h "${dai_host}" \ 71 | -u "${dai_user}" \ 72 | -p "${dai_pass}" \ 73 | -d "${script_dir}/${exp_data_dir}/train.csv" \ 74 | -c "${script_dir}/${exp_config_file}" \ 75 | -j "${project_name}" \ 76 | ${include_test_data:+ -t "${script_dir}/${exp_data_dir}/test.csv"} && 77 | conda deactivate && 78 | popd > /dev/null 79 | 80 | # remove temp directory if experiment.json does not exist. 81 | [[ -f "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" ]] || { rm -rf "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}"; } 82 | 83 | # if the experiment.json exists, get experiment key from the json and rename dir to the key 84 | if [[ -f "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" ]]; then 85 | exp_key=$(cat "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" | grep -Po '"key": "\K[a-z]*?(?=",)') 86 | if [[ ! -z "${exp_key}" ]]; then 87 | mv "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" "${exp_data_dir}/${exp_run_dir_root}/${exp_key}" 88 | fi 89 | fi 90 | } 91 | 92 | parse_args_then_exec(){ 93 | # fail fast in case no parameters are passed 94 | [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; } 95 | 96 | # fail fast if required environment variables are not defined; if defined get the values 97 | [[ ! -z "${DAI_HOST}" ]] || error_exit "Expected environment variable DAI_HOST is not defined." 98 | [[ ! -z "${DAI_USER}" ]] || error_exit "Expected environment variable DAI_USER is not defined." 99 | [[ ! -z "${DAI_PASS}" ]] || error_exit "Expected environment variable DAI_PASS is not defined." 100 | dai_host="${DAI_HOST}" 101 | dai_user="${DAI_USER}" 102 | dai_pass="${DAI_PASS}" 103 | 104 | 105 | while [[ "$1" != "" ]]; do 106 | case "$1" in 107 | -d ) 108 | shift 109 | exp_data_dir="$1" 110 | # If directory exists, proceed; else print message and exit with error code 111 | [[ -d "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not exist."; } 112 | [[ -f "${exp_data_dir}/train.csv" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain train.csv dataset."; } 113 | ;; 114 | -c ) 115 | shift 116 | exp_config_file="$1" 117 | # If directory exists, proceed; else print message and exit with error code 118 | [[ -f "${exp_config_file}" ]] || { print_usage; error_exit "Experiment configuration file ${script_dir}/${exp_config_file} does not exist."; } 119 | ;; 120 | -t | --test ) 121 | include_test_data="yes" 122 | ;; 123 | -h | --help ) 124 | print_usage 125 | exit 0 126 | ;; 127 | * ) 128 | print_usage 129 | error_exit "Error: Incorrect parameters passed" 130 | ;; 131 | esac 132 | shift 133 | done 134 | # check if needed parameters are provided 135 | [[ ! -z "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory is mandatory"; } 136 | [[ ! -z "${exp_config_file}" ]] || { print_usage; error_exit "Experiment config file is mandatory"; } 137 | 138 | # if test data is to be included check if file exists 139 | if [[ "${include_test_data}" == "yes" ]]; then 140 | [[ -f "${exp_data_dir}/test.csv" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain test.csv dataset."; } 141 | fi 142 | 143 | # setup project_name from exp_data_dir 144 | project_name=$(basename ${exp_data_dir}) 145 | 146 | # Create conda environment if it does not exist 147 | check_create_condaenv 148 | 149 | run_experiment 150 | } 151 | 152 | main() { 153 | parse_args_then_exec $@ 154 | } 155 | 156 | main $@ 157 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/03_run_experiment.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | import os 4 | 5 | import h2oai_client as h2o 6 | 7 | @click.command() 8 | @click.option('-h', '--host', 'dai_host', 9 | required=True, 10 | help='Driverless AI host url e.g http://hostname:12345') 11 | @click.option('-u', '--user', 'dai_user', 12 | required=True, 13 | help='Driverless AI username') 14 | @click.option('-p', '--pass', 'dai_pass', 15 | required=True, 16 | help='Driverless AI password') 17 | @click.option('-d', '--train', 'train_ds', type=click.Path(exists=True, 18 | file_okay=True, 19 | dir_okay=False, 20 | readable=True), 21 | required=True, 22 | help='Training dataset CSV file path.') 23 | @click.option('-c', '--config', 'exp_config', type=click.Path(exists=True, 24 | file_okay=True, 25 | dir_okay=False, 26 | readable=True), 27 | required=True, 28 | help='Default experiment config file.') 29 | @click.option('-j', '--project', 'project_name', 30 | required=True, 31 | help='Project name to use for organizing the experiment. If does not exist, new project is created.') 32 | @click.option('-t', '--test', 'test_ds', type=click.Path(exists=True, 33 | file_okay=True, 34 | dir_okay=False, 35 | readable=True), 36 | required=False, 37 | default=None, 38 | help='Testing dataset CSV file path.') 39 | def process(dai_host, 40 | dai_user, 41 | dai_pass, 42 | train_ds, 43 | exp_config, 44 | project_name, 45 | test_ds): 46 | """ 47 | 48 | :param dai_host: Driverless AI host URL e.g. http://localhost:12345 49 | :param dai_user: Driverless AI user name 50 | :param dai_pass: Driverless AI password 51 | :param train_ds: path to training dataset csv file 52 | :param exp_config: path to experiment config json file 53 | :param project_name: Project name to organize datasets and experiments 54 | :param test_ds: path to testing dataset csv file (optional) 55 | :return: None 56 | """ 57 | # print all the passed parameters 58 | # import inspect 59 | # _, _, _, values = inspect.getargvalues(inspect.currentframe()) 60 | # print(values) 61 | 62 | # Create a connection to Driverless AI 63 | con = h2o.Client(address=dai_host, 64 | username=dai_user, 65 | password=dai_pass) 66 | 67 | # Get project key 68 | project_key = get_project_key(con, project_name) 69 | 70 | # Upload datasets and link to project 71 | test_ds_key = None 72 | train_ds_key = upload_dataset_to_project(con, project_key, train_ds, "Training") 73 | if test_ds is not None: 74 | test_ds_key = upload_dataset_to_project(con, project_key, test_ds, "Testing") 75 | 76 | # Read experiment config file and overwrite needed configs, save the config on file system 77 | with open(exp_config, 'r') as read_file: 78 | experiment_configs = json.load(read_file) 79 | experiment_configs['dataset_key'] = train_ds_key 80 | if test_ds_key is not None: 81 | experiment_configs['testset_key'] = test_ds_key 82 | with open('experiment-config.json', 'w') as write_file: 83 | json.dump(experiment_configs, write_file, indent=4) 84 | 85 | # Execute the experiment, link to project 86 | experiment: h2o.Model = con.start_experiment_sync(**experiment_configs) 87 | con.link_experiment_to_project(project_key,experiment.key) 88 | 89 | # build mojo pipeline 90 | mojo: h2o.MojoPipeline = con.build_mojo_pipeline_sync(experiment.key) 91 | 92 | # download mojo and python scoring pipelines and experiment summary 93 | con.download(experiment.scoring_pipeline_path, "") 94 | con.download(experiment.summary_path, "") 95 | con.download(mojo.file_path, "") 96 | 97 | # Finally save experiment.json 98 | with open('experiment.json', 'w') as write_file: 99 | json.dump(experiment.dump(), write_file, indent=4) 100 | 101 | 102 | 103 | def upload_dataset_to_project(con: h2o.Client, 104 | project_key: str, 105 | dataset_file: str, 106 | dataset_type: str): 107 | """ 108 | Uploads the data provided in dataset_file path to Driverless AI and links to the project. If the project already 109 | has a dataset of the specified type and filename linked, then it is not re-uploaded. For the uploaded dataset, the 110 | dataset_key of the newly uploaded dataset is returned. If it is not uploaded, then key of the dataset matching the 111 | file name is returned. 112 | 113 | :param con: Connection to H2O Driverless AI 114 | :param project_key: Key of the project to link the dataset to 115 | :param dataset_file: File path of the dataset to upload and link to project 116 | :param dataset_type: Either 'Training' or 'Testing' 117 | :return: dataset_key 118 | """ 119 | file_name = os.path.basename(dataset_file) 120 | datasets = con.get_datasets_for_project(project_key, dataset_type) 121 | dataset = next((x for x in datasets if x.name == file_name), None) 122 | if dataset is None: 123 | dataset = con.upload_dataset_sync(file_path=dataset_file) 124 | con.link_dataset_to_project(project_key=project_key, 125 | dataset_key=dataset.key, 126 | dataset_type=dataset_type) 127 | return dataset.key 128 | 129 | 130 | def get_project_key(con: h2o.Client, 131 | project_name: str) -> str: 132 | """ 133 | Returns the key of the project with name matching project_name. If such a project does not exist, a new project is 134 | created and its key is returned. 135 | 136 | :param con: Client to H2O Driverless AI 137 | :param project_name: Name of the project 138 | :return: 139 | """ 140 | projects = con.list_projects(offset=0, limit=1000) 141 | project = next((x for x in projects if x.name == project_name), None) 142 | if project is None: 143 | key = con.create_project(project_name, project_name) 144 | return key 145 | return project.key 146 | 147 | 148 | if __name__ == '__main__': 149 | # Call the main processing function 150 | process() 151 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/04-create-tta-scoring-files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Commented. Enable for debugging 4 | # set -x 5 | 6 | current_dir="$(pwd)" 7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | conda_env_name="ts-pipeline-env" 9 | conda_env_def_file="environment.yml" 10 | process_script="04_generate_tta_files.py" 11 | exp_data_dir_root="experiment_data" 12 | exp_data_dir_regex="s([0-9-]+)-e([0-9-]+)-gd([0-9]+)-td([0-9]+)-m[0-9]+" 13 | tta_dir_prefix="tta-scoring-data" 14 | predict_duration=24 # daily 15 | roll_duration=1 # hourly 16 | 17 | error_exit(){ 18 | echo "" 19 | echo "$1" 1>&2 20 | echo "" 21 | exit 1 22 | } 23 | 24 | print_usage(){ 25 | echo "Usage:" 26 | echo " bash $0 -i [-p ] [-r ] [-h | --help]" 27 | echo "Options:" 28 | echo " -i Experiment data directory containing train, gap, and test csv and pickle files" 29 | echo " -p Duration (in hours) of data to predict in each scoring data frame. Optional, defaults to 24 hours i.e 1 day" 30 | echo " -r Duration (in hours) by which to roll the data window and score for next predict duration. Optional, defaults to 1 hour" 31 | echo " -h, --help Display usage information." 32 | echo "Details:" 33 | echo " Creates TTA and rolling window based scoring dataframes (csv and pickle) in the output directory." 34 | echo " The output directory will be created in the format tta-scoring-data-pdP-rdR, where" 35 | echo " - pdP is the predict duration" 36 | echo " - rdR is the rolling duration" 37 | echo " The output directory will be created as a subdirectory of " 38 | echo " When the script is executed with certain inputs which results in an output directory that already exists, no action is taken." 39 | } 40 | 41 | parse_args_then_exec(){ 42 | # fail fast in case no parameters are passed 43 | [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; } 44 | while [[ "$1" != "" ]]; do 45 | case "$1" in 46 | -i ) 47 | shift 48 | exp_data_dir="$1" 49 | # If directory exists, proceed; else print message and exit with error code 50 | [[ -d "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not exist."; } 51 | [[ -f "${exp_data_dir}/train.pickle" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain train.pickle dataset."; } 52 | [[ -f "${exp_data_dir}/test.pickle" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain test.pickle dataset."; } 53 | ;; 54 | -p ) 55 | shift 56 | predict_duration="$1" 57 | # error is date is not in the valid format 58 | [[ "${predict_duration}" =~ ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Predict duration (hours) is expected to be a non-zero integer."; } 59 | ;; 60 | -r ) 61 | shift 62 | roll_duration="$1" 63 | # error is date is not in the valid format 64 | [[ "${roll_duration}" =~ ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Roll duration (hours) is expected to be a non-zero integer."; } 65 | ;; 66 | -h | --help ) 67 | print_usage 68 | exit 0 69 | ;; 70 | * ) 71 | print_usage 72 | error_exit "Error: Incorrect parameters passed" 73 | ;; 74 | esac 75 | shift 76 | done 77 | 78 | # If required parameters are missing, print usage and exit 79 | [[ ! -z "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory is mandatory"; } 80 | 81 | # Check if experiment data directory is in the correct format 82 | exp_data_dir_base=$(basename ${exp_data_dir}) 83 | [[ ${exp_data_dir_base} =~ ${exp_data_dir_regex} ]] || { error_exit "Experiment data directory ${exp_data_dir_base} is not in the correct format."; } 84 | 85 | # Extract information from data directory name 86 | start_date=${BASH_REMATCH[1]} 87 | end_date=${BASH_REMATCH[2]} 88 | gap_duration=${BASH_REMATCH[3]} 89 | test_duration=${BASH_REMATCH[4]} 90 | 91 | # Generate tta directory 92 | tta_dir="${tta_dir_prefix}-pd${predict_duration}-rd${roll_duration}" 93 | [[ ! -d "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}" ]] || error_exit "TTA data directory ${exp_data_dir_root}/${exp_data_dir}/${tta_dir} already exists. No action taken." 94 | mkdir -p "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}/score" 95 | mkdir -p "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}/predicted" 96 | 97 | # Create conda environment if it does not exist 98 | check_create_condaenv 99 | 100 | generate_tta_scoring_files 101 | } 102 | 103 | check_create_condaenv(){ 104 | conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly." 105 | local env_count=$(conda env list | grep "${conda_env_name}" | wc -l) 106 | if [[ "${env_count}" == 0 ]]; then 107 | # create conda environment from the yml file 108 | [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found" 109 | conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment" 110 | fi 111 | } 112 | 113 | generate_tta_scoring_files(){ 114 | # if control reaches here, then conda environment is available 115 | [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found" 116 | pushd "${exp_data_dir_root}/${exp_data_dir_base}" > /dev/null && 117 | source activate "${conda_env_name}" && 118 | python "${script_dir}/${process_script}" -o "${tta_dir}" \ 119 | -s "${start_date}" \ 120 | -e "${end_date}" \ 121 | -g "${gap_duration}" \ 122 | -t "${test_duration}" \ 123 | -p "${predict_duration}" \ 124 | -r "${roll_duration}" && 125 | conda deactivate && 126 | popd > /dev/null 127 | } 128 | 129 | main() { 130 | parse_args_then_exec $@ 131 | } 132 | 133 | main $@ -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/04_generate_tta_files.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | import datetime as dt 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | @click.command() 9 | @click.option('-o', '--outdir', 'tta_dir', type=click.Path(exists=True, 10 | file_okay=False, 11 | dir_okay=True, 12 | readable=True, 13 | writable=True), 14 | required=True, 15 | help='Output data directory where the TTA scoring files will be generated.') 16 | @click.option('-s', '--start', 'train_start_date', 17 | required=True, 18 | type=click.DateTime(formats=['%Y-%m-%d']), 19 | help='Start date for training data.') 20 | @click.option('-e', '--end', 'train_end_date', 21 | required=True, 22 | type=click.DateTime(formats=['%Y-%m-%d']), 23 | help='End date for training data.') 24 | @click.option('-g', '--gap', 'gap_duration', 25 | required=True, 26 | type=click.INT, 27 | help='Gap (in days) between training and test data') 28 | @click.option('-t', '--test', 'test_duration', 29 | required=True, 30 | type=click.INT, 31 | help='Duration (in days) for the testing dataset.') 32 | @click.option('-p', '--predict', 'predict_duration', 33 | required=True, 34 | type=click.INT, 35 | help='Duration (in hours) of data to predict in each scoring data frame.') 36 | @click.option('-r', '--roll', 'roll_duration', 37 | required=True, 38 | type=click.INT, 39 | help='Duration (in hours) by which to roll the data window for the next scoring cycle.') 40 | def process(tta_dir, 41 | train_start_date, 42 | train_end_date, 43 | gap_duration, 44 | test_duration, 45 | predict_duration, 46 | roll_duration): 47 | """ 48 | Creates TTA (test time augmentation) and rolling window based scoring dataframes from the test data 49 | in the output directory. These scoring files can then be passed to Driverless AI Scoring module for 50 | scoring. 51 | 52 | :param tta_dir: Output directory to create the TTA scoring data files. 53 | :param train_start_date: Start date for training dataset 54 | :param train_end_date: End date for training datset 55 | :param gap_duration: Gap (in days) between training and testing dataset. 56 | :param test_duration: Duration (in days) of the testing dataset. 57 | :param predict_duration: Duration (in hours) for which we are predicting in each scoring call. 58 | :param roll_duration: Duration (in hours) by which to roll the data window fo the next scoring call. 59 | :return: None 60 | """ 61 | # Note the shell wrapper is taking care of changing to the appropriate data directory, so the train, gap and test 62 | # files will be in the current directory. The TTA file directory can be created here 63 | 64 | train_end_date = train_end_date.replace(hour=23) 65 | gap_start_date = train_end_date + dt.timedelta(hours=1) 66 | gap_end_date = gap_start_date + dt.timedelta(days=gap_duration, hours=-1) 67 | test_start_date = gap_end_date + dt.timedelta(hours=1) 68 | test_end_date = test_start_date + dt.timedelta(days=test_duration, hours=-1) 69 | 70 | rolling_slots = get_tta_scoring_slots(gap_start_date, gap_end_date, 71 | test_start_date, test_end_date, 72 | predict_duration, roll_duration) 73 | 74 | # Read the dataframes. 75 | df = pd.read_pickle('test.pickle') 76 | if gap_duration > 0: 77 | gap_df = pd.read_pickle('gap.pickle') 78 | df = pd.concat([gap_df, df]) 79 | 80 | for slot in rolling_slots: 81 | tta_df = df[slot['tta_start']:slot['tta_end']].copy() 82 | score_df = df[slot['score_start']:slot['score_end']].copy() 83 | score_df['Sale'] = np.nan 84 | bind_df = pd.concat([tta_df, score_df]) 85 | file_name = f"{slot['roll_counter_str']}-ss{slot['score_start']}-se{slot['score_end']}" 86 | save_datasets(bind_df, 87 | tta_dir + "/score/" + file_name, 88 | as_csv=True, 89 | as_pickle=False) 90 | 91 | 92 | #%% Define another function 93 | def get_tta_scoring_slots(gs: dt.datetime, 94 | ge: dt.datetime, 95 | ts: dt.datetime, 96 | te: dt.datetime, 97 | pd: int, 98 | rd: int): 99 | """ 100 | Print the TTA scoring info in the following format 101 | TNNNN-ScoreTime-TTAstarttime-TTAendtime-PRDstarttime-PRDendtime 102 | :param gs: Gap start 103 | :param ge: Gap end 104 | :param ts: Test Start 105 | :param te: Test end 106 | :param pd: Predict Duration (hours) should be > 0 107 | :param rd: Roll Duration (hours) should be > 0 108 | :return: List of dicts containing the tta slot information 109 | """ 110 | slots_list = [] 111 | if ge > gs: 112 | tta_start = gs 113 | else: 114 | tta_start = ts 115 | score_pointer = ts 116 | roll_counter = 0 117 | while score_pointer <= te - dt.timedelta(hours=pd-1): 118 | tta_end = tta_start + dt.timedelta(hours=(roll_counter-1)*rd) 119 | score_start = score_pointer 120 | score_end = score_pointer + dt.timedelta(hours=pd-1) 121 | d = { 122 | 'roll_counter': roll_counter, 123 | 'roll_counter_str': f"{roll_counter:05d}", 124 | 'tta_start': tta_start, 125 | 'tta_end': tta_end, 126 | 'score_start': score_start, 127 | 'score_end': score_end 128 | } 129 | slots_list.append(d) 130 | score_pointer = score_pointer + dt.timedelta(hours=rd) 131 | roll_counter = roll_counter + 1 132 | return slots_list 133 | 134 | 135 | def save_datasets(df: pd.DataFrame, 136 | filename: str, 137 | as_pickle=True, 138 | as_csv=True): 139 | """ 140 | Saves the input dataframe as pickle and csv files, by default. 141 | 142 | :param df: The dataframe to save 143 | :param filename: File name to save as, output file will be filename.csv and filename.pickle 144 | :param as_pickle: Flag to save file as pickle, by default True 145 | :param as_csv: Flag to save file as csv, by default True 146 | :return: None 147 | """ 148 | if as_pickle: 149 | df.to_pickle(filename+'.pickle') 150 | if as_csv: 151 | df.to_csv(filename+'.csv', 152 | sep=",", header=True, index=False) 153 | 154 | 155 | if __name__ == '__main__': 156 | # Set sns and matplotlib options 157 | 158 | # process the dataframe 159 | process() 160 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/05-score-tta-files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Commented. Enable for debugging 4 | # set -x 5 | 6 | current_dir="$(pwd)" 7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 8 | process_script="05_score_tta_files.py" 9 | exp_data_dir_root="experiment_data" 10 | tta_dir_prefix="tta-scoring-data" 11 | use_pipeline="python" 12 | use_method="module" 13 | 14 | error_exit(){ 15 | echo "" 16 | echo "$1" 1>&2 17 | echo "" 18 | exit 1 19 | } 20 | 21 | print_usage(){ 22 | echo "Usage:" 23 | echo " bash $0 -e -s [-p ] [-m ] [-h | --help]" 24 | echo "Options:" 25 | echo " -e Experiment run directory containing scorer.zip. Will have same name as experiment in Driverless AI" 26 | echo " -s TTA scoring data directory created in step 04. Name will start with ${tta_dir_prefix}" 27 | echo " -p Optional, defaults to python. Use Driverless AI Python or Mojo (Java) pipeline for scoring" 28 | echo " -m Optional, defaults to module. Score using python module in code or using HTTP JSON or DataFrame API endpoint" 29 | echo " -h, --help Display usage information." 30 | echo "Details:" 31 | echo " Scores the files in scoring data directory using the scoring pipeline for selected experiment. Also creates the necessary" 32 | echo " environments with dependencies for the scoring pipeline to work." 33 | echo " Scoring files will be picked from the 'score' sub-directory of selected scoring data directory." 34 | echo " Output files will be generated in the 'predicted' sub-directory of selected scoring data directory." 35 | echo " Scoring method 'api' sends the prediction dataframe as JSON to API server for batch scoring; 'api2' uses base64 encoded Pandas DataFrame" 36 | } 37 | 38 | parse_args_validate_then_exec(){ 39 | # fail fast in case no parameters are passed 40 | [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; } 41 | while [[ "$1" != "" ]]; do 42 | case "$1" in 43 | -e ) 44 | shift 45 | exp_run_dir="$1" 46 | # If directory exists, proceed; else print message and exit with error code 47 | [[ -d "${exp_run_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not exist."; } 48 | [[ -f "${exp_run_dir}/experiment.json" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain experiment.json file."; } 49 | [[ -f "${exp_run_dir}/experiment-config.json" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain experiment-config.json."; } 50 | experiment_name=$(basename "${exp_run_dir}") 51 | ;; 52 | -s ) 53 | shift 54 | scoring_data_dir="$1" 55 | # If directory exists, proceed; else print message and exit with error code 56 | [[ -d "${scoring_data_dir}/score" ]] || { print_usage; error_exit "Scoring data directory ${script_dir}/${scoring_data_dir}/score does not exist."; } 57 | files_to_score=$(ls "${scoring_data_dir}/score" | wc -l) 58 | [[ "${files_to_score}" -gt "0" ]] || { print_usage; error_exit "No files to score in scoring data directory ${script_dir}/${scoring_data_dir}/score."; } 59 | ;; 60 | -p ) 61 | shift 62 | use_pipeline="$1" 63 | [[ "${use_pipeline}" =~ ^(python|mojo)$ ]] || { print_usage; error_exit "Incorrect pipeline option. Only 'python' and 'mojo' are supported."; } 64 | ;; 65 | -m ) 66 | shift 67 | use_method="$1" 68 | [[ "${use_method}" =~ ^(module|api|api2)$ ]] || { print_usage; error_exit "Incorrect method option. Only 'module' and 'api' are supported."; } 69 | ;; 70 | -h | --help ) 71 | print_usage 72 | exit 0 73 | ;; 74 | * ) 75 | print_usage 76 | error_exit "Error: Incorrect parameters passed" 77 | ;; 78 | esac 79 | shift 80 | done 81 | 82 | 83 | # If required parameters are missing, print usage and exit 84 | [[ ! -z "${exp_run_dir}" ]] || { print_usage; error_exit "Experiment run directory is mandatory"; } 85 | [[ ! -z "${scoring_data_dir}" ]] || { print_usage; error_exit "Scoring data directory is mandatory"; } 86 | 87 | # Check if experiment run dir has required pipeline.zip file based on the selected pipeline option 88 | case "${use_pipeline}" in 89 | python ) 90 | [[ -f "${exp_run_dir}/scorer.zip" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain python scoring pipeline scorer.zip."; } 91 | ;; 92 | mojo ) 93 | [[ -f "${exp_run_dir}/mojo.zip" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain mojo scoring pipeline mojo.zip."; } 94 | error_exit "Mojo pipeline option not yet supported for Test Time Augmentation scoring for Time Series experiments. Please use python type" 95 | ;; 96 | * ) 97 | print_usage 98 | error_exit "Incorrect pipeline option, only 'python' and 'mojo' are supported" 99 | ;; 100 | esac 101 | 102 | # Check prediction duration to tta scoring data and experiment match. 103 | # Prediction duration in step 04 (TTA scoring file generation) should match the prediction duration used to create TTA data 104 | scoring_data_dir_base=$(basename "${scoring_data_dir}") 105 | scoring_data_dir_regex="tta-scoring-data-pd([0-9-]+)-rd([0-9-]+)" 106 | [[ "${scoring_data_dir_base}" =~ ${scoring_data_dir_regex} ]] || { error_exit "Scoring data directory ${scoring_data_dir_base} is not in the correct format."; } 107 | scoring_data_predict_duration=${BASH_REMATCH[1]} 108 | exp_config_predict_duration=$(cat "${exp_run_dir}/experiment-config.json" | grep -P -o '"num_prediction_periods": \K([0-9]+)') 109 | [[ "${scoring_data_predict_duration}" -eq "${exp_config_predict_duration}" ]] || { error_exit "Prediction duration mismatch. Experiment: ${exp_config_predict_duration}, Scoring Data: ${scoring_data_predict_duration}"; } 110 | 111 | # Check if predicted directory contains scored files 112 | if [[ -d "${scoring_data_dir}/predict/${experiment_name}" ]]; then 113 | files_scored=$(ls "${scoring_data_dir}/predict/${experiment_name}" | wc -l) 114 | [[ "${files_scored}" -gt "0" ]] || { print_usage; error_exit "Scored files already exist in directory ${script_dir}/${scoring_data_dir}/predict/${experiment_name}."; } 115 | fi 116 | 117 | # Check that experiment data dir is common for experiment and tta scoring data 118 | # Get experiment data directory; 119 | experiment_data_dir_regex="^([0-9a-z_/\-]+)/experiment_runs/.*" 120 | [[ "${exp_run_dir}" =~ ${experiment_data_dir_regex} ]] || { error_exit "Experiment run directory ${exp_run_dir} is not in the correct format."; } 121 | experiment_data_dir=${BASH_REMATCH[1]} 122 | # Get experiment data directory; 123 | score_experiment_data_dir_regex="^([0-9a-z_/\-]+)/tta-scoring-data.*" 124 | [[ "${scoring_data_dir}" =~ ${score_experiment_data_dir_regex} ]] || { error_exit "Scoring data directory ${scoring_data_dir} is not in the correct format."; } 125 | score_experiment_data_dir=${BASH_REMATCH[1]} 126 | # Esnure they are same 127 | [[ "${experiment_data_dir}" == "${score_experiment_data_dir}" ]] || { error_exit "Experiment Run and Scoring data do not have the same experiment data directory."; } 128 | 129 | 130 | # Create conda environment if it does not exist 131 | check_create_condaenv 132 | 133 | case "${use_method}" in 134 | module ) 135 | score_tta_files_using_module 136 | ;; 137 | api ) 138 | score_tta_files_using_api 139 | ;; 140 | api2 ) 141 | score_tta_files_using_api2 142 | ;; 143 | * ) 144 | print_usage 145 | error_exit "Incorrect method option, only 'module' and 'api' are supported" 146 | ;; 147 | esac 148 | 149 | } 150 | 151 | check_create_condaenv(){ 152 | conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly." 153 | unzip -v > /dev/null || error_exit "Unzip required, please install unzip." 154 | # check if scoring-pipeline is already unzipped, if not unzip it 155 | [[ -d "${exp_run_dir}/scoring-pipeline" ]] || { pushd ${exp_run_dir} > /dev/null && unzip scorer.zip && popd > /dev/null; } 156 | conda_env_name=$(grep -P -o 'name: \K([a-z2_]+)' "${exp_run_dir}/scoring-pipeline/environment.yml") 157 | local env_count=$(conda env list | grep "${conda_env_name}" | wc -l) 158 | if [[ "${env_count}" == 0 ]]; then 159 | # create conda environment from the yml file 160 | [[ -e "${exp_run_dir}/scoring-pipeline/environment.yml" ]] || error_exit "Conda environment creation file not found" 161 | conda env create -f "${exp_run_dir}/scoring-pipeline/environment.yml" || error_exit "Error creating conda environment" 162 | source activate "${conda_env_name}" && 163 | conda install -y -c conda-forge click tqdm starlette uvicorn && 164 | conda deactivate 165 | fi 166 | } 167 | 168 | score_tta_files_using_module(){ 169 | # if control reaches here, then conda environment is available 170 | [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found" 171 | pushd "${scoring_data_dir}" > /dev/null && 172 | source activate "${conda_env_name}" && 173 | python "${script_dir}/${process_script}" -n "${experiment_name}" \ 174 | -t "${script_dir}/${experiment_data_dir}/test.pickle" \ 175 | -g "${script_dir}/${experiment_data_dir}/gap.pickle" \ 176 | --module && 177 | conda deactivate && 178 | rm -rf tmp && 179 | popd > /dev/null 180 | } 181 | 182 | score_tta_files_using_api(){ 183 | # if control reaches here, then conda environment is available 184 | [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found" 185 | 186 | # Hack to get http_server working for TTA 187 | # More info - Read Warning in 188 | # https://github.com/h2oai/driverlessai-tutorials/tree/master/driverlessai_experiments/timeseries/ts-full-pipeline#step-05-score-tta-files 189 | # We check if the line is already added in the file, if so we dont add it again. 190 | # If not added already, then we find out the line # in the file where we add this line and then add it 191 | # The idea will work for all use cases, but the code is specific to this data/experiment 192 | # for your experiment, make corresponding changes 193 | grep -q "pd.Series(\[r\['Sale'] if" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" || { 194 | line_no=$(grep -n "pd.Series(\[r\['" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" | tail -n 1 | cut -d ":" -f 1) 195 | inject_lino=$(expr ${line_no} + 1) 196 | sed -i "${inject_lino}i\ pd.Series([r['Sale'] if r['Sale'] != None else None for r in rows], name='Sale', dtype='float')" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" 197 | } 198 | 199 | pushd "${scoring_data_dir}" > /dev/null && 200 | source activate "${conda_env_name}" && 201 | (python "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" --port=9090 > /dev/null 2>&1 &) && 202 | sleep 20 && 203 | python "${script_dir}/${process_script}" -n "${experiment_name}" \ 204 | -t "${script_dir}/${experiment_data_dir}/test.pickle" \ 205 | -g "${script_dir}/${experiment_data_dir}/gap.pickle" \ 206 | --api-json && 207 | pkill -f http_server.py && 208 | conda deactivate && 209 | rm -rf tmp && 210 | popd > /dev/null 211 | } 212 | 213 | score_tta_files_using_api2(){ 214 | # if control reaches here, then conda environment is available 215 | [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found" 216 | pushd "${scoring_data_dir}" > /dev/null && 217 | source activate "${conda_env_name}" && 218 | (python "${script_dir}/11_http_server2.py" -n ${experiment_name} -p 9090 > /dev/null 2>&1 &) && 219 | sleep 20 && 220 | python "${script_dir}/${process_script}" -n "${experiment_name}" \ 221 | -t "${script_dir}/${experiment_data_dir}/test.pickle" \ 222 | -g "${script_dir}/${experiment_data_dir}/gap.pickle" \ 223 | --api-df && 224 | pkill -f 11_http_server2.py && 225 | conda deactivate && 226 | rm -rf tmp && 227 | popd > /dev/null 228 | } 229 | 230 | main(){ 231 | parse_args_validate_then_exec $@ 232 | } 233 | 234 | main $@ 235 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/05_score_tta_files.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import click 3 | import glob 4 | import importlib 5 | import json 6 | import os 7 | import re 8 | import requests 9 | 10 | import datetime as dt 11 | import pandas as pd 12 | 13 | from tqdm import tqdm 14 | from io import BytesIO 15 | 16 | 17 | @click.command() 18 | @click.option('-n', '--name', 'experiment_name', 19 | required=True, 20 | help='Experiment name.') 21 | @click.option('-t', '--test', 'test_ds_file', type=click.Path(exists=True, 22 | file_okay=True, 23 | dir_okay=False, 24 | readable=True), 25 | required=True, 26 | help='Testing dataset CSV file path.') 27 | @click.option('-g', '--gap', 'gap_ds_file', type=click.Path(exists=False, 28 | file_okay=True, 29 | dir_okay=False, 30 | readable=True), 31 | required=False, 32 | help='Gap dataset CSV file path.') 33 | @click.option('--module', 'method', flag_value='module', default=True) 34 | @click.option('--api-json', 'method', flag_value='api-json', default=True) 35 | @click.option('--api-df', 'method', flag_value='api-df', default=True) 36 | def process(experiment_name, 37 | test_ds_file, 38 | gap_ds_file, 39 | method): 40 | """ 41 | Score the TTA files in the 'score' directory, and create corresponding prediction files in the 42 | 'predict/ directory. Also calculate the metric (RMSE) to measure how good is the 43 | prediction for that file. 44 | 45 | :param experiment_name: Name of the experiment run 46 | :param test_ds_file: Path of the test dataset file used for RMSE calculation 47 | :param gap_ds_file: Path of the gap dataset file used for RMSE calculation 48 | :param method: Score using imported module or use HTTP API using JSON (api-json) or DataFrame (api-df) 49 | :return: None 50 | """ 51 | # Note the shell wrapper ensures this python file is executed in the TTA scoring data directory. 52 | 53 | # print(experiment_name) 54 | # print(test_ds) 55 | 56 | 57 | # Load the test datasset 58 | # Read csv to data frame. 59 | # test_ds = pd.read_csv(test_ds_file, 60 | # sep=',', 61 | # names=['Timeslot', 'StoreID', 'Product', 'Sale'], 62 | # parse_dates=['Timeslot'], 63 | # infer_datetime_format=True) 64 | test_ds = pd.read_pickle(test_ds_file) 65 | if gap_ds_file is not None and os.path.exists(gap_ds_file): 66 | gap_ds = pd.read_pickle(gap_ds_file) 67 | test_ds = pd.concat([gap_ds, test_ds]) 68 | 69 | # Create the output directory if it does not exists 70 | os.makedirs(f'predicted/{experiment_name}', exist_ok=True) 71 | 72 | # Compile the regex 73 | regex = re.compile(r'([0-9]{5})-ss([0-9 -:]{19})-se([0-9 -:]{19})') 74 | 75 | # Glob all files to score, from the 'score' directory and then process each of them 76 | for file in tqdm(glob.glob('score/*.csv')): 77 | # Extract scoring duration from the file name. Calculate how many data points it makes 78 | # Per hour is 8 data points 79 | file_name = os.path.splitext(os.path.basename(file))[0] 80 | capture_groups = regex.match(file_name) 81 | file_order = capture_groups.group(1) 82 | score_start_time = dt.datetime.strptime(capture_groups.group(2), r'%Y-%m-%d %H:%M:%S') 83 | score_end_time = dt.datetime.strptime(capture_groups.group(3), r'%Y-%m-%d %H:%M:%S') 84 | last_n_values = (((score_end_time - score_start_time).seconds // 3600) + 1) * 8 85 | 86 | # Load dataset to score and score it 87 | score_ds = pd.read_csv(file) 88 | if method == 'module': 89 | preds_ds = score_using_module(experiment_name, score_ds) 90 | elif method == 'api-json': 91 | preds_ds = score_using_http_api(score_ds) 92 | elif method == 'api-df': 93 | preds_ds = score_using_http_api2(score_ds) 94 | 95 | # Rename the predicted Sale column as Sale_hat and concat it to the original dataset 96 | preds_ds.columns = ['Sale_hat'] 97 | preds_ds = pd.concat([score_ds, preds_ds], axis=1) 98 | 99 | # Get actual and predicted value arrays. 100 | # Actuals are obtained from test data using score start and end time to slice 101 | # Predicted data frame even predicts and returns TTA data. So use last_n_values to slice it 102 | actual_values = test_ds.loc[score_start_time:score_end_time, 'Sale'].values 103 | predicted_values = preds_ds['Sale_hat'].values[-last_n_values:] 104 | 105 | # Ensure the arrays match 106 | assert len(actual_values) == len(predicted_values) 107 | df = pd.DataFrame({'actual': actual_values, 'predicted': predicted_values}) 108 | # Note that we drop the rows in case there is an NaN in actuals to calculate RMSE 109 | df.dropna(inplace=True) 110 | rmse = ((df['predicted'] - df['actual']) ** 2).mean() ** 0.5 111 | 112 | if method == 'module': 113 | file_name = f'predicted/{experiment_name}/{file_order}-mod-m{rmse}' 114 | elif method == 'api-json': 115 | file_name = f'predicted/{experiment_name}/{file_order}-api-json-m{rmse}' 116 | elif method == 'api-df': 117 | file_name = f'predicted/{experiment_name}/{file_order}-api-df-m{rmse}' 118 | 119 | # Save the predictions 120 | save_datasets(preds_ds, 121 | file_name, 122 | as_pickle=False, 123 | as_csv=True) 124 | 125 | 126 | def score_using_module(experiment_name: str, 127 | df: pd.DataFrame): 128 | """ 129 | Score the input dataframe using python module 130 | 131 | :param experiment_name: Name of the experiment 132 | :param df: Input pandas dataframe to score 133 | :return: A pandas DataFrame with the predictions 134 | """ 135 | # Get DAI scorer 136 | scorer = get_dai_scorer(experiment_name) 137 | return scorer.score_batch(df) 138 | 139 | 140 | def score_using_http_api(df: pd.DataFrame): 141 | """ 142 | Score the input dataframe using the HTTP api endpoint. Assumes that the HTTP endpoint is 143 | started by the wrapper script and listening on localhost:9090 at the /rpc endpoint 144 | 145 | :param df: Input pandas dataframe to score 146 | :return: A pandas DataFrame with the predictions 147 | """ 148 | d = { 149 | "id": 1, 150 | "method": "score_batch", 151 | "params": {} 152 | } 153 | d['params']['rows'] = json.loads(df.to_json(orient='records')) 154 | 155 | # Send the post to HTTP endpoint 156 | headers = {'Content-Type': 'application/json'} 157 | r = requests.post(url="http://localhost:9090/rpc", 158 | json=d, 159 | headers=headers) 160 | results_list = r.json()['result'] 161 | preds_list = [val for sub_list in results_list for val in sub_list] 162 | return pd.DataFrame(preds_list, columns=['Sale']) 163 | 164 | 165 | def score_using_http_api2(df: pd.DataFrame): 166 | buf = BytesIO() 167 | df.to_pickle(buf, compression=None) 168 | buf.seek(0) 169 | d = dict(id=1, method='score_batch', payload=base64.b64encode(buf.getvalue()).decode()) 170 | buf.close() 171 | # Send the post to HTTP endpoint 172 | post_headers = {'Content-Type': 'application/json'} 173 | r = requests.post(url="http://localhost:9090/predict", 174 | data=json.dumps(d), 175 | headers=post_headers) 176 | if r: 177 | buf = BytesIO(base64.b64decode(r.json()['payload'])) 178 | buf.seek(0) 179 | return pd.read_pickle(buf, compression=None) 180 | 181 | 182 | def get_dai_scorer(experiment_name: str): 183 | """ 184 | Import the Driverless AI scoring module dynamically based on the experiment name passed, and return 185 | the corresponding scorer object 186 | 187 | :param experiment_name: Name of DAI experiment for which to return the scoring object 188 | :return: A Scoring object of type scoring_h2oai_experiment_.scorer.Scorer 189 | """ 190 | scoring_module_name = 'scoring_h2oai_experiment_{}'.format(experiment_name) 191 | scoring_module = importlib.import_module(scoring_module_name) 192 | scoring_class = getattr(scoring_module, 'Scorer') 193 | 194 | scorer = scoring_class() 195 | return scorer 196 | 197 | 198 | def save_datasets(df: pd.DataFrame, 199 | filename: str, 200 | as_pickle=True, 201 | as_csv=True): 202 | """ 203 | Saves the input dataframe as pickle and csv files, by default. 204 | 205 | :param df: The dataframe to save 206 | :param filename: File name to save as, output file will be filename.csv and filename.pickle 207 | :param as_pickle: Flag to save file as pickle, by default True 208 | :param as_csv: Flag to save file as csv, by default True 209 | :return: None 210 | """ 211 | if as_pickle: 212 | df.to_pickle(filename+'.pickle') 213 | if as_csv: 214 | df.to_csv(filename+'.csv', 215 | sep=",", header=True, index=False) 216 | 217 | 218 | if __name__ == '__main__': 219 | # process the dataframe 220 | process() 221 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/10_plot_score_metric.py: -------------------------------------------------------------------------------- 1 | import click 2 | import glob 3 | import os 4 | import re 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import seaborn as sns 9 | 10 | from pandas.plotting import register_matplotlib_converters 11 | 12 | @click.command() 13 | @click.option('-p', '--predictions-dir', 'preds_dir', type=click.Path(exists=True, 14 | file_okay=False, 15 | dir_okay=True, 16 | readable=True, 17 | writable=True), 18 | required=True, 19 | help='Predictions data directory.') 20 | def process(preds_dir): 21 | """ 22 | Reads the scored files in predictions directory, extracts the metric from the filename and plots 23 | a graph to compare the metric divergence between API and Python module based scoring. 24 | 25 | :param preds_dir: Predictions data directory. 26 | :return: None 27 | """ 28 | # First glob all module files 29 | mod_list = [] 30 | mod_regex = re.compile(r'([0-9]{5})-mod-m([0-9.]+).csv') 31 | for m in glob.glob(f'{preds_dir}/*-mod-*.csv'): 32 | file_name = os.path.basename(m) 33 | capture_group = mod_regex.match(file_name) 34 | mod_list.append({ 35 | 'order_id': capture_group.group(1), 36 | 'Module': capture_group.group(2) 37 | }) 38 | 39 | # Next glob all api files 40 | api_json_list = [] 41 | api_regex = re.compile(r'([0-9]{5})-api-json-m([0-9.]+).csv') 42 | for a in glob.glob(f'{preds_dir}/*-api-json-*.csv'): 43 | file_name = os.path.basename(a) 44 | capture_group = api_regex.match(file_name) 45 | api_json_list.append({ 46 | 'order_id': capture_group.group(1), 47 | 'API-JSON': capture_group.group(2) 48 | }) 49 | 50 | # Next glob all api files 51 | api_df_list = [] 52 | api_regex = re.compile(r'([0-9]{5})-api-df-m([0-9.]+).csv') 53 | for a in glob.glob(f'{preds_dir}/*-api-df-*.csv'): 54 | file_name = os.path.basename(a) 55 | capture_group = api_regex.match(file_name) 56 | api_df_list.append({ 57 | 'order_id': capture_group.group(1), 58 | 'API-DF': capture_group.group(2) 59 | }) 60 | 61 | assert len(mod_list) == len(api_json_list) == len(api_df_list), \ 62 | 'Unequal files scored by Module, JSON API and DataFrame API.' 63 | 64 | mod_df = pd.DataFrame(mod_list) 65 | api_json_df = pd.DataFrame(api_json_list) 66 | api_df_df = pd.DataFrame(api_df_list) 67 | 68 | # Merge all dataframes on a common column 69 | mod_df.set_index('order_id', inplace=True) 70 | api_json_df.set_index('order_id', inplace=True) 71 | api_df_df.set_index('order_id', inplace=True) 72 | df: pd.DataFrame = pd.concat([mod_df, api_json_df, api_df_df], axis=1, sort=False) 73 | df.reset_index(inplace=True) 74 | df.sort_values(by='index',inplace=True) 75 | df['index'] = df['index'].astype(np.int16) 76 | df['Module'] = df['Module'].astype(np.float64) 77 | df['API-JSON'] = df['API-JSON'].astype(np.float64) 78 | df['API-DF'] = df['API-DF'].astype(np.float64) 79 | df = pd.melt(df, 80 | id_vars=['index'], 81 | var_name='Method', 82 | value_name='RMSE') 83 | 84 | # Create TS plots for each store id in a separate file 85 | register_matplotlib_converters() 86 | sns.set_context('notebook') 87 | 88 | sns.relplot(x='index', 89 | y='RMSE', 90 | hue='Method', 91 | kind='line', 92 | height=7, 93 | aspect=2, 94 | data=df).fig.savefig(f'{preds_dir}/metrics_plot.svg') 95 | 96 | 97 | if __name__ == '__main__': 98 | # process the dataframe 99 | process() 100 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/11_http_server2.py: -------------------------------------------------------------------------------- 1 | from starlette.applications import Starlette 2 | from starlette.responses import JSONResponse 3 | 4 | import base64 5 | import click 6 | import importlib 7 | import json 8 | import pandas 9 | import uvicorn 10 | 11 | from io import BytesIO 12 | 13 | 14 | # Create a global scorer and assign to None for now 15 | scorer = None 16 | 17 | app = Starlette(debug=True) 18 | 19 | 20 | @app.route("/predict", methods=['POST']) 21 | async def predict(request): 22 | request_content_json = json.loads(await request.body()) 23 | buf = BytesIO(base64.b64decode(request_content_json['payload'])) 24 | buf.seek(0) 25 | score_ds = pandas.read_pickle(buf, compression=None) 26 | buf.close() 27 | if scorer is not None and type(score_ds).__name__ == 'DataFrame': 28 | pred_ds = scorer.score_batch(score_ds) 29 | buf = BytesIO() 30 | pred_ds.to_pickle(buf, compression=None) 31 | buf.seek(0) 32 | return JSONResponse(content={'payload': base64.b64encode(buf.getvalue()).decode()}, 33 | status_code=200) 34 | else: 35 | return JSONResponse(content={'payload': 'Error scorer could not load or request payload not pandas DataFrame'}, 36 | status_code=500) 37 | 38 | 39 | @click.command() 40 | @click.option('-n', '--name', 'experiment_name', 41 | required=True, 42 | type=click.types.STRING, 43 | help='Experiment Name') 44 | @click.option('-p', '--port', 'port', 45 | required=False, 46 | type=click.types.INT, 47 | default=9090) 48 | def process(experiment_name, 49 | port): 50 | """ 51 | Executes a HTTP prediction server for the Driverless AI python pipeline. 52 | Will create a '/predict' endpoint that will respond to only HTTP posts. Expected input for the endpoint 53 | is a pandas DataFrame for batch scoring using the 'score_batch' operation of the DAI python scoring pipeline. 54 | The pandas DataFrame should be pickled and then Base64 encoded and then sent in the Request body. 55 | 56 | :param experiment_name: Name of the Driverless AI experiment for which the scoring pipeline is used 57 | :param port: Port number to listen to for input data to predict 58 | :return: 59 | """ 60 | # Make function aware about the global variable scorer, and then set it 61 | global scorer 62 | scorer = experiment_name 63 | scoring_module_name = 'scoring_h2oai_experiment_{}'.format(experiment_name) 64 | scoring_module = importlib.import_module(scoring_module_name) 65 | scoring_class = getattr(scoring_module, 'Scorer') 66 | scorer = scoring_class() 67 | 68 | # Refer to the list of supported kwargs 69 | # https://github.com/encode/uvicorn/blob/e95e995781c7d1d8661b4f94631e3adb77c85237/uvicorn/main.py#L196 70 | uvicorn.run(app, 71 | host='0.0.0.0', 72 | port=port) 73 | 74 | 75 | if __name__ == "__main__": 76 | process() 77 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/environment.yml: -------------------------------------------------------------------------------- 1 | name: ts-pipeline-env 2 | channels: 3 | - h2oai 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=3.6.7 8 | - numpy 9 | - pandas 10 | - click 11 | - matplotlib 12 | - seaborn 13 | - tqdm 14 | - h2oai_client=1.6.3 15 | -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA - Rolling Window.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA - Rolling Window.odp -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA-RollWindow-duration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA-RollWindow-duration.png -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/images/metrics_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/metrics_plot.png -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/ts-full-pipeline/ts-definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "generators": [ 3 | { 4 | "name": "store-s1-baseline", 5 | "type": "constant", 6 | "value": 8 7 | }, 8 | { 9 | "name": "store-s2-baseline", 10 | "type": "constant", 11 | "value": 4 12 | }, 13 | { 14 | "name": "burger-monthly-baseline", 15 | "type": "monthly", 16 | "points": { 17 | "january": 3.3, 18 | "february": 3.7, 19 | "march": 6.8, 20 | "april": 9.8, 21 | "may": 13.6, 22 | "june": 16.2, 23 | "july": 18.4, 24 | "august": 18, 25 | "september": 14.9, 26 | "october": 11.1, 27 | "november": 4.3, 28 | "december": 5.9 29 | } 30 | }, 31 | { 32 | "name": "taco-monthly-baseline", 33 | "type": "monthly", 34 | "points": { 35 | "january": 5.3, 36 | "february": 5.7, 37 | "march": 8.8, 38 | "april": 11.8, 39 | "may": 15.6, 40 | "june": 18.2, 41 | "july": 22.4, 42 | "august": 20, 43 | "september": 16.9, 44 | "october": 13.1, 45 | "november": 7.3, 46 | "december": 6.9 47 | } 48 | }, 49 | { 50 | "name": "soda-monthly-baseline", 51 | "type": "monthly", 52 | "points": { 53 | "january": 8.3, 54 | "february": 8.7, 55 | "march": 9.8, 56 | "april": 11.8, 57 | "may": 19.6, 58 | "june": 26.2, 59 | "july": 38.4, 60 | "august": 28, 61 | "september": 18.9, 62 | "october": 13.1, 63 | "november": 8.8, 64 | "december": 12.9 65 | } 66 | }, 67 | { 68 | "name": "coffee-monthly-baseline", 69 | "type": "monthly", 70 | "points": { 71 | "january": 18.3, 72 | "february": 17.7, 73 | "march": 13.8, 74 | "april": 9.8, 75 | "may": 8.6, 76 | "june": 7.5, 77 | "july": 7.4, 78 | "august": 8.5, 79 | "september": 12.9, 80 | "october": 13.1, 81 | "november": 18.8, 82 | "december": 23.9 83 | } 84 | }, 85 | { 86 | "name": "weekly-variation", 87 | "type": "weekly", 88 | "points": { 89 | "monday": -5.5, 90 | "tuesday": -5.25, 91 | "wednesday": -8.5, 92 | "friday": 5.35, 93 | "saturday": 9.5, 94 | "sunday": 7.23 95 | } 96 | }, 97 | { 98 | "name": "daily-variation", 99 | "type": "daily", 100 | "points": { 101 | "00:00:00.000": -5, 102 | "02:00:00.000": -5.9, 103 | "04:00:00.000": -7, 104 | "06:00:00.000": -2.6, 105 | "08:00:00.000": 6.7, 106 | "10:00:00.000": 2.2, 107 | "12:00:00.000": 9, 108 | "14:00:00.000": 3, 109 | "16:00:00.000": 1.3, 110 | "18:00:00.000": 6.9, 111 | "20:00:00.000": 5.3, 112 | "22:00:00.000": -2.7 113 | } 114 | }, 115 | { 116 | "name": "result", 117 | "type": "aggregate", 118 | "aggregator": "sum", 119 | "generators": [ 120 | "weekly-variation", 121 | "daily-variation" 122 | ] 123 | }, 124 | { 125 | "name": "s1-burger", 126 | "type": "aggregate", 127 | "aggregator": "max", 128 | "generators": [ 129 | { 130 | "type": "constant", 131 | "value": 0 132 | }, 133 | { 134 | "type": "aggregate", 135 | "aggregator": "sum", 136 | "generators": [ 137 | "store-s1-baseline", 138 | "burger-monthly-baseline", 139 | "result" 140 | ] 141 | } 142 | ] 143 | }, 144 | { 145 | "name": "s2-burger", 146 | "type": "aggregate", 147 | "aggregator": "max", 148 | "generators": [ 149 | { 150 | "type": "constant", 151 | "value": 0 152 | }, 153 | { 154 | "type": "aggregate", 155 | "aggregator": "sum", 156 | "generators": [ 157 | "store-s2-baseline", 158 | "burger-monthly-baseline", 159 | "result" 160 | ] 161 | } 162 | ] 163 | }, 164 | { 165 | "name": "s1-taco", 166 | "type": "aggregate", 167 | "aggregator": "max", 168 | "generators": [ 169 | { 170 | "type": "constant", 171 | "value": 0 172 | }, 173 | { 174 | "type": "aggregate", 175 | "aggregator": "sum", 176 | "generators": [ 177 | "store-s1-baseline", 178 | "taco-monthly-baseline", 179 | "result" 180 | ] 181 | } 182 | ] 183 | }, 184 | { 185 | "name": "s2-taco", 186 | "type": "aggregate", 187 | "aggregator": "max", 188 | "generators": [ 189 | { 190 | "type": "constant", 191 | "value": 0 192 | }, 193 | { 194 | "type": "aggregate", 195 | "aggregator": "sum", 196 | "generators": [ 197 | "store-s2-baseline", 198 | "taco-monthly-baseline", 199 | "result" 200 | ] 201 | } 202 | ] 203 | }, 204 | { 205 | "name": "s1-soda", 206 | "type": "aggregate", 207 | "aggregator": "max", 208 | "generators": [ 209 | { 210 | "type": "constant", 211 | "value": 0 212 | }, 213 | { 214 | "type": "aggregate", 215 | "aggregator": "sum", 216 | "generators": [ 217 | "store-s1-baseline", 218 | "soda-monthly-baseline", 219 | "result" 220 | ] 221 | } 222 | ] 223 | }, 224 | { 225 | "name": "s2-soda", 226 | "type": "aggregate", 227 | "aggregator": "max", 228 | "generators": [ 229 | { 230 | "type": "constant", 231 | "value": 0 232 | }, 233 | { 234 | "type": "aggregate", 235 | "aggregator": "sum", 236 | "generators": [ 237 | "store-s2-baseline", 238 | "soda-monthly-baseline", 239 | "result" 240 | ] 241 | } 242 | ] 243 | }, 244 | { 245 | "name": "s1-coffee", 246 | "type": "aggregate", 247 | "aggregator": "max", 248 | "generators": [ 249 | { 250 | "type": "constant", 251 | "value": 0 252 | }, 253 | { 254 | "type": "aggregate", 255 | "aggregator": "sum", 256 | "generators": [ 257 | "store-s1-baseline", 258 | "coffee-monthly-baseline", 259 | "result" 260 | ] 261 | } 262 | ] 263 | }, 264 | { 265 | "name": "s2-coffee", 266 | "type": "aggregate", 267 | "aggregator": "max", 268 | "generators": [ 269 | { 270 | "type": "constant", 271 | "value": 0 272 | }, 273 | { 274 | "type": "aggregate", 275 | "aggregator": "sum", 276 | "generators": [ 277 | "store-s2-baseline", 278 | "coffee-monthly-baseline", 279 | "result" 280 | ] 281 | } 282 | ] 283 | } 284 | ], 285 | "exported": [ 286 | { 287 | "name": "S1;BURGER", 288 | "generator": "s1-burger", 289 | "frequency": 3600000 290 | }, 291 | { 292 | "name": "S2;BURGER", 293 | "generator": "s2-burger", 294 | "frequency": 3600000 295 | }, 296 | { 297 | "name": "S1;TACO", 298 | "generator": "s1-taco", 299 | "frequency": 3600000 300 | }, 301 | { 302 | "name": "S2;TACO", 303 | "generator": "s2-taco", 304 | "frequency": 3600000 305 | }, 306 | { 307 | "name": "S1;SODA", 308 | "generator": "s1-soda", 309 | "frequency": 3600000 310 | }, 311 | { 312 | "name": "S2;SODA", 313 | "generator": "s2-soda", 314 | "frequency": 3600000 315 | }, 316 | { 317 | "name": "S1;COFFEE", 318 | "generator": "s1-coffee", 319 | "frequency": 3600000 320 | }, 321 | { 322 | "name": "S2;COFFEE", 323 | "generator": "s2-soda", 324 | "frequency": 3600000 325 | } 326 | ], 327 | "from": "2016-01-01 00:00:00.000", 328 | "to": "2017-12-31 23:59:59.999" 329 | } -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/import_data_sets_stock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/import_data_sets_stock.png -------------------------------------------------------------------------------- /driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/launching_experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/launching_experiment.png -------------------------------------------------------------------------------- /interpretable_ml/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Interpretability Code Samples/Tutorials for Driverless AI 2 | -------------------------------------------------------------------------------- /interpretable_ml/data/default_of_credit_card_clients.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/interpretable_ml/data/default_of_credit_card_clients.xls -------------------------------------------------------------------------------- /scoring-pipeline-deployment/R/Shiny_Example/1_Data_Recoding.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ################ DATA PROCESSING ################ 3 | ################################################################################################ 4 | 5 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/") 6 | 7 | dataset <- read.csv("CreditCard.csv", header = TRUE, stringsAsFactors = TRUE) 8 | str(dataset) 9 | 10 | #### remove ID column #### 11 | dataset = dataset[-c(1)] 12 | names(dataset) 13 | 14 | 15 | 16 | #### Recoding GENDER 17 | table(dataset$SEX) 18 | dataset$SEX <- ifelse(dataset$SEX == 1, "Male", "Female") 19 | table(dataset$SEX) 20 | 21 | #### Recoding EDUCATION 22 | table(dataset$EDUCATION) 23 | dataset$EDUCATION[dataset$EDUCATION > 3] <- "Others" 24 | dataset$EDUCATION[dataset$EDUCATION == 0] <- "No Schooling" 25 | dataset$EDUCATION[dataset$EDUCATION == 1] <- "Graduate School" 26 | dataset$EDUCATION[dataset$EDUCATION == 2] <- "University" 27 | dataset$EDUCATION[dataset$EDUCATION == 3] <- "High School" 28 | table(dataset$EDUCATION) 29 | 30 | #### Recoding MARITAL STATUS 31 | table(dataset$MARRIAGE) 32 | dataset$MARRIAGE[dataset$MARRIAGE == 0 | dataset$MARRIAGE == 3] <- "Others" 33 | dataset$MARRIAGE[dataset$MARRIAGE == 1] <- "Married" 34 | dataset$MARRIAGE[dataset$MARRIAGE == 2] <- "Single" 35 | table(dataset$MARRIAGE) 36 | 37 | #### Target 38 | table(dataset$default.payment.next.month) 39 | dataset$default.payment.next.month = ifelse(dataset$default.payment.next.month==0, "0_Non-Default", "1_Default") 40 | 41 | write.csv(dataset, "CreditCardRe.csv", row.names = FALSE) 42 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/R/Shiny_Example/2_DAI_Interaction.R: -------------------------------------------------------------------------------- 1 | 2 | # https://support.rstudio.com/hc/en-us/articles/200486138-Changing-R-versions-for-RStudio-desktop 3 | 4 | ## http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/r_install_client.html#prerequisites 5 | 6 | #install.packages('curl') #given incorrectly as rcurl 7 | #install.packages('jsonlite') 8 | #install.packages('rlang') 9 | #install.packages('methods') 10 | 11 | ############################################################################################# 12 | ########## INSTALL DRIVERLESSAI R CLIENT ########### 13 | ############################################################################################# 14 | getwd() 15 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/") 16 | #install.packages('dai_1.9.1.tar.gz', type = 'source', repos = NULL) 17 | library(dai) 18 | 19 | ############################################################################################# 20 | ########## DAI Connect ########### 21 | ############################################################################################# 22 | url = 'http://ec2-52-206-210-31.compute-1.amazonaws.com:12345' 23 | username = 'h2oai' 24 | password = 'i-0f244cddd419191cd' 25 | dai.connect(uri = url, username = username, password = password) 26 | 27 | ############################################################################################# 28 | ########## DAI Data Upload/Delete ########### 29 | ############################################################################################# 30 | dataset_daiFrame = as.DAIFrame(dataset) 31 | #or 32 | cc_dai <- dai.upload_dataset("CreditCardRe.csv", progress = TRUE) 33 | 34 | View(dai.list_datasets()) 35 | 36 | dai_frame <- dai.get_frame('ef2d496e-6461-11eb-b42c-0242ac110002') 37 | dai.rm(dai_frame) 38 | 39 | View(dai.list_datasets()) 40 | 41 | cc_df <- as.data.frame(cc_dai) 42 | str(cc_df) 43 | 44 | 45 | 46 | ############################################################################################# 47 | ########## DAI Dataset Visuals ########### 48 | ############################################################################################# 49 | 50 | library(vegawidget) 51 | 52 | ### Parallel Coordinates Plot 53 | dai.parallel_coordinates_plot(cc_dai) 54 | dai.parallel_coordinates_plot( 55 | cc_dai, 56 | variable_names = NULL, 57 | permute = FALSE, 58 | transpose = FALSE, 59 | cluster = TRUE, 60 | render = TRUE, 61 | progress = TRUE 62 | ) 63 | 64 | ### Distribution 65 | dai.dotplot(cc_dai, variable_name = 'PAY_0', mark = "point") 66 | #dai.histogram(cc_dai, variable_name = 'LIMIT_BAL', number_of_bars = 5) 67 | 68 | ## Linear Regression 69 | dai.loess_regression_plot(cc_dai, x_variable_name = 'BILL_AMT1', y_variable_name = 'BILL_AMT2' ) 70 | #dai.linear_regression_plot(cc_dai, x_variable_name = 'PAY_AMT1', y_variable_name = 'PAY_AMT2' ) 71 | 72 | ############################################################################################# 73 | ########## DAI Split Dataset ########### 74 | ############################################################################################# 75 | 76 | dai.split_dataset( 77 | dataset = cc_dai, 78 | output_name1 = 'CreditCardRe_Train', 79 | output_name2 = 'CreditCardRe_Test', 80 | ratio = 0.8, 81 | seed = 1234, 82 | target = 'default.payment.next.month', 83 | fold_col = NULL, 84 | time_col = NULL, 85 | progress = TRUE 86 | ) 87 | View(dai.list_datasets()) 88 | 89 | train_dai_frame = dai.get_frame(key = '1cd2a352-63cf-11eb-831f-0242ac110002') 90 | test_dai_frame = dai.get_frame(key = '1cd2cf12-63cf-11eb-831f-0242ac110002') 91 | 92 | View(train_dai_frame) 93 | View(train_dai_frame$columns) 94 | 95 | ############################################################################################# 96 | ########## DAI New Experiment ########### 97 | ############################################################################################# 98 | 99 | View(dai.list_models()) 100 | 101 | default_model = dai.train(training_frame = train_dai_frame, 102 | target_col = 'default.payment.next.month', 103 | is_classification = TRUE, 104 | experiment_name = 'Default', 105 | testing_frame = test_dai_frame) 106 | 107 | simple_model = dai.train(training_frame = train_dai_frame, 108 | target_col = 'default.payment.next.month', 109 | is_classification = TRUE, 110 | testing_frame = test_dai_frame, 111 | scorer = 'F1', 112 | accuracy = 1, 113 | time = 1, 114 | interpretability = 10, 115 | experiment_name = 'Basic') 116 | 117 | 118 | glm_model= dai.train(training_frame = train_dai_frame, 119 | target_col = 'default.payment.next.month', 120 | is_classification = TRUE, 121 | testing_frame = test_dai_frame, 122 | scorer = 'AUC', 123 | accuracy = 1, 124 | time = 1, 125 | interpretability = 10, 126 | experiment_name = 'Config_Override', 127 | config_overrides = c('make_autoreport = true', 128 | 'autodoc_population_stability_index = true', 129 | 'enable_glm="on"', 130 | 'enable_decision_tree="off"', 131 | 'enable_xgboost_gbm = "off"', 132 | 'enable_lightgbm = "off"', 133 | 'make_python_scoring_pipeline = "off"', 134 | 'make_mojo_scoring_pipeline = "off"' 135 | )) 136 | View(dai.list_models()) 137 | 138 | # suggested_params = dai.suggest_model_params( 139 | # training_frame = train_dai_frame, 140 | # target_col = 'default.payment.next.month', 141 | # is_classification = TRUE, 142 | # is_timeseries = FALSE, 143 | # is_image = FALSE, 144 | # config_overrides = "", 145 | # cols_to_drop = NULL 146 | # ) 147 | # 148 | # View(suggested_params) 149 | # suggested_params_model = do.call(dai.train, suggested_params) 150 | 151 | View(dai.list_models()) 152 | fetched_model = dai.get_model(key = 'c1224714-63d4-11eb-831f-0242ac110002') 153 | dai.set_model_desc(fetched_model, 'prod_model') 154 | #dai.rm(fetched_model) 155 | 156 | 157 | ############################################################################################# 158 | ########## DAI Reuse/Refit a Model ########### 159 | ############################################################################################# 160 | 161 | View(dai.list_models()) 162 | 163 | summary(fetched_model) 164 | 165 | another_expert_model= dai.train(training_frame = train_dai_frame, 166 | target_col = 'default.payment.next.month', 167 | is_classification = TRUE, 168 | testing_frame = test_dai_frame, 169 | scorer = 'AUCPR', 170 | experiment_name = 'NewExpSameParams', 171 | resumed_model = fetched_model, 172 | resume_method = 'same') 173 | ### ^^ When trying new experiments with same parameters, config_override changes are NOT used 174 | 175 | refit_expert_model= dai.train(training_frame = cc_dai, 176 | target_col = 'default.payment.next.month', 177 | is_classification = TRUE, 178 | testing_frame = test_dai_frame, 179 | scorer = 'MCC', 180 | accuracy = 1, 181 | time = 0, 182 | interpretability = 10, 183 | experiment_name = 'RefitFinalModel', 184 | resumed_model = fetched_model, 185 | resume_method = 'refit') 186 | 187 | ### ^^ When refitting final model, time setting is forced to 0 188 | 189 | 190 | ############################################################################################# 191 | ########## Retrieving / Downloading Artefacts ########### 192 | ############################################################################################# 193 | 194 | View(dai.list_models()) 195 | 196 | final_model = dai.get_model(key = '0e856d32-63db-11eb-831f-0242ac110002') 197 | 198 | ##### Predictions ##### 199 | dai.autoreport(final_model, path = "../", force = TRUE, progress = TRUE) 200 | 201 | ##### Predictions ##### 202 | dai.download_file(final_model$train_predictions_path, dest_path = "../", force = TRUE, progress = TRUE) 203 | dai.download_file(final_model$test_predictions_path, dest_path = "../", force = TRUE, progress = TRUE) 204 | 205 | ##### Summary and Log Files ##### 206 | dai.download_file(final_model$summary_path, dest_path = ".", force = TRUE, progress = TRUE) 207 | dai.download_file(final_model$log_file_path, dest_path = ".", force = TRUE, progress = TRUE) 208 | 209 | ##### Download MOJO ##### 210 | dai.download_mojo(final_model, path = getwd(), force = TRUE, progress = TRUE) 211 | 212 | 213 | ############################################################################################# 214 | ########## MLI Interpretation - CAUTION - Low Level Code / BUG ########### 215 | ############################################################################################# 216 | 217 | # library(jsonlite) 218 | # 219 | # dai.interpret_model <- function(model, dataset, target_col, progress = TRUE) { 220 | # print(model$key) 221 | # print(dataset$key) 222 | # key <- dai:::.dai_do_rpc("api_run_interpretation", list("interpret_params" = list( 223 | # dai_model = list(key = unbox(model$key), display_name = unbox(model$description)), 224 | # dataset = list(key = unbox(dataset$key), display_name = unbox(dataset$name)), 225 | # target_col = unbox(target_col), 226 | # use_raw_features = unbox(TRUE), 227 | # prediction_col = unbox(''), 228 | # weight_col = unbox(''), 229 | # drop_cols = list(), 230 | # klime_cluster_col = unbox(''), 231 | # nfolds = unbox(0), 232 | # sample = unbox(TRUE), 233 | # sample_num_rows = unbox(-1), 234 | # qbin_cols = list(), 235 | # qbin_count = unbox(0), 236 | # lime_method = unbox("k-LIME"), 237 | # dt_tree_depth = unbox(3), 238 | # vars_to_pdp = unbox(10), 239 | # config_overrides = NULL, 240 | # dia_cols = list() 241 | # ))) 242 | # 243 | # print("key is set") 244 | # print(key) 245 | # 246 | # return(dai:::wait_for_job(function() dai:::get_interpretation_job(key), progress = progress)$entity) 247 | # } 248 | # 249 | # mli <- dai.interpret_model(final_model, train_dai_frame, 'default.payment.next.month') 250 | 251 | 252 | 253 | 254 | 255 | 256 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/R/Shiny_Example/3_DAI_Model_Prediction.R: -------------------------------------------------------------------------------- 1 | # https://support.rstudio.com/hc/en-us/articles/200486138-Changing-R-versions-for-RStudio-desktop 2 | 3 | ## http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/r_install_client.html#prerequisites 4 | 5 | #install.packages('curl') #given incorrectly as rcurl 6 | #install.packages('jsonlite') 7 | #install.packages('rlang') 8 | #install.packages('methods') 9 | 10 | ############################################################################################# 11 | ########## INSTALL DRIVERLESSAI R CLIENT ########### 12 | ############################################################################################# 13 | getwd() 14 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/") 15 | #install.packages('dai_1.9.1.tar.gz', type = 'source', repos = NULL) 16 | library(dai) 17 | 18 | 19 | 20 | ############################################################################################# 21 | ########## DAI Connect ########### 22 | ############################################################################################# 23 | url = 'http://ec2-54-204-68-13.compute-1.amazonaws.com:12345' 24 | username = 'h2oai' 25 | password = 'i-0f244cddd419191cd' 26 | dai.connect(uri = url, username = username, password = password) 27 | 28 | ############################################################################################# 29 | ########## DAI Model Prediction ########### 30 | ############################################################################################# 31 | 32 | View(dai.list_models()) 33 | final_model = dai.get_model(key = '0e856d32-63db-11eb-831f-0242ac110002') 34 | 35 | new_data = read.csv("CreditCardRe_Test.csv") 36 | new_data_dai = as.DAIFrame(new_data) 37 | preds = predict(final_model, newdata = new_data_dai) 38 | 39 | pred_shap_contribs = predict(final_model, newdata = new_data_dai, pred_contribs = TRUE) 40 | pred_orig_contribs = predict(final_model, newdata = new_data_dai, pred_contribs = TRUE, pred_contribs_original = TRUE) 41 | 42 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/R/Shiny_Example/4_MOJO_Predictions.R: -------------------------------------------------------------------------------- 1 | ############################################################################################# 2 | ########## MOJO Model Prediction ########### 3 | ############################################################################################# 4 | rm(list = ls()) # remove all objects including dai 5 | #install dependencies and daimojo package 6 | #install.packages('Rcpp') 7 | #install.packages("~/Downloads/daimojo_2.5.8_x86_64-darwin.tar.gz", type = 'source', repos=NULL) 8 | 9 | #install.packages('data.table') 10 | getwd() 11 | #setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/") 12 | 13 | library(daimojo) 14 | library(data.table) 15 | ### set DRIVERLESS_AI_LICENSE_KEY 16 | #Sys.setenv("DRIVERLESS_AI_LICENSE_KEY"="paste your key here") 17 | Sys.getenv("DRIVERLESS_AI_LICENSE_KEY") 18 | model = daimojo::load.mojo("mojo-pipeline/pipeline.mojo") 19 | daimojo::create.time(model) 20 | daimojo::feature.names(model) 21 | col_class <- setNames(daimojo::feature.types(model), daimojo::feature.names(model)) 22 | daimojo::feature.types(model) 23 | daimojo::missing.values(model) 24 | daimojo::uuid(model) 25 | 26 | new_data <- fread("./mojo-pipeline/example.csv", colClasses=col_class, header=TRUE, sep=",") 27 | str(new_data) 28 | 29 | daimojo::predict.mojo(m = model, newdata = new_data) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/R/Shiny_Example/Data_Preprocessing_for_app.R: -------------------------------------------------------------------------------- 1 | ################# 2 | 3 | 4 | library(dplyr) 5 | library(daimojo) 6 | options(scipen = 99999) 7 | 8 | #install.packages("daimojo_2.4.8_x86_64-darwin.tar.gz", repos = NULL, type = "source") # to be downloaded from DAI under "Download MOJO Piepline" 9 | # http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/scoring-pipeline-cpp.html#downloading-the-scoring-pipeline-runtimes 10 | 11 | #path set to driverlessai-tutorials 12 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/") 13 | dataset <- read.csv("CreditCard-train.csv", header = TRUE, stringsAsFactors = TRUE) 14 | target <- "default.payment.next.month" 15 | 16 | 17 | dataset$SEX <- ifelse(dataset$SEX == 1, "Female", "Male") 18 | table(dataset$SEX) 19 | 20 | dataset$EDUCATION[dataset$EDUCATION > 3] <- "Others" 21 | dataset$EDUCATION[dataset$EDUCATION == 0] <- "No Schooling" 22 | dataset$EDUCATION[dataset$EDUCATION == 1] <- "Graduate School" 23 | dataset$EDUCATION[dataset$EDUCATION == 2] <- "University" 24 | dataset$EDUCATION[dataset$EDUCATION == 3] <- "High School" 25 | 26 | 27 | table(dataset$EDUCATION) 28 | 29 | dataset$MARRIAGE[dataset$MARRIAGE == 0] <- "Others" 30 | dataset$MARRIAGE[dataset$MARRIAGE == 1] <- "Married" 31 | dataset$MARRIAGE[dataset$MARRIAGE == 2] <- "Single" 32 | dataset$MARRIAGE[dataset$MARRIAGE == 3] <- "Others" 33 | 34 | table(dataset$MARRIAGE) 35 | 36 | 37 | class_vec <- data.frame(sapply(dataset, class)) 38 | class_vec$columns <- rownames(class_vec) 39 | rownames(class_vec) <- NULL 40 | 41 | colnames(class_vec) <- c("class", "variables") 42 | class_vec$class <- as.character(class_vec$class) 43 | 44 | class_vec <- class_vec[class_vec$variables != target, ] 45 | 46 | class_vec <- class_vec[class_vec$variables != "ID", ] 47 | 48 | int_cols <- class_vec$variables[class_vec$class %in% c("integer", "numeric")] 49 | 50 | for(i in int_cols ){ 51 | if(length(unique(dataset[,i])) < 50) 52 | class_vec$class[class_vec$variables == i] <- "numeric_cat" 53 | 54 | } 55 | 56 | summary(dataset) 57 | str(dataset) 58 | 59 | dataset$LIMIT_BAL <- as.numeric(dataset$LIMIT_BAL) #present as integer 60 | dataset$PAY_0 <- as.numeric(dataset$PAY_0) #present as integer 61 | dataset$PAY_2 <- as.numeric(dataset$PAY_2) #present as integer 62 | dataset$PAY_3 <- as.numeric(dataset$PAY_3) #present as integer 63 | dataset$PAY_4 <- as.numeric(dataset$PAY_4) #present as integer 64 | dataset$PAY_5 <- as.numeric(dataset$PAY_5) #present as integer 65 | dataset$PAY_6 <- as.numeric(dataset$PAY_6) #present as integer 66 | dataset$PAY_AMT1 <- as.numeric(dataset$PAY_AMT1) #present as integer 67 | dataset$PAY_AMT2 <- as.numeric(dataset$PAY_AMT2) #present as integer 68 | dataset$PAY_AMT3 <- as.numeric(dataset$PAY_AMT3) #present as integer 69 | dataset$PAY_AMT4 <- as.numeric(dataset$PAY_AMT4) #present as integer 70 | dataset$PAY_AMT5 <- as.numeric(dataset$PAY_AMT5) #present as integer 71 | dataset$PAY_AMT6 <- as.numeric(dataset$PAY_AMT6) #present as integer 72 | dataset$BILL_AMT1 <- as.numeric(dataset$BILL_AMT1) #present as integer 73 | dataset$BILL_AMT2 <- as.numeric(dataset$BILL_AMT2) #present as integer 74 | dataset$BILL_AMT3 <- as.numeric(dataset$BILL_AMT3) #present as integer 75 | dataset$BILL_AMT4 <- as.numeric(dataset$BILL_AMT4) #present as integer 76 | dataset$BILL_AMT5 <- as.numeric(dataset$BILL_AMT5) #present as integer 77 | dataset$BILL_AMT6 <- as.numeric(dataset$BILL_AMT6) #present as integer 78 | 79 | #int_cols <- class_vec$variables[class_vec$class %in% c("integer", "numeric")] 80 | #numeric_cat_cols <- class_vec$variables[class_vec$class %in% c("numeric_cat")] 81 | #cat_cols <- class_vec$variables[class_vec$class %in% c("str", "factor", "character")] 82 | #bool_cols <- class_vec$variables[class_vec$class %in% c("logical")] 83 | 84 | summary(dataset) 85 | 86 | predictor_colnames <- colnames(dataset)[colnames(dataset) != target] 87 | predictor_colnames <- predictor_colnames[predictor_colnames != "ID"] 88 | 89 | predictions_df <- read.csv("train_preds_custom.csv", header = TRUE) 90 | colnames(predictions_df)[27] <- "prob_pred" 91 | 92 | 93 | dataset_w_pred <- inner_join(dataset, predictions_df[, c("ID", "prob_pred")], by = "ID") 94 | colnames(dataset_w_pred)[25] <- "Actual_Target" 95 | dataset_w_pred$Actual_Target <- as.factor(dataset_w_pred$Actual_Target) 96 | 97 | # TODO - REMOVE LICENSE KEY 98 | Sys.setenv(DRIVERLESS_AI_LICENSE_KEY = paste0("paste your DAI License key here")) 99 | m <- daimojo::load.mojo("mojo-pipeline/pipeline.mojo") 100 | 101 | create.time(m) 102 | uuid(m) 103 | predict.mojo(m, dataset) 104 | 105 | daimojo::predict(m, dataset) 106 | 107 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/README.md: -------------------------------------------------------------------------------- 1 | Scoring Pipeline Deployment Examples 2 | ==================================== 3 | 4 | Driverless AI scoring pipelines can be deployed independently of the machine 5 | where Driverless AI is running. This essentially helps you to separate the 6 | concerns of Model Training from Model Deployment. This capability gives you 7 | immense flexibility on how you can deploy your scoring pipelines to production. 8 | 9 | This directory list contains example code that will show how to deploy DAI 10 | scoring pipelines (python and mojo) on new machines. 11 | 12 | Refer to `python` and `java` directories for detailed examples on how to deploy 13 | the corresponding pipeline in various scenarios. 14 | 15 | 16 | Disclaimer 17 | ---------- 18 | 19 | The scoring pipeline wrapper code shared in this directory is created to provide you 20 | a sample starting point and is not intended to be directly deployed to production as is. 21 | You can use this starting point and build over it to solve your deployment needs ensuring 22 | that your security etc. requirements are met. 23 | 24 | 25 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/java/README.md: -------------------------------------------------------------------------------- 1 | Mojo (Java) Scoring Pipeline Deployment Examples 2 | ================================================ 3 | 4 | To be created... -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:centos7 2 | 3 | # These commands run as root 4 | # Install base dependencies 5 | RUN yum -y update && \ 6 | yum install -y epel-release && \ 7 | yum -y groupinstall 'Development Tools' && \ 8 | yum install -y openblas-devel openblas python36-virtualenv python36-pip wget unzip java && \ 9 | ln -s /usr/bin/virtualenv-3.6 /usr/bin/virtualenv && \ 10 | ln -s /usr/bin/pip-3.6 /usr/bin/pip && \ 11 | ln -sf /usr/bin/python3 /usr/bin/python 12 | 13 | ENV LANG en_US.UTF-8 14 | ENV LANGUAGE en_US:en 15 | ENV LC_ALL en_US.UTF-8 16 | ENV HOME /home/newuser 17 | 18 | # Create new user 19 | RUN useradd -ms /bin/bash newuser 20 | 21 | # Create a new user to run the pipeline 22 | USER newuser 23 | WORKDIR /home/newuser 24 | 25 | # Commands below run as newuser 26 | COPY --chown=newuser:newuser payload/scorer.zip ./ 27 | COPY --chown=newuser:newuser payload/license.sig .driverlessai/ 28 | 29 | RUN unzip scorer.zip 30 | 31 | WORKDIR scoring-pipeline 32 | 33 | RUN bash run_example.sh 34 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/docker/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline Wrapper using Docker 2 | ============================================ 3 | 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 docker container. This directory acts as the build 6 | context for the docker build step. 7 | 8 | 9 | Prerequisites 10 | ------------- 11 | 12 | The following pre-requisites are needed 13 | - [Docker](https://www.docker.com/) 14 | 15 | Follow the installation instructions for your platform and get Docker Ce (or EE) installed on the machine. 16 | 17 | 18 | Code Structure 19 | -------------- 20 | 21 | The code assumes a directory structure as below: 22 | 23 | ``` 24 | top-dir: A directory with the below structure. Name can be anything. This is the build context for docker build command 25 | - README.md: This file with the details you are reading 26 | - Dockerfile: The docker image build script 27 | - payload: A directory that contains files to be used in the docker container for deployment 28 | - scorer.zip: The DAI python scoring pipeline. (You need to put this file here) 29 | - license.sig: Valid Driverless AI license file. (You need to provide your license file here) 30 | ``` 31 | 32 | Instructions 33 | ------------ 34 | 35 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker 36 | 3. Change to `top-dir`, which contains the files as mentioned in the above section 37 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory. 38 | 5. Copy Driverless AI license `license.sig` in the `payload` directory 39 | 6. Issue the command `docker build -t scoretest .`. This will 40 | - Create a CentOS 7 based docker container 41 | - Install required dependencies, python etc.. 42 | - Create a virtual environment for the scoring pipeline by installing all needed dependencies 43 | - Run `example.py` from the scoring pipeline 44 | 45 | As part of the build process you will see the scores being produced for the test data in `example.py`. This example 46 | shows how to use DAI python scoring pipeline as a python module. There are other options like HTTP service and TCP service that can be created too. 47 | 48 | You can run the docker container in interactive model, and can experiment with the HTTP and TCP endpoints. 49 | 50 | Execute the command `docker run -it --rm scoretest:latest`. Once connected you will be in the `scoring-pipeline` directory. 51 | 52 | To run `example.py` you can follow the below steps once you are connected using SSH 53 | 54 | ``` 55 | . env/bin/activate # activate environment for required experiment 56 | python example.py # to run example.py manually 57 | ``` 58 | 59 | Similarly, you can run the HTTP and TCP server python files too. 60 | 61 | 62 | Disclaimer 63 | ---------- 64 | 65 | The scoring pipeline wrapper code shared in this directory is created to provide you 66 | a sample starting point and is not intended to be directly deployed to production as is. 67 | You can use this starting point and build over it to solve your deployment needs ensuring 68 | that your security etc. requirements are met. 69 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/vagrant/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline Wrapper using Vagrant 2 | ============================================= 3 | 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline 5 | obtained from H2O Driverless AI in a CentOS 7 virtual machine in Vagrant. 6 | 7 | 8 | Prerequisites 9 | ------------- 10 | 11 | The following pre-requisites are needed 12 | - [VirtualBox](https://www.virtualbox.org/): A free virtualization provider 13 | - [Vagrant](https://www.vagrantup.com/): A tool for building and managing virtual machines 14 | - [Vagrant Disk Resize plugin](https://github.com/sprotheroe/vagrant-disksize): A vagrnt plugin to manage disk sizes 15 | 16 | Follow the installation instructions for your platform and get them installed in the above order. 17 | 18 | 19 | Code Structure 20 | -------------- 21 | 22 | The code assumes a directory structure as below: 23 | 24 | ``` 25 | top-dir: A directory with the below structure. Name of directory can be anything. 26 | - README.md: This file with the details you are reading 27 | - Vagrantfile: File providing the definition of the virtual machine to create using Vagrant 28 | - bootstrap.sh: The shell provisioner, installs core ubuntu packages 29 | - payload.sh: Shell provisioner, installs Miniconda, creates scoring environment, runs pipeline 30 | - payload: A directory that contains files which can be used in the virtual machine for deployment 31 | - scorer.zip: The DAI python scoring pipeline. (You need to put this file here) 32 | - license.sig: Valid Driverless AI license file. (You need to provide your license file here) 33 | ``` 34 | 35 | Instructions 36 | ------------ 37 | 38 | 1. Install VirtualBox 39 | 2. Install Vagrant. Ensure you can invoke it using `vagrant --version` 40 | 2. Install Vagrant Disk Size plugin `vagrant plugin install vagrant-disksize` 41 | 3. Go to `top-dir`, which contains the files as mentioned in the above section 42 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory. 43 | 5. Copy Driverless AI license `license.sig` in the `payload` directory 44 | 6. Issue the command `vagrant up`. This will 45 | - Create a CentOS 7 based virtual machine 46 | - Bootstrap it i.e. install all dependencies, miniconda, python etc.. 47 | - Create a conda environment for the scoring pipeline by installing all needed dependencies 48 | - Run `example.py` from the scoring pipeline 49 | 50 | You can SSH to the machine using the command `vagrant ssh` from `top-dir` directory. Once connected it is like 51 | working on any Ubuntu terminal. 52 | 53 | To run `example.py` you can follow the below steps once you are connected using SSH 54 | 55 | ``` 56 | conda env list # shows conda environments available on the system 57 | conda activate environment_name # activate environment for required experiment (experiment key is in name) 58 | python example.py # to run example.py manually 59 | ``` 60 | 61 | Similarly, you can run the HTTP and TCP server python files too. 62 | 63 | Multiple Deployments on same Host 64 | --------------------------------- 65 | 66 | Each DAI experiment python deployment pipeline should be contained in its own virtual python environment. 67 | We support both `conda` and `pip + virtualenv` based virtual environments. This separation enables flexibility 68 | to have multiple experiment scoring pipelines to be deployed on the same machine without interfering with 69 | each other. 70 | 71 | 72 | Disclaimer 73 | ---------- 74 | 75 | The scoring pipeline wrapper code shared in this directory is created to provide you 76 | a sample starting point and is not intended to be directly deployed to production as is. 77 | You can use this starting point and build over it to solve your deployment needs ensuring 78 | that your security etc. requirements are met. 79 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/vagrant/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | # More boxes at https://vagrantcloud.com/search. 6 | config.vm.box = "centos/7" 7 | config.vm.network "private_network", ip: "192.168.33.10" 8 | # config.vm.network "forwarded_port", guest: 80, host: 8080 9 | 10 | # HDD size for guest machine 11 | config.disksize.size = '10GB' 12 | 13 | config.vm.provider "virtualbox" do |vb| 14 | vb.memory = "8192" 15 | end 16 | 17 | # Provisioning 18 | # File 19 | config.vm.provision "file", source: "payload/scorer.zip", destination: "/home/vagrant/scorer.zip" 20 | config.vm.provision "file", source: "payload/license.sig", destination: "/home/vagrant/.driverlessai/license.sig" 21 | 22 | # Shell - bootstraping 23 | config.vm.provision "shell", path: "bootstrap.sh", name: "bootstrap", privileged: true 24 | # Shell - user install 25 | config.vm.provision "shell", path: "payload.sh", name: "payload", privileged: false 26 | end 27 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/vagrant/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | yum -y update 4 | yum -y groupinstall 'Development Tools' 5 | yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm 6 | yum -y install openblas-devel openblas python36-virtualenv python36-pip 7 | 8 | # create links 9 | ln -s /usr/bin/virtualenv-3.6 /usr/bin/virtualenv 10 | ln -s /usr/bin/pip-3.6 /usr/bin/pip 11 | ln -sf /usr/bin/python3 /usr/bin/python 12 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/vagrant/payload.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | unzip scorer.zip && cd scoring-pipeline 4 | bash run_example.sh --pm pip -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/centos/vagrant/payload/README.md: -------------------------------------------------------------------------------- 1 | Payload Directory 2 | ================= 3 | 4 | Put the following two files in this directory 5 | 6 | - `scorer.zip` 7 | - `license.sig` 8 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/pyspark/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline using PySpark 2 | ============================================ 3 | 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline 5 | obtained from H2O Driverless AI on a Spark cluster. 6 | 7 | 8 | Prerequisites 9 | ------------- 10 | 11 | The following pre-requisites are needed. 12 | 1. Conda and [conda-pack](https://conda.github.io/conda-pack/) installed. This is needed to build Python environment/code to distribute among cluster. 13 | - To install conda-pack: `conda install -c conda-forge conda-pack` 14 | 2. Install `openblas` on all nodes (driver and executors that will run the Python code). 15 | - Install openblas on Spark driver and all executors: 16 | a. CentOS: `sudo yum install -y openblas-devel` or use rpm 0.3.3: https://centos.pkgs.org/7/epel-x86_64/openblas-0.3.3-2.el7.x86_64.rpm.html 17 | b. Ubuntu: `sudo apt-get install libopenblas-dev` 18 | 3. Install git on Spark driver and all executors, e.g. `sudo yum install git` 19 | 20 | Code Structure 21 | -------------- 22 | 23 | The process assumes a directory structure as below: 24 | 25 | ``` 26 | top-dir: A directory with the below structure. This example uses the home directory of current user. 27 | - README.md: This file with the details you are reading 28 | - py_scorer_testing: A directory that contains files to be used for deployment 29 | - scorer.zip: The DAI python scoring pipeline. (You need to put this file here to extract files needed below) 30 | - license.sig: Valid Driverless AI license file. (You need to provide your license file here) 31 | - get_predictions.py: PySpark script (example given) used for running batch scoring 32 | - py_scorer_env.tar.gz: conda-pack generated following instructions below 33 | - dai_contrib.tar.gz: (optional) compressed tmp folder generated following instructions below (necessary if your model used custom recipes) 34 | ``` 35 | 36 | Instructions 37 | ------------ 38 | 39 | 1. Upload Python Scoring Pipeline (scorer.zip) and license.sig onto Spark driver. 40 | 2. Copy your input_dataset.csv to HDFS for the cluster to access. Or, if using spark locally, store the dataset on the driver. 41 | 3. Create scorer folder and unzip scorer on Spark driver. 42 | `mkdir py_scorer_testing` 43 | `cd py_scorer_testing` 44 | Move scorer.zip into py_scorer_testing 45 | `unzip scorer.zip` 46 | 47 | 4. Create Python Env using environment.yml found in scorer.zip: 48 | `conda env create --name py_scorer_env -f scoring-pipeline/environment.yml` 49 | `conda activate py_scorer_env` 50 | If model was created before DAI 1.8.5, you will need to install gitdb: 51 | `pip install --upgrade gitdb2==2.0.6 gitdb==0.6.4` 52 | 53 | 5. Create conda-pack of new Env: 54 | `cd py_scorer_testing` 55 | `conda env list` OR `conda list` 56 | `conda pack -n py_scorer_env -o py_scorer_env.tar.gz` 57 | 58 | 6. Create tar.gz of DAI’s tmp folder (this step is necessary if your model used custom recipes) 59 | `tar -czvf dai_contrib.tar.gz -C scoring-pipeline/tmp/contrib .` 60 | Note that you cannot use tmp due to conflict of Spark already having tmp folder 61 | 62 | 7. Download `get_predictions.py` from this repo and add to `py_scorer_testing` folder 63 | 64 | 8. Set up env vars (some may not be needed for YARN cluster mode) 65 | `export ARROW_PRE_0_15_IPC_FORMAT=1` (due to [pyarrow issue](https://stackoverflow.com/questions/58269115/how-to-enable-apache-arrow-in-pyspark)) 66 | `export DRIVERLESS_AI_LICENSE_FILE=~/py_scorer_testing/license.sig` 67 | `export PYSPARK_PYTHON=./py_scorer_env/bin/python` 68 | `export SPARK_HOME=/path/to/spark` (e.g. ~/spark/spark-2.4.5-bin-hadoop2.7) 69 | `export HADOOP_CONF_DIR=/etc/hadoop/conf` (may need to modify if don't have default hadoop path) 70 | 71 | 9. Run `kinit` if Hadoop is secured with Kerberos 72 | 73 | 10. cd into conda envs, e.g. `cd ~/miniconda3/envs` 74 | 75 | 11. Submit Spark Job `get_predictions.py` 76 | ``` 77 | PYTHONIOENCODING=utf8 \ 78 | PYSPARK_PYTHON=./py_scorer_env/bin/python \ 79 | spark-submit \ 80 | --master yarn \ 81 | --deploy-mode cluster \ 82 | --num-executors 2 --driver-memory 2g --executor-memory 4g \ 83 | --archives ../../py_scorer_testing/py_scorer_env.tar.gz#py_scorer_env,../../py_scorer_testing/dai_contrib.tar.gz#tmp/contrib \ 84 | --conf spark.executorEnv.PATH=`echo $PATH` \ 85 | --conf spark.executorEnv.PYSPARK_PYTHON=./py_scorer_env/bin/python \ 86 | --conf spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT=1 \ 87 | --conf spark.executorEnv.PYTHONIOENCODING=utf8 \ 88 | --conf spark.yarn.appMasterEnv.PYTHONIOENCODING=utf8 \ 89 | --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./py_scorer_env/bin/python \ 90 | --conf spark.executorEnv.DRIVERLESS_AI_LICENSE_KEY=`cat ~/py_scorer_testing/license.sig` \ 91 | --conf spark.driver.maxResultSize=2g \ 92 | ~/py_scorer_testing/get_predictions.py hdfs:///user/path/to/input_dataset.csv hdfs:///user/path/to/output_dataset.csv 93 | ``` 94 | Note: utf encodings (used above) may be needed for certain NLP models and spark.executorEnv.PATH for initialization 95 | 96 | Disclaimer 97 | ---------- 98 | 99 | The scoring pipeline wrapper code shared in this directory is created to provide you 100 | a sample starting point and is not intended to be directly deployed to production as is. 101 | You can use this starting point and build over it to solve your deployment needs ensuring 102 | that your security etc. requirements are met. 103 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/pyspark/get_predictions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession 3 | from pyspark import SparkFiles 4 | spark = SparkSession.builder.getOrCreate() 5 | 6 | input_path = sys.argv[1] 7 | output_path = sys.argv[2] 8 | df = spark.read.csv(input_path, header=True, inferSchema=True) 9 | names = df.columns 10 | 11 | import pandas as pd 12 | from pyspark.sql.functions import col, pandas_udf, size 13 | from pyspark.sql.types import DoubleType, ArrayType 14 | 15 | def predict(*series) -> pd.Series: 16 | import pandas as pd 17 | import numpy as np 18 | from numpy import nan 19 | from scipy.special._ufuncs import expit 20 | from scoring_h2oai_experiment_336ccd12_cbb4_11ea_8496_ac1f6b68b7be import Scorer # update with your key 21 | scorer = Scorer() 22 | merged = pd.concat(series, axis=1) 23 | merged.columns = names 24 | output = scorer.score_batch(merged) 25 | return pd.Series(output.values.tolist()) 26 | 27 | 28 | predict_udf = pandas_udf(predict, returnType=ArrayType(DoubleType())) 29 | columns = [col(name) for name in df.columns] 30 | withPredictions = df.withColumn("prediction", predict_udf(*columns)) 31 | 32 | # If working with multi-class, can expand prediction, e.g. 3 classes: 33 | num_cols = withPredictions.withColumn("size", size(col("prediction"))).agg({"size": "max"}).head()[0] # To be performant, specify the value, e.g. num_cols=3 34 | withPredictions = withPredictions.select(col("*"), *(col('prediction').getItem(i).alias(f'prediction_{i}') for i in range(num_cols))) 35 | withPredictions = withPredictions.drop(col("prediction")) -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline Deployment Examples 2 | =========================================== 3 | 4 | Driverless AI scoring pipelines can be deployed independently of the machine 5 | where Driverless AI is running. This essentially helps you to separate the 6 | concerns of Model Training from Model Deployment. This capability gives you 7 | immense flexibility on how you can deploy your scoring pipelines to production. 8 | 9 | This directory lists example code that shows how to deploy Python Scoring Pipeline 10 | in various scenarios 11 | 12 | Bare-metal or Virtual Linux Environments 13 | ---------------------------------------- 14 | 15 | The `vagrant` directory contains example code that explains how to get DAI 16 | python scoring pipeline installed and running on a Ubuntu 18.04 linux. The example 17 | uses Ubuntu 10.04 running on Virtualbox managed via Vagrant. The example can be 18 | used the understand the steps needed to get the scoring pipeline working, which 19 | can be adjusted per your scenarios. 20 | 21 | 22 | Containerised Environments 23 | -------------------------- 24 | 25 | The `docker` directory contains example code to show how to create a Ubuntu 18.04 26 | based container that can be used to deploy the python scoring pipeline. 27 | 28 | 29 | Disclaimer 30 | ---------- 31 | 32 | The scoring pipeline wrapper code shared in this directory is created to provide you 33 | a sample starting point and is not intended to be directly deployed to production as is. 34 | You can use this starting point and build over it to solve your deployment needs ensuring 35 | that your security etc. requirements are met. 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:bionic 2 | 3 | # These commands run as root 4 | # Install base dependencies 5 | RUN apt-get update && \ 6 | apt install -y \ 7 | build-essential \ 8 | libmagic-dev \ 9 | libopenblas-dev \ 10 | git \ 11 | locales \ 12 | unzip \ 13 | wget 14 | 15 | RUN locale-gen en_US.UTF-8 16 | ENV LANG en_US.UTF-8 17 | ENV LANGUAGE en_US:en 18 | ENV LC_ALL en_US.UTF-8 19 | ENV HOME /home/newuser 20 | 21 | # Create new user 22 | RUN useradd -ms /bin/bash newuser 23 | 24 | # Create a new user to run the pipeline 25 | USER newuser 26 | WORKDIR /home/newuser 27 | 28 | # Commands below run as newuser 29 | COPY --chown=newuser:newuser payload/scorer.zip ./ 30 | COPY --chown=newuser:newuser payload/license.sig .driverlessai/ 31 | 32 | # install Miniconda 33 | RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ 34 | bash miniconda.sh -b -p $HOME/miniconda3 && \ 35 | echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> .bashrc && \ 36 | unzip scorer.zip 37 | 38 | WORKDIR scoring-pipeline 39 | 40 | RUN export PATH="$HOME/miniconda3/bin:$PATH" && \ 41 | bash run_example.sh --pm conda -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile-pip-batch: -------------------------------------------------------------------------------- 1 | FROM ubuntu:bionic 2 | 3 | # Similar to Dockerfile, but uses PIP to install dependencies without creating environment 4 | # No user is created. Installs as root. 5 | # Use as example code and modify as needed 6 | 7 | # These commands run as root 8 | # Install base dependencies 9 | RUN apt-get update && \ 10 | apt install -y \ 11 | build-essential \ 12 | libmagic-dev \ 13 | libopenblas-dev \ 14 | openjdk-8-jre \ 15 | git \ 16 | locales \ 17 | python3-pip python3-virtualenv\ 18 | unzip \ 19 | wget 20 | 21 | RUN locale-gen en_US.UTF-8 22 | ENV LANG="en_US.UTF-8" 23 | ENV LANGUAGE="en_US:en" 24 | ENV LC_ALL="en_US.UTF-8" 25 | ENV HOME="/root" 26 | 27 | WORKDIR $HOME 28 | 29 | COPY payload/scorer.zip ./ 30 | COPY payload/license.sig .driverlessai/ 31 | 32 | RUN unzip scorer.zip 33 | 34 | COPY batch_scorer.py scoring-pipeline 35 | 36 | WORKDIR scoring-pipeline 37 | 38 | RUN python3 -m virtualenv -p python3.6 env && \ 39 | env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed pip==19.3.1 pkginfo==1.5.0.1 && \ 40 | env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r requirements.txt -c full_constraints.txt && \ 41 | env/bin/python -m pip uninstall -y tensorflow && \ 42 | env/bin/python -m pip uninstall -y tensorflow-gpu && \ 43 | env/bin/python -m pip install tensorflow==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \ 44 | tf_path=`env/bin/python -c "import os ; import importlib.util ; tf_loader = importlib.util.find_spec('tensorflow') ; print(os.path.dirname(tf_loader.origin))"` && \ 45 | rm -rf ${tf_path}_cpu && mv ${tf_path} ${tf_path}_cpu && \ 46 | env/bin/python -m pip install tensorflow_gpu==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \ 47 | rm -rf ${tf_path}_gpu && mv ${tf_path} ${tf_path}_gpu 48 | 49 | RUN import_statement=$(grep -E 'from scoring_h2oai_experiment' example.py) && \ 50 | sed -i "s/INJECT_EXPERIMENT_IMPORT/${import_statement}/g" batch_scorer.py 51 | 52 | CMD ["env/bin/python", "batch_scorer.py"] 53 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile-pip-http: -------------------------------------------------------------------------------- 1 | FROM ubuntu:bionic 2 | 3 | # Similar to Dockerfile, but uses PIP to install dependencies without creating environment 4 | # No user is created. Installs as root. 5 | # Use as example code and modify as needed 6 | 7 | # These commands run as root 8 | # Install base dependencies 9 | RUN apt-get update && \ 10 | apt install -y \ 11 | build-essential \ 12 | libmagic-dev \ 13 | libopenblas-dev \ 14 | openjdk-8-jre \ 15 | git \ 16 | locales \ 17 | python3-pip python3-virtualenv\ 18 | unzip \ 19 | wget 20 | 21 | RUN locale-gen en_US.UTF-8 22 | ENV LANG="en_US.UTF-8" 23 | ENV LANGUAGE="en_US:en" 24 | ENV LC_ALL="en_US.UTF-8" 25 | ENV HOME="/root" 26 | 27 | WORKDIR $HOME 28 | 29 | COPY payload/scorer.zip ./ 30 | COPY payload/license.sig .driverlessai/ 31 | 32 | RUN unzip scorer.zip 33 | 34 | WORKDIR scoring-pipeline 35 | 36 | RUN python3 -m virtualenv -p python3.6 env && \ 37 | env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed pip==19.3.1 pkginfo==1.5.0.1 && \ 38 | env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r requirements.txt -c full_constraints.txt && \ 39 | env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r http_server_requirements.txt -c full_constraints.txt && \ 40 | env/bin/python -m pip uninstall -y tensorflow && \ 41 | env/bin/python -m pip uninstall -y tensorflow-gpu && \ 42 | env/bin/python -m pip install tensorflow==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \ 43 | tf_path=`env/bin/python -c "import os ; import importlib.util ; tf_loader = importlib.util.find_spec('tensorflow') ; print(os.path.dirname(tf_loader.origin))"` && \ 44 | rm -rf ${tf_path}_cpu && mv ${tf_path} ${tf_path}_cpu && \ 45 | env/bin/python -m pip install tensorflow_gpu==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \ 46 | rm -rf ${tf_path}_gpu && mv ${tf_path} ${tf_path}_gpu 47 | 48 | EXPOSE 9090 49 | 50 | CMD ["env/bin/python", "http_server.py", "--port=9090"] 51 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline Wrapper using Docker 2 | ============================================ 3 | 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 docker container. This directory acts as the build 6 | context for the docker build step. 7 | 8 | 9 | Prerequisites 10 | ------------- 11 | 12 | The following pre-requisites are needed 13 | - [Docker](https://www.docker.com/) 14 | 15 | Follow the installation instructions for your platform and get Docker Ce (or EE) installed on the machine. 16 | 17 | 18 | Code Structure 19 | -------------- 20 | 21 | The code assumes a directory structure as below: 22 | 23 | ``` 24 | top-dir: A directory with the below structure. Name can be anything. This is the build context for docker build command 25 | - README.md: This file with the details you are reading 26 | - Dockerfile: The docker image build script 27 | - payload: A directory that contains files to be used in the docker container for deployment 28 | - scorer.zip: The DAI python scoring pipeline. (You need to put this file here) 29 | - license.sig: Valid Driverless AI license file. (You need to provide your license file here) 30 | ``` 31 | 32 | Docker Container to expose HTTP REST endpoint for scoring 33 | --------------------------------------------------------- 34 | 35 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker 36 | 3. Change to `top-dir`, which contains the files as mentioned in the above section 37 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory. 38 | 5. Copy Driverless AI license `license.sig` in the `payload` directory 39 | 6. Issue the command `docker build -f Dockerfile-pip-http -t score_python_http .`. This will 40 | - Create a Ubuntu 18.04 based docker container 41 | - Install required system dependencies, python3.6, pip etc.. 42 | - Install all python package dependencies needed for the scoring pipeline to work 43 | - Run `http_server.py` from the scoring pipeline and expose the REST scoring server at port 9090 44 | 45 | Execute the command `docker run -p 9090:9090 score_python_http:latest` and you will notice the python scoring server start and accept connections. 46 | 47 | In the `scorer.zip` file you put in the `payload` directory there is a sample http client you can use to test this server. Extract the file `run_http_client.sh` and execute it while the docker image is still listening. You will see the predictions being returned. 48 | 49 | Docker Container for Batch scoring 50 | ---------------------------------- 51 | 52 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker 53 | 3. Change to `top-dir`, which contains the files as mentioned in the above section 54 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory. 55 | 5. Copy Driverless AI license `license.sig` in the `payload` directory 56 | 6. Issue the command `docker build -f Dockerfile-pip-batch -t score_python_batch .`. This will 57 | - Create a Ubuntu 18.04 based docker container 58 | - Install required system dependencies, python3.6, pip etc.. 59 | - Install all python package dependencies needed for the scoring pipeline to work 60 | - Run the `batch_scorer.py` in the container such that it scores the file `/data/input.csv` and writes the predictions to `/data/output.csv` 61 | 62 | Execute the command `docker run -v some_dir_with_input_data:/data score_python_batch:latest`. Here `some_dir_with_input_data` is some directory on the machine where you are executing the docker run command. The file you want to score should be present in that directory with the name `input.csv`. The user executing the docker run command should have read and write permissions on the directory `some_dir_with_input_data` to be able to create the prediction output file `output.csv` in that same directory 63 | 7. Once the `output.csv` is generate you can cbind the two files in linux using the command `paste -d ',' input.csv output.csv` 64 | 65 | Disclaimer 66 | ---------- 67 | 68 | The scoring pipeline wrapper code shared in this directory is created to provide you 69 | a sample starting point and is not intended to be directly deployed to production as is. 70 | You can use this starting point and build over it to solve your deployment needs ensuring 71 | that your security etc. requirements are met. -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/batch_scorer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from numpy import nan 4 | from scipy.special._ufuncs import expit 5 | import datatable as dt 6 | INJECT_EXPERIMENT_IMPORT 7 | 8 | scorer = Scorer() 9 | 10 | input_dt = dt.fread("/data/input.csv", na_strings=['', '?', 'None', 'nan', 'NA', 'N/A', 'unknown', 'inf', '-inf', '1.7976931348623157e+308', '-1.7976931348623157e+308']) 11 | output_dt = scorer.score_batch(input_dt, apply_data_recipes=False) 12 | dt.Frame(output_dt).to_csv("/data/output.csv") -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/docker/payload/README.md: -------------------------------------------------------------------------------- 1 | Payload Directory 2 | ================= 3 | 4 | Put the following two files in this directory 5 | 6 | - `scorer.zip` 7 | - `license.sig` 8 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/vagrant/README.md: -------------------------------------------------------------------------------- 1 | Python Scoring Pipeline Wrapper using Vagrant 2 | ============================================= 3 | 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 virtual machine in Vagrant. 6 | 7 | 8 | Prerequisites 9 | ------------- 10 | 11 | The following pre-requisites are needed 12 | - [VirtualBox](https://www.virtualbox.org/): A free virtualization provider 13 | - [Vagrant](https://www.vagrantup.com/): A tool for building and managing virtual machines 14 | - [Vagrant Disk Resize plugin](https://github.com/sprotheroe/vagrant-disksize): A vagrnt plugin to manage disk sizes 15 | 16 | Follow the installation instructions for your platform and get them installed in the above order. 17 | 18 | 19 | Code Structure 20 | -------------- 21 | 22 | The code assumes a directory structure as below: 23 | 24 | ``` 25 | top-dir: A directory with the below structure. Name of directory can be anything. 26 | - README.md: This file with the details you are reading 27 | - Vagrantfile: File providing the definition of the virtual machine to create using Vagrant 28 | - bootstrap.sh: The shell provisioner, installs core ubuntu packages 29 | - payload.sh: Shell provisioner, installs Miniconda, creates scoring environment, runs pipeline 30 | - payload: A directory that contains files which can be used in the virtual machine for deployment 31 | - scorer.zip: The DAI python scoring pipeline. (You need to put this file here) 32 | - license.sig: Valid Driverless AI license file. (You need to provide your license file here) 33 | ``` 34 | 35 | Instructions 36 | ------------ 37 | 38 | 1. Install VirtualBox 39 | 2. Install Vagrant. Ensure you can invoke it using `vagrant --version` 40 | 2. Install Vagrant Disk Size plugin `vagrant plugin install vagrant-disksize` 41 | 3. Go to `top-dir`, which contains the files as mentioned in the above section 42 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory. 43 | 5. Copy Driverless AI license `license.sig` in the `payload` directory 44 | 6. Issue the command `vagrant up`. This will 45 | - Create a Ubuntu 18.04 based virtual machine 46 | - Bootstrap it i.e. install all dependencies, miniconda, python etc.. 47 | - Create a conda environment for the scoring pipeline by installing all needed dependencies 48 | - Run `example.py` from the scoring pipeline 49 | 50 | You can SSH to the machine using the command `vagrant ssh` from `top-dir` directory. Once connected it is like 51 | working on any Ubuntu terminal. 52 | 53 | To run `example.py` you can follow the below steps once you are connected using SSH 54 | 55 | ``` 56 | conda env list # shows conda environments available on the system 57 | conda activate environment_name # activate environment for required experiment (experiment key is in name) 58 | python example.py # to run example.py manually 59 | ``` 60 | 61 | Similarly, you can run the HTTP and TCP server python files too. 62 | 63 | Multiple Deployments on same Host 64 | --------------------------------- 65 | 66 | Each DAI experiment python deployment pipeline should be contained in its own virtual python environment. 67 | We support both `conda` and `pip + virtualenv` based virtual environments. This separation enables flexibility 68 | to have multiple experiment scoring pipelines to be deployed on the same machine without interfering with 69 | each other. 70 | 71 | 72 | Disclaimer 73 | ---------- 74 | 75 | The scoring pipeline wrapper code shared in this directory is created to provide you 76 | a sample starting point and is not intended to be directly deployed to production as is. 77 | You can use this starting point and build over it to solve your deployment needs ensuring 78 | that your security etc. requirements are met. 79 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/vagrant/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | # More boxes at https://vagrantcloud.com/search. 6 | config.vm.box = "ubuntu/bionic64" 7 | config.vm.network "private_network", ip: "192.168.33.10" 8 | # config.vm.network "forwarded_port", guest: 80, host: 8080 9 | 10 | # HDD size for guest machine 11 | config.disksize.size = '10GB' 12 | 13 | config.vm.provider "virtualbox" do |vb| 14 | vb.memory = "8192" 15 | end 16 | 17 | # Provisioning 18 | # File 19 | config.vm.provision "file", source: "payload/scorer.zip", destination: "/home/vagrant/scorer.zip" 20 | config.vm.provision "file", source: "payload/license.sig", destination: "/home/vagrant/.driverlessai/license.sig" 21 | 22 | # Shell - bootstraping 23 | config.vm.provision "shell", path: "bootstrap.sh", name: "bootstrap", privileged: true 24 | # Shell - user install 25 | config.vm.provision "shell", path: "payload.sh", name: "payload", privileged: false 26 | end 27 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/vagrant/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get -y update 4 | apt-get -y upgrade 5 | apt-get -y install unzip build-essential libopenblas-dev 6 | 7 | 8 | -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/vagrant/payload.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh 4 | bash ~/miniconda.sh -b -p $HOME/miniconda3 5 | echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> ~/.bashrc 6 | export PATH=$HOME/miniconda3/bin:$PATH 7 | unzip scorer.zip && cd scoring-pipeline 8 | bash run_example.sh --pm conda -------------------------------------------------------------------------------- /scoring-pipeline-deployment/python/ubuntu/vagrant/payload/README.md: -------------------------------------------------------------------------------- 1 | Payload Directory 2 | ================= 3 | 4 | Put the following two files in this directory 5 | 6 | - `scorer.zip` 7 | - `license.sig` 8 | --------------------------------------------------------------------------------