├── .gitignore
├── README.md
├── autoviz
    └── autoviz_client_example.ipynb
├── compliant_driverlessai
    ├── README.md
    ├── data
    │   ├── CreditCard-test.csv
    │   ├── compliance_maralutu.csv
    │   ├── dai_10_10_1_9cf011b9.csv
    │   ├── dai_6_4_6_nulafigi.csv
    │   └── mono_xgb_nisonote.csv
    └── notebooks
    │   ├── compliant_dia_gender.ipynb
    │   ├── compliant_dia_marriage.ipynb
    │   ├── dai_10_10_1_gender.ipynb
    │   ├── dai_10_10_1_marriage.ipynb
    │   ├── dai_6_4_6_gender.ipynb
    │   ├── dai_6_4_6_marriage.ipynb
    │   ├── mono_xgb_dia_gender.ipynb
    │   ├── mono_xgb_dia_marriage.ipynb
    │   └── requirements.txt
├── dai_installation
    ├── Aws
    │   ├── Rhel7.7.md
    │   └── images
    │   │   └── 01_compute_config.png
    ├── Azure
    │   ├── Ubuntu16.04.md
    │   └── images
    │   │   ├── 01_nvidia_driver_install.gif
    │   │   ├── 01_select_compute_ubuntu.png
    │   │   ├── 02_compute_config.png
    │   │   ├── 02_cuda_install.gif
    │   │   ├── 03_create_new_disk.png
    │   │   ├── 03_docker_install.gif
    │   │   ├── 04_dai_install_e13.gif
    │   │   ├── 04_new_disk_attached.png
    │   │   ├── 05_configure_networking.png
    │   │   └── 06_configure_nsg_ports_open.png
    └── README.md
├── dai_python_client
    ├── README.md
    ├── algorithm_family_comparison.ipynb
    └── common_workflow.ipynb
├── driverlessai_experiments
    ├── iid
    │   ├── credit_card_experiment
    │   │   ├── credit_card_default.ipynb
    │   │   ├── credit_card_models_by_accuracy_time_complexity.Rmd
    │   │   ├── credit_card_models_by_accuracy_time_complexity.html
    │   │   └── images
    │   │   │   ├── download_mojo.png
    │   │   │   ├── exp_running_creditcard.png
    │   │   │   ├── experiment_complete_creditcard.png
    │   │   │   ├── experiment_list_complete.png
    │   │   │   ├── experiment_list_running.png
    │   │   │   ├── import_data_sets_creditcard.png
    │   │   │   ├── mli_external.png
    │   │   │   ├── mli_list.png
    │   │   │   ├── model_diagnostics_complete.png
    │   │   │   ├── model_diagnostics_setup.png
    │   │   │   ├── py_client_link.png
    │   │   │   ├── set_columns_creditcard.png
    │   │   │   ├── set_parameters_creditcard.png
    │   │   │   ├── sign_in_home_page_0.png
    │   │   │   └── skip_sign_in_home_page_1.png
    │   ├── imbalanced
    │   │   ├── images
    │   │   │   ├── compare_weighted_experiments.png
    │   │   │   ├── py_client_link.png
    │   │   │   └── weighted_project.png
    │   │   └── imbalanced_experiment.ipynb
    │   ├── model_family_comparison
    │   │   └── model_family_comparison.ipynb
    │   └── reject_inference
    │   │   └── Reject_Inference_with_Fuzzy_Augment.Rmd
    ├── nlp
    │   ├── airline_sentiment_experiment
    │   │   ├── demo_nlp_airline_sentiment.ipynb
    │   │   └── nlp_airline_sentiment_mli.ipynb
    │   └── custom_word2vec_embeddings.ipynb
    ├── nlp_timeseries
    │   ├── imgs
    │   │   ├── coffee.gif
    │   │   ├── create_experiment.png
    │   │   ├── mapbox.png
    │   │   └── scpf_lb_progress.png
    │   ├── kaggle_see_click_predict_fix.ipynb
    │   └── predictions
    │   │   └── .gitkeep
    └── timeseries
    │   ├── stock_timeseries_experiment
    │       └── demo_stock_timeseries.ipynb
    │   ├── ts-full-pipeline
    │       ├── .gitignore
    │       ├── 01-generate-data.sh
    │       ├── 01_process_full_TS_csv.py
    │       ├── 02-create-experiment-data.sh
    │       ├── 02_extract_experiment_datasets.py
    │       ├── 03-default-experiment-configs.json
    │       ├── 03-run-experiment.sh
    │       ├── 03_run_experiment.py
    │       ├── 04-create-tta-scoring-files.sh
    │       ├── 04_generate_tta_files.py
    │       ├── 05-score-tta-files.sh
    │       ├── 05_score_tta_files.py
    │       ├── 10_plot_score_metric.py
    │       ├── 11_http_server2.py
    │       ├── README.md
    │       ├── environment.yml
    │       ├── images
    │       │   ├── TTA - Rolling Window.odp
    │       │   ├── TTA-RollWindow-duration.png
    │       │   ├── metrics_plot.png
    │       │   └── metrics_plot.svg
    │       └── ts-definition.json
    │   └── walmart_timeseries_experiment
    │       ├── images
    │           ├── import_data_sets_stock.png
    │           └── launching_experiment.png
    │       ├── timeseries_model_rollingwindow.ipynb
    │       └── training_timeseries_model.ipynb
├── interpretable_ml
    ├── DAIDIA.ipynb
    ├── FormatReasonCodes.ipynb
    ├── MLIDTSurrogate.ipynb
    ├── MLIPDPICE.ipynb
    ├── MLIResidualAnalysis.ipynb
    ├── MLISensitivityAnalysis.ipynb
    ├── README.md
    ├── TimeSeriesDiagnostics.ipynb
    └── data
    │   ├── credit_test.csv
    │   ├── credit_train.csv
    │   ├── default_of_credit_card_clients.xls
    │   ├── klime_frame.csv
    │   └── shapley.csv
└── scoring-pipeline-deployment
    ├── R
        └── Shiny_Example
        │   ├── 1_Data_Recoding.R
        │   ├── 2_DAI_Interaction.R
        │   ├── 3_DAI_Model_Prediction.R
        │   ├── 4_MOJO_Predictions.R
        │   ├── CreditCard.csv
        │   ├── CreditCardRe.csv
        │   ├── Data_Preprocessing_for_app.R
        │   ├── full_app.R
        │   ├── simple_app.R
        │   └── train_preds_custom.csv
    ├── README.md
    ├── java
        └── README.md
    └── python
        ├── centos
            ├── docker
            │   ├── Dockerfile
            │   └── README.md
            └── vagrant
            │   ├── README.md
            │   ├── Vagrantfile
            │   ├── bootstrap.sh
            │   ├── payload.sh
            │   └── payload
            │       └── README.md
        ├── pyspark
            ├── README.md
            └── get_predictions.py
        └── ubuntu
            ├── README.md
            ├── docker
                ├── .gitignore
                ├── Dockerfile
                ├── Dockerfile-pip-batch
                ├── Dockerfile-pip-http
                ├── README.md
                ├── batch_scorer.py
                └── payload
                │   └── README.md
            └── vagrant
                ├── README.md
                ├── Vagrantfile
                ├── bootstrap.sh
                ├── payload.sh
                └── payload
                    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | # C extensions
 6 | *.so
 7 | # Distribution / packaging
 8 | .Python
 9 | build/
10 | develop-eggs/
11 | dist/
12 | downloads/
13 | eggs/
14 | .eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | # Installer logs
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .coverage
37 | .coverage.*
38 | .cache
39 | nosetests.xml
40 | coverage.xml
41 | *.cover
42 | .hypothesis/
43 | .pytest_cache/
44 | # Translations
45 | *.mo
46 | *.pot
47 | # Django stuff:
48 | *.log
49 | local_settings.py
50 | db.sqlite3
51 | # Flask stuff:
52 | instance/
53 | .webassets-cache
54 | # Scrapy stuff:
55 | .scrapy
56 | # Sphinx documentation
57 | docs/_build/
58 | # PyBuilder
59 | target/
60 | # Jupyter Notebook
61 | .ipynb_checkpoints
62 | # pyenv
63 | .python-version
64 | # celery beat schedule file
65 | celerybeat-schedule
66 | # SageMath parsed files
67 | *.sage.py
68 | # Environments
69 | .env
70 | .venv
71 | env/
72 | venv/
73 | ENV/
74 | env.bak/
75 | venv.bak/
76 | # Spyder project settings
77 | .spyderproject
78 | .spyproject
79 | # Rope project settings
80 | .ropeproject
81 | # mkdocs documentation
82 | /site
83 | # mypy
84 | .mypy_cache/
85 | # Scoring pipeline payloads
86 | scorer.zip
87 | mojo.zip
88 | # DAI license
89 | license.sig
90 | # VAgrant
91 | .vagrant
92 | .DS_Store
93 | .Rproj.user
94 | driverlessai-tutorials.Rproj
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Driverless AI Code Samples and Tutorials
2 | 
3 | This repository provides code examples and tutorials demonstrating use of Driverless AI.
4 | 
5 | 


--------------------------------------------------------------------------------
/compliant_driverlessai/README.md:
--------------------------------------------------------------------------------
 1 | # Compliant Driverless AI
 2 | 
 3 | # Contents 
 4 | 
 5 | * Jupyter notebooks for `compliant` modelling in Driverless AI (version 1.6.2):
 6 | 	* `Compliant` mode:
 7 | 	 	* Documentation about `Compliant` mode in Driverless AI is [here](http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/expert-settings.html?highlight=compliant#pipeline-building-recipe)
 8 | 		* Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/compliant_dia_gender.ipynb)  
 9 | 		* Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/compliant_dia_marriage.ipynb)   
10 | 	* Monotonic XGBoost modelling in Driverless AI:
11 | 		* Documentation about monotonicity constraints in Driverless AI is [here](http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/experiment-settings.html?highlight=monotonic#interpretability) 
12 | 		* Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/mono_xgb_dia_gender.ipynb)  
13 | 		* Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/mono_xgb_dia_marriage.ipynb) 
14 | 	* DAI Experiment with settings 6/4/6 (Accuracy/Time/Interpretability):
15 | 		* Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_6_4_6_gender.ipynb)  
16 | 		* Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_6_4_6_marriage.ipynb) 
17 | 	* DAI Experiment with Settings 10/10/1 (Accruracy/Time/Interpretability): 
18 | 		* Disparate Impact Analysis for different `gender` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_10_10_1_gender.ipynb)  
19 | 		* Disparate Impact Analysis for different `marriage` levels in the [UCI creditcard dataset](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) is [here](https://github.com/h2oai/driverlessai-tutorials/blob/master/compliant_driverlessai/notebooks/dai_10_10_1_marriage.ipynb)    
20 | 


--------------------------------------------------------------------------------
/compliant_driverlessai/notebooks/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.14.5
 2 | pandas==0.22.0
 3 | matplotlib==2.2.2
 4 | xlrd==1.1.0
 5 | shap==0.24.0
 6 | scikit-learn==0.19.2
 7 | jupyter==1.0.0
 8 | eli5==0.8
 9 | h2o==3.20.0.8
10 | seaborn==0.9.0
11 | datatable==0.8.0
12 | 


--------------------------------------------------------------------------------
/dai_installation/Aws/Rhel7.7.md:
--------------------------------------------------------------------------------
  1 | Install H2O Driverless AI on base RHEL7.7 on AWS EC2 Instances
  2 | =======================================================
  3 | 
  4 | > While this guide mentions about installing Driverless AI from scratch on Aws, it can be used on bare-metal machine or on any other cloud VM from the `Install Nvidia driver` step onwards.
  5 | 
  6 | Create base RHEL 7.7 Server
  7 | -----------------------------------
  8 | 
  9 | - **Select OS**
 10 |   - Login to Aws console and create a new EC2 compute instance.
 11 |   - Search for Rhel 7.7 AMI.
 12 |   - Select RHEL 7.7 AMI and start configuring the EC2 instance.
 13 | - **Select EC2 instance Type**
 14 |   - Choose the instance type For this exercise, we choose g4dn.16xlarge which
 15 |   has 1 GPU and 64 CPU cores. Consider the proper [EC2 instances type][1] based on your use case.
 16 | - **Configure Instance**
 17 |   - Configure instance setting for network, IAM role, suhtdwon polict etc., 
 18 |   as required.
 19 | - **Configure Storage**
 20 |   - SSD is recommended persistance store for Driverless AI.
 21 |   - For this setup, I installed DAI on the same disk where the OS is installed.
 22 |   - To increase the OS disk size, once the VM is running you will need to stop it.
 23 |   - If you are doing the RPM install, DAI will put the bulk of its data in the `/opt/h2oai/dai` directory. So if you are attaching an additional drive, ensure that you mount it at `/opt`.
 24 |   - If you are going with a docker based approach, you can mount the disk to any mount point as you will be mapping the host directories as volumes in the docker container. 
 25 | - **Configure Security Group**
 26 |   - Configure security group as needed.
 27 |   - At a minimum, ensure that your compute instance would have a public IP.
 28 |   - Configure the Network Security Group to allow incoming connections to port 22 (for SSH connection) and 12345 (for Driverless AI web UI).
 29 | ![AWS compute configuration](images/01_compute_config.png)
 30 | 
 31 | > H2O Driverless AI uses Tensorflow built against CUDA 10.0, hence this is the recommended CUDA version to use. Per [Nvidia Compatibility Matrix][2], Nvidia driver version 384.XX is the minimum version needed and was the default when CUDA 9.0 was shipped. Per [Nvidia Hardware Support][3], driver 384.xx does not support the latest Turing architecture cards.
 32 | > The latest Nvidia Driver we have tested to work with CUDA 10.0 and Driverless AI is the 440.82+ branch. We install 450.XX in the steps below.
 33 | 
 34 | Install pciutils
 35 | ----------------------
 36 | 
 37 | - Once the server is up, ssh to it.
 38 | - Disable SELINUX to avoid interference with Nvidia driver installation
 39 | ```shell
 40 | sudo vi /etc/sysconfig/selinux
 41 | SELINUX=disabled
 42 | ```
 43 | - Ensure GPU's are installed using pciutils. To install pciutils
 44 | ```shell
 45 | sudo yum -y install pciutils
 46 | ```
 47 | - Check to determine what GPU card is installed
 48 | ```shell
 49 | lspci | grep -e VGA -ie NVIDIA
 50 | ```
 51 | - The output of lspci should be something similar to 
 52 | ```
 53 | 00:02.0 VGA compatible controller: Intel Corporation 4th Gen ...
 54 | 01:00.0 VGA compatible controller: Nvidia Corporation ...
 55 | ```
 56 | 
 57 | Disable Nouveau driver
 58 | ----------------------
 59 | 
 60 | - The `nouveau` driver is an alternative to the Nvidia driver that is generally installed on the server. It does not work with `CUDA` and needs to be disabled.
 61 | - If Nouveau drivers are installed and loaded, then you need to follow the steps for your Linux version to [Disable Nouveau Drivers][4]. For RHEL, the steps are
 62 | 
 63 | ```shell
 64 | cat <<EOF | sudo tee /etc/modprobe.d/blacklist-nouveau.conf
 65 | blacklist nouveau
 66 | options nouveau modeset=0
 67 | EOF
 68 | ```
 69 | - Update Grub to Blacklist Nouveau.
 70 | ```shell
 71 | # Create a backup on grub config template
 72 | sudo cp /etc/sysconfig/grub /etc/sysconfig/grub.bak
 73 | 
 74 | #Then, update grub config template 
 75 | sudo vi /etc/sysconfig/grub
 76 | 
 77 | # Add rd.driver.blacklist=grub.nouveau to the GRUB_CMDLINE_LINUX variable. Example
 78 | GRUB_CMDLINE_LINUX="crashkernel=auto ... quiet rd.driver.blacklist=grub.nouvea"
 79 | 
 80 | ```
 81 | - Rebuld grub config.
 82 | ```
 83 | sudo grub2-mkconfig -o /boot/grub2/grub.cfg
 84 | ```
 85 | - Regenerate the Initramfs Image.
 86 | ```shell
 87 | sudo mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r)-nouveau.img
 88 | sudo dracut /boot/initramfs-$(uname -r).img $(uname -r)
 89 | ```
 90 | - Ensure to restart the server before proceeding ahead.
 91 | ```
 92 | sudo reboot now
 93 | ```
 94 | - Once server is up, ssh to it and ensure `Nouveau` driver is disabled.
 95 | ```shell
 96 | # This command should print success
 97 | lsmod | grep "nouveau" > /dev/null && echo "WARNING: nouveau still active" ||echo "Success" 
 98 | ```
 99 | 
100 | Install Nvidia driver
101 | ----------------------------------
102 | The installation steps are to be followed when EPEL repository and DKMS libraries need not to be installed on the server. These steps also require that the user has ssh'd into the server already.
103 | 
104 | - Add the tech preview repository. For detailed instructions refer[5].
105 | ```shell
106 | sudo yum-config-manager --add-repo=http://developer.download.nvidia.com/compute/cuda/preview/repos/rhel7/x86_64/techpreview_nvidia_rh_drv.repo
107 | ```
108 | - Install NVIDIA yum plugin.
109 | ```shell
110 | sudo yum install yum-plugin-nvidia
111 | ```
112 | - Verify that you have a supported kernel installed.
113 | ```
114 | uname -r
115 | 
116 | # The above cmd should show a similar output
117 | 3.10.0-957.12.2.el7.x86_64
118 | ```
119 | 
120 | - Install required dependencies
121 | ```
122 | sudo yum -y install kernel-devel kernel-headers gcc acpid make 
123 | ```
124 | - Upgrade kernel and rebbot
125 | ```
126 | sudo yum upgrade kernel
127 | sudo reboot now
128 | ```
129 | - Once server is up, ssh to the server
130 | - Check Nvidia driver version
131 | ```
132 | lspci | grep -e VGA -ie NVIDIA
133 | ```
134 | 
135 | - Navigate to [Nvidia Unix driver archive][6], and select `Linux x86_64` > `Latest Long Lived Branch`. Here we choose version `450.80.02`
136 | - Select Download and it should download similar file `NVIDIA-Linux-x86_64-450.80.02.run`.
137 | - Alternatively, on the ssh session, download the file using `wget <put the copied link here>` to the server.
138 | - Install the downloaded package 
139 | ```shell
140 | sudo chmod +x ./NVIDIA-Linux-$(uname -m)-*.run
141 | sudo ./NVIDIA-Linux-$(uname -m)-*.run
142 | ```
143 | - At this point you will need to restart the machine. This will ensure that nvidia drivers are correctly loaded to the kernel.
144 | 
145 | Set Nvidia Persistance mode
146 | ---------------------------
147 | 
148 | - Driverless AI requires the persistance mode to be enabled on each GPU that would be used with DAI
149 | - To manually enable persistance mode on all GPUs issue the command `sudo nvidia-smi -pm 1`
150 | - To validate, issue the command `nvidia-smi` and verify that persistance mode setting is turned ON.
151 | 
152 | > At this point your system setup tasks are completed. You can now proceed with a native rpm package install of Driverless AI or proceed to install `Docker CE` and `nvidia-runtime` for a docker based installation of Driverless AI.
153 | 
154 | Install Driverless AI native RPM package
155 | ----------------------------------------
156 | 
157 | - If you want docker container based Driverless AI install, skip this section and proceed from Install Docker CE onwards.
158 | - If you want a deb based install, follow the steps in this section and do not follow any of the docker installation sections below.
159 | - [Download latest Driverless AI][8] rpm package from [https://www.h2o.ai/download/#driverless-ai][8]. You can get the URL and issue the command `wget <paste URL here>` to download the file.
160 | - Issue the command `sudo rpm -i <dai file downloaded>.rpm` to install Driverless AI.
161 | - Proceed to Driverless AI documentation to understand the steps to [manage Driverless AI i.e. start, stop, uninstall, update]
162 | 
163 | Great, you should be done with native installation of Driverless AI.
164 | 
165 | Follow on from here in case you are doing a Docker install for H2O Driverless AI
166 | 
167 | Install Docker CE
168 | -----------------
169 | 
170 | - Using redhat subscription manager check if repository exists. Refer to the gist[10] for detailed information
171 | ```
172 | subscription-manager repos --list | grep -i extras
173 | ```
174 | - Enable the repository
175 | ```
176 | sudo subscription-manager repos --enable rhel-7-server-extras-rpms
177 | ```
178 | - List docker packages
179 | ```
180 | sudo yum list "*docker*"
181 | ```
182 | - Install docker
183 | ```
184 | sudo yum -y install docker
185 | ```
186 | - Start docker and check docker version
187 | ```
188 | sudo systemctl start docker.service
189 | docker -v
190 | ```
191 | - Stop docker
192 | ```
193 | sudo systemctl stop docker.service
194 | ```
195 | 
196 | Install nvidia-docker
197 | ----------------------
198 | 
199 | - Ensure docker is started and runs.
200 | ```
201 | sudo systemctl start docker && sudo systemctl enable docker
202 | ```
203 | 
204 | - Setup the stable repository and the GPG key
205 | ```shell
206 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
207 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
208 | ```
209 | - Install nvidia-docker
210 | ```shell
211 | sudo yum install nvidia-docker
212 | ```
213 | - Install nvidia-container-toolkit package
214 | ```shell
215 | sudo yum clean expire-cache
216 | sudo yum install nvidia-container-toolkit -y
217 | ```
218 | - Restart docker and test setup using a base CUDA container
219 | ```shell
220 | sudo systemctl restart docker
221 | sudo docker run --rm -e NVIDIA_VISIBLE_DEVICES=all nvidia/cuda:11.0-base nvidia-smi
222 | 
223 | ```
224 | - This completes nvidia-docker installation.
225 | [nvidia-docker install reference][11]
226 | 
227 | Install H2O Driverless AI as docker
228 | -----------------------------------
229 | 
230 | - [Download latest Driverless AI][12] docker image from [https://www.h2o.ai/download/#driverless-ai][12]
231 | - Load the download image to docker using command `docker load < dai_image_name.tar.gz`. Substitute the correct file name.
232 | - Proceed with [installing Driverless AI][13] following the directions step 5 onwards on that page.
233 | 
234 | [1]: https://aws.amazon.com/emr/pricing/
235 | [2]: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver
236 | [3]: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#support-hardware
237 | [4]: https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-linux/index.html#runfile-nouveau
238 | [5]: http://developer.download.nvidia.com/compute/cuda/preview/repos/rhel7/x86_64/README.html
239 | [6]: https://www.nvidia.com/en-us/drivers/unix/
240 | [7]: https://docs.nvidia.com/deploy/driver-persistence/index.html#usage
241 | [8]: https://www.h2o.ai/download/#driverless-ai
242 | [9]: http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/install/linux-rpm.html#installing-driverless-ai
243 | [10]: https://gist.github.com/WelshSean/d55289acba43d9c305fbffda2befe201
244 | [11]: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-rhel-7
245 | [12]: https://www.h2o.ai/download/#driverless-ai
246 | [13]: http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/install/rhel.html#install-on-rhel-with-gpus


--------------------------------------------------------------------------------
/dai_installation/Aws/images/01_compute_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Aws/images/01_compute_config.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/01_nvidia_driver_install.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/01_nvidia_driver_install.gif


--------------------------------------------------------------------------------
/dai_installation/Azure/images/01_select_compute_ubuntu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/01_select_compute_ubuntu.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/02_compute_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/02_compute_config.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/02_cuda_install.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/02_cuda_install.gif


--------------------------------------------------------------------------------
/dai_installation/Azure/images/03_create_new_disk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/03_create_new_disk.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/03_docker_install.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/03_docker_install.gif


--------------------------------------------------------------------------------
/dai_installation/Azure/images/04_dai_install_e13.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/04_dai_install_e13.gif


--------------------------------------------------------------------------------
/dai_installation/Azure/images/04_new_disk_attached.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/04_new_disk_attached.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/05_configure_networking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/05_configure_networking.png


--------------------------------------------------------------------------------
/dai_installation/Azure/images/06_configure_nsg_ports_open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/dai_installation/Azure/images/06_configure_nsg_ports_open.png


--------------------------------------------------------------------------------
/dai_installation/README.md:
--------------------------------------------------------------------------------
 1 | H2O.ai Driverless AI installation from scratch
 2 | ==============================================
 3 | 
 4 | This directory lists guides to manually setup [H2O.ai Driverless AI][1] on baremetal machines and various clouds.
 5 | 
 6 | While the guides mention a cloud provider (which is where I tried the steps), they are not specific to the cloud provider and should work on other clouds like AWS, GCP, and even on a bare-metal machine.
 7 | 
 8 | **[Azure/Ubuntu16.04.md](Azure/Ubuntu16.04.md)**
 9 | 
10 | - Guide to setup Driverless AI from scratch on Ubuntu 16.04 LTS.
11 | - We install the following things in order
12 |   - Nvidia Drivers
13 |   - CUDA 9.0
14 |   - docker-ce
15 |   - nvidia-docker and then configure the GPU cards for use is H2O Driverless AI
16 | - We explain the process using a VM on Azure, but then setup steps should be valid for baremetal as well as VMs in other clouds.
17 | 
18 | **[Aws/Rhel7.7.md](Aws/Rhel7.7.md)**
19 | 
20 | - Guide to setup Driverless AI from scratch on RHEL 7.7.
21 | - We install the following things in order
22 |   - Nvidia Drivers
23 |   - docker-ce
24 |   - nvidia-docker and then configure the GPU cards for use is H2O Driverless AI
25 | - We explain the process using a EC2 instance on AWS, but then setup steps should be valid for baremetal as well as VMs in other clouds.
26 | 
27 | [1]: https://www.h2o.ai/products/h2o-driverless-ai/


--------------------------------------------------------------------------------
/dai_python_client/README.md:
--------------------------------------------------------------------------------
 1 | # Python Client Examples: driverlessai
 2 | 
 3 | The intuitive, static Python client for Drivierless AI.
 4 | 
 5 | ### Install
 6 | 
 7 | Install with `pip install driverlessai` or `conda install -c h2oai driverlessai`.
 8 | 
 9 | Upgrade with `pip install --upgrade driverlessai` or `conda update -c h2oai driverlessai`.
10 | 
11 | ### Documentation
12 | 
13 | http://docs.h2o.ai/driverless-ai/pyclient/docs/html/index.html
14 | 
15 | ## Available Examples 
16 | 
17 | * [algorithm_family_comparison](https://github.com/h2oai/driverlessai-tutorials/blob/master/dai_python_client/algorithm_family_comparison.ipynb) - Compare complexity of algorithm vs. accuracy of experiments
18 |   1. Notebook Setup
19 |   2. Connect to Driverless AI
20 |   3. Load a Dataset
21 |   4. Split Dataset
22 |   5. Run Experiments
23 |   6. View Results
24 | * [common_workflow](https://github.com/h2oai/driverlessai-tutorials/blob/master/dai_python_client/common_workflow.ipynb) - Common DAI UI flow from python
25 |   1. Connect to Driverless AI
26 |   2. Documentation
27 |   3. Data
28 |   4. Recipes
29 |   5. Modeling
30 |   6. Launching Machine Learning Interpretability
31 |   7. Disconnect
32 | 
33 | Please send feedback to help improve the client and documentation to support@h2o.ai.
34 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/download_mojo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/download_mojo.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/exp_running_creditcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/exp_running_creditcard.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/experiment_complete_creditcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_complete_creditcard.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_complete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_complete.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/experiment_list_running.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/import_data_sets_creditcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/import_data_sets_creditcard.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/mli_external.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/mli_external.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/mli_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/mli_list.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_complete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_complete.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/model_diagnostics_setup.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/py_client_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/py_client_link.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/set_columns_creditcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/set_columns_creditcard.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/set_parameters_creditcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/set_parameters_creditcard.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/sign_in_home_page_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/sign_in_home_page_0.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/credit_card_experiment/images/skip_sign_in_home_page_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/credit_card_experiment/images/skip_sign_in_home_page_1.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/imbalanced/images/compare_weighted_experiments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/compare_weighted_experiments.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/imbalanced/images/py_client_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/py_client_link.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/imbalanced/images/weighted_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/iid/imbalanced/images/weighted_project.png


--------------------------------------------------------------------------------
/driverlessai_experiments/iid/reject_inference/Reject_Inference_with_Fuzzy_Augment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: html_document
  3 | editor_options: 
  4 |   chunk_output_type: console
  5 | ---
  6 | 
  7 | ```{r}
  8 | library(DiagrammeR)
  9 | library(data.table)
 10 | library(ggplot2)
 11 | library(scales)
 12 | library(ggthemes)
 13 | library(R.utils)
 14 | ```
 15 | 
 16 | # Reject Inference Workflow
 17 | 
 18 | ```{r}
 19 | mermaid("
 20 | graph TB 
 21 | 
 22 |   subgraph Application Data
 23 |     AllApplicantsDS[\"All Applicants\"] --> UnknownGoodBadDS[\"Rejects\"] 
 24 |     AllApplicantsDS --> KnownGoodBadDS[\"Accepts\"]
 25 |     KnownGoodBadDS -.- KnownGoodDS[\"Loans Paid Off (Good)\"]
 26 |     KnownGoodBadDS -.- KnownBadDS[\"Charged Off (Bad)\"]
 27 |   end
 28 |   
 29 |   subgraph Accepted Applicants Model
 30 |     Training1((\"Training\")) --> KnownGoodBadModel(-\"Accepted Loans Model\"-)
 31 |     KnownGoodBadModel --> Scoring1((\"Scoring\"))
 32 |   end
 33 |   
 34 |   subgraph Fuzzy Augmentation
 35 |     ScoredUnknownDS[\"Rejects Scored\"] --> UnknownDSLabeledGoodDS[\"Label and Weight as Good\"]
 36 |     ScoredUnknownDS --> UnknownDSLabeledBadDS[\"Label and Weight as Bad\"]
 37 |     AllGoodBadWeightedDS[\"Accepted Applicants plus Weighted Rejects\"]
 38 |     UnknownDSLabeledGoodDS --> AllGoodBadWeightedDS
 39 |     UnknownDSLabeledBadDS --> AllGoodBadWeightedDS
 40 |   end 
 41 |   
 42 |   FinalAllApplicantsScoredDS[\"All Scored on Augmented Model\"]
 43 |   
 44 |   subgraph Augmented Model
 45 |     Training2((\"Training\")) --> FinalRejectInferenceModel(-\"Final Reject Inference Model\"-)
 46 |     FinalRejectInferenceModel --> Scoring2((\"Scoring\"))
 47 |   end
 48 | 
 49 |   KnownGoodBadDS --> Training1
 50 |   UnknownGoodBadDS --> Scoring1
 51 |   Scoring1 --> ScoredUnknownDS
 52 |   KnownGoodBadDS --> AllGoodBadWeightedDS
 53 |   AllGoodBadWeightedDS --> Training2
 54 |   AllApplicantsDS --> Scoring2
 55 |   Scoring2 --> FinalAllApplicantsScoredDS
 56 |   
 57 | ")
 58 | 
 59 | ```
 60 | 
 61 | # Loading Dataset
 62 | 
 63 | Original dataset contains loans with either paid off or charged off status. A tiny fraction of loans doesn't have status and thus treated as rejected. Since its number is insufficient to similuate reject inference use case half of loans with status will be assigned to rejected (no status). At the end, two datasets will represent rejected and accepted loans: "KnownGoodBad.csv" and "UnknownGoodBad.csv".
 64 | 
 65 | ```{r createInitialDatasets}
 66 | # make changes to where your file located
 67 | data_dir = "~/Projects/Playground/data/US_Small_Business_Admin_Loans/"
 68 | 
 69 | # Source:
 70 | # https://amstat.tandfonline.com/doi/full/10.1080/10691898.2018.1434342
 71 | 
 72 | sba_national = fread(paste0(data_dir,"SBAnational.csv"))
 73 | 
 74 | # separate data based on labels and randomness
 75 | sba_national[MIS_Status!="", Target := MIS_Status=="CHGOFF"]
 76 | table(sba_national$Target, useNA = "ifany")
 77 | 
 78 | sba_national[, Reject_Status := MIS_Status==""]
 79 | table(sba_national$Reject_Status, useNA = "ifany")
 80 | 
 81 | sba_national[Target==TRUE, Reject_Status := 
 82 |                sample(c(TRUE, FALSE), nrow(sba_national[Target==TRUE,]), replace = TRUE, prob = c(.5,.5))]
 83 | sba_national[Target==FALSE, Reject_Status := 
 84 |                sample(c(TRUE, FALSE), nrow(sba_national[Target==FALSE,]), replace = TRUE, prob = c(.5,.5))]
 85 | table(sba_national$Reject_Status, useNA = "ifany")
 86 | 
 87 | # make loan number character-based
 88 | sba_national[ , LoanNr_ChkDgt := paste0('#', as.character(LoanNr_ChkDgt))]
 89 | # parse money amounts to numeric
 90 | cols = c("DisbursementGross", "BalanceGross", "ChgOffPrinGr", "GrAppv", "SBA_Appv")
 91 | sba_national[ , (cols) := lapply(.SD, FUN = function(x){
 92 |   as.numeric(gsub(",", "", substring(x, 2)))
 93 | }), .SDcols = cols]
 94 | 
 95 | unknownGoodBad = sba_national[Reject_Status==TRUE]
 96 | knownGoodBad = sba_national[Reject_Status==FALSE]
 97 | 
 98 | fwrite(unknownGoodBad, file = paste0(data_dir, "UnknownGoodBad.csv"))
 99 | gzip(paste0(data_dir,'UnknownGoodBad.csv'), destname=paste0(data_dir,'UnknownGoodBad.csv.gz'), 
100 |      remove=FALSE, overwrite=TRUE)
101 | fwrite(knownGoodBad, file = paste0(data_dir, "KnownGoodBad.csv"))
102 | gzip(paste0(data_dir,'KnownGoodBad.csv'), destname=paste0(data_dir,'KnownGoodBad.csv.gz'), 
103 |      remove=FALSE, overwrite=TRUE)
104 | ```
105 | 
106 | # Connect to Driverless AI
107 | 
108 | ```{r connectDAI, include=FALSE}
109 | library(dai)
110 | 
111 | dai_uri = ""
112 | usr = "h2oai"
113 | pwd = ""
114 | dai.connect(uri = dai_uri, username = usr, password = pwd, force_version = FALSE)
115 | ```
116 | 
117 | ```{r connectDAIvisible, eval=FALSE, include=TRUE}
118 | dai_uri = "http://mydai.instance.com:12345"
119 | usr = "mydaiuser"
120 | pwd = "mydaipassword"
121 | dai.connect(uri = dai_uri, username = usr, password = pwd, force_version = FALSE)
122 | ```
123 | 
124 | # Import data into Driverless AI
125 | 
126 | Import datasets for both accepted and rejected loans, then split accepted loans into training and test partitions to train 1st loan default model.
127 | 
128 | ```{r findOrCreateDatasets}
129 | existing_datasets = data.table(dai.list_datasets(limit = 1000))
130 | if(nrow(existing_datasets) > 0 &&
131 |    nrow(existing_datasets[name=='KnownGoodBad.csv.gz']) == 1) {
132 |   known_key = existing_datasets[name=='KnownGoodBad.csv.gz','key'][[1,1]]
133 |   known_data = dai.get_frame(known_key)
134 | }else {
135 |   known_data = dai.upload_dataset(paste0(data_dir, "KnownGoodBad.csv.gz"))
136 | }
137 | 
138 | if(nrow(existing_datasets) > 0 &&
139 |    nrow(existing_datasets[name=="KnownGoodBad_train"]) == 1 &&
140 |    nrow(existing_datasets[name=="KnownGoodBad_test"]) == 1) {
141 |   known_train_key = existing_datasets[name=="KnownGoodBad_train",'key'][[1,1]]
142 |   known_train_set = dai.get_frame(known_train_key)
143 |   known_test_key = existing_datasets[name=="KnownGoodBad_test",'key'][[1,1]]
144 |   known_test_set = dai.get_frame(known_test_key)
145 | }else {
146 |   partitions = dai.split_dataset(dataset = known_data, 
147 |                   output_name1 = "KnownGoodBad_train", output_name2 = "KnownGoodBad_test",
148 |                   ratio = 0.8, seed = 75252, target = "Target")
149 |   known_train_set = partitions[[1]]
150 |   known_test_set = partitions[[2]]
151 | }
152 | ```
153 | 
154 | # Train Primary Default Model
155 | 
156 | Build classification model for loan defaults.
157 | 
158 | ```{r buildKnownModel}
159 | existing_models = data.table(dai.list_models(offset = 0, limit = 1000)[,c("key","description")])
160 | if(nrow(existing_models) > 0 && 
161 |    nrow(existing_models[description=="known-goodbad-445"]) == 1) {
162 |   known_model_key = existing_models[description=="known-goodbad-445","key"][[1,1]]
163 |   known_model = dai.get_model(known_model_key)
164 | }else {
165 |   known_model = dai.train(training_frame = known_train_set, testing_frame = known_test_set, 
166 |                        target_col = "Target", is_classification = TRUE, is_timeseries = FALSE,
167 |                        cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt"),
168 |                        time = 4, accuracy = 4, interpretability = 5,
169 |                        experiment_name = "known-goodbad-445",
170 |                        enable_gpus = TRUE, seed = 75252,
171 |                        config_overrides = "make_python_scoring_pipeline = 'off'")
172 | }
173 | ```
174 | 
175 | # Scoring Test Set and Visualizing 
176 | ```{r}
177 | known_model_key = existing_models[description=="2.known_goodbad_glm","key"][[1,1]]
178 | known_model_glm = dai.get_model(known_model_key)
179 | test_scored = predict(known_model_glm, newdata = known_test_set, 
180 |                       include_columns = c("LoanNr_ChkDgt","Target"), return_df = TRUE)
181 | ggplot(test_scored) +
182 |   # geom_histogram(aes(Target.1, fill=factor(Target)), alpha=0.7, bins = 100, position = "dodge") +
183 |   geom_density(aes(LoanNr_ChkDgt, Target.1, color=factor(Target)), alpha=0.7) +
184 |   theme_tufte(base_size = 12, base_family = 'Palatino', ticks = FALSE)
185 | ```
186 | 
187 | 
188 | # Scoring Rejected Loans
189 | 
190 | Imported rejected loan dataset and score on primary default loan model.
191 | ```{r importAndScoreRejects}
192 | if(nrow(existing_datasets) > 0 && 
193 |    nrow(existing_datasets[name=='UnknownGoodBad.csv.gz']) == 1) {
194 |   unknown_key = existing_datasets[name=='UnknownGoodBad.csv.gz','key'][[1,1]]
195 |   unknown_data = dai.get_frame(known_key)
196 | }else {
197 |   unknown_data = dai.upload_dataset(paste0(data_dir, "UnknownGoodBad.csv.gz"))
198 | }
199 | 
200 | unknown_scored = predict(known_model, newdata = unknown_data, 
201 |                          include_columns = "LoanNr_ChkDgt", return_df = TRUE)
202 | ```
203 | 
204 | Manufacture new weighted dataset for Reject Inference with Fuzzy Augmentation
205 | 
206 | ```{r}
207 | unknownScored = data.frame(unknown_scored)
208 | N = nrow(sba_national) # total number of rejected and accepted loans
209 | 
210 | unknownGoodOnly = data.table(unknownGoodBad)
211 | unknownGoodOnly[unknownScored, c("Target", "weight", "weight_btb") := 
212 |                   list(FALSE, as.double(`Target.0`), as.double(`Target.0`)/N), on='LoanNr_ChkDgt']
213 | 
214 | unknownBadOnly = data.table(unknownGoodBad)
215 | unknownBadOnly[unknownScored, c("Target", "weight", "weight_btb") := 
216 |                   list(TRUE, as.double(`Target.1`), as.double(`Target.1`)/N), on='LoanNr_ChkDgt']
217 | 
218 | allGoodBad = rbindlist(list(knownGoodBad[, c("weight", "weight_btb") := list(1, 1/N)],
219 |                unknownGoodOnly,
220 |                unknownBadOnly))
221 | 
222 | fwrite(allGoodBad, file = paste0(data_dir, "AllGoodBad.csv"))
223 | gzip(paste0(data_dir,'AllGoodBad.csv'), destname=paste0(data_dir,'AllGoodBad.csv.gz'), 
224 |      remove=FALSE, overwrite=TRUE)
225 | 
226 | fwrite(allGoodBad[, -c("weight","weight_btb")], file = paste0(data_dir, "AllGoodBad_noweight.csv"))
227 | gzip(paste0(data_dir,'AllGoodBad_noweight.csv'), destname=paste0(data_dir,'AllGoodBad_noweight.csv.gz'), 
228 |      remove=FALSE, overwrite=TRUE)
229 | 
230 | all_data = dai.upload_dataset(paste0(data_dir, "AllGoodBad.csv.gz"))
231 | all_data_noweight = dai.upload_dataset(paste0(data_dir, "AllGoodBad_noweight.csv.gz"))
232 | ```
233 | 
234 | 
235 | ```{r buildAllGoodBadModel}
236 | if(nrow(existing_datasets) > 0 &&
237 |    nrow(existing_datasets[name=='AllGoodBad_train']) >= 1 &&
238 |    nrow(existing_datasets[name=='AllGoodBad_test']) >= 1) {
239 |   alltrain_set_key = existing_datasets[name=='AllGoodBad_train','key'][[1,1]]
240 |   alltrain_set = dai.get_frame(alltrain_set_key)
241 |   alltest_set_key = existing_datasets[name=='AllGoodBad_test','key'][[1,1]]
242 |   alltest_set = dai.get_frame(alltest_set_key)
243 | }else {
244 |   partitions = dai.split_dataset(dataset = all_data, 
245 |                   output_name1 = "AllGoodBad_train", output_name2 = "AllGoodBad_test",
246 |                   ratio = 0.8, seed = 75252, target = "Target")
247 |   alltrain_set = partitions[[1]]
248 |   alltest_set = partitions[[2]]
249 | }
250 | 
251 | all_model = dai.train(training_frame = alltrain_set, testing_frame = alltest_set, 
252 |                        target_col = "Target", weight_col = "weight",
253 |                        is_classification = TRUE, is_timeseries = FALSE,
254 |                        cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt",
255 |                                         "weight_btb"),
256 |                        time = 4, accuracy = 4, interpretability = 5,
257 |                        experiment_name = "all-goodbad-445",
258 |                        enable_gpus = TRUE, seed = 75252,
259 |                        config_overrides = "make_python_scoring_pipeline = 'off'")
260 | 
261 | all_model_btb = dai.train(training_frame = alltrain_set, testing_frame = alltest_set, 
262 |                        target_col = "Target", weight_col = "weight_btb",
263 |                        is_classification = TRUE, is_timeseries = FALSE,
264 |                        cols_to_drop = c("MIS_Status","ChgOffPrinGr","ChgOffDate","LoanNr_ChkDgt",
265 |                                         "weight"),
266 |                        time = 4, accuracy = 4, interpretability = 5,
267 |                        experiment_name = "all-goodbad-btb-445",
268 |                        enable_gpus = TRUE, seed = 75252,
269 |                        config_overrides = "make_python_scoring_pipeline = 'off'")
270 | ```
271 | 
272 | # Make Final Model Predictions on Rejected Loans
273 | 
274 | ```{r predictRejectsOnFinalModel}
275 | existing_datasets = data.table(dai.list_datasets(limit = 1000))
276 | test_data_key = existing_datasets[name=='KnownGoodBad_test','key'][[1,1]]
277 | test_data = dai.get_frame(test_data_key)
278 | 
279 | existing_models = data.table(dai.list_models(offset = 0, limit = 1000)[,c("key","description")])
280 | all_good_bad_model_key = existing_models[description=="known-goodbad-445","key"][[1,1]]
281 | all_good_bad_model = dai.get_model(all_good_bad_model_key)
282 | 
283 | alltest_final_scored = predict(all_good_bad_model, newdata = test_data, 
284 |                          include_columns = c("LoanNr_ChkDgt","Target","UrbanRural"), return_df = TRUE)
285 | alltest_final_scored$Target = factor(alltest_final_scored$Target)
286 | alltest_final_scored$UrbanRural = factor(alltest_final_scored$UrbanRural)
287 | 
288 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) +
289 |   geom_histogram(bins=50, position="stack", color="black") +
290 |   theme_tufte(ticks=TRUE) + geom_rangeframe() + 
291 |   theme(legend.position = "bottom")
292 | 
293 | ggplot(alltest_final_scored, aes(x=Target.1)) +
294 |   geom_density(alpha = .7, trim=TRUE) +
295 |   theme_tufte() + geom_rangeframe() + # geom_rug() +
296 |   theme(legend.position = "bottom")
297 | 
298 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) +
299 |   geom_histogram(bins=50, position="dodge", color="black") +
300 |   theme_tufte(ticks=TRUE) + geom_rangeframe() + 
301 |   theme(legend.position = "bottom")
302 | 
303 | ggplot(alltest_final_scored, aes(x=Target.1, fill=Target)) +
304 |   geom_density(alpha = .7, trim=TRUE) +
305 |   # facet_wrap(~UrbanRural, ncol=1, scales = "free_y") +
306 |   theme_tufte(ticks=TRUE) + geom_rangeframe() + 
307 |   theme(legend.position = "bottom")
308 | ```
309 | 
310 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp/airline_sentiment_experiment/demo_nlp_airline_sentiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Driverless AI NLP Demo - Airline Sentiment Dataset ###\n",
  8 |     "\n",
  9 |     "In this notebook, we will see how to use Driverless AI python client to build text classification models using the Airline sentiment twitter dataset.\n",
 10 |     "\n",
 11 |     "Import the necessary python modules to get started including the Driverless AI client. If not already installed, please download and install the python client from Driverless AI GUI."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 4,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "from sklearn import model_selection\n",
 22 |     "from h2oai_client import Client"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "The below code downloads the twitter airline sentiment dataset and save it in the current folder. "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "--2019-08-21 15:42:17--  https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv\n",
 42 |       "Resolving www.figure-eight.com (www.figure-eight.com)... 54.164.48.21, 54.165.94.158\n",
 43 |       "Connecting to www.figure-eight.com (www.figure-eight.com)|54.164.48.21|:443... connected.\n",
 44 |       "HTTP request sent, awaiting response... 200 OK\n",
 45 |       "Length: 3704908 (3.5M) [application/octet-stream]\n",
 46 |       "Saving to: ‘Airline-Sentiment-2-w-AA.csv’\n",
 47 |       "\n",
 48 |       "Airline-Sentiment-2 100%[===================>]   3.53M  4.79MB/s    in 0.7s    \n",
 49 |       "\n",
 50 |       "2019-08-21 15:42:18 (4.79 MB/s) - ‘Airline-Sentiment-2-w-AA.csv’ saved [3704908/3704908]\n",
 51 |       "\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "! wget https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "We can now split the data into training and testing datasets."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "al = pd.read_csv(\"Airline-Sentiment-2-w-AA.csv\", encoding='ISO-8859-1')\n",
 73 |     "train_al, test_al = model_selection.train_test_split(al, test_size=0.2, random_state=2018)\n",
 74 |     "train_al.to_csv(\"train_airline_sentiment.csv\", index=False)\n",
 75 |     "test_al.to_csv(\"test_airline_sentiment.csv\", index=False)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "The first step is to establish a connection to Driverless AI using `Client`. Please key in your credentials and the url address."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 6,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "address = 'http://ip_where_driverless_is_running:12345'\n",
 92 |     "username = 'username'\n",
 93 |     "password = 'password'\n",
 94 |     "h2oai = Client(address = address, username = username, password = password)\n",
 95 |     "# # make sure to use the same user name and password when signing in through the GUI"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Read the train and test files into Driverless AI using the `upload_dataset_sync` command."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 8,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "train_path = './train_airline_sentiment.csv'\n",
112 |     "test_path = './test_airline_sentiment.csv'\n",
113 |     "\n",
114 |     "train = h2oai.upload_dataset_sync(train_path)\n",
115 |     "test = h2oai.upload_dataset_sync(test_path)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "Now let us look at some basic information about the dataset."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 10,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "Train Dataset:  20 x 11712\n",
135 |       "Test Dataset:  20 x 2928\n"
136 |      ]
137 |     },
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "['_unit_id',\n",
142 |        " '_golden',\n",
143 |        " '_unit_state',\n",
144 |        " '_trusted_judgments',\n",
145 |        " '_last_judgment_at',\n",
146 |        " 'airline_sentiment',\n",
147 |        " 'airline_sentiment:confidence',\n",
148 |        " 'negativereason',\n",
149 |        " 'negativereason:confidence',\n",
150 |        " 'airline',\n",
151 |        " 'airline_sentiment_gold',\n",
152 |        " 'name',\n",
153 |        " 'negativereason_gold',\n",
154 |        " 'retweet_count',\n",
155 |        " 'text',\n",
156 |        " 'tweet_coord',\n",
157 |        " 'tweet_created',\n",
158 |        " 'tweet_id',\n",
159 |        " 'tweet_location',\n",
160 |        " 'user_timezone']"
161 |       ]
162 |      },
163 |      "execution_count": 10,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "print('Train Dataset: ', len(train.columns), 'x', train.row_count)\n",
170 |     "print('Test Dataset: ', len(test.columns), 'x', test.row_count)\n",
171 |     "\n",
172 |     "[c.name for c in train.columns]"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "We just need two columns for our experiment. `text` which contains the text of the tweet and `airline_sentiment` which contains the sentiment of the tweet (target column). We can drop the remaining columns for this experiment. \n",
180 |     "\n",
181 |     "We will enable tensorflow models and transformations to take advantage of CNN based text features."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 22,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "['ACCURACY [6/10]:',\n",
193 |        " '- Training data size: *11,712 rows, 2 cols*',\n",
194 |        " '- Feature evolution: *[LightGBM, TensorFlow, XGBoostGBM]*, *3-fold CV**, 2 reps*',\n",
195 |        " '- Final pipeline: *Ensemble (6 models), 3-fold CV*',\n",
196 |        " '',\n",
197 |        " 'TIME [4/10]:',\n",
198 |        " '- Feature evolution: *4 individuals*, up to *56 iterations*',\n",
199 |        " '- Early stopping: After *5* iterations of no improvement',\n",
200 |        " '',\n",
201 |        " 'INTERPRETABILITY [5/10]:',\n",
202 |        " '- Feature pre-pruning strategy: None',\n",
203 |        " '- XGBoost Monotonicity constraints: disabled',\n",
204 |        " '- Feature engineering search space (where applicable): [CVCatNumEncode, CVTargetEncode, ClusterTE, Dates, Frequent, Interactions, IsHoliday, NumCatTE, NumToCatTE, Original, TextBiGRU, TextCNN, TextCharCNN, Text]',\n",
205 |        " '',\n",
206 |        " '[LightGBM, TensorFlow, XGBoostGBM] models to train:',\n",
207 |        " '- Model and feature tuning: *192*',\n",
208 |        " '- Feature evolution: *504*',\n",
209 |        " '- Final pipeline: *6*',\n",
210 |        " '',\n",
211 |        " 'Estimated runtime: *minutes*']"
212 |       ]
213 |      },
214 |      "execution_count": 22,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "exp_preview = h2oai.get_experiment_preview_sync(\n",
221 |     "    dataset_key=train.key\n",
222 |     "    , validset_key=''\n",
223 |     "    , target_col='airline_sentiment'\n",
224 |     "    , classification=True\n",
225 |     "    , dropped_cols=[\"_unit_id\", \"_golden\", \"_unit_state\", \"_trusted_judgments\", \"_last_judgment_at\",\n",
226 |     "                  \"airline_sentiment:confidence\", \"negativereason\", \"negativereason:confidence\", \"airline\",\n",
227 |     "                  \"airline_sentiment_gold\", \"name\", \"negativereason_gold\", \"retweet_count\", \n",
228 |     "                  \"tweet_coord\", \"tweet_created\", \"tweet_id\", \"tweet_location\", \"user_timezone\"]\n",
229 |     "    , accuracy=6\n",
230 |     "    , time=4\n",
231 |     "    , interpretability=5\n",
232 |     "    , is_time_series=False\n",
233 |     "    , enable_gpus=True\n",
234 |     "    , reproducible=False\n",
235 |     "    , resumed_experiment_id=''\n",
236 |     "    , config_overrides=\"\"\"\n",
237 |     "        enable_tensorflow='on'\n",
238 |     "        enable_tensorflow_charcnn='on'\n",
239 |     "        enable_tensorflow_textcnn='on'\n",
240 |     "        enable_tensorflow_textbigru='on'\n",
241 |     "    \"\"\"\n",
242 |     ")\n",
243 |     "exp_preview"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "Please note that the `Text` and `TextCNN` features are enabled for this experiment.\n",
251 |     "\n",
252 |     "Now we can start the experiment."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 24,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "model = h2oai.start_experiment_sync(\n",
262 |     "    dataset_key=train.key,\n",
263 |     "    testset_key=test.key,\n",
264 |     "    target_col='airline_sentiment',\n",
265 |     "    scorer='F1',\n",
266 |     "    is_classification=True,\n",
267 |     "    cols_to_drop=[\"_unit_id\", \"_golden\", \"_unit_state\", \"_trusted_judgments\", \"_last_judgment_at\",\n",
268 |     "                  \"airline_sentiment:confidence\", \"negativereason\", \"negativereason:confidence\", \"airline\",\n",
269 |     "                  \"airline_sentiment_gold\", \"name\", \"negativereason_gold\", \"retweet_count\", \n",
270 |     "                  \"tweet_coord\", \"tweet_created\", \"tweet_id\", \"tweet_location\", \"user_timezone\"],\n",
271 |     "    accuracy=6,\n",
272 |     "    time=2,\n",
273 |     "    interpretability=5,\n",
274 |     "    enable_gpus=True,\n",
275 |     "    config_overrides=\"\"\"\n",
276 |     "        enable_tensorflow='on'\n",
277 |     "        enable_tensorflow_charcnn='on'\n",
278 |     "        enable_tensorflow_textcnn='on'\n",
279 |     "        enable_tensorflow_textbigru='on'\n",
280 |     "    \"\"\"\n",
281 |     ")"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 25,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "name": "stdout",
291 |      "output_type": "stream",
292 |      "text": [
293 |       "Modeling completed for model d272df9c-c466-11e9-b1a0-0242ac110002\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "print('Modeling completed for model ' + model.key)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 29,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "Logs available at ./test_preds.csv\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "logs = h2oai.download(model.log_file_path, '.')\n",
316 |     "print('Logs available at', test_preds)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "We can download the predictions to the current folder."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 28,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "name": "stdout",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "Test set predictions available at ./test_preds.csv\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "test_preds = h2oai.download(model.test_predictions_path, '.')\n",
341 |     "print('Test set predictions available at', test_preds)"
342 |    ]
343 |   }
344 |  ],
345 |  "metadata": {
346 |   "kernelspec": {
347 |    "display_name": "Python 3",
348 |    "language": "python",
349 |    "name": "python3"
350 |   },
351 |   "language_info": {
352 |    "codemirror_mode": {
353 |     "name": "ipython",
354 |     "version": 3
355 |    },
356 |    "file_extension": ".py",
357 |    "mimetype": "text/x-python",
358 |    "name": "python",
359 |    "nbconvert_exporter": "python",
360 |    "pygments_lexer": "ipython3",
361 |    "version": "3.6.5"
362 |   }
363 |  },
364 |  "nbformat": 4,
365 |  "nbformat_minor": 2
366 | }
367 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp/custom_word2vec_embeddings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Pretrained Word Embeddings\n",
  8 |     "\n",
  9 |     "From Driverless AI version 1.7.0, text models can take in pretrained word embeddings through expert settings. There are several pre-trained word embeddings available in the open source domain like [Glove](https://nlp.stanford.edu/projects/glove/) and [Fasttext](https://fasttext.cc/docs/en/crawl-vectors.html). We can download these embeddings and use them in our models. These embeddings are trained on corpus like wikipedia, common crawl etc. \n",
 10 |     "\n",
 11 |     "We can also train our own embeddings on our domain dataset instead of using the publicly available ones. This one is particularly useful when there is a good amount of text data that is not tagged and want to use that information. This notebook is to help create custom pre-trained embeddings.\n",
 12 |     "\n",
 13 |     "The data used in this example is [US Airline Sentiment dataset](https://www.figure-eight.com/wp-content/uploads/2016/03/Airline-Sentiment-2-w-AA.csv) from [Figure Eight’s Data for Everyone](https://www.figure-eight.com/data-for-everyone/) library. The dataset is split into training and test with this [simple script](https://gist.github.com/woobe/bd79d9f4d7ea139c5d2eb4cf1de1e7db) and the train file is used for word embeddings creation. Please use your own text corpus inplace of this airline train file."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# Please enter the file name\n",
 23 |     "file_name = \"train_airline_sentiment.csv\"\n",
 24 |     "# Please enter the name of the text column\n",
 25 |     "col_name = \"text\""
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "Import the h2o module and H2OWord2vecEstimator"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.\n",
 45 |       "Attempting to start a local H2O server...\n",
 46 |       "  Java Version: openjdk version \"11.0.1\" 2018-10-16; OpenJDK Runtime Environment 18.9 (build 11.0.1+13); OpenJDK 64-Bit Server VM 18.9 (build 11.0.1+13, mixed mode)\n",
 47 |       "  Starting server from /Users/srk/envs/DS2/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar\n",
 48 |       "  Ice root: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0\n",
 49 |       "  JVM stdout: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0/h2o_srk_started_from_python.out\n",
 50 |       "  JVM stderr: /var/folders/db/49r_20s91bg8qhg08qf78x100000gn/T/tmp8m3vtkx0/h2o_srk_started_from_python.err\n",
 51 |       "  Server is running at http://127.0.0.1:54321\n",
 52 |       "Connecting to H2O server at http://127.0.0.1:54321 ... successful.\n"
 53 |      ]
 54 |     },
 55 |     {
 56 |      "data": {
 57 |       "text/html": [
 58 |        "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime:</td>\n",
 59 |        "<td>01 secs</td></tr>\n",
 60 |        "<tr><td>H2O cluster timezone:</td>\n",
 61 |        "<td>Asia/Kolkata</td></tr>\n",
 62 |        "<tr><td>H2O data parsing timezone:</td>\n",
 63 |        "<td>UTC</td></tr>\n",
 64 |        "<tr><td>H2O cluster version:</td>\n",
 65 |        "<td>3.24.0.4</td></tr>\n",
 66 |        "<tr><td>H2O cluster version age:</td>\n",
 67 |        "<td>1 month and 24 days </td></tr>\n",
 68 |        "<tr><td>H2O cluster name:</td>\n",
 69 |        "<td>H2O_from_python_srk_z7y5eb</td></tr>\n",
 70 |        "<tr><td>H2O cluster total nodes:</td>\n",
 71 |        "<td>1</td></tr>\n",
 72 |        "<tr><td>H2O cluster free memory:</td>\n",
 73 |        "<td>4 Gb</td></tr>\n",
 74 |        "<tr><td>H2O cluster total cores:</td>\n",
 75 |        "<td>12</td></tr>\n",
 76 |        "<tr><td>H2O cluster allowed cores:</td>\n",
 77 |        "<td>12</td></tr>\n",
 78 |        "<tr><td>H2O cluster status:</td>\n",
 79 |        "<td>accepting new members, healthy</td></tr>\n",
 80 |        "<tr><td>H2O connection url:</td>\n",
 81 |        "<td>http://127.0.0.1:54321</td></tr>\n",
 82 |        "<tr><td>H2O connection proxy:</td>\n",
 83 |        "<td>None</td></tr>\n",
 84 |        "<tr><td>H2O internal security:</td>\n",
 85 |        "<td>False</td></tr>\n",
 86 |        "<tr><td>H2O API Extensions:</td>\n",
 87 |        "<td>Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4</td></tr>\n",
 88 |        "<tr><td>Python version:</td>\n",
 89 |        "<td>3.6.5 final</td></tr></table></div>"
 90 |       ],
 91 |       "text/plain": [
 92 |        "--------------------------  ---------------------------------------------------\n",
 93 |        "H2O cluster uptime:         01 secs\n",
 94 |        "H2O cluster timezone:       Asia/Kolkata\n",
 95 |        "H2O data parsing timezone:  UTC\n",
 96 |        "H2O cluster version:        3.24.0.4\n",
 97 |        "H2O cluster version age:    1 month and 24 days\n",
 98 |        "H2O cluster name:           H2O_from_python_srk_z7y5eb\n",
 99 |        "H2O cluster total nodes:    1\n",
100 |        "H2O cluster free memory:    4 Gb\n",
101 |        "H2O cluster total cores:    12\n",
102 |        "H2O cluster allowed cores:  12\n",
103 |        "H2O cluster status:         accepting new members, healthy\n",
104 |        "H2O connection url:         http://127.0.0.1:54321\n",
105 |        "H2O connection proxy:\n",
106 |        "H2O internal security:      False\n",
107 |        "H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4\n",
108 |        "Python version:             3.6.5 final\n",
109 |        "--------------------------  ---------------------------------------------------"
110 |       ]
111 |      },
112 |      "metadata": {},
113 |      "output_type": "display_data"
114 |     }
115 |    ],
116 |    "source": [
117 |     "import h2o\n",
118 |     "h2o.init()\n",
119 |     "from h2o.estimators.word2vec import H2OWord2vecEstimator"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "Import the dataset file. Please note that the input file should be a csv file with a valid header in the first line."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 3,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "name": "stdout",
136 |      "output_type": "stream",
137 |      "text": [
138 |       "Parse progress: |█████████████████████████████████████████████████████████| 100%\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "df = h2o.import_file(file_name, header=1, sep=\",\")\n",
144 |     "df = df[[col_name]].ascharacter()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Do some text preprocessing."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 4,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "def tokenize(sentences):\n",
161 |     "    # tokenize the sentences\n",
162 |     "    tokenized = sentences.tokenize(\"\\\\W+\")\n",
163 |     "    # lower case the text column\n",
164 |     "    tokenized = tokenized.tolower()\n",
165 |     "    # filter out the sentences which has less than 2 characters or where text is missing\n",
166 |     "    tokenized = tokenized[(tokenized.nchar() >= 2) | (tokenized.isna()),:]\n",
167 |     "    return tokenized\n",
168 |     "\n",
169 |     "words = tokenize(df[col_name])"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "The next step is to build the word2vec model. We can also adjust the parameters of the word2vec mdoel. Please refer to the [documentation of H2oWord2vecEstimator](http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2oword2vecestimator) for more details on the parameters. "
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 5,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "Build word2vec model\n",
189 |       "word2vec Model Build progress: |██████████████████████████████████████████| 100%\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "print(\"Build word2vec model\")\n",
195 |     "w2v_model = H2OWord2vecEstimator(min_word_freq=3,\n",
196 |     "                                 vec_size=300,\n",
197 |     "                                 window_size=5,\n",
198 |     "                                 epochs=10,\n",
199 |     "                                 word_model=\"skip_gram\")\n",
200 |     "w2v_model.train(training_frame=words)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Save the word embeddings as text file. \n",
208 |     "\n",
209 |     "This file can be given as pre-trained word embedding input for Driverless AI. The option is present in `Expert Settings -> NLP -> Path to pretrained embeddings for TensorFlow NLP models` "
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 6,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "w2v_model.to_frame().as_data_frame().to_csv(\"w2vec.txt\", float_format='%.6f', sep=\" \", header=False, index=False)"
219 |    ]
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python 3",
225 |    "language": "python",
226 |    "name": "python3"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.6.5"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 2
243 | }
244 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp_timeseries/imgs/coffee.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/coffee.gif


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp_timeseries/imgs/create_experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/create_experiment.png


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp_timeseries/imgs/mapbox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/mapbox.png


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp_timeseries/imgs/scpf_lb_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/imgs/scpf_lb_progress.png


--------------------------------------------------------------------------------
/driverlessai_experiments/nlp_timeseries/predictions/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/nlp_timeseries/predictions/.gitkeep


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/.gitignore:
--------------------------------------------------------------------------------
 1 | # Exclude specific files
 2 | scratch.py
 3 | 
 4 | # Exclude specific folders
 5 | .idea
 6 | data_fullts
 7 | experiment_data
 8 | tmp
 9 | 
10 | # Exclude files based on extensions
11 | *.jar
12 | *.csv
13 | *.pickle
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/01-generate-data.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Commented. Enable for debugging
  4 | # set -x
  5 | 
  6 | force_overwrite=false
  7 | current_dir="$(pwd)"
  8 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  9 | conda_env_name="ts-pipeline-env"
 10 | conda_env_def_file="environment.yml"
 11 | ts_process_script="01_process_full_TS_csv.py"
 12 | tmp_csv_file="temp.csv"
 13 | fullts_data_directory="data_fullts"
 14 | 
 15 | error_exit(){
 16 |     echo ""
 17 |     echo "$1" 1>&2
 18 |     echo ""
 19 |     exit 1
 20 | }
 21 | 
 22 | print_usage(){
 23 |     echo "Usage:"
 24 |     echo "  bash $0 -d <tsdf.json> -o <output> [-f | --force] [-h | --help]"
 25 |     echo "Options:"
 26 |     echo "  -d <tsdf.json>            Timeseries definition file. Must be JSON file."
 27 |     echo "  -o <output>               Output file name. Will generate <output>.csv, <output>.pickle, and <output>.svg files in ${fullts_data_directory} directory"
 28 |     echo "  -f, --force               Force overwrite of output file."
 29 |     echo "  -h, --help                Display usage information."
 30 |     echo "Details:"
 31 |     echo "  Creates the master time series dataset for this pipeline demo. It simulates a larger database"
 32 |     echo "  from which section of data will be extracted to train and then predict on"
 33 | }
 34 | 
 35 | check_or_download_tsimulus(){
 36 |     if [[ ! -e tsimulus-cli.jar ]]; then
 37 |         local latest_tag=$(curl --silent 'https://api.github.com/repos/cetic/tsimulus-cli/releases/latest' | grep -Po '"tag_name": "\K.*?(?=")')
 38 |         curl https://github.com/cetic/tsimulus-cli/releases/download/"${latest_tag}"/tsimulus-cli.jar --o tsimulus-cli.jar -silent
 39 |     fi
 40 |     # finally check that the file does exist, or error out
 41 |     [[ -e "tsimulus-cli.jar" ]] || error_exit "Error downloading TSimulus CLI. Cannot continue"
 42 | }
 43 | 
 44 | generate_ts_data(){
 45 |     # if flow reaches here, validation checks are assumed to be passed and output file is ok to overwrite if present
 46 |     java -jar tsimulus-cli.jar "${ts_def_file}" | tail -n +2 | sed -r 's/;/,/g' > "${fullts_data_directory}/${tmp_csv_file}"
 47 | }
 48 | 
 49 | check_create_condaenv(){
 50 |     conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly."
 51 |     local env_count=$(conda env list | grep "${conda_env_name}" | wc -l)
 52 |     if [[ "${env_count}" == 0 ]]; then
 53 |         # create conda environment from the yml file
 54 |         [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found"
 55 |         conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment"
 56 |     fi
 57 | }
 58 | 
 59 | process_ts_file(){
 60 |     # if control reaches here, then conda environment is available
 61 |     [[ -e "${ts_process_script}" ]] || error_exit "Python script to process timeseries data not found"
 62 |     pushd "${fullts_data_directory}" > /dev/null &&
 63 |         source activate "${conda_env_name}" &&
 64 |         python "${script_dir}/${ts_process_script}" -i "${tmp_csv_file}" -o "${ts_out_file}"  &&
 65 |         mv "${tmp_csv_file}" "${ts_out_file}.csv" &&
 66 |         conda deactivate &&
 67 |         popd > /dev/null
 68 | }
 69 | 
 70 | parse_args_then_exec(){
 71 |     # fail fast in case no parameters are passed
 72 |     [[ ! -z "${1}" ]] || { print_usage; error_exit "Timeseries definition file is mandatory"; }
 73 |     while [[ "$1" != "" ]]; do
 74 |         case "$1" in
 75 |            -d )
 76 |                 shift
 77 |                 ts_def_file="$1"
 78 |                 # error if such file does not exits
 79 |                 [[ -e "${ts_def_file}" ]] || { print_usage; error_exit "Timeseries definition file does not exist"; }
 80 |                 ;;
 81 |            -o )
 82 |                 shift
 83 |                 ts_out_file="$1"
 84 |                 ;;
 85 |             -f | --force )
 86 |                 force_overwrite=true
 87 |                 ;;
 88 |             -h | --help )
 89 |                 print_usage
 90 |                 exit 0
 91 |                 ;;
 92 |             * )
 93 |                 print_usage
 94 |                 error_exit "Error: Incorrect parameters passed"
 95 |                 ;;
 96 |         esac
 97 |         shift
 98 |     done
 99 | 
100 |     # If required parame
101 |     [[ ! -z "${ts_def_file}" ]] || { print_usage; error_exit "Timeseries definition file is mandatory"; }
102 |     [[ ! -z "${ts_out_file}" ]] || { print_usage; error_exit "Timeseries output file is mandatory"; }
103 | 
104 |     # check if output file exist. If exists, and overwrite option is not specified then show error
105 |     if [[ -e "${fullts_data_directory}/${ts_out_file}.csv" || -e "${fullts_data_directory}/${ts_out_file}.pickle" ]] && [[ "${force_overwrite}" == false ]]; then
106 |         print_usage
107 |         error_exit "Cannot overwite existing file. Use -f option"
108 |     fi
109 | 
110 |     # Make fullts_data directory if it does not exists. if, exists do nothing
111 |     mkdir -p "${fullts_data_directory}"
112 | 
113 |     # check Java exists, if not exit with error
114 |     java -version 2>/dev/null || error_exit "Java required. Please install java runtime"
115 | 
116 |     # check curl exists
117 |     curl -V >/dev/null || error_exit "Curl required. Please install curl"
118 | 
119 |     # check tsimulus cli available, if not, download it
120 |     check_or_download_tsimulus
121 | 
122 |     # generate Timeseries data based on the definition file
123 |     generate_ts_data
124 | 
125 |     # Create conda environment if it does not exist
126 |     check_create_condaenv
127 | 
128 |     # process the temp.csv file. Generate plots, save as feather for better read/write performance
129 |     process_ts_file
130 | }
131 | 
132 | main() {
133 |     parse_args_then_exec $@
134 | }
135 | 
136 | main $@


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/01_process_full_TS_csv.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import numpy as np
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | 
 6 | from pandas.plotting import register_matplotlib_converters
 7 | 
 8 | 
 9 | @click.command()
10 | @click.option('-i', '--input', 'in_file', type=click.Path(exists=True), help='Input time series data file (csv)')
11 | @click.option('-o', '--output', 'output', type=click.STRING, help='Output file prefix.')
12 | def process(in_file, output):
13 |     """
14 |     Process a time series file, create a plot, save the data as pickle.
15 | 
16 |     This function processes the time series csv file, provided as input.
17 |     Creates a plot of the time series and saves it as output_plot.svg.
18 |     It also converts the input csv file and stores it as output.pickle for faster processing.
19 |     """
20 |     # Read csv to data frame.
21 |     df = pd.read_csv(in_file,
22 |                      sep=',',
23 |                      names=['Timeslot', 'StoreID', 'Product', 'Sale'],
24 |                      parse_dates=['Timeslot'],
25 |                      infer_datetime_format=True)
26 | 
27 |     # Round Sale and convert from float to int64
28 |     df['Sale'] = pd.Series.round(df['Sale']).apply(np.int64)
29 |     df['StoreID'] = df['StoreID'].astype('category')
30 |     df['Product'] = df['Product'].astype('category')
31 | 
32 |     # Set dataframe index to help easy slicing
33 |     df.set_index('Timeslot', drop=False, inplace=True)
34 | 
35 |     # Create TS plots for each store id in a separate file
36 |     register_matplotlib_converters()
37 |     sns.set_context('notebook')
38 | 
39 |     sns.relplot(x='Timeslot',
40 |                 y='Sale',
41 |                 hue='StoreID',
42 |                 row='Product',
43 |                 kind='line',
44 |                 height=3,
45 |                 aspect=10,
46 |                 data=df).fig.savefig(output+'_plot.svg')
47 | 
48 |     # Store the file as pickle
49 |     df.to_pickle(output+'.pickle')
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     process()
54 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/02-create-experiment-data.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Commented. Enable for debugging
  4 | # set -x
  5 | 
  6 | current_dir="$(pwd)"
  7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  8 | conda_env_name="ts-pipeline-env"
  9 | conda_env_def_file="environment.yml"
 10 | process_script="02_extract_experiment_datasets.py"
 11 | exp_data_dir_root="experiment_data"
 12 | missing_data_percentage=0
 13 | 
 14 | error_exit(){
 15 |     echo ""
 16 |     echo "$1" 1>&2
 17 |     echo ""
 18 |     exit 1
 19 | }
 20 | 
 21 | print_usage(){
 22 |     echo "Usage:"
 23 |     echo "  bash $0 -i <dataset.pickle> -s <train start date> -e <train end date> -g <gap> -t <test duration> [-m <misssing data %> ] [-h | --help]"
 24 |     echo "Options:"
 25 |     echo "  -i <dataset.pickle>         Full time series dataset, created by 01-generate-data script. Provide .pickle file"
 26 |     echo "  -s <train start date>       Starting date for Train data YYYY-MM-DD format. Train dataset will start from 00:00:00.000 hours for that date."
 27 |     echo "  -e <train end date>         Ending date for Train data in YYYY-MM-DD format. Train dataset will include data for this date till 23:00:00 hours i.e. full 24 hour period."
 28 |     echo "  -g <gap duration>           Gap (in days) between last training date and first testing date."
 29 |     echo "  -t <test duration>          Duration (in days) for which we are generating test data. It starts from gap days after the last date in train dataset."
 30 |     echo "  -m <missing data %>         Proportion of target data that is missing in both Training and Test dataset. Optional, defaults to 0."
 31 |     echo "  -h, --help                  Display usage information."
 32 |     echo "Details:"
 33 |     echo "  Creates train, gap and test datasets (csv and pickle) in the output directory. Also creates timeseries plots for train and test datasets. "
 34 |     echo "  The output directory will be created in the format sYYYYMMDD-eYYYYMMDD-gdG-tdF-mMP, where"
 35 |     echo "  - sYYYYMMDD-eYYYYMMDD is the training dataset start and end date"
 36 |     echo "  - gdG is the gap duration"
 37 |     echo "  - tdF is the test duration"
 38 |     echo "  - mMP is proportion of missing data in Train and Test datasets"
 39 |     echo "  When the script is executed with certain inputs which results in an output directory that already exists, no action is taken."
 40 | }
 41 | 
 42 | check_create_condaenv(){
 43 |     conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly."
 44 |     local env_count=$(conda env list | grep "${conda_env_name}" | wc -l)
 45 |     if [[ "${env_count}" == 0 ]]; then
 46 |         # create conda environment from the yml file
 47 |         [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found"
 48 |         conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment"
 49 |     fi
 50 | }
 51 | 
 52 | process_ts_file(){
 53 |     # if control reaches here, then conda environment is available
 54 |     [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found"
 55 |     pushd "${exp_data_dir_root}/${exp_data_dir}" > /dev/null &&
 56 |         source activate "${conda_env_name}" &&
 57 |         python "${script_dir}/${process_script}" -i "${script_dir}/${ts_full_data_file}" \
 58 |                                                  -s "${formatted_start_date}" \
 59 |                                                  -e "${formatted_end_date}" \
 60 |                                                  -g "${gap_duration}" \
 61 |                                                  -t "${test_duration}" \
 62 |                                                  -m "${missing_data_percentage}" &&
 63 |         conda deactivate &&
 64 |         popd > /dev/null
 65 | }
 66 | 
 67 | parse_args_then_exec(){
 68 |     # fail fast in case no parameters are passed
 69 |     [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; }
 70 |     while [[ "$1" != "" ]]; do
 71 |         case "$1" in
 72 |            -i )
 73 |                 shift
 74 |                 ts_full_data_file="$1"
 75 |                 # If file exists, proceed; else print message and exit with error code
 76 |                 [[ -f "${ts_full_data_file}" ]] || { print_usage; error_exit "Provided time series full data file does not exist."; }
 77 |                 ;;
 78 |            -s )
 79 |                 shift
 80 |                 start_date="$1"
 81 |                 # convert input to expected date format and check with input. if they match, input is in expected format, so proceed ; else error
 82 |                 formatted_start_date=$(date "+%F" -d "${start_date}" 2>/dev/null)
 83 |                 [[ "${formatted_start_date}" == "${start_date}" ]] || { print_usage; error_exit "Invalid start date or date format. Use YYYY-MM-DD format."; }
 84 |                 ;;
 85 |            -e )
 86 |                 shift
 87 |                 end_date="$1"
 88 |                 # error is date is not in the valid format
 89 |                 formatted_end_date=$(date "+%F" -d "${end_date}" 2>/dev/null)
 90 |                 [[ "${formatted_end_date}" ==  "${end_date}" ]] || { print_usage; error_exit "Invalid end date or date format. Use YYYY-MM-DD format."; }
 91 |                 ;;
 92 |            -g )
 93 |                 shift
 94 |                 gap_duration="$1"
 95 |                 [[ "${gap_duration}" =~  ^[0-9]+$ ]] || { print_usage; error_exit "Gap duration (days) is expected to be an integer. If no gap is needed pass 0."; }
 96 |                 ;;
 97 |            -t )
 98 |                 shift
 99 |                 test_duration="$1"
100 |                 # error is date is not in the valid format
101 |                 [[ "${test_duration}" =~  ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Test data duration (days) is expected to be a non-zero integer."; }
102 |                 ;;
103 |            -m )
104 |                 shift
105 |                 missing_data_percentage="$1"
106 |                 # error is date is not in the valid format
107 |                 [[ "${missing_data_percentage}" =~  ^[0-9]{1,2}$ ]] || { print_usage; error_exit "Proportion (%) of missing data to create in Train and Test datasets. Optional, defaults to 0."; }
108 |                 ;;
109 |             -h | --help )
110 |                 print_usage
111 |                 exit 0
112 |                 ;;
113 |             * )
114 |                 print_usage
115 |                 error_exit "Error: Incorrect parameters passed"
116 |                 ;;
117 |         esac
118 |         shift
119 |     done
120 | 
121 |     # If required parameters are missing, print usage and exit
122 |     [[ ! -z "${ts_full_data_file}" ]] || { print_usage; error_exit "Timeseries input data file is mandatory"; }
123 |     [[ -f "${ts_full_data_file}" ]] || { print_usage; error_exit "Provided timeseries input data file is missing"; }
124 |     [[ ! -z "${formatted_start_date}" ]] || { print_usage; error_exit "Training data start date is mandatory"; }
125 |     [[ ! -z "${formatted_end_date}" ]] || { print_usage; error_exit "Training data end date is mandatory"; }
126 |     [[ ! -z "${gap_duration}" ]] || { print_usage; error_exit "Gap duration is mandatory. If no gap, pass 0 as the value"; }
127 |     [[ ! -z "${test_duration}" ]] || { print_usage; error_exit "Test data duration is mandatory"; }
128 | 
129 |     # Check if experiment data directory exists, if so dont proceed. If it does not exist, create it.
130 |     exp_data_dir="s${formatted_start_date}-e${formatted_end_date}-gd${gap_duration}-td${test_duration}-m${missing_data_percentage}"
131 |     [[ ! -d "${exp_data_dir_root}/${exp_data_dir}" ]] || error_exit "Experiment data directory ${exp_data_dir_root}/${exp_data_dir} already exists. No action taken."
132 |     mkdir -p "${exp_data_dir_root}/${exp_data_dir}"
133 | 
134 |     # Create conda environment if it does not exist
135 |     check_create_condaenv
136 | 
137 |     # process the temp.csv file. Generate plots, save as feather for better read/write performance
138 |     process_ts_file
139 | }
140 | 
141 | main() {
142 |     parse_args_then_exec $@
143 | }
144 | 
145 | main $@


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/02_extract_experiment_datasets.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import random
  3 | 
  4 | import datetime as dt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sns
  8 | 
  9 | from pandas.plotting import register_matplotlib_converters
 10 | 
 11 | 
 12 | @click.command()
 13 | @click.option('-i', '--input', 'input_pickle', type=click.Path(exists=True,
 14 |                                                                file_okay=True,
 15 |                                                                dir_okay=False,
 16 |                                                                readable=True),
 17 |               required=True,
 18 |               help='Full time series dataset pickle file from which to extract experiment data.')
 19 | @click.option('-s', '--start', 'train_start_date',
 20 |               required=True,
 21 |               type=click.DateTime(formats=['%Y-%m-%d']),
 22 |               help='Start date for training data.')
 23 | @click.option('-e', '--end', 'train_end_date',
 24 |               required=True,
 25 |               type=click.DateTime(formats=['%Y-%m-%d']),
 26 |               help='End date for training data.')
 27 | @click.option('-g', '--gap', 'gap_duration',
 28 |               required=True,
 29 |               type=click.INT,
 30 |               help='Gap (in days) between training and test data')
 31 | @click.option('-t', '--test', 'test_duration',
 32 |               required=True,
 33 |               type=click.INT,
 34 |               help='Duration (in days) for the testing dataset.')
 35 | @click.option('-m', '--missing', 'missing_data_percentage',
 36 |               default=0,
 37 |               required=False,
 38 |               type=click.INT,
 39 |               help='Proportion (in %) of missing data in train and test datasets. Optional, defaults to 0')
 40 | def process(input_pickle,
 41 |             train_start_date,
 42 |             train_end_date,
 43 |             gap_duration,
 44 |             test_duration,
 45 |             missing_data_percentage):
 46 |     """
 47 |     Creates train and test datasets (csv and pickle) in the output directory.
 48 |     Also creates timeseries plots for both the files.
 49 | 
 50 |     :param input_pickle: Full time series dataset pickle file from which to extract experiment data.
 51 |     :param train_start_date: Start date for training dataset
 52 |     :param train_end_date: End date for training datset
 53 |     :param gap_duration: Gap (in days) between training and testing dataset.
 54 |     :param test_duration: Duration (in days) of the testing dataset.
 55 |     :param missing_data_percentage: Proportion of missing data in train and test datasets. Optional, defaults to 0.
 56 |     :return: None
 57 |     """
 58 |     # Read the input data file
 59 |     df = pd.read_pickle(input_pickle)
 60 | 
 61 |     # Calculate data slice times
 62 |     train_end_date = train_end_date.replace(hour=23)
 63 |     gap_start_date = train_end_date + dt.timedelta(hours=1)
 64 |     gap_end_date = gap_start_date + dt.timedelta(days=gap_duration, hours=-1)
 65 |     test_start_date = gap_end_date + dt.timedelta(hours=1)
 66 |     test_end_date = test_start_date + dt.timedelta(days=test_duration, hours=-1)
 67 | 
 68 |     # Slice data
 69 |     train_df = df[train_start_date:train_end_date].copy()
 70 |     test_df = df[test_start_date:test_end_date].copy()
 71 | 
 72 |     # Add missing data
 73 |     if missing_data_percentage != 0:
 74 |         create_missing_data(train_df, missing_data_percentage, 3)
 75 |         create_missing_data(test_df, missing_data_percentage, 3)
 76 | 
 77 |     # Plot train and test data
 78 |     create_plots(train_df, 'train')
 79 |     create_plots(test_df, 'test')
 80 | 
 81 |     # Save as CSV and pickle
 82 |     save_datasets(train_df, 'train', as_csv=True, as_pickle=True)
 83 |     save_datasets(test_df, 'test', as_csv=True, as_pickle=True)
 84 | 
 85 |     # Handle gap data
 86 |     if gap_duration != 0:
 87 |         gap_df = df[gap_start_date:gap_end_date].copy()
 88 |         create_missing_data(gap_df, missing_data_percentage, 3)
 89 |         save_datasets(gap_df, 'gap', as_csv=True, as_pickle=True)
 90 | 
 91 | def create_plots(data_frame,
 92 |                  filename_prefix):
 93 |     """
 94 |     Create timeseries plot for the passed dataframe
 95 | 
 96 |     :param data_frame: Input time series dataframe to plot
 97 |     :param filename_prefix: File name prefix. Generated file will be filename_prefix_plot.svg
 98 |     :return: None
 99 |     """
100 |     sns.relplot(x='Timeslot',
101 |                 y='Sale',
102 |                 hue='StoreID',
103 |                 row='Product',
104 |                 kind='line',
105 |                 height=3,
106 |                 aspect=10,
107 |                 data=data_frame).fig.savefig(filename_prefix+'_plot.svg')
108 | 
109 | 
110 | def create_missing_data(df,
111 |                         missing_data_percentage,
112 |                         target_col_index):
113 |     """
114 |     Creates missing data in the target column specified by the index (target_col_index).
115 |     Proportion of rows for which missing data is created is determined by missing_data_percentage
116 | 
117 |     :param df: Input time series dataframe to inject NaN into.
118 |     :param missing_data_percentage: Proportion of rows to mark as missing target data
119 |     :param target_col_index: Index of the column (target) in which to create missing data
120 |     :return: None
121 |     """
122 |     rows, _ = df.shape
123 |     df.iloc[sorted(random.sample(range(rows), round(rows * missing_data_percentage/100))), target_col_index] = np.nan
124 | 
125 | 
126 | def save_datasets(df: pd.DataFrame,
127 |                   filename: str,
128 |                   as_pickle=True,
129 |                   as_csv=True):
130 |     """
131 |     Saves the input dataframe as pickle and csv files, by default.
132 | 
133 |     :param df: The dataframe to save
134 |     :param filename: File name to save as, output file will be filename.csv and filename.pickle
135 |     :param as_pickle: Flag to save file as pickle, by default True
136 |     :param as_csv: Flag to save file as csv, by default True
137 |     :return: None
138 |     """
139 |     if as_pickle:
140 |         df.to_pickle(filename+'.pickle')
141 |     if as_csv:
142 |         df.to_csv(filename+'.csv',
143 |                   sep=",", header=True, index=False)
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     # Set sns and matplotlib options
148 |     register_matplotlib_converters()
149 |     sns.set_context('notebook')
150 | 
151 |     # process the dataframe
152 |     process()
153 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/03-default-experiment-configs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dataset_key": "",
 3 |   "testset_key": "",
 4 |   "validset_key": "",
 5 |   "target_col": "Sale",
 6 |   "fold_col": "",
 7 |   "weight_col": "",
 8 |   "orig_time_col": "Timeslot",
 9 |   "time_col": "Timeslot",
10 |   "is_classification": false,
11 |   "cols_to_drop": [],
12 |   "enable_gpus": false,
13 |   "seed": 1234,
14 |   "accuracy": 6,
15 |   "time": 3,
16 |   "interpretability": 8,
17 |   "scorer": "RMSE",
18 |   "time_groups_columns": [
19 |     "Timeslot",
20 |     "StoreID",
21 |     "Product"
22 |   ],
23 |   "time_period_in_seconds": 3600,
24 |   "num_prediction_periods": 24,
25 |   "num_gap_periods": 0,
26 |   "is_timeseries": true,
27 |   "config_overrides": "recipe = \"auto\"\nenable_xgboost = \"auto\"\nenable_lightgbm = \"auto\"\nenable_rf = \"auto\"\nenable_glm = \"auto\"\nenable_tensorflow = \"off\"\nenable_rulefit = \"off\"\nenable_ftrl = \"off\"\nparameter_tuning_num_models = -1\nfixed_ensemble_level = -1\ncheck_distribution_shift = true\ndrop_features_distribution_shift_threshold_auc = 0.6\ntarget_transformer = \"auto\"\nenable_target_encoding = true\ntime_series_recipe = true\noverride_lag_sizes = \"\"\nprob_lag_non_targets = 0.1\nmake_python_scoring_pipeline = true\nmake_mojo_scoring_pipeline = false\nrulefit_max_num_rules = -1\nfeature_brain_level = 2\nquantile_imbalanced_sampling = false\nholiday_features = true\nseed = 1234\nforce_64bit_precision = false\nmin_num_rows = 100\nmax_orig_cols_selected = 10000\nnfeatures_max = -1\nfeature_evolution_data_size = 100000000\nfeature_engineering_effort = 5\nmax_feature_interaction_depth = 8\nmax_relative_cardinality = 0.95\nstring_col_as_text_threshold = 0.3\ntensorflow_max_epochs = 10\nenable_tensorflow_textcnn = false\nenable_tensorflow_textbigru = false\nenable_tensorflow_charcnn = false\ntensorflow_max_epochs_nlp = 2\nmin_dai_iterations = 0\nmax_nestimators = 3000\nmax_nestimators_feature_evolution_factor = 0.2\nmax_learning_rate = 0.5\nmax_cores = -1\nnum_gpus_per_model = 1\nnum_gpus_per_experiment = -1\ngpu_id_start = 0\ncompute_correlation = false\nhigh_correlation_value_to_report = 0.95\ndump_modelparams_every_scored_indiv = false\ndump_varimp_every_scored_indiv = false\ndetailed_traces = false\nconfig_overrides = \"\"\n"
28 | }
29 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/03-run-experiment.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Commented. Enable for debugging
  4 | # set -x
  5 | 
  6 | current_dir="$(pwd)"
  7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  8 | conda_env_name="ts-pipeline-env"
  9 | conda_env_def_file="environment.yml"
 10 | process_script="03_run_experiment.py"
 11 | exp_data_dir_root="experiment_data"
 12 | exp_run_dir_root="experiment_runs"
 13 | exp_accuracy=1
 14 | exp_time=1
 15 | exp_interpretability=8
 16 | exp_scorer="RMSE"
 17 | cur_date=$(date +%y%m%d)
 18 | temp_dir_name="run_${cur_date}_${BASHPID}"
 19 | 
 20 | error_exit(){
 21 |     echo ""
 22 |     echo "$1" 1>&2
 23 |     echo ""
 24 |     exit 1
 25 | }
 26 | 
 27 | print_usage(){
 28 |     echo "Usage:"
 29 |     echo "  bash $0 -d <experiment_data_dir> -c <experiment_config_file> [-t | --test]  [-h | --help]"
 30 |     echo "Options:"
 31 |     echo "  -d <experiment_data_dir>         Path (relative to this script) to the experiment data directory containing train.csv and test.csv files"
 32 |     echo "  -c <experiment_config_file>      Path (relative to this script) to the default experiment config settings. Dataset details not needed in file."
 33 |     echo "  -t, --test                       Include test dataset when executing the experiment (optional)."
 34 |     echo "  -h, --help                       Display usage information."
 35 |     echo "Details:"
 36 |     echo "  Executes an experiment on the Driverless AI server at DAI_HOST. The train dataset (train.csv) is obtained from "
 37 |     echo "  the experiment_data_dir. Experiment configuration is obtained from experiment_config_file. The dataset key information"
 38 |     echo "  in experiment_config_file can be left as it is. It will be obtained at runtime. "
 39 |     echo "  "
 40 |     echo "  The script expects below three environment variables to be set with Driverless AI connection information"
 41 |     echo "  - DAI_HOST - Url where DAI is running. Include full URL till the port e.g. http://localhost:12345"
 42 |     echo "  - DAI_USER - Username for connecting to Driverless AI"
 43 |     echo "  - DAI_PASS - Password for the above user"
 44 |     echo "  "
 45 |     echo "  If the experiment completes successfully; python and mojo scoring pipelines are downloaded for the experiment. "
 46 | }
 47 | 
 48 | check_create_condaenv(){
 49 |     conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly."
 50 |     local env_count=$(conda env list | grep "${conda_env_name}" | wc -l)
 51 |     if [[ "${env_count}" == 0 ]]; then
 52 |         # create conda environment from the yml file
 53 |         [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found"
 54 |         conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment"
 55 |     fi
 56 | }
 57 | 
 58 | run_experiment(){
 59 |     # Make the temporary directory for this experiment run
 60 |     mkdir -p "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" && echo "Created temporary directory ${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}"
 61 |     # pushd this directory
 62 |     # call python file. Pass DAI credentials. full path for train,test datasets and config file. aLso pass project name
 63 |     # read read experiment.json and get the experiment key
 64 |     # popd
 65 |     # rename the temporary directory to the experiment key
 66 |     # if control reaches here, then conda environment is available
 67 |     [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found"
 68 |     pushd "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" > /dev/null &&
 69 |         source activate "${conda_env_name}" &&
 70 |         python "${script_dir}/${process_script}" -h "${dai_host}" \
 71 |                                                  -u "${dai_user}" \
 72 |                                                  -p "${dai_pass}" \
 73 |                                                  -d "${script_dir}/${exp_data_dir}/train.csv" \
 74 |                                                  -c "${script_dir}/${exp_config_file}" \
 75 |                                                  -j "${project_name}" \
 76 |                                                  ${include_test_data:+ -t "${script_dir}/${exp_data_dir}/test.csv"} &&
 77 |         conda deactivate &&
 78 |         popd > /dev/null
 79 | 
 80 |         # remove temp directory if experiment.json does not exist.
 81 |         [[ -f "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" ]]  || { rm -rf "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}"; }
 82 | 
 83 |         # if the experiment.json exists, get experiment key from the json and rename dir to the key
 84 |         if [[ -f "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" ]]; then
 85 |             exp_key=$(cat "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}/experiment.json" | grep -Po '"key": "\K[a-z]*?(?=",)')
 86 |             if [[ ! -z "${exp_key}" ]]; then
 87 |                 mv "${exp_data_dir}/${exp_run_dir_root}/${temp_dir_name}" "${exp_data_dir}/${exp_run_dir_root}/${exp_key}"
 88 |             fi
 89 |         fi
 90 | }
 91 | 
 92 | parse_args_then_exec(){
 93 |     # fail fast in case no parameters are passed
 94 |     [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; }
 95 | 
 96 |     # fail fast if required environment variables are not defined; if defined get the values
 97 |     [[ ! -z "${DAI_HOST}" ]] || error_exit "Expected environment variable DAI_HOST is not defined."
 98 |     [[ ! -z "${DAI_USER}" ]] || error_exit "Expected environment variable DAI_USER is not defined."
 99 |     [[ ! -z "${DAI_PASS}" ]] || error_exit "Expected environment variable DAI_PASS is not defined."
100 |     dai_host="${DAI_HOST}"
101 |     dai_user="${DAI_USER}"
102 |     dai_pass="${DAI_PASS}"
103 | 
104 | 
105 |     while [[ "$1" != "" ]]; do
106 |         case "$1" in
107 |            -d )
108 |                 shift
109 |                 exp_data_dir="$1"
110 |                 # If directory exists, proceed; else print message and exit with error code
111 |                 [[ -d "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not exist."; }
112 |                 [[ -f "${exp_data_dir}/train.csv" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain train.csv dataset."; }
113 |                 ;;
114 |            -c )
115 |                 shift
116 |                 exp_config_file="$1"
117 |                 # If directory exists, proceed; else print message and exit with error code
118 |                 [[ -f "${exp_config_file}" ]] || { print_usage; error_exit "Experiment configuration file ${script_dir}/${exp_config_file} does not exist."; }
119 |                 ;;
120 |            -t | --test )
121 |                 include_test_data="yes"
122 |                 ;;
123 |             -h | --help )
124 |                 print_usage
125 |                 exit 0
126 |                 ;;
127 |             * )
128 |                 print_usage
129 |                 error_exit "Error: Incorrect parameters passed"
130 |                 ;;
131 |         esac
132 |         shift
133 |     done
134 |     # check if needed parameters are provided
135 |     [[ ! -z "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory is mandatory"; }
136 |     [[ ! -z "${exp_config_file}" ]] || { print_usage; error_exit "Experiment config file is mandatory"; }
137 | 
138 |     # if test data is to be included check if file exists
139 |     if [[ "${include_test_data}" == "yes" ]]; then
140 |         [[ -f "${exp_data_dir}/test.csv" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain test.csv dataset."; }
141 |     fi
142 | 
143 |     # setup project_name from exp_data_dir
144 |     project_name=$(basename ${exp_data_dir})
145 | 
146 |     # Create conda environment if it does not exist
147 |     check_create_condaenv
148 | 
149 |     run_experiment
150 | }
151 | 
152 | main() {
153 |     parse_args_then_exec $@
154 | }
155 | 
156 | main $@
157 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/03_run_experiment.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import json
  3 | import os
  4 | 
  5 | import h2oai_client as h2o
  6 | 
  7 | @click.command()
  8 | @click.option('-h', '--host', 'dai_host',
  9 |               required=True,
 10 |               help='Driverless AI host url e.g http://hostname:12345')
 11 | @click.option('-u', '--user', 'dai_user',
 12 |               required=True,
 13 |               help='Driverless AI username')
 14 | @click.option('-p', '--pass', 'dai_pass',
 15 |               required=True,
 16 |               help='Driverless AI password')
 17 | @click.option('-d', '--train', 'train_ds', type=click.Path(exists=True,
 18 |                                                            file_okay=True,
 19 |                                                            dir_okay=False,
 20 |                                                            readable=True),
 21 |               required=True,
 22 |               help='Training dataset CSV file path.')
 23 | @click.option('-c', '--config', 'exp_config', type=click.Path(exists=True,
 24 |                                                               file_okay=True,
 25 |                                                               dir_okay=False,
 26 |                                                               readable=True),
 27 |               required=True,
 28 |               help='Default experiment config file.')
 29 | @click.option('-j', '--project', 'project_name',
 30 |               required=True,
 31 |               help='Project name to use for organizing the experiment. If does not exist, new project is created.')
 32 | @click.option('-t', '--test', 'test_ds', type=click.Path(exists=True,
 33 |                                                          file_okay=True,
 34 |                                                          dir_okay=False,
 35 |                                                          readable=True),
 36 |               required=False,
 37 |               default=None,
 38 |               help='Testing dataset CSV file path.')
 39 | def process(dai_host,
 40 |             dai_user,
 41 |             dai_pass,
 42 |             train_ds,
 43 |             exp_config,
 44 |             project_name,
 45 |             test_ds):
 46 |     """
 47 | 
 48 |     :param dai_host: Driverless AI host URL e.g. http://localhost:12345
 49 |     :param dai_user: Driverless AI user name
 50 |     :param dai_pass: Driverless AI password
 51 |     :param train_ds: path to training dataset csv file
 52 |     :param exp_config: path to experiment config json file
 53 |     :param project_name: Project name to organize datasets and experiments
 54 |     :param test_ds: path to testing dataset csv file (optional)
 55 |     :return: None
 56 |     """
 57 |     # print all the passed parameters
 58 |     # import inspect
 59 |     # _, _, _, values = inspect.getargvalues(inspect.currentframe())
 60 |     # print(values)
 61 | 
 62 |     # Create a connection to Driverless AI
 63 |     con = h2o.Client(address=dai_host,
 64 |                      username=dai_user,
 65 |                      password=dai_pass)
 66 | 
 67 |     # Get project key
 68 |     project_key = get_project_key(con, project_name)
 69 | 
 70 |     # Upload datasets and link to project
 71 |     test_ds_key = None
 72 |     train_ds_key = upload_dataset_to_project(con, project_key, train_ds, "Training")
 73 |     if test_ds is not None:
 74 |         test_ds_key = upload_dataset_to_project(con, project_key, test_ds, "Testing")
 75 | 
 76 |     # Read experiment config file and overwrite needed configs, save the config on file system
 77 |     with open(exp_config, 'r') as read_file:
 78 |         experiment_configs = json.load(read_file)
 79 |     experiment_configs['dataset_key'] = train_ds_key
 80 |     if test_ds_key is not None:
 81 |         experiment_configs['testset_key'] = test_ds_key
 82 |     with open('experiment-config.json', 'w') as write_file:
 83 |         json.dump(experiment_configs, write_file, indent=4)
 84 | 
 85 |     # Execute the experiment, link to project
 86 |     experiment: h2o.Model = con.start_experiment_sync(**experiment_configs)
 87 |     con.link_experiment_to_project(project_key,experiment.key)
 88 | 
 89 |     # build mojo pipeline
 90 |     mojo: h2o.MojoPipeline = con.build_mojo_pipeline_sync(experiment.key)
 91 | 
 92 |     # download mojo and python scoring pipelines and experiment summary
 93 |     con.download(experiment.scoring_pipeline_path, "")
 94 |     con.download(experiment.summary_path, "")
 95 |     con.download(mojo.file_path, "")
 96 | 
 97 |     # Finally save experiment.json
 98 |     with open('experiment.json', 'w') as write_file:
 99 |         json.dump(experiment.dump(), write_file, indent=4)
100 | 
101 | 
102 | 
103 | def upload_dataset_to_project(con: h2o.Client,
104 |                               project_key: str,
105 |                               dataset_file: str,
106 |                               dataset_type: str):
107 |     """
108 |     Uploads the data provided in dataset_file path to Driverless AI and links to the project. If the project already
109 |     has a dataset of the specified type and filename linked, then it is not re-uploaded. For the uploaded dataset, the
110 |     dataset_key of the newly uploaded dataset is returned. If it is not uploaded, then key of the dataset matching the
111 |     file name is returned.
112 | 
113 |     :param con: Connection to H2O Driverless AI
114 |     :param project_key: Key of the project to link the dataset to
115 |     :param dataset_file: File path of the dataset to upload and link to project
116 |     :param dataset_type: Either 'Training' or 'Testing'
117 |     :return: dataset_key
118 |     """
119 |     file_name = os.path.basename(dataset_file)
120 |     datasets = con.get_datasets_for_project(project_key, dataset_type)
121 |     dataset = next((x for x in datasets if x.name == file_name), None)
122 |     if dataset is None:
123 |         dataset = con.upload_dataset_sync(file_path=dataset_file)
124 |         con.link_dataset_to_project(project_key=project_key,
125 |                                     dataset_key=dataset.key,
126 |                                     dataset_type=dataset_type)
127 |     return dataset.key
128 | 
129 | 
130 | def get_project_key(con: h2o.Client,
131 |                     project_name: str) -> str:
132 |     """
133 |     Returns the key of the project with name matching project_name. If such a project does not exist, a new project is
134 |     created and its key is returned.
135 | 
136 |     :param con: Client to H2O Driverless AI
137 |     :param project_name: Name of the project
138 |     :return:
139 |     """
140 |     projects = con.list_projects(offset=0, limit=1000)
141 |     project = next((x for x in projects if x.name == project_name), None)
142 |     if project is None:
143 |         key = con.create_project(project_name, project_name)
144 |         return key
145 |     return project.key
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     # Call the main processing function
150 |     process()
151 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/04-create-tta-scoring-files.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Commented. Enable for debugging
  4 | # set -x
  5 | 
  6 | current_dir="$(pwd)"
  7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  8 | conda_env_name="ts-pipeline-env"
  9 | conda_env_def_file="environment.yml"
 10 | process_script="04_generate_tta_files.py"
 11 | exp_data_dir_root="experiment_data"
 12 | exp_data_dir_regex="s([0-9-]+)-e([0-9-]+)-gd([0-9]+)-td([0-9]+)-m[0-9]+"
 13 | tta_dir_prefix="tta-scoring-data"
 14 | predict_duration=24  # daily
 15 | roll_duration=1  # hourly
 16 | 
 17 | error_exit(){
 18 |     echo ""
 19 |     echo "$1" 1>&2
 20 |     echo ""
 21 |     exit 1
 22 | }
 23 | 
 24 | print_usage(){
 25 |     echo "Usage:"
 26 |     echo "  bash $0 -i <experiment data dir> [-p <prediction duration> ] [-r <roll duration>] [-h | --help]"
 27 |     echo "Options:"
 28 |     echo "  -i <experiment data dir>    Experiment data directory containing train, gap, and test csv and pickle files"
 29 |     echo "  -p <predict duration>       Duration (in hours) of data to predict in each scoring data frame. Optional, defaults to 24 hours i.e 1 day"
 30 |     echo "  -r <roll duration>          Duration (in hours) by which to roll the data window and score for next predict duration. Optional, defaults to 1 hour"
 31 |     echo "  -h, --help                  Display usage information."
 32 |     echo "Details:"
 33 |     echo "  Creates TTA and rolling window based scoring dataframes (csv and pickle) in the output directory."
 34 |     echo "  The output directory will be created in the format tta-scoring-data-pdP-rdR, where"
 35 |     echo "  - pdP is the predict duration"
 36 |     echo "  - rdR is the rolling duration"
 37 |     echo "  The output directory will be created as a subdirectory of <experiment data directory>"
 38 |     echo "  When the script is executed with certain inputs which results in an output directory that already exists, no action is taken."
 39 | }
 40 | 
 41 | parse_args_then_exec(){
 42 |     # fail fast in case no parameters are passed
 43 |     [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; }
 44 |     while [[ "$1" != "" ]]; do
 45 |         case "$1" in
 46 |            -i )
 47 |                 shift
 48 |                 exp_data_dir="$1"
 49 |                 # If directory exists, proceed; else print message and exit with error code
 50 |                 [[ -d "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not exist."; }
 51 |                 [[ -f "${exp_data_dir}/train.pickle" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain train.pickle dataset."; }
 52 |                 [[ -f "${exp_data_dir}/test.pickle" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_data_dir} does not contain test.pickle dataset."; }
 53 |                 ;;
 54 |            -p )
 55 |                 shift
 56 |                 predict_duration="$1"
 57 |                 # error is date is not in the valid format
 58 |                 [[ "${predict_duration}" =~  ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Predict duration (hours) is expected to be a non-zero integer."; }
 59 |                 ;;
 60 |            -r )
 61 |                 shift
 62 |                 roll_duration="$1"
 63 |                 # error is date is not in the valid format
 64 |                 [[ "${roll_duration}" =~  ^[1-9][0-9]*$ ]] || { print_usage; error_exit "Roll duration (hours) is expected to be a non-zero integer."; }
 65 |                 ;;
 66 |              -h | --help )
 67 |                 print_usage
 68 |                 exit 0
 69 |                 ;;
 70 |             * )
 71 |                 print_usage
 72 |                 error_exit "Error: Incorrect parameters passed"
 73 |                 ;;
 74 |         esac
 75 |         shift
 76 |     done
 77 | 
 78 |     # If required parameters are missing, print usage and exit
 79 |     [[ ! -z "${exp_data_dir}" ]] || { print_usage; error_exit "Experiment data directory is mandatory"; }
 80 | 
 81 |     # Check if experiment data directory is in the correct format
 82 |     exp_data_dir_base=$(basename ${exp_data_dir})
 83 |     [[ ${exp_data_dir_base} =~ ${exp_data_dir_regex} ]] || { error_exit "Experiment data directory ${exp_data_dir_base} is not in the correct format."; }
 84 | 
 85 |     # Extract information from data directory name
 86 |     start_date=${BASH_REMATCH[1]}
 87 |     end_date=${BASH_REMATCH[2]}
 88 |     gap_duration=${BASH_REMATCH[3]}
 89 |     test_duration=${BASH_REMATCH[4]}
 90 | 
 91 |     # Generate tta directory
 92 |     tta_dir="${tta_dir_prefix}-pd${predict_duration}-rd${roll_duration}"
 93 |     [[ ! -d "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}" ]] || error_exit "TTA data directory ${exp_data_dir_root}/${exp_data_dir}/${tta_dir} already exists. No action taken."
 94 |     mkdir -p "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}/score"
 95 |     mkdir -p "${exp_data_dir_root}/${exp_data_dir_base}/${tta_dir}/predicted"
 96 | 
 97 |     # Create conda environment if it does not exist
 98 |     check_create_condaenv
 99 | 
100 |     generate_tta_scoring_files
101 | }
102 | 
103 | check_create_condaenv(){
104 |     conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly."
105 |     local env_count=$(conda env list | grep "${conda_env_name}" | wc -l)
106 |     if [[ "${env_count}" == 0 ]]; then
107 |         # create conda environment from the yml file
108 |         [[ -e "${conda_env_def_file}" ]] || error_exit "Conda environment creation file not found"
109 |         conda env create -f "${conda_env_def_file}" || error_exit "Error creating conda environment"
110 |     fi
111 | }
112 | 
113 | generate_tta_scoring_files(){
114 |     # if control reaches here, then conda environment is available
115 |     [[ -e "${process_script}" ]] || error_exit "Python script to generate experiment data not found"
116 |     pushd "${exp_data_dir_root}/${exp_data_dir_base}" > /dev/null &&
117 |         source activate "${conda_env_name}" &&
118 |         python "${script_dir}/${process_script}" -o "${tta_dir}" \
119 |                                                  -s "${start_date}" \
120 |                                                  -e "${end_date}" \
121 |                                                  -g "${gap_duration}" \
122 |                                                  -t "${test_duration}" \
123 |                                                  -p "${predict_duration}" \
124 |                                                  -r "${roll_duration}" &&
125 |         conda deactivate &&
126 |         popd > /dev/null
127 | }
128 | 
129 | main() {
130 |     parse_args_then_exec $@
131 | }
132 | 
133 | main $@


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/04_generate_tta_files.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | 
  3 | import datetime as dt
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | 
  8 | @click.command()
  9 | @click.option('-o', '--outdir', 'tta_dir', type=click.Path(exists=True,
 10 |                                                            file_okay=False,
 11 |                                                            dir_okay=True,
 12 |                                                            readable=True,
 13 |                                                            writable=True),
 14 |               required=True,
 15 |               help='Output data directory where the TTA scoring files will be generated.')
 16 | @click.option('-s', '--start', 'train_start_date',
 17 |               required=True,
 18 |               type=click.DateTime(formats=['%Y-%m-%d']),
 19 |               help='Start date for training data.')
 20 | @click.option('-e', '--end', 'train_end_date',
 21 |               required=True,
 22 |               type=click.DateTime(formats=['%Y-%m-%d']),
 23 |               help='End date for training data.')
 24 | @click.option('-g', '--gap', 'gap_duration',
 25 |               required=True,
 26 |               type=click.INT,
 27 |               help='Gap (in days) between training and test data')
 28 | @click.option('-t', '--test', 'test_duration',
 29 |               required=True,
 30 |               type=click.INT,
 31 |               help='Duration (in days) for the testing dataset.')
 32 | @click.option('-p', '--predict', 'predict_duration',
 33 |               required=True,
 34 |               type=click.INT,
 35 |               help='Duration (in hours) of data to predict in each scoring data frame.')
 36 | @click.option('-r', '--roll', 'roll_duration',
 37 |               required=True,
 38 |               type=click.INT,
 39 |               help='Duration (in hours) by which to roll the data window for the next scoring cycle.')
 40 | def process(tta_dir,
 41 |             train_start_date,
 42 |             train_end_date,
 43 |             gap_duration,
 44 |             test_duration,
 45 |             predict_duration,
 46 |             roll_duration):
 47 |     """
 48 |     Creates TTA (test time augmentation) and rolling window based scoring dataframes from the test data
 49 |     in the output directory. These scoring files can then be passed to Driverless AI Scoring module for
 50 |     scoring.
 51 | 
 52 |     :param tta_dir: Output directory to create the TTA scoring data files.
 53 |     :param train_start_date: Start date for training dataset
 54 |     :param train_end_date: End date for training datset
 55 |     :param gap_duration: Gap (in days) between training and testing dataset.
 56 |     :param test_duration: Duration (in days) of the testing dataset.
 57 |     :param predict_duration: Duration (in hours) for which we are predicting in each scoring call.
 58 |     :param roll_duration: Duration (in hours) by which to roll the data window fo the next scoring call.
 59 |     :return: None
 60 |     """
 61 |     # Note the shell wrapper is taking care of changing to the appropriate data directory, so the train, gap and test
 62 |     # files will be in the current directory. The TTA file directory can be created here
 63 | 
 64 |     train_end_date = train_end_date.replace(hour=23)
 65 |     gap_start_date = train_end_date + dt.timedelta(hours=1)
 66 |     gap_end_date = gap_start_date + dt.timedelta(days=gap_duration, hours=-1)
 67 |     test_start_date = gap_end_date + dt.timedelta(hours=1)
 68 |     test_end_date = test_start_date + dt.timedelta(days=test_duration, hours=-1)
 69 | 
 70 |     rolling_slots = get_tta_scoring_slots(gap_start_date, gap_end_date,
 71 |                                           test_start_date, test_end_date,
 72 |                                           predict_duration, roll_duration)
 73 | 
 74 |     # Read the dataframes.
 75 |     df = pd.read_pickle('test.pickle')
 76 |     if gap_duration > 0:
 77 |         gap_df = pd.read_pickle('gap.pickle')
 78 |         df = pd.concat([gap_df, df])
 79 | 
 80 |     for slot in rolling_slots:
 81 |         tta_df = df[slot['tta_start']:slot['tta_end']].copy()
 82 |         score_df = df[slot['score_start']:slot['score_end']].copy()
 83 |         score_df['Sale'] = np.nan
 84 |         bind_df = pd.concat([tta_df, score_df])
 85 |         file_name = f"{slot['roll_counter_str']}-ss{slot['score_start']}-se{slot['score_end']}"
 86 |         save_datasets(bind_df,
 87 |                       tta_dir + "/score/" + file_name,
 88 |                       as_csv=True,
 89 |                       as_pickle=False)
 90 | 
 91 | 
 92 | #%% Define another function
 93 | def get_tta_scoring_slots(gs: dt.datetime,
 94 |                           ge: dt.datetime,
 95 |                           ts: dt.datetime,
 96 |                           te: dt.datetime,
 97 |                           pd: int,
 98 |                           rd: int):
 99 |     """
100 |     Print the TTA scoring info in the following format
101 |     TNNNN-ScoreTime-TTAstarttime-TTAendtime-PRDstarttime-PRDendtime
102 |     :param gs: Gap start
103 |     :param ge: Gap end
104 |     :param ts: Test Start
105 |     :param te: Test end
106 |     :param pd: Predict Duration (hours) should be > 0
107 |     :param rd: Roll Duration (hours) should be > 0
108 |     :return: List of dicts containing the tta slot information
109 |     """
110 |     slots_list = []
111 |     if ge > gs:
112 |         tta_start = gs
113 |     else:
114 |         tta_start = ts
115 |     score_pointer = ts
116 |     roll_counter = 0
117 |     while score_pointer <= te - dt.timedelta(hours=pd-1):
118 |         tta_end = tta_start + dt.timedelta(hours=(roll_counter-1)*rd)
119 |         score_start = score_pointer
120 |         score_end = score_pointer + dt.timedelta(hours=pd-1)
121 |         d = {
122 |             'roll_counter': roll_counter,
123 |             'roll_counter_str': f"{roll_counter:05d}",
124 |             'tta_start': tta_start,
125 |             'tta_end': tta_end,
126 |             'score_start': score_start,
127 |             'score_end': score_end
128 |         }
129 |         slots_list.append(d)
130 |         score_pointer = score_pointer + dt.timedelta(hours=rd)
131 |         roll_counter = roll_counter + 1
132 |     return slots_list
133 | 
134 | 
135 | def save_datasets(df: pd.DataFrame,
136 |                   filename: str,
137 |                   as_pickle=True,
138 |                   as_csv=True):
139 |     """
140 |     Saves the input dataframe as pickle and csv files, by default.
141 | 
142 |     :param df: The dataframe to save
143 |     :param filename: File name to save as, output file will be filename.csv and filename.pickle
144 |     :param as_pickle: Flag to save file as pickle, by default True
145 |     :param as_csv: Flag to save file as csv, by default True
146 |     :return: None
147 |     """
148 |     if as_pickle:
149 |         df.to_pickle(filename+'.pickle')
150 |     if as_csv:
151 |         df.to_csv(filename+'.csv',
152 |                   sep=",", header=True, index=False)
153 | 
154 | 
155 | if __name__ == '__main__':
156 |     # Set sns and matplotlib options
157 | 
158 |     # process the dataframe
159 |     process()
160 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/05-score-tta-files.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Commented. Enable for debugging
  4 | # set -x
  5 | 
  6 | current_dir="$(pwd)"
  7 | script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
  8 | process_script="05_score_tta_files.py"
  9 | exp_data_dir_root="experiment_data"
 10 | tta_dir_prefix="tta-scoring-data"
 11 | use_pipeline="python"
 12 | use_method="module"
 13 | 
 14 | error_exit(){
 15 |     echo ""
 16 |     echo "$1" 1>&2
 17 |     echo ""
 18 |     exit 1
 19 | }
 20 | 
 21 | print_usage(){
 22 |     echo "Usage:"
 23 |     echo "  bash $0 -e <experiment run dir> -s <scoring data dir> [-p <python|mojo>] [-m <module|api|api2>] [-h | --help]"
 24 |     echo "Options:"
 25 |     echo "  -e <experiment run dir>     Experiment run directory containing scorer.zip. Will have same name as experiment in Driverless AI"
 26 |     echo "  -s <scoring data dir>       TTA scoring data directory created in step 04. Name will start with ${tta_dir_prefix}"
 27 |     echo "  -p <python|mojo>            Optional, defaults to python. Use Driverless AI Python or Mojo (Java) pipeline for scoring"
 28 |     echo "  -m <module|api|api2>        Optional, defaults to module. Score using python module in code or using HTTP JSON or DataFrame API endpoint"
 29 |     echo "  -h, --help                  Display usage information."
 30 |     echo "Details:"
 31 |     echo "  Scores the files in scoring data directory using the scoring pipeline for selected experiment. Also creates the necessary"
 32 |     echo "  environments with dependencies for the scoring pipeline to work."
 33 |     echo "  Scoring files will be picked from the 'score' sub-directory of selected scoring data directory."
 34 |     echo "  Output files will be generated in the 'predicted' sub-directory of selected scoring data directory."
 35 |     echo "  Scoring method 'api' sends the prediction dataframe as JSON to API server for batch scoring; 'api2' uses base64 encoded Pandas DataFrame"
 36 | }
 37 | 
 38 | parse_args_validate_then_exec(){
 39 |     # fail fast in case no parameters are passed
 40 |     [[ ! -z "${1}" ]] || { print_usage; error_exit "Expected parameters not passed during script invocation"; }
 41 |     while [[ "$1" != "" ]]; do
 42 |         case "$1" in
 43 |            -e )
 44 |                 shift
 45 |                 exp_run_dir="$1"
 46 |                 # If directory exists, proceed; else print message and exit with error code
 47 |                 [[ -d "${exp_run_dir}" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not exist."; }
 48 |                 [[ -f "${exp_run_dir}/experiment.json" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain experiment.json file."; }
 49 |                 [[ -f "${exp_run_dir}/experiment-config.json" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain experiment-config.json."; }
 50 |                 experiment_name=$(basename "${exp_run_dir}")
 51 |                 ;;
 52 |            -s )
 53 |                 shift
 54 |                 scoring_data_dir="$1"
 55 |                 # If directory exists, proceed; else print message and exit with error code
 56 |                 [[ -d "${scoring_data_dir}/score" ]] || { print_usage; error_exit "Scoring data directory ${script_dir}/${scoring_data_dir}/score does not exist."; }
 57 |                 files_to_score=$(ls "${scoring_data_dir}/score" | wc -l)
 58 |                 [[ "${files_to_score}" -gt "0" ]] || { print_usage; error_exit "No files to score in scoring data directory ${script_dir}/${scoring_data_dir}/score."; }
 59 |                 ;;
 60 |             -p )
 61 |                 shift
 62 |                 use_pipeline="$1"
 63 |                 [[ "${use_pipeline}" =~ ^(python|mojo)$ ]] || { print_usage; error_exit "Incorrect pipeline option. Only 'python' and 'mojo' are supported."; }
 64 |                 ;;
 65 |             -m )
 66 |                 shift
 67 |                 use_method="$1"
 68 |                 [[ "${use_method}" =~ ^(module|api|api2)$ ]] || { print_usage; error_exit "Incorrect method option. Only 'module' and 'api' are supported."; }
 69 |                 ;;
 70 |             -h | --help )
 71 |                 print_usage
 72 |                 exit 0
 73 |                 ;;
 74 |             * )
 75 |                 print_usage
 76 |                 error_exit "Error: Incorrect parameters passed"
 77 |                 ;;
 78 |         esac
 79 |         shift
 80 |     done
 81 | 
 82 | 
 83 |     # If required parameters are missing, print usage and exit
 84 |     [[ ! -z "${exp_run_dir}" ]] || { print_usage; error_exit "Experiment run directory is mandatory"; }
 85 |     [[ ! -z "${scoring_data_dir}" ]] || { print_usage; error_exit "Scoring data directory is mandatory"; }
 86 | 
 87 |     # Check if experiment run dir has required pipeline.zip file based on the selected pipeline option
 88 |     case "${use_pipeline}" in
 89 |         python )
 90 |             [[ -f "${exp_run_dir}/scorer.zip" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain python scoring pipeline scorer.zip."; }
 91 |             ;;
 92 |         mojo )
 93 |             [[ -f "${exp_run_dir}/mojo.zip" ]] || { print_usage; error_exit "Experiment data directory ${script_dir}/${exp_run_dir} does not contain mojo scoring pipeline mojo.zip."; }
 94 |             error_exit "Mojo pipeline option not yet supported for Test Time Augmentation scoring for Time Series experiments. Please use python type"
 95 |             ;;
 96 |         * )
 97 |             print_usage
 98 |             error_exit "Incorrect pipeline option, only 'python' and 'mojo' are supported"
 99 |             ;;
100 |     esac
101 | 
102 |     # Check prediction duration to tta scoring data and experiment match.
103 |     # Prediction duration in step 04 (TTA scoring file generation) should match the prediction duration used to create TTA data
104 |     scoring_data_dir_base=$(basename "${scoring_data_dir}")
105 |     scoring_data_dir_regex="tta-scoring-data-pd([0-9-]+)-rd([0-9-]+)"
106 |     [[ "${scoring_data_dir_base}" =~ ${scoring_data_dir_regex} ]] || { error_exit "Scoring data directory ${scoring_data_dir_base} is not in the correct format."; }
107 |     scoring_data_predict_duration=${BASH_REMATCH[1]}
108 |     exp_config_predict_duration=$(cat "${exp_run_dir}/experiment-config.json" | grep -P -o '"num_prediction_periods": \K([0-9]+)')
109 |     [[ "${scoring_data_predict_duration}" -eq "${exp_config_predict_duration}" ]] || { error_exit "Prediction duration mismatch. Experiment: ${exp_config_predict_duration}, Scoring Data: ${scoring_data_predict_duration}"; }
110 | 
111 |     # Check if predicted directory contains scored files
112 |     if [[ -d "${scoring_data_dir}/predict/${experiment_name}" ]]; then
113 |         files_scored=$(ls "${scoring_data_dir}/predict/${experiment_name}" | wc -l)
114 |         [[ "${files_scored}" -gt "0" ]] || { print_usage; error_exit "Scored files already exist in directory ${script_dir}/${scoring_data_dir}/predict/${experiment_name}."; }
115 |     fi
116 | 
117 |     # Check that experiment data dir is common for experiment and tta scoring data
118 |     # Get experiment data directory;
119 |     experiment_data_dir_regex="^([0-9a-z_/\-]+)/experiment_runs/.*"
120 |     [[ "${exp_run_dir}" =~ ${experiment_data_dir_regex} ]] || { error_exit "Experiment run directory ${exp_run_dir} is not in the correct format."; }
121 |     experiment_data_dir=${BASH_REMATCH[1]}
122 |     # Get experiment data directory;
123 |     score_experiment_data_dir_regex="^([0-9a-z_/\-]+)/tta-scoring-data.*"
124 |     [[ "${scoring_data_dir}" =~ ${score_experiment_data_dir_regex} ]] || { error_exit "Scoring data directory ${scoring_data_dir} is not in the correct format."; }
125 |     score_experiment_data_dir=${BASH_REMATCH[1]}
126 |     # Esnure they are same
127 |     [[ "${experiment_data_dir}" == "${score_experiment_data_dir}" ]] || { error_exit "Experiment Run and Scoring data do not have the same experiment data directory."; }
128 | 
129 | 
130 |     # Create conda environment if it does not exist
131 |     check_create_condaenv
132 | 
133 |     case "${use_method}" in
134 |         module )
135 |             score_tta_files_using_module
136 |             ;;
137 |         api )
138 |             score_tta_files_using_api
139 |             ;;
140 |         api2 )
141 |             score_tta_files_using_api2
142 |             ;;
143 |         * )
144 |             print_usage
145 |             error_exit "Incorrect method option, only 'module' and 'api' are supported"
146 |             ;;
147 |     esac
148 | 
149 | }
150 | 
151 | check_create_condaenv(){
152 |     conda --version > /dev/null || error_exit "Conda required, please install miniconda or anaconada and configure PATH correctly."
153 |     unzip -v > /dev/null || error_exit "Unzip required, please install unzip."
154 |     # check if scoring-pipeline is already unzipped, if not unzip it
155 |     [[ -d "${exp_run_dir}/scoring-pipeline" ]] || { pushd ${exp_run_dir} > /dev/null && unzip scorer.zip && popd > /dev/null; }
156 |     conda_env_name=$(grep -P -o 'name: \K([a-z2_]+)' "${exp_run_dir}/scoring-pipeline/environment.yml")
157 |     local env_count=$(conda env list | grep "${conda_env_name}" | wc -l)
158 |     if [[ "${env_count}" == 0 ]]; then
159 |         # create conda environment from the yml file
160 |         [[ -e "${exp_run_dir}/scoring-pipeline/environment.yml" ]] || error_exit "Conda environment creation file not found"
161 |         conda env create -f  "${exp_run_dir}/scoring-pipeline/environment.yml" || error_exit "Error creating conda environment"
162 |         source activate "${conda_env_name}" &&
163 |             conda install -y -c conda-forge click tqdm starlette uvicorn &&
164 |             conda deactivate
165 |     fi
166 | }
167 | 
168 | score_tta_files_using_module(){
169 |     # if control reaches here, then conda environment is available
170 |     [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found"
171 |     pushd "${scoring_data_dir}" > /dev/null &&
172 |         source activate "${conda_env_name}" &&
173 |         python "${script_dir}/${process_script}" -n "${experiment_name}" \
174 |                                                  -t "${script_dir}/${experiment_data_dir}/test.pickle" \
175 |                                                  -g "${script_dir}/${experiment_data_dir}/gap.pickle" \
176 |                                                  --module &&
177 |         conda deactivate &&
178 |         rm -rf tmp &&
179 |         popd > /dev/null
180 | }
181 | 
182 | score_tta_files_using_api(){
183 |     # if control reaches here, then conda environment is available
184 |     [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found"
185 | 
186 |     # Hack to get http_server working for TTA
187 |     # More info - Read Warning in
188 |     # https://github.com/h2oai/driverlessai-tutorials/tree/master/driverlessai_experiments/timeseries/ts-full-pipeline#step-05-score-tta-files
189 |     # We check if the line is already added in the file, if so we dont add it again.
190 |     # If not added already, then we find out the line # in the file where we add this line and then add it
191 |     # The idea will work for all use cases, but the code is specific to this data/experiment
192 |     # for your experiment, make corresponding changes
193 |     grep -q "pd.Series(\[r\['Sale'] if" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" || {
194 |         line_no=$(grep -n "pd.Series(\[r\['" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" | tail -n 1 | cut -d ":" -f 1)
195 |         inject_lino=$(expr ${line_no} + 1)
196 |         sed -i "${inject_lino}i\            pd.Series([r['Sale'] if r['Sale'] != None else None for r in rows], name='Sale', dtype='float')" "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py"
197 |     }
198 | 
199 |     pushd "${scoring_data_dir}" > /dev/null &&
200 |         source activate "${conda_env_name}" &&
201 |         (python  "${script_dir}/${exp_run_dir}/scoring-pipeline/http_server.py" --port=9090 > /dev/null 2>&1 &) &&
202 |         sleep 20 &&
203 |         python "${script_dir}/${process_script}" -n "${experiment_name}" \
204 |                                                  -t "${script_dir}/${experiment_data_dir}/test.pickle" \
205 |                                                  -g "${script_dir}/${experiment_data_dir}/gap.pickle" \
206 |                                                  --api-json &&
207 |         pkill -f http_server.py &&
208 |         conda deactivate &&
209 |         rm -rf tmp &&
210 |         popd > /dev/null
211 | }
212 | 
213 | score_tta_files_using_api2(){
214 |     # if control reaches here, then conda environment is available
215 |     [[ -e "${process_script}" ]] || error_exit "Python script ${process_script} data not found"
216 |     pushd "${scoring_data_dir}" > /dev/null &&
217 |         source activate "${conda_env_name}" &&
218 |         (python  "${script_dir}/11_http_server2.py" -n ${experiment_name} -p 9090 > /dev/null 2>&1 &) &&
219 |         sleep 20 &&
220 |         python "${script_dir}/${process_script}" -n "${experiment_name}" \
221 |                                                  -t "${script_dir}/${experiment_data_dir}/test.pickle" \
222 |                                                  -g "${script_dir}/${experiment_data_dir}/gap.pickle" \
223 |                                                  --api-df &&
224 |         pkill -f 11_http_server2.py &&
225 |         conda deactivate &&
226 |         rm -rf tmp &&
227 |         popd > /dev/null
228 | }
229 | 
230 | main(){
231 |     parse_args_validate_then_exec $@
232 | }
233 | 
234 | main $@
235 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/05_score_tta_files.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import click
  3 | import glob
  4 | import importlib
  5 | import json
  6 | import os
  7 | import re
  8 | import requests
  9 | 
 10 | import datetime as dt
 11 | import pandas as pd
 12 | 
 13 | from tqdm import tqdm
 14 | from io import BytesIO
 15 | 
 16 | 
 17 | @click.command()
 18 | @click.option('-n', '--name', 'experiment_name',
 19 |               required=True,
 20 |               help='Experiment name.')
 21 | @click.option('-t', '--test', 'test_ds_file', type=click.Path(exists=True,
 22 |                                                               file_okay=True,
 23 |                                                               dir_okay=False,
 24 |                                                               readable=True),
 25 |               required=True,
 26 |               help='Testing dataset CSV file path.')
 27 | @click.option('-g', '--gap', 'gap_ds_file', type=click.Path(exists=False,
 28 |                                                             file_okay=True,
 29 |                                                             dir_okay=False,
 30 |                                                             readable=True),
 31 |               required=False,
 32 |               help='Gap dataset CSV file path.')
 33 | @click.option('--module', 'method', flag_value='module', default=True)
 34 | @click.option('--api-json', 'method', flag_value='api-json', default=True)
 35 | @click.option('--api-df', 'method', flag_value='api-df', default=True)
 36 | def process(experiment_name,
 37 |             test_ds_file,
 38 |             gap_ds_file,
 39 |             method):
 40 |     """
 41 |     Score the TTA files in the 'score' directory, and create corresponding prediction files in the
 42 |     'predict/<experiment name> directory. Also calculate the metric (RMSE) to measure how good is the
 43 |     prediction for that file.
 44 | 
 45 |     :param experiment_name: Name of the experiment run
 46 |     :param test_ds_file: Path of the test dataset file used for RMSE calculation
 47 |     :param gap_ds_file: Path of the gap dataset file used for RMSE calculation
 48 |     :param method: Score using imported module or use HTTP API using JSON (api-json) or DataFrame (api-df)
 49 |     :return: None
 50 |     """
 51 |     # Note the shell wrapper ensures this python file is executed in the TTA scoring data directory.
 52 | 
 53 |     # print(experiment_name)
 54 |     # print(test_ds)
 55 | 
 56 | 
 57 |     # Load the test datasset
 58 |     # Read csv to data frame.
 59 |     # test_ds = pd.read_csv(test_ds_file,
 60 |     #                       sep=',',
 61 |     #                       names=['Timeslot', 'StoreID', 'Product', 'Sale'],
 62 |     #                       parse_dates=['Timeslot'],
 63 |     #                       infer_datetime_format=True)
 64 |     test_ds = pd.read_pickle(test_ds_file)
 65 |     if gap_ds_file is not None and os.path.exists(gap_ds_file):
 66 |         gap_ds = pd.read_pickle(gap_ds_file)
 67 |         test_ds = pd.concat([gap_ds, test_ds])
 68 | 
 69 |     # Create the output directory if it does not exists
 70 |     os.makedirs(f'predicted/{experiment_name}', exist_ok=True)
 71 | 
 72 |     # Compile the regex
 73 |     regex = re.compile(r'([0-9]{5})-ss([0-9 -:]{19})-se([0-9 -:]{19})')
 74 | 
 75 |     # Glob all files to score, from the 'score' directory and then process each of them
 76 |     for file in tqdm(glob.glob('score/*.csv')):
 77 |         # Extract scoring duration from the file name. Calculate how many data points it makes
 78 |         # Per hour is 8 data points
 79 |         file_name = os.path.splitext(os.path.basename(file))[0]
 80 |         capture_groups = regex.match(file_name)
 81 |         file_order = capture_groups.group(1)
 82 |         score_start_time = dt.datetime.strptime(capture_groups.group(2), r'%Y-%m-%d %H:%M:%S')
 83 |         score_end_time = dt.datetime.strptime(capture_groups.group(3), r'%Y-%m-%d %H:%M:%S')
 84 |         last_n_values = (((score_end_time - score_start_time).seconds // 3600) + 1) * 8
 85 | 
 86 |         # Load dataset to score and score it
 87 |         score_ds = pd.read_csv(file)
 88 |         if method == 'module':
 89 |             preds_ds = score_using_module(experiment_name, score_ds)
 90 |         elif method == 'api-json':
 91 |             preds_ds = score_using_http_api(score_ds)
 92 |         elif method == 'api-df':
 93 |             preds_ds = score_using_http_api2(score_ds)
 94 | 
 95 |         # Rename the predicted Sale column as Sale_hat and concat it to the original dataset
 96 |         preds_ds.columns = ['Sale_hat']
 97 |         preds_ds = pd.concat([score_ds, preds_ds], axis=1)
 98 | 
 99 |         # Get actual and predicted value arrays.
100 |         # Actuals are obtained from test data using score start and end time to slice
101 |         # Predicted data frame even predicts and returns TTA data. So use last_n_values to slice it
102 |         actual_values = test_ds.loc[score_start_time:score_end_time, 'Sale'].values
103 |         predicted_values = preds_ds['Sale_hat'].values[-last_n_values:]
104 | 
105 |         # Ensure the arrays match
106 |         assert len(actual_values) == len(predicted_values)
107 |         df = pd.DataFrame({'actual': actual_values, 'predicted': predicted_values})
108 |         # Note that we drop the rows in case there is an NaN in actuals to calculate RMSE
109 |         df.dropna(inplace=True)
110 |         rmse = ((df['predicted'] - df['actual']) ** 2).mean() ** 0.5
111 | 
112 |         if method == 'module':
113 |             file_name = f'predicted/{experiment_name}/{file_order}-mod-m{rmse}'
114 |         elif method == 'api-json':
115 |             file_name = f'predicted/{experiment_name}/{file_order}-api-json-m{rmse}'
116 |         elif method == 'api-df':
117 |             file_name = f'predicted/{experiment_name}/{file_order}-api-df-m{rmse}'
118 | 
119 |         # Save the predictions
120 |         save_datasets(preds_ds,
121 |                       file_name,
122 |                       as_pickle=False,
123 |                       as_csv=True)
124 | 
125 | 
126 | def score_using_module(experiment_name: str,
127 |                        df: pd.DataFrame):
128 |     """
129 |     Score the input dataframe using python module
130 | 
131 |     :param experiment_name: Name of the experiment
132 |     :param df: Input pandas dataframe to score
133 |     :return: A pandas DataFrame with the predictions
134 |     """
135 |     # Get DAI scorer
136 |     scorer = get_dai_scorer(experiment_name)
137 |     return scorer.score_batch(df)
138 | 
139 | 
140 | def score_using_http_api(df: pd.DataFrame):
141 |     """
142 |     Score the input dataframe using the HTTP api endpoint. Assumes that the HTTP endpoint is
143 |     started by the wrapper script and listening on localhost:9090 at the /rpc endpoint
144 | 
145 |     :param df: Input pandas dataframe to score
146 |     :return: A pandas DataFrame with the predictions
147 |     """
148 |     d = {
149 |             "id": 1,
150 |             "method": "score_batch",
151 |             "params": {}
152 |     }
153 |     d['params']['rows'] = json.loads(df.to_json(orient='records'))
154 | 
155 |     # Send the post to HTTP endpoint
156 |     headers = {'Content-Type': 'application/json'}
157 |     r = requests.post(url="http://localhost:9090/rpc",
158 |                       json=d,
159 |                       headers=headers)
160 |     results_list = r.json()['result']
161 |     preds_list = [val for sub_list in results_list for val in sub_list]
162 |     return pd.DataFrame(preds_list, columns=['Sale'])
163 | 
164 | 
165 | def score_using_http_api2(df: pd.DataFrame):
166 |     buf = BytesIO()
167 |     df.to_pickle(buf, compression=None)
168 |     buf.seek(0)
169 |     d = dict(id=1, method='score_batch', payload=base64.b64encode(buf.getvalue()).decode())
170 |     buf.close()
171 |     # Send the post to HTTP endpoint
172 |     post_headers = {'Content-Type': 'application/json'}
173 |     r = requests.post(url="http://localhost:9090/predict",
174 |                       data=json.dumps(d),
175 |                       headers=post_headers)
176 |     if r:
177 |         buf = BytesIO(base64.b64decode(r.json()['payload']))
178 |         buf.seek(0)
179 |         return pd.read_pickle(buf, compression=None)
180 | 
181 | 
182 | def get_dai_scorer(experiment_name: str):
183 |     """
184 |     Import the Driverless AI scoring module dynamically based on the experiment name passed, and return
185 |     the corresponding scorer object
186 | 
187 |     :param experiment_name: Name of DAI experiment for which to return the scoring object
188 |     :return: A Scoring object of type scoring_h2oai_experiment_<experiment_name>.scorer.Scorer
189 |     """
190 |     scoring_module_name = 'scoring_h2oai_experiment_{}'.format(experiment_name)
191 |     scoring_module = importlib.import_module(scoring_module_name)
192 |     scoring_class = getattr(scoring_module, 'Scorer')
193 | 
194 |     scorer = scoring_class()
195 |     return scorer
196 | 
197 | 
198 | def save_datasets(df: pd.DataFrame,
199 |                   filename: str,
200 |                   as_pickle=True,
201 |                   as_csv=True):
202 |     """
203 |     Saves the input dataframe as pickle and csv files, by default.
204 | 
205 |     :param df: The dataframe to save
206 |     :param filename: File name to save as, output file will be filename.csv and filename.pickle
207 |     :param as_pickle: Flag to save file as pickle, by default True
208 |     :param as_csv: Flag to save file as csv, by default True
209 |     :return: None
210 |     """
211 |     if as_pickle:
212 |         df.to_pickle(filename+'.pickle')
213 |     if as_csv:
214 |         df.to_csv(filename+'.csv',
215 |                   sep=",", header=True, index=False)
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     # process the dataframe
220 |     process()
221 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/10_plot_score_metric.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import glob
  3 | import os
  4 | import re
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | import seaborn as sns
  9 | 
 10 | from pandas.plotting import register_matplotlib_converters
 11 | 
 12 | @click.command()
 13 | @click.option('-p', '--predictions-dir', 'preds_dir', type=click.Path(exists=True,
 14 |                                                                       file_okay=False,
 15 |                                                                       dir_okay=True,
 16 |                                                                       readable=True,
 17 |                                                                       writable=True),
 18 |               required=True,
 19 |               help='Predictions data directory.')
 20 | def process(preds_dir):
 21 |     """
 22 |     Reads the scored files in predictions directory, extracts the metric from the filename and plots
 23 |     a graph to compare the metric divergence between API and Python module based scoring.
 24 | 
 25 |     :param preds_dir: Predictions data directory.
 26 |     :return: None
 27 |     """
 28 |     # First glob all module files
 29 |     mod_list = []
 30 |     mod_regex = re.compile(r'([0-9]{5})-mod-m([0-9.]+).csv')
 31 |     for m in glob.glob(f'{preds_dir}/*-mod-*.csv'):
 32 |         file_name = os.path.basename(m)
 33 |         capture_group = mod_regex.match(file_name)
 34 |         mod_list.append({
 35 |             'order_id': capture_group.group(1),
 36 |             'Module': capture_group.group(2)
 37 |         })
 38 | 
 39 |     # Next glob all api files
 40 |     api_json_list = []
 41 |     api_regex = re.compile(r'([0-9]{5})-api-json-m([0-9.]+).csv')
 42 |     for a in glob.glob(f'{preds_dir}/*-api-json-*.csv'):
 43 |         file_name = os.path.basename(a)
 44 |         capture_group = api_regex.match(file_name)
 45 |         api_json_list.append({
 46 |             'order_id': capture_group.group(1),
 47 |             'API-JSON': capture_group.group(2)
 48 |         })
 49 | 
 50 |     # Next glob all api files
 51 |     api_df_list = []
 52 |     api_regex = re.compile(r'([0-9]{5})-api-df-m([0-9.]+).csv')
 53 |     for a in glob.glob(f'{preds_dir}/*-api-df-*.csv'):
 54 |         file_name = os.path.basename(a)
 55 |         capture_group = api_regex.match(file_name)
 56 |         api_df_list.append({
 57 |             'order_id': capture_group.group(1),
 58 |             'API-DF': capture_group.group(2)
 59 |         })
 60 | 
 61 |     assert len(mod_list) == len(api_json_list) == len(api_df_list), \
 62 |         'Unequal files scored by Module, JSON API and DataFrame API.'
 63 | 
 64 |     mod_df = pd.DataFrame(mod_list)
 65 |     api_json_df = pd.DataFrame(api_json_list)
 66 |     api_df_df = pd.DataFrame(api_df_list)
 67 | 
 68 |     # Merge all dataframes on a common column
 69 |     mod_df.set_index('order_id', inplace=True)
 70 |     api_json_df.set_index('order_id', inplace=True)
 71 |     api_df_df.set_index('order_id', inplace=True)
 72 |     df: pd.DataFrame = pd.concat([mod_df, api_json_df, api_df_df], axis=1, sort=False)
 73 |     df.reset_index(inplace=True)
 74 |     df.sort_values(by='index',inplace=True)
 75 |     df['index'] = df['index'].astype(np.int16)
 76 |     df['Module'] = df['Module'].astype(np.float64)
 77 |     df['API-JSON'] = df['API-JSON'].astype(np.float64)
 78 |     df['API-DF'] = df['API-DF'].astype(np.float64)
 79 |     df = pd.melt(df,
 80 |                  id_vars=['index'],
 81 |                  var_name='Method',
 82 |                  value_name='RMSE')
 83 | 
 84 |     # Create TS plots for each store id in a separate file
 85 |     register_matplotlib_converters()
 86 |     sns.set_context('notebook')
 87 | 
 88 |     sns.relplot(x='index',
 89 |                 y='RMSE',
 90 |                 hue='Method',
 91 |                 kind='line',
 92 |                 height=7,
 93 |                 aspect=2,
 94 |                 data=df).fig.savefig(f'{preds_dir}/metrics_plot.svg')
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     # process the dataframe
 99 |     process()
100 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/11_http_server2.py:
--------------------------------------------------------------------------------
 1 | from starlette.applications import Starlette
 2 | from starlette.responses import JSONResponse
 3 | 
 4 | import base64
 5 | import click
 6 | import importlib
 7 | import json
 8 | import pandas
 9 | import uvicorn
10 | 
11 | from io import BytesIO
12 | 
13 | 
14 | # Create a global scorer and assign to None for now
15 | scorer = None
16 | 
17 | app = Starlette(debug=True)
18 | 
19 | 
20 | @app.route("/predict", methods=['POST'])
21 | async def predict(request):
22 |     request_content_json = json.loads(await request.body())
23 |     buf = BytesIO(base64.b64decode(request_content_json['payload']))
24 |     buf.seek(0)
25 |     score_ds = pandas.read_pickle(buf, compression=None)
26 |     buf.close()
27 |     if scorer is not None and type(score_ds).__name__ == 'DataFrame':
28 |         pred_ds = scorer.score_batch(score_ds)
29 |         buf = BytesIO()
30 |         pred_ds.to_pickle(buf, compression=None)
31 |         buf.seek(0)
32 |         return JSONResponse(content={'payload': base64.b64encode(buf.getvalue()).decode()},
33 |                             status_code=200)
34 |     else:
35 |         return JSONResponse(content={'payload': 'Error scorer could not load or request payload not pandas DataFrame'},
36 |                             status_code=500)
37 | 
38 | 
39 | @click.command()
40 | @click.option('-n', '--name', 'experiment_name',
41 |               required=True,
42 |               type=click.types.STRING,
43 |               help='Experiment Name')
44 | @click.option('-p', '--port', 'port',
45 |               required=False,
46 |               type=click.types.INT,
47 |               default=9090)
48 | def process(experiment_name,
49 |             port):
50 |     """
51 |     Executes a HTTP prediction server for the Driverless AI python pipeline.
52 |     Will create a '/predict' endpoint that will respond to only HTTP posts. Expected input for the endpoint
53 |     is a pandas DataFrame for batch scoring using the 'score_batch' operation of the DAI python scoring pipeline.
54 |     The pandas DataFrame should be pickled and then Base64 encoded and then sent in the Request body.
55 | 
56 |     :param experiment_name: Name of the Driverless AI experiment for which the scoring pipeline is used
57 |     :param port: Port number to listen to for input data to predict
58 |     :return:
59 |     """
60 |     # Make function aware about the global variable scorer, and then set it
61 |     global scorer
62 |     scorer = experiment_name
63 |     scoring_module_name = 'scoring_h2oai_experiment_{}'.format(experiment_name)
64 |     scoring_module = importlib.import_module(scoring_module_name)
65 |     scoring_class = getattr(scoring_module, 'Scorer')
66 |     scorer = scoring_class()
67 | 
68 |     # Refer to the list of supported kwargs
69 |     # https://github.com/encode/uvicorn/blob/e95e995781c7d1d8661b4f94631e3adb77c85237/uvicorn/main.py#L196
70 |     uvicorn.run(app,
71 |                 host='0.0.0.0',
72 |                 port=port)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     process()
77 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ts-pipeline-env
 2 | channels:
 3 |   - h2oai
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.6.7
 8 |   - numpy
 9 |   - pandas
10 |   - click
11 |   - matplotlib
12 |   - seaborn
13 |   - tqdm
14 |   - h2oai_client=1.6.3
15 | 


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA - Rolling Window.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA - Rolling Window.odp


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA-RollWindow-duration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/TTA-RollWindow-duration.png


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/images/metrics_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/ts-full-pipeline/images/metrics_plot.png


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/ts-full-pipeline/ts-definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "generators": [
  3 |         {
  4 |             "name": "store-s1-baseline",
  5 |             "type": "constant",
  6 |             "value": 8
  7 |         },
  8 |         {
  9 |             "name": "store-s2-baseline",
 10 |             "type": "constant",
 11 |             "value": 4
 12 |         },
 13 |         {
 14 |             "name": "burger-monthly-baseline",
 15 |             "type": "monthly",
 16 |             "points": {
 17 |                 "january": 3.3,
 18 |                 "february": 3.7,
 19 |                 "march": 6.8,
 20 |                 "april": 9.8,
 21 |                 "may": 13.6,
 22 |                 "june": 16.2,
 23 |                 "july": 18.4,
 24 |                 "august": 18,
 25 |                 "september": 14.9,
 26 |                 "october": 11.1,
 27 |                 "november": 4.3,
 28 |                 "december": 5.9
 29 |             }
 30 |         },
 31 |         {
 32 |             "name": "taco-monthly-baseline",
 33 |             "type": "monthly",
 34 |             "points": {
 35 |                 "january": 5.3,
 36 |                 "february": 5.7,
 37 |                 "march": 8.8,
 38 |                 "april": 11.8,
 39 |                 "may": 15.6,
 40 |                 "june": 18.2,
 41 |                 "july": 22.4,
 42 |                 "august": 20,
 43 |                 "september": 16.9,
 44 |                 "october": 13.1,
 45 |                 "november": 7.3,
 46 |                 "december": 6.9
 47 |             }
 48 |         },
 49 |         {
 50 |             "name": "soda-monthly-baseline",
 51 |             "type": "monthly",
 52 |             "points": {
 53 |                 "january": 8.3,
 54 |                 "february": 8.7,
 55 |                 "march": 9.8,
 56 |                 "april": 11.8,
 57 |                 "may": 19.6,
 58 |                 "june": 26.2,
 59 |                 "july": 38.4,
 60 |                 "august": 28,
 61 |                 "september": 18.9,
 62 |                 "october": 13.1,
 63 |                 "november": 8.8,
 64 |                 "december": 12.9
 65 |             }
 66 |         },
 67 |         {
 68 |             "name": "coffee-monthly-baseline",
 69 |             "type": "monthly",
 70 |             "points": {
 71 |                 "january": 18.3,
 72 |                 "february": 17.7,
 73 |                 "march": 13.8,
 74 |                 "april": 9.8,
 75 |                 "may": 8.6,
 76 |                 "june": 7.5,
 77 |                 "july": 7.4,
 78 |                 "august": 8.5,
 79 |                 "september": 12.9,
 80 |                 "october": 13.1,
 81 |                 "november": 18.8,
 82 |                 "december": 23.9
 83 |             }
 84 |         },
 85 |         {
 86 |             "name": "weekly-variation",
 87 |             "type": "weekly",
 88 |             "points": {
 89 |                 "monday": -5.5,
 90 |                 "tuesday": -5.25,
 91 |                 "wednesday": -8.5,
 92 |                 "friday": 5.35,
 93 |                 "saturday": 9.5,
 94 |                 "sunday": 7.23
 95 |             }
 96 |         },
 97 |         {
 98 |             "name": "daily-variation",
 99 |             "type": "daily",
100 |             "points": {
101 |                 "00:00:00.000": -5,
102 |                 "02:00:00.000": -5.9,
103 |                 "04:00:00.000": -7,
104 |                 "06:00:00.000": -2.6,
105 |                 "08:00:00.000": 6.7,
106 |                 "10:00:00.000": 2.2,
107 |                 "12:00:00.000": 9,
108 |                 "14:00:00.000": 3,
109 |                 "16:00:00.000": 1.3,
110 |                 "18:00:00.000": 6.9,
111 |                 "20:00:00.000": 5.3,
112 |                 "22:00:00.000": -2.7
113 |             }
114 |         },
115 |         {
116 |             "name": "result",
117 |             "type": "aggregate",
118 |             "aggregator": "sum",
119 |             "generators": [
120 |                 "weekly-variation",
121 |                 "daily-variation"
122 |             ]
123 |         },
124 |         {
125 |             "name": "s1-burger",
126 |             "type": "aggregate",
127 |             "aggregator": "max",
128 |             "generators": [
129 |                 {
130 |                     "type": "constant",
131 |                     "value": 0
132 |                 },
133 |                 {
134 |                     "type": "aggregate",
135 |                     "aggregator": "sum",
136 |                     "generators": [
137 |                         "store-s1-baseline",
138 |                         "burger-monthly-baseline",
139 |                         "result"
140 |                     ]
141 |                 }
142 |             ]
143 |         },
144 |         {
145 |             "name": "s2-burger",
146 |             "type": "aggregate",
147 |             "aggregator": "max",
148 |             "generators": [
149 |                 {
150 |                     "type": "constant",
151 |                     "value": 0
152 |                 },
153 |                 {
154 |                     "type": "aggregate",
155 |                     "aggregator": "sum",
156 |                     "generators": [
157 |                         "store-s2-baseline",
158 |                         "burger-monthly-baseline",
159 |                         "result"
160 |                     ]
161 |                 }
162 |             ]
163 |         },
164 |         {
165 |             "name": "s1-taco",
166 |             "type": "aggregate",
167 |             "aggregator": "max",
168 |             "generators": [
169 |                 {
170 |                     "type": "constant",
171 |                     "value": 0
172 |                 },
173 |                 {
174 |                     "type": "aggregate",
175 |                     "aggregator": "sum",
176 |                     "generators": [
177 |                         "store-s1-baseline",
178 |                         "taco-monthly-baseline",
179 |                         "result"
180 |                     ]
181 |                 }
182 |             ]
183 |         },
184 |         {
185 |             "name": "s2-taco",
186 |             "type": "aggregate",
187 |             "aggregator": "max",
188 |             "generators": [
189 |                 {
190 |                     "type": "constant",
191 |                     "value": 0
192 |                 },
193 |                 {
194 |                     "type": "aggregate",
195 |                     "aggregator": "sum",
196 |                     "generators": [
197 |                         "store-s2-baseline",
198 |                         "taco-monthly-baseline",
199 |                         "result"
200 |                     ]
201 |                 }
202 |             ]
203 |         },
204 |         {
205 |             "name": "s1-soda",
206 |             "type": "aggregate",
207 |             "aggregator": "max",
208 |             "generators": [
209 |                 {
210 |                     "type": "constant",
211 |                     "value": 0
212 |                 },
213 |                 {
214 |                     "type": "aggregate",
215 |                     "aggregator": "sum",
216 |                     "generators": [
217 |                         "store-s1-baseline",
218 |                         "soda-monthly-baseline",
219 |                         "result"
220 |                     ]
221 |                 }
222 |             ]
223 |         },
224 |         {
225 |             "name": "s2-soda",
226 |             "type": "aggregate",
227 |             "aggregator": "max",
228 |             "generators": [
229 |                 {
230 |                     "type": "constant",
231 |                     "value": 0
232 |                 },
233 |                 {
234 |                     "type": "aggregate",
235 |                     "aggregator": "sum",
236 |                     "generators": [
237 |                         "store-s2-baseline",
238 |                         "soda-monthly-baseline",
239 |                         "result"
240 |                     ]
241 |                 }
242 |             ]
243 |         },
244 |         {
245 |             "name": "s1-coffee",
246 |             "type": "aggregate",
247 |             "aggregator": "max",
248 |             "generators": [
249 |                 {
250 |                     "type": "constant",
251 |                     "value": 0
252 |                 },
253 |                 {
254 |                     "type": "aggregate",
255 |                     "aggregator": "sum",
256 |                     "generators": [
257 |                         "store-s1-baseline",
258 |                         "coffee-monthly-baseline",
259 |                         "result"
260 |                     ]
261 |                 }
262 |             ]
263 |         },
264 |         {
265 |             "name": "s2-coffee",
266 |             "type": "aggregate",
267 |             "aggregator": "max",
268 |             "generators": [
269 |                 {
270 |                     "type": "constant",
271 |                     "value": 0
272 |                 },
273 |                 {
274 |                     "type": "aggregate",
275 |                     "aggregator": "sum",
276 |                     "generators": [
277 |                         "store-s2-baseline",
278 |                         "coffee-monthly-baseline",
279 |                         "result"
280 |                     ]
281 |                 }
282 |             ]
283 |         }
284 |     ],
285 |     "exported": [
286 |         {
287 |             "name": "S1;BURGER",
288 |             "generator": "s1-burger",
289 |             "frequency": 3600000
290 |         },
291 |         {
292 |             "name": "S2;BURGER",
293 |             "generator": "s2-burger",
294 |             "frequency": 3600000
295 |         },
296 |         {
297 |             "name": "S1;TACO",
298 |             "generator": "s1-taco",
299 |             "frequency": 3600000
300 |         },
301 |         {
302 |             "name": "S2;TACO",
303 |             "generator": "s2-taco",
304 |             "frequency": 3600000
305 |         },
306 |         {
307 |             "name": "S1;SODA",
308 |             "generator": "s1-soda",
309 |             "frequency": 3600000
310 |         },
311 |         {
312 |             "name": "S2;SODA",
313 |             "generator": "s2-soda",
314 |             "frequency": 3600000
315 |         },
316 |         {
317 |             "name": "S1;COFFEE",
318 |             "generator": "s1-coffee",
319 |             "frequency": 3600000
320 |         },
321 |         {
322 |             "name": "S2;COFFEE",
323 |             "generator": "s2-soda",
324 |             "frequency": 3600000
325 |         }
326 |     ],
327 |     "from": "2016-01-01 00:00:00.000",
328 |     "to": "2017-12-31 23:59:59.999"
329 | }


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/import_data_sets_stock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/import_data_sets_stock.png


--------------------------------------------------------------------------------
/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/launching_experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/driverlessai_experiments/timeseries/walmart_timeseries_experiment/images/launching_experiment.png


--------------------------------------------------------------------------------
/interpretable_ml/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning Interpretability Code Samples/Tutorials for Driverless AI
2 | 


--------------------------------------------------------------------------------
/interpretable_ml/data/default_of_credit_card_clients.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/h2oai/driverlessai-tutorials/7a3f628bf5cddc8bcfb74d66b2207dcec6a8b364/interpretable_ml/data/default_of_credit_card_clients.xls


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/R/Shiny_Example/1_Data_Recoding.R:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | ################                        DATA PROCESSING                         ################
 3 | ################################################################################################
 4 | 
 5 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/")
 6 | 
 7 | dataset <- read.csv("CreditCard.csv", header = TRUE, stringsAsFactors = TRUE)
 8 | str(dataset)
 9 | 
10 | #### remove ID column ####
11 | dataset = dataset[-c(1)]
12 | names(dataset)
13 | 
14 | 
15 | 
16 | #### Recoding GENDER
17 | table(dataset$SEX)
18 | dataset$SEX <- ifelse(dataset$SEX == 1, "Male", "Female")
19 | table(dataset$SEX)
20 | 
21 | #### Recoding EDUCATION
22 | table(dataset$EDUCATION)
23 | dataset$EDUCATION[dataset$EDUCATION > 3] <- "Others"
24 | dataset$EDUCATION[dataset$EDUCATION == 0] <- "No Schooling"
25 | dataset$EDUCATION[dataset$EDUCATION == 1] <- "Graduate School"
26 | dataset$EDUCATION[dataset$EDUCATION == 2] <- "University"
27 | dataset$EDUCATION[dataset$EDUCATION == 3] <- "High School"
28 | table(dataset$EDUCATION)
29 | 
30 | #### Recoding MARITAL STATUS
31 | table(dataset$MARRIAGE)
32 | dataset$MARRIAGE[dataset$MARRIAGE == 0 | dataset$MARRIAGE == 3] <- "Others"
33 | dataset$MARRIAGE[dataset$MARRIAGE == 1] <- "Married"
34 | dataset$MARRIAGE[dataset$MARRIAGE == 2] <- "Single"
35 | table(dataset$MARRIAGE)
36 | 
37 | #### Target
38 | table(dataset$default.payment.next.month)
39 | dataset$default.payment.next.month = ifelse(dataset$default.payment.next.month==0, "0_Non-Default", "1_Default")
40 | 
41 | write.csv(dataset, "CreditCardRe.csv", row.names = FALSE)
42 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/R/Shiny_Example/2_DAI_Interaction.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # https://support.rstudio.com/hc/en-us/articles/200486138-Changing-R-versions-for-RStudio-desktop
  3 | 
  4 | ##  http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/r_install_client.html#prerequisites
  5 | 
  6 | #install.packages('curl') #given incorrectly as rcurl
  7 | #install.packages('jsonlite')
  8 | #install.packages('rlang')
  9 | #install.packages('methods')
 10 | 
 11 | #############################################################################################
 12 | ##########                   INSTALL DRIVERLESSAI R CLIENT                        ###########
 13 | #############################################################################################
 14 | getwd()
 15 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/")
 16 | #install.packages('dai_1.9.1.tar.gz', type = 'source', repos = NULL)
 17 | library(dai)
 18 | 
 19 | #############################################################################################
 20 | ##########                           DAI Connect                                  ###########
 21 | #############################################################################################
 22 | url = 'http://ec2-52-206-210-31.compute-1.amazonaws.com:12345'
 23 | username = 'h2oai'
 24 | password = 'i-0f244cddd419191cd'
 25 | dai.connect(uri = url, username = username, password = password)
 26 | 
 27 | #############################################################################################
 28 | ##########                    DAI Data Upload/Delete                              ###########
 29 | #############################################################################################
 30 | dataset_daiFrame = as.DAIFrame(dataset)
 31 | #or
 32 | cc_dai <- dai.upload_dataset("CreditCardRe.csv", progress = TRUE)
 33 | 
 34 | View(dai.list_datasets())
 35 | 
 36 | dai_frame <- dai.get_frame('ef2d496e-6461-11eb-b42c-0242ac110002')
 37 | dai.rm(dai_frame)
 38 | 
 39 | View(dai.list_datasets())
 40 | 
 41 | cc_df <- as.data.frame(cc_dai)
 42 | str(cc_df)
 43 | 
 44 | 
 45 | 
 46 | #############################################################################################
 47 | ##########                    DAI Dataset Visuals                                 ###########
 48 | #############################################################################################
 49 | 
 50 | library(vegawidget)
 51 | 
 52 | ### Parallel Coordinates Plot
 53 | dai.parallel_coordinates_plot(cc_dai)
 54 | dai.parallel_coordinates_plot(
 55 |   cc_dai,
 56 |   variable_names = NULL,
 57 |   permute = FALSE,
 58 |   transpose = FALSE,
 59 |   cluster = TRUE,
 60 |   render = TRUE,
 61 |   progress = TRUE
 62 | )
 63 | 
 64 | ### Distribution
 65 | dai.dotplot(cc_dai, variable_name = 'PAY_0', mark = "point")
 66 | #dai.histogram(cc_dai, variable_name = 'LIMIT_BAL', number_of_bars = 5)
 67 | 
 68 | ## Linear Regression
 69 | dai.loess_regression_plot(cc_dai, x_variable_name = 'BILL_AMT1', y_variable_name = 'BILL_AMT2' )
 70 | #dai.linear_regression_plot(cc_dai, x_variable_name = 'PAY_AMT1', y_variable_name = 'PAY_AMT2' )
 71 | 
 72 | #############################################################################################
 73 | ##########                         DAI Split Dataset                              ###########
 74 | #############################################################################################
 75 | 
 76 | dai.split_dataset(
 77 |   dataset = cc_dai,
 78 |   output_name1 = 'CreditCardRe_Train',
 79 |   output_name2 = 'CreditCardRe_Test',
 80 |   ratio = 0.8,
 81 |   seed = 1234,
 82 |   target = 'default.payment.next.month',
 83 |   fold_col = NULL,
 84 |   time_col = NULL,
 85 |   progress = TRUE
 86 | )
 87 | View(dai.list_datasets())
 88 | 
 89 | train_dai_frame = dai.get_frame(key = '1cd2a352-63cf-11eb-831f-0242ac110002')
 90 | test_dai_frame = dai.get_frame(key = '1cd2cf12-63cf-11eb-831f-0242ac110002')
 91 | 
 92 | View(train_dai_frame)
 93 | View(train_dai_frame$columns)
 94 | 
 95 | #############################################################################################
 96 | ##########                        DAI New Experiment                              ###########
 97 | #############################################################################################
 98 | 
 99 | View(dai.list_models())
100 | 
101 | default_model = dai.train(training_frame = train_dai_frame,
102 |                           target_col = 'default.payment.next.month',
103 |                           is_classification = TRUE,
104 |                           experiment_name = 'Default',
105 |                           testing_frame = test_dai_frame)
106 | 
107 | simple_model = dai.train(training_frame = train_dai_frame,
108 |                          target_col = 'default.payment.next.month',
109 |                          is_classification = TRUE,
110 |                          testing_frame = test_dai_frame,
111 |                          scorer = 'F1',
112 |                          accuracy = 1,
113 |                          time = 1,
114 |                          interpretability = 10,
115 |                          experiment_name = 'Basic')
116 | 
117 | 
118 | glm_model= dai.train(training_frame = train_dai_frame,
119 |                         target_col = 'default.payment.next.month',
120 |                         is_classification = TRUE,
121 |                         testing_frame = test_dai_frame,
122 |                         scorer = 'AUC',
123 |                         accuracy = 1,
124 |                         time = 1,
125 |                         interpretability = 10,
126 |                         experiment_name = 'Config_Override', 
127 |                         config_overrides = c('make_autoreport = true',
128 |                                              'autodoc_population_stability_index = true',
129 |                                              'enable_glm="on"',
130 |                                              'enable_decision_tree="off"',
131 |                                              'enable_xgboost_gbm = "off"',
132 |                                              'enable_lightgbm = "off"',
133 |                                              'make_python_scoring_pipeline = "off"',
134 |                                              'make_mojo_scoring_pipeline = "off"'
135 |                                ))
136 | View(dai.list_models())
137 | 
138 | # suggested_params = dai.suggest_model_params(
139 | #   training_frame = train_dai_frame,
140 | #   target_col = 'default.payment.next.month',
141 | #   is_classification = TRUE,
142 | #   is_timeseries = FALSE,
143 | #   is_image = FALSE,
144 | #   config_overrides = "",
145 | #   cols_to_drop = NULL
146 | # )
147 | # 
148 | # View(suggested_params)
149 | # suggested_params_model = do.call(dai.train, suggested_params)
150 | 
151 | View(dai.list_models())
152 | fetched_model = dai.get_model(key = 'c1224714-63d4-11eb-831f-0242ac110002')
153 | dai.set_model_desc(fetched_model, 'prod_model')
154 | #dai.rm(fetched_model)
155 | 
156 | 
157 | #############################################################################################
158 | ##########                        DAI Reuse/Refit a Model                         ###########
159 | #############################################################################################
160 | 
161 | View(dai.list_models())
162 | 
163 | summary(fetched_model)
164 | 
165 | another_expert_model= dai.train(training_frame = train_dai_frame,
166 |                         target_col = 'default.payment.next.month',
167 |                         is_classification = TRUE,
168 |                         testing_frame = test_dai_frame,
169 |                         scorer = 'AUCPR',
170 |                         experiment_name = 'NewExpSameParams', 
171 |                         resumed_model = fetched_model,
172 |                         resume_method = 'same')
173 | ### ^^ When trying new experiments with same parameters, config_override changes are NOT used
174 | 
175 | refit_expert_model= dai.train(training_frame = cc_dai,
176 |                                 target_col = 'default.payment.next.month',
177 |                                 is_classification = TRUE,
178 |                                 testing_frame = test_dai_frame,
179 |                                 scorer = 'MCC',
180 |                                 accuracy = 1,
181 |                                 time = 0,
182 |                                 interpretability = 10,
183 |                                 experiment_name = 'RefitFinalModel', 
184 |                                 resumed_model = fetched_model,
185 |                                 resume_method = 'refit')
186 | 
187 | ### ^^ When refitting final model, time setting is forced to 0
188 | 
189 | 
190 | #############################################################################################
191 | ##########                    Retrieving / Downloading Artefacts                  ###########
192 | #############################################################################################
193 | 
194 | View(dai.list_models())
195 | 
196 | final_model = dai.get_model(key = '0e856d32-63db-11eb-831f-0242ac110002')
197 | 
198 | #####   Predictions  #####
199 | dai.autoreport(final_model, path = "../", force = TRUE, progress = TRUE)
200 | 
201 | #####   Predictions  #####
202 | dai.download_file(final_model$train_predictions_path, dest_path = "../", force = TRUE,  progress = TRUE)
203 | dai.download_file(final_model$test_predictions_path, dest_path = "../", force = TRUE,  progress = TRUE)
204 | 
205 | ##### Summary and Log Files #####
206 | dai.download_file(final_model$summary_path, dest_path = ".", force = TRUE, progress = TRUE)
207 | dai.download_file(final_model$log_file_path, dest_path = ".", force = TRUE, progress = TRUE)
208 | 
209 | ##### Download MOJO #####
210 | dai.download_mojo(final_model, path = getwd(), force = TRUE, progress = TRUE)
211 | 
212 | 
213 | #############################################################################################
214 | ##########         MLI Interpretation - CAUTION - Low Level Code / BUG            ###########
215 | #############################################################################################
216 | 
217 | # library(jsonlite)
218 | # 
219 | # dai.interpret_model <- function(model, dataset, target_col, progress = TRUE) {
220 | #   print(model$key)
221 | #   print(dataset$key)
222 | #   key <- dai:::.dai_do_rpc("api_run_interpretation", list("interpret_params" = list(
223 | #     dai_model = list(key = unbox(model$key), display_name = unbox(model$description)),
224 | #     dataset = list(key = unbox(dataset$key), display_name = unbox(dataset$name)),
225 | #     target_col = unbox(target_col),
226 | #     use_raw_features = unbox(TRUE),
227 | #     prediction_col = unbox(''),
228 | #     weight_col = unbox(''),
229 | #     drop_cols = list(),
230 | #     klime_cluster_col = unbox(''),
231 | #     nfolds = unbox(0),
232 | #     sample = unbox(TRUE),
233 | #     sample_num_rows = unbox(-1),
234 | #     qbin_cols = list(),
235 | #     qbin_count = unbox(0),
236 | #     lime_method = unbox("k-LIME"),
237 | #     dt_tree_depth = unbox(3),
238 | #     vars_to_pdp = unbox(10),
239 | #     config_overrides = NULL,
240 | #     dia_cols = list()
241 | #   )))
242 | #   
243 | #   print("key is set")
244 | #   print(key)
245 | #   
246 | #   return(dai:::wait_for_job(function() dai:::get_interpretation_job(key), progress = progress)$entity)
247 | # }
248 | # 
249 | # mli <- dai.interpret_model(final_model, train_dai_frame, 'default.payment.next.month')
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/R/Shiny_Example/3_DAI_Model_Prediction.R:
--------------------------------------------------------------------------------
 1 | # https://support.rstudio.com/hc/en-us/articles/200486138-Changing-R-versions-for-RStudio-desktop
 2 | 
 3 | ##  http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/r_install_client.html#prerequisites
 4 | 
 5 | #install.packages('curl') #given incorrectly as rcurl
 6 | #install.packages('jsonlite')
 7 | #install.packages('rlang')
 8 | #install.packages('methods')
 9 | 
10 | #############################################################################################
11 | ##########                   INSTALL DRIVERLESSAI R CLIENT                        ###########
12 | #############################################################################################
13 | getwd()
14 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/")
15 | #install.packages('dai_1.9.1.tar.gz', type = 'source', repos = NULL)
16 | library(dai)
17 | 
18 | 
19 | 
20 | #############################################################################################
21 | ##########                           DAI Connect                                  ###########
22 | #############################################################################################
23 | url = 'http://ec2-54-204-68-13.compute-1.amazonaws.com:12345'
24 | username = 'h2oai'
25 | password = 'i-0f244cddd419191cd'
26 | dai.connect(uri = url, username = username, password = password)
27 | 
28 | #############################################################################################
29 | ##########                    DAI Model Prediction                                ###########
30 | #############################################################################################
31 | 
32 | View(dai.list_models())
33 | final_model = dai.get_model(key = '0e856d32-63db-11eb-831f-0242ac110002')
34 | 
35 | new_data = read.csv("CreditCardRe_Test.csv")
36 | new_data_dai = as.DAIFrame(new_data)
37 | preds = predict(final_model, newdata = new_data_dai)
38 | 
39 | pred_shap_contribs = predict(final_model, newdata = new_data_dai, pred_contribs = TRUE)
40 | pred_orig_contribs = predict(final_model, newdata = new_data_dai, pred_contribs = TRUE, pred_contribs_original = TRUE)
41 | 
42 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/R/Shiny_Example/4_MOJO_Predictions.R:
--------------------------------------------------------------------------------
 1 | #############################################################################################
 2 | ##########                         MOJO Model Prediction                          ###########
 3 | #############################################################################################
 4 | rm(list = ls())  # remove all objects including dai
 5 | #install dependencies and daimojo package
 6 | #install.packages('Rcpp')
 7 | #install.packages("~/Downloads/daimojo_2.5.8_x86_64-darwin.tar.gz", type = 'source', repos=NULL)
 8 | 
 9 | #install.packages('data.table')
10 | getwd()
11 | #setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/")
12 | 
13 | library(daimojo)
14 | library(data.table)
15 | ### set DRIVERLESS_AI_LICENSE_KEY
16 | #Sys.setenv("DRIVERLESS_AI_LICENSE_KEY"="paste your key here")
17 | Sys.getenv("DRIVERLESS_AI_LICENSE_KEY")
18 | model = daimojo::load.mojo("mojo-pipeline/pipeline.mojo")
19 | daimojo::create.time(model)
20 | daimojo::feature.names(model)
21 | col_class <- setNames(daimojo::feature.types(model), daimojo::feature.names(model))
22 | daimojo::feature.types(model)
23 | daimojo::missing.values(model)
24 | daimojo::uuid(model)
25 | 
26 | new_data <- fread("./mojo-pipeline/example.csv", colClasses=col_class, header=TRUE, sep=",")
27 | str(new_data)
28 | 
29 | daimojo::predict.mojo(m = model, newdata = new_data)
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/R/Shiny_Example/Data_Preprocessing_for_app.R:
--------------------------------------------------------------------------------
  1 | #################     
  2 | 
  3 | 
  4 | library(dplyr)
  5 | library(daimojo)
  6 | options(scipen = 99999)
  7 | 
  8 | #install.packages("daimojo_2.4.8_x86_64-darwin.tar.gz", repos = NULL, type = "source") # to be downloaded from DAI under "Download MOJO Piepline"
  9 | # http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/scoring-pipeline-cpp.html#downloading-the-scoring-pipeline-runtimes
 10 | 
 11 | #path set to driverlessai-tutorials
 12 | setwd("/Users/felix/Code/h2oai/driverlessai-tutorials/scoring-pipeline-deployment/R/Shiny_Example/")
 13 | dataset <- read.csv("CreditCard-train.csv", header = TRUE, stringsAsFactors = TRUE)
 14 | target <- "default.payment.next.month"
 15 | 
 16 | 
 17 | dataset$SEX <- ifelse(dataset$SEX == 1, "Female", "Male")
 18 | table(dataset$SEX)
 19 | 
 20 | dataset$EDUCATION[dataset$EDUCATION > 3] <- "Others"
 21 | dataset$EDUCATION[dataset$EDUCATION == 0] <- "No Schooling"
 22 | dataset$EDUCATION[dataset$EDUCATION == 1] <- "Graduate School"
 23 | dataset$EDUCATION[dataset$EDUCATION == 2] <- "University"
 24 | dataset$EDUCATION[dataset$EDUCATION == 3] <- "High School"
 25 | 
 26 | 
 27 | table(dataset$EDUCATION)
 28 | 
 29 | dataset$MARRIAGE[dataset$MARRIAGE == 0] <- "Others"
 30 | dataset$MARRIAGE[dataset$MARRIAGE == 1] <- "Married"
 31 | dataset$MARRIAGE[dataset$MARRIAGE == 2] <- "Single"
 32 | dataset$MARRIAGE[dataset$MARRIAGE == 3] <- "Others"
 33 | 
 34 | table(dataset$MARRIAGE)
 35 | 
 36 | 
 37 | class_vec <- data.frame(sapply(dataset, class))
 38 | class_vec$columns <- rownames(class_vec)
 39 | rownames(class_vec) <- NULL
 40 | 
 41 | colnames(class_vec) <- c("class", "variables")
 42 | class_vec$class <- as.character(class_vec$class)
 43 | 
 44 | class_vec <- class_vec[class_vec$variables != target, ]
 45 | 
 46 | class_vec <- class_vec[class_vec$variables != "ID", ]
 47 | 
 48 | int_cols <- class_vec$variables[class_vec$class %in% c("integer", "numeric")]
 49 | 
 50 | for(i in int_cols ){
 51 |   if(length(unique(dataset[,i])) < 50)
 52 |     class_vec$class[class_vec$variables == i] <- "numeric_cat"
 53 |   
 54 | }
 55 | 
 56 | summary(dataset)
 57 | str(dataset)
 58 | 
 59 | dataset$LIMIT_BAL <- as.numeric(dataset$LIMIT_BAL) #present as integer
 60 | dataset$PAY_0 <- as.numeric(dataset$PAY_0) #present as integer
 61 | dataset$PAY_2 <- as.numeric(dataset$PAY_2) #present as integer
 62 | dataset$PAY_3 <- as.numeric(dataset$PAY_3) #present as integer
 63 | dataset$PAY_4 <- as.numeric(dataset$PAY_4) #present as integer
 64 | dataset$PAY_5 <- as.numeric(dataset$PAY_5) #present as integer
 65 | dataset$PAY_6 <- as.numeric(dataset$PAY_6) #present as integer
 66 | dataset$PAY_AMT1 <- as.numeric(dataset$PAY_AMT1) #present as integer
 67 | dataset$PAY_AMT2 <- as.numeric(dataset$PAY_AMT2) #present as integer
 68 | dataset$PAY_AMT3 <- as.numeric(dataset$PAY_AMT3) #present as integer
 69 | dataset$PAY_AMT4 <- as.numeric(dataset$PAY_AMT4) #present as integer
 70 | dataset$PAY_AMT5 <- as.numeric(dataset$PAY_AMT5) #present as integer
 71 | dataset$PAY_AMT6 <- as.numeric(dataset$PAY_AMT6) #present as integer
 72 | dataset$BILL_AMT1 <- as.numeric(dataset$BILL_AMT1) #present as integer
 73 | dataset$BILL_AMT2 <- as.numeric(dataset$BILL_AMT2) #present as integer
 74 | dataset$BILL_AMT3 <- as.numeric(dataset$BILL_AMT3) #present as integer
 75 | dataset$BILL_AMT4 <- as.numeric(dataset$BILL_AMT4) #present as integer
 76 | dataset$BILL_AMT5 <- as.numeric(dataset$BILL_AMT5) #present as integer
 77 | dataset$BILL_AMT6 <- as.numeric(dataset$BILL_AMT6) #present as integer
 78 | 
 79 | #int_cols <- class_vec$variables[class_vec$class %in% c("integer", "numeric")]
 80 | #numeric_cat_cols <- class_vec$variables[class_vec$class %in% c("numeric_cat")]
 81 | #cat_cols <- class_vec$variables[class_vec$class %in% c("str", "factor", "character")]
 82 | #bool_cols <- class_vec$variables[class_vec$class %in% c("logical")]
 83 | 
 84 | summary(dataset)
 85 | 
 86 | predictor_colnames <- colnames(dataset)[colnames(dataset) != target]
 87 | predictor_colnames <- predictor_colnames[predictor_colnames != "ID"]
 88 | 
 89 | predictions_df <- read.csv("train_preds_custom.csv", header = TRUE)
 90 | colnames(predictions_df)[27] <- "prob_pred"
 91 | 
 92 | 
 93 | dataset_w_pred <- inner_join(dataset, predictions_df[, c("ID", "prob_pred")], by = "ID")
 94 | colnames(dataset_w_pred)[25] <- "Actual_Target"
 95 | dataset_w_pred$Actual_Target <- as.factor(dataset_w_pred$Actual_Target)
 96 | 
 97 | # TODO - REMOVE LICENSE KEY
 98 | Sys.setenv(DRIVERLESS_AI_LICENSE_KEY = paste0("paste your DAI License key here"))
 99 | m <- daimojo::load.mojo("mojo-pipeline/pipeline.mojo")
100 | 
101 | create.time(m)
102 | uuid(m)
103 | predict.mojo(m, dataset)
104 | 
105 | daimojo::predict(m, dataset)
106 | 
107 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/README.md:
--------------------------------------------------------------------------------
 1 | Scoring Pipeline Deployment Examples
 2 | ====================================
 3 | 
 4 | Driverless AI scoring pipelines can be deployed independently of the machine
 5 | where Driverless AI is running. This essentially helps you to separate the 
 6 | concerns of Model Training from Model Deployment. This capability gives you
 7 | immense flexibility on how you can deploy your scoring pipelines to production.
 8 | 
 9 | This directory list contains example code that will show how to deploy DAI 
10 | scoring pipelines (python and mojo) on new machines.
11 | 
12 | Refer to `python` and `java` directories for detailed examples on how to deploy
13 | the corresponding pipeline in various scenarios.
14 | 
15 | 
16 | Disclaimer
17 | ----------
18 | 
19 | The scoring pipeline wrapper code shared in this directory is created to provide you 
20 | a sample starting point and is not intended to be directly deployed to production as is.
21 | You can use this starting point and build over it to solve your deployment needs ensuring
22 | that your security etc. requirements are met.
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/java/README.md:
--------------------------------------------------------------------------------
1 | Mojo (Java) Scoring Pipeline Deployment Examples
2 | ================================================
3 | 
4 | To be created...


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM centos:centos7
 2 | 
 3 | # These commands run as root
 4 | # Install base dependencies
 5 | RUN yum -y update && \
 6 |     yum install -y epel-release && \
 7 |     yum -y groupinstall 'Development Tools' && \
 8 |     yum install -y openblas-devel openblas python36-virtualenv python36-pip wget unzip java && \
 9 |     ln -s /usr/bin/virtualenv-3.6 /usr/bin/virtualenv && \
10 |     ln -s /usr/bin/pip-3.6 /usr/bin/pip && \
11 |     ln -sf /usr/bin/python3 /usr/bin/python
12 | 
13 | ENV LANG en_US.UTF-8
14 | ENV LANGUAGE en_US:en
15 | ENV LC_ALL en_US.UTF-8
16 | ENV HOME /home/newuser
17 | 
18 | # Create new user
19 | RUN useradd -ms /bin/bash newuser
20 | 
21 | # Create a new user to run the pipeline
22 | USER newuser
23 | WORKDIR /home/newuser
24 | 
25 | # Commands below run as newuser
26 | COPY --chown=newuser:newuser payload/scorer.zip ./
27 | COPY --chown=newuser:newuser payload/license.sig .driverlessai/
28 | 
29 | RUN unzip scorer.zip
30 | 
31 | WORKDIR scoring-pipeline
32 | 
33 | RUN bash run_example.sh
34 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/docker/README.md:
--------------------------------------------------------------------------------
 1 | Python Scoring Pipeline Wrapper using Docker
 2 | ============================================
 3 | 
 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline
 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 docker container. This directory acts as the build 
 6 | context for the docker build step. 
 7 | 
 8 | 
 9 | Prerequisites
10 | -------------
11 | 
12 | The following pre-requisites are needed
13 | - [Docker](https://www.docker.com/) 
14 | 
15 | Follow the installation instructions for your platform and get Docker Ce (or EE) installed on the machine. 
16 | 
17 | 
18 | Code Structure
19 | --------------
20 | 
21 | The code assumes a directory structure as below:
22 | 
23 | ```
24 | top-dir: A directory with the below structure. Name can be anything. This is the build context for docker build command
25 | - README.md: This file with the details you are reading
26 | - Dockerfile: The docker image build script
27 | - payload: A directory that contains files to be used in the docker container for deployment
28 |     - scorer.zip: The DAI python scoring pipeline. (You need to put this file here)
29 |     - license.sig: Valid Driverless AI license file. (You need to provide your license file here)
30 | ```
31 | 
32 | Instructions
33 | ------------
34 | 
35 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker
36 | 3. Change to `top-dir`, which contains the files as mentioned in the above section
37 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory.
38 | 5. Copy Driverless AI license `license.sig` in the `payload` directory
39 | 6. Issue the command `docker build -t scoretest .`. This will
40 |     - Create a CentOS 7 based docker container 
41 |     - Install required dependencies, python etc..
42 |     - Create a virtual environment for the scoring pipeline by installing all needed dependencies
43 |     - Run `example.py` from the scoring pipeline
44 | 
45 | As part of the build process you will see the scores being produced for the test data in `example.py`. This example
46 | shows how to use DAI python scoring pipeline as a python module. There are other options like HTTP service and TCP service that can be created too.
47 | 
48 | You can run the docker container in interactive model, and can experiment with the HTTP and TCP endpoints.
49 | 
50 | Execute the command `docker run -it --rm scoretest:latest`. Once connected you will be in the `scoring-pipeline` directory.
51 | 
52 | To run `example.py` you can follow the below steps once you are connected using SSH
53 | 
54 | ```
55 | . env/bin/activate # activate environment for required experiment
56 | python example.py  # to run example.py manually
57 | ```
58 | 
59 | Similarly, you can run the HTTP and TCP server python files too.
60 | 
61 | 
62 | Disclaimer
63 | ----------
64 | 
65 | The scoring pipeline wrapper code shared in this directory is created to provide you 
66 | a sample starting point and is not intended to be directly deployed to production as is.
67 | You can use this starting point and build over it to solve your deployment needs ensuring
68 | that your security etc. requirements are met.
69 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/vagrant/README.md:
--------------------------------------------------------------------------------
 1 | Python Scoring Pipeline Wrapper using Vagrant
 2 | =============================================
 3 | 
 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline
 5 | obtained from H2O Driverless AI in a CentOS 7 virtual machine in Vagrant.
 6 | 
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | The following pre-requisites are needed
12 | - [VirtualBox](https://www.virtualbox.org/): A free virtualization provider
13 | - [Vagrant](https://www.vagrantup.com/): A tool for building and managing virtual machines
14 | - [Vagrant Disk Resize plugin](https://github.com/sprotheroe/vagrant-disksize): A vagrnt plugin to manage disk sizes
15 | 
16 | Follow the installation instructions for your platform and get them installed in the above order.
17 | 
18 | 
19 | Code Structure
20 | --------------
21 | 
22 | The code assumes a directory structure as below:
23 | 
24 | ```
25 | top-dir: A directory with the below structure. Name of directory can be anything.
26 | - README.md: This file with the details you are reading
27 | - Vagrantfile: File providing the definition of the virtual machine to create using Vagrant
28 | - bootstrap.sh: The shell provisioner, installs core ubuntu packages
29 | - payload.sh: Shell provisioner, installs Miniconda, creates scoring environment, runs pipeline  
30 | - payload: A directory that contains files which can be used in the virtual machine for deployment
31 |     - scorer.zip: The DAI python scoring pipeline. (You need to put this file here)
32 |     - license.sig: Valid Driverless AI license file. (You need to provide your license file here)
33 | ```
34 | 
35 | Instructions
36 | ------------
37 | 
38 | 1. Install VirtualBox
39 | 2. Install Vagrant. Ensure you can invoke it using `vagrant --version`
40 | 2. Install Vagrant Disk Size plugin `vagrant plugin install vagrant-disksize`
41 | 3. Go to `top-dir`, which contains the files as mentioned in the above section
42 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory.
43 | 5. Copy Driverless AI license `license.sig` in the `payload` directory
44 | 6. Issue the command `vagrant up`. This will
45 |     - Create a CentOS 7 based virtual machine
46 |     - Bootstrap it i.e. install all dependencies, miniconda, python etc..
47 |     - Create a conda environment for the scoring pipeline by installing all needed dependencies
48 |     - Run `example.py` from the scoring pipeline
49 | 
50 | You can SSH to the machine using the command `vagrant ssh` from `top-dir` directory. Once connected it is like
51 | working on any Ubuntu terminal.
52 | 
53 | To run `example.py` you can follow the below steps once you are connected using SSH
54 | 
55 | ```
56 | conda env list                            # shows conda environments available on the system
57 | conda activate environment_name           # activate environment for required experiment (experiment key is in name)
58 | python example.py                         # to run example.py manually
59 | ```
60 | 
61 | Similarly, you can run the HTTP and TCP server python files too.
62 | 
63 | Multiple Deployments on same Host
64 | ---------------------------------
65 | 
66 | Each DAI experiment python deployment pipeline should be contained in its own virtual python environment.
67 | We support both `conda` and `pip + virtualenv` based virtual environments. This separation enables flexibility
68 | to have multiple experiment scoring pipelines to be deployed on the same machine without interfering with
69 | each other.
70 | 
71 | 
72 | Disclaimer
73 | ----------
74 | 
75 | The scoring pipeline wrapper code shared in this directory is created to provide you 
76 | a sample starting point and is not intended to be directly deployed to production as is.
77 | You can use this starting point and build over it to solve your deployment needs ensuring
78 | that your security etc. requirements are met.
79 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/vagrant/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | Vagrant.configure("2") do |config|
 5 |   # More boxes at https://vagrantcloud.com/search.
 6 |   config.vm.box = "centos/7"
 7 |   config.vm.network "private_network", ip: "192.168.33.10"
 8 |   # config.vm.network "forwarded_port", guest: 80, host: 8080
 9 | 
10 |   # HDD size for guest machine
11 |   config.disksize.size = '10GB'
12 | 
13 |   config.vm.provider "virtualbox" do |vb|
14 |      vb.memory = "8192"
15 |   end
16 |   
17 |   # Provisioning 
18 |   # File
19 |   config.vm.provision "file", source: "payload/scorer.zip", destination: "/home/vagrant/scorer.zip"
20 |   config.vm.provision "file", source: "payload/license.sig", destination: "/home/vagrant/.driverlessai/license.sig"
21 |   
22 |   # Shell - bootstraping 
23 |   config.vm.provision "shell", path: "bootstrap.sh", name: "bootstrap", privileged: true
24 |   # Shell - user install 
25 |   config.vm.provision "shell", path: "payload.sh", name: "payload", privileged: false 
26 | end
27 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/vagrant/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | yum -y update 
 4 | yum -y groupinstall 'Development Tools'
 5 | yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
 6 | yum -y install openblas-devel openblas python36-virtualenv python36-pip 
 7 | 
 8 | # create links
 9 | ln -s /usr/bin/virtualenv-3.6 /usr/bin/virtualenv
10 | ln -s /usr/bin/pip-3.6 /usr/bin/pip
11 | ln -sf /usr/bin/python3 /usr/bin/python
12 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/vagrant/payload.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | unzip scorer.zip && cd scoring-pipeline
4 | bash run_example.sh --pm pip


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/centos/vagrant/payload/README.md:
--------------------------------------------------------------------------------
1 | Payload Directory
2 | =================
3 | 
4 | Put the following two files in this directory
5 | 
6 | - `scorer.zip`
7 | - `license.sig`
8 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/pyspark/README.md:
--------------------------------------------------------------------------------
  1 | Python Scoring Pipeline using PySpark
  2 | ============================================
  3 | 
  4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline
  5 | obtained from H2O Driverless AI on a Spark cluster.
  6 | 
  7 | 
  8 | Prerequisites
  9 | -------------
 10 | 
 11 | The following pre-requisites are needed.
 12 | 1. Conda and [conda-pack](https://conda.github.io/conda-pack/) installed. This is needed to build Python environment/code to distribute among cluster.
 13 | - To install conda-pack:  `conda install -c conda-forge conda-pack`
 14 | 2. Install `openblas` on all nodes (driver and executors that will run the Python code).
 15 | - Install openblas on Spark driver and all executors:
 16 |   a. CentOS: `sudo yum install -y  openblas-devel` or use rpm 0.3.3: https://centos.pkgs.org/7/epel-x86_64/openblas-0.3.3-2.el7.x86_64.rpm.html
 17 |   b. Ubuntu: `sudo apt-get install libopenblas-dev`
 18 | 3. Install git on Spark driver and all executors, e.g. `sudo yum install git`
 19 | 
 20 | Code Structure
 21 | --------------
 22 | 
 23 | The process assumes a directory structure as below:
 24 | 
 25 | ```
 26 | top-dir: A directory with the below structure. This example uses the home directory of current user.
 27 | - README.md: This file with the details you are reading
 28 | - py_scorer_testing: A directory that contains files to be used for deployment
 29 |     - scorer.zip: The DAI python scoring pipeline. (You need to put this file here to extract files needed below)
 30 |     - license.sig: Valid Driverless AI license file. (You need to provide your license file here)
 31 |     - get_predictions.py: PySpark script (example given) used for running batch scoring
 32 |     - py_scorer_env.tar.gz: conda-pack generated following instructions below
 33 |     - dai_contrib.tar.gz: (optional) compressed tmp folder generated following instructions below (necessary if your model used custom recipes)
 34 | ```
 35 | 
 36 | Instructions
 37 | ------------
 38 | 
 39 | 1. Upload Python Scoring Pipeline (scorer.zip) and license.sig onto Spark driver.
 40 | 2. Copy your input_dataset.csv to HDFS for the cluster to access. Or, if using spark locally, store the dataset on the driver.
 41 | 3. Create scorer folder and unzip scorer on Spark driver.
 42 | `mkdir py_scorer_testing`
 43 | `cd py_scorer_testing`
 44 | Move scorer.zip into py_scorer_testing
 45 | `unzip scorer.zip`
 46 | 
 47 | 4. Create Python Env using environment.yml found in scorer.zip:
 48 | `conda env create --name py_scorer_env -f scoring-pipeline/environment.yml`
 49 | `conda activate py_scorer_env`
 50 | If model was created before DAI 1.8.5, you will need to install gitdb:
 51 | `pip install --upgrade gitdb2==2.0.6 gitdb==0.6.4`
 52 | 
 53 | 5. Create conda-pack of new Env:
 54 | `cd py_scorer_testing`
 55 | `conda env list` OR `conda list`
 56 | `conda pack -n py_scorer_env -o py_scorer_env.tar.gz`
 57 | 
 58 | 6. Create tar.gz of DAI’s tmp folder (this step is necessary if your model used custom recipes)
 59 | `tar -czvf dai_contrib.tar.gz -C scoring-pipeline/tmp/contrib .`
 60 | Note that you cannot use tmp due to conflict of Spark already having tmp folder
 61 | 
 62 | 7. Download `get_predictions.py` from this repo and add to `py_scorer_testing` folder
 63 | 
 64 | 8. Set up env vars (some may not be needed for YARN cluster mode)
 65 | `export ARROW_PRE_0_15_IPC_FORMAT=1`  (due to [pyarrow issue](https://stackoverflow.com/questions/58269115/how-to-enable-apache-arrow-in-pyspark))  
 66 | `export DRIVERLESS_AI_LICENSE_FILE=~/py_scorer_testing/license.sig`
 67 | `export PYSPARK_PYTHON=./py_scorer_env/bin/python`
 68 | `export SPARK_HOME=/path/to/spark` (e.g. ~/spark/spark-2.4.5-bin-hadoop2.7)
 69 | `export HADOOP_CONF_DIR=/etc/hadoop/conf` (may need to modify if don't have default hadoop path)
 70 | 
 71 | 9. Run `kinit` if Hadoop is secured with Kerberos
 72 | 
 73 | 10. cd into conda envs, e.g. `cd ~/miniconda3/envs`
 74 | 
 75 | 11. Submit Spark Job `get_predictions.py`
 76 | ```
 77 | PYTHONIOENCODING=utf8 \
 78 | PYSPARK_PYTHON=./py_scorer_env/bin/python \
 79 | spark-submit \
 80 | --master yarn \
 81 | --deploy-mode cluster \
 82 | --num-executors 2 --driver-memory 2g --executor-memory 4g \
 83 | --archives ../../py_scorer_testing/py_scorer_env.tar.gz#py_scorer_env,../../py_scorer_testing/dai_contrib.tar.gz#tmp/contrib \
 84 | --conf spark.executorEnv.PATH=`echo $PATH` \
 85 | --conf spark.executorEnv.PYSPARK_PYTHON=./py_scorer_env/bin/python \
 86 | --conf spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT=1 \
 87 | --conf spark.executorEnv.PYTHONIOENCODING=utf8 \
 88 | --conf spark.yarn.appMasterEnv.PYTHONIOENCODING=utf8 \
 89 | --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./py_scorer_env/bin/python \
 90 | --conf spark.executorEnv.DRIVERLESS_AI_LICENSE_KEY=`cat ~/py_scorer_testing/license.sig` \
 91 | --conf spark.driver.maxResultSize=2g \
 92 | ~/py_scorer_testing/get_predictions.py hdfs:///user/path/to/input_dataset.csv hdfs:///user/path/to/output_dataset.csv
 93 | ```
 94 | Note: utf encodings (used above) may be needed for certain NLP models and spark.executorEnv.PATH for initialization
 95 | 
 96 | Disclaimer
 97 | ----------
 98 | 
 99 | The scoring pipeline wrapper code shared in this directory is created to provide you 
100 | a sample starting point and is not intended to be directly deployed to production as is.
101 | You can use this starting point and build over it to solve your deployment needs ensuring
102 | that your security etc. requirements are met.
103 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/pyspark/get_predictions.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession
 3 | from pyspark import SparkFiles
 4 | spark = SparkSession.builder.getOrCreate()
 5 | 
 6 | input_path = sys.argv[1]
 7 | output_path = sys.argv[2]
 8 | df = spark.read.csv(input_path, header=True, inferSchema=True)
 9 | names = df.columns
10 | 
11 | import pandas as pd
12 | from pyspark.sql.functions import col, pandas_udf, size
13 | from pyspark.sql.types import DoubleType, ArrayType
14 | 
15 | def predict(*series) -> pd.Series:
16 |     import pandas as pd
17 |     import numpy as np
18 |     from numpy import nan
19 |     from scipy.special._ufuncs import expit
20 |     from scoring_h2oai_experiment_336ccd12_cbb4_11ea_8496_ac1f6b68b7be import Scorer # update with your key
21 |     scorer = Scorer()
22 |     merged = pd.concat(series, axis=1)
23 |     merged.columns = names
24 |     output = scorer.score_batch(merged)
25 |     return pd.Series(output.values.tolist())
26 | 
27 |     
28 | predict_udf = pandas_udf(predict, returnType=ArrayType(DoubleType()))
29 | columns = [col(name) for name in df.columns]
30 | withPredictions = df.withColumn("prediction", predict_udf(*columns))
31 | 
32 | # If working with multi-class, can expand prediction, e.g. 3 classes:
33 | num_cols = withPredictions.withColumn("size", size(col("prediction"))).agg({"size": "max"}).head()[0] # To be performant, specify the value, e.g. num_cols=3
34 | withPredictions = withPredictions.select(col("*"), *(col('prediction').getItem(i).alias(f'prediction_{i}') for i in range(num_cols)))
35 | withPredictions = withPredictions.drop(col("prediction"))


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/README.md:
--------------------------------------------------------------------------------
 1 | Python Scoring Pipeline Deployment Examples
 2 | ===========================================
 3 | 
 4 | Driverless AI scoring pipelines can be deployed independently of the machine
 5 | where Driverless AI is running. This essentially helps you to separate the 
 6 | concerns of Model Training from Model Deployment. This capability gives you
 7 | immense flexibility on how you can deploy your scoring pipelines to production.
 8 | 
 9 | This directory lists example code that shows how to deploy Python Scoring Pipeline
10 | in various scenarios
11 | 
12 | Bare-metal or Virtual Linux Environments
13 | ----------------------------------------
14 | 
15 | The `vagrant` directory contains example code that explains how to get DAI 
16 | python scoring pipeline installed and running on a Ubuntu 18.04 linux. The example
17 | uses Ubuntu 10.04 running on Virtualbox managed via Vagrant. The example can be
18 | used the understand the steps needed to get the scoring pipeline working, which
19 | can be adjusted per your scenarios.
20 | 
21 | 
22 | Containerised Environments
23 | --------------------------
24 | 
25 | The `docker` directory contains example code to show how to create a Ubuntu 18.04
26 | based container that can be used to deploy the python scoring pipeline.
27 | 
28 | 
29 | Disclaimer
30 | ----------
31 | 
32 | The scoring pipeline wrapper code shared in this directory is created to provide you 
33 | a sample starting point and is not intended to be directly deployed to production as is.
34 | You can use this starting point and build over it to solve your deployment needs ensuring
35 | that your security etc. requirements are met.
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/.gitignore:
--------------------------------------------------------------------------------
1 | .idea


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:bionic
 2 | 
 3 | # These commands run as root 
 4 | # Install base dependencies
 5 | RUN apt-get update && \ 
 6 |     apt install -y \
 7 |         build-essential \
 8 |         libmagic-dev \ 
 9 |         libopenblas-dev \
10 |         git \
11 |         locales \
12 |         unzip \
13 |         wget
14 | 
15 | RUN locale-gen en_US.UTF-8
16 | ENV LANG en_US.UTF-8
17 | ENV LANGUAGE en_US:en
18 | ENV LC_ALL en_US.UTF-8
19 | ENV HOME /home/newuser
20 | 
21 | # Create new user
22 | RUN useradd -ms /bin/bash newuser
23 | 
24 | # Create a new user to run the pipeline
25 | USER newuser
26 | WORKDIR /home/newuser
27 | 
28 | # Commands below run as newuser
29 | COPY --chown=newuser:newuser payload/scorer.zip ./ 
30 | COPY --chown=newuser:newuser payload/license.sig .driverlessai/
31 | 
32 | # install Miniconda
33 | RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
34 |     bash miniconda.sh -b -p $HOME/miniconda3 && \
35 |     echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> .bashrc && \
36 |     unzip scorer.zip 
37 | 
38 | WORKDIR scoring-pipeline
39 | 
40 | RUN export PATH="$HOME/miniconda3/bin:$PATH" && \
41 |     bash run_example.sh --pm conda


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile-pip-batch:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:bionic
 2 | 
 3 | # Similar to Dockerfile, but uses PIP to install dependencies without creating environment
 4 | # No user is created. Installs as root.
 5 | # Use as example code and modify as needed
 6 | 
 7 | # These commands run as root
 8 | # Install base dependencies
 9 | RUN apt-get update && \
10 |     apt install -y \
11 |         build-essential \
12 |         libmagic-dev \
13 |         libopenblas-dev \
14 |         openjdk-8-jre \
15 |         git \
16 |         locales \
17 |         python3-pip python3-virtualenv\
18 |         unzip \
19 |         wget
20 | 
21 | RUN locale-gen en_US.UTF-8
22 | ENV LANG="en_US.UTF-8"
23 | ENV LANGUAGE="en_US:en"
24 | ENV LC_ALL="en_US.UTF-8"
25 | ENV HOME="/root"
26 | 
27 | WORKDIR $HOME
28 | 
29 | COPY payload/scorer.zip ./
30 | COPY payload/license.sig .driverlessai/
31 | 
32 | RUN unzip scorer.zip
33 | 
34 | COPY batch_scorer.py scoring-pipeline
35 | 
36 | WORKDIR scoring-pipeline
37 | 
38 | RUN python3 -m virtualenv -p python3.6 env && \
39 |     env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed pip==19.3.1 pkginfo==1.5.0.1 && \
40 |     env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r requirements.txt -c full_constraints.txt && \
41 |     env/bin/python -m pip uninstall -y tensorflow && \
42 |     env/bin/python -m pip uninstall -y tensorflow-gpu && \
43 |     env/bin/python -m pip install tensorflow==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \
44 |     tf_path=`env/bin/python -c "import os ; import importlib.util ; tf_loader = importlib.util.find_spec('tensorflow') ; print(os.path.dirname(tf_loader.origin))"` && \
45 |     rm -rf ${tf_path}_cpu && mv ${tf_path} ${tf_path}_cpu && \
46 |     env/bin/python -m pip install tensorflow_gpu==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \
47 |     rm -rf ${tf_path}_gpu && mv ${tf_path} ${tf_path}_gpu
48 | 
49 | RUN import_statement=$(grep -E 'from scoring_h2oai_experiment' example.py) && \
50 |     sed -i "s/INJECT_EXPERIMENT_IMPORT/${import_statement}/g" batch_scorer.py
51 | 
52 | CMD ["env/bin/python", "batch_scorer.py"]
53 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/Dockerfile-pip-http:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:bionic
 2 | 
 3 | # Similar to Dockerfile, but uses PIP to install dependencies without creating environment
 4 | # No user is created. Installs as root.
 5 | # Use as example code and modify as needed
 6 | 
 7 | # These commands run as root
 8 | # Install base dependencies
 9 | RUN apt-get update && \
10 |     apt install -y \
11 |         build-essential \
12 |         libmagic-dev \
13 |         libopenblas-dev \
14 |         openjdk-8-jre \
15 |         git \
16 |         locales \
17 |         python3-pip python3-virtualenv\
18 |         unzip \
19 |         wget
20 | 
21 | RUN locale-gen en_US.UTF-8
22 | ENV LANG="en_US.UTF-8"
23 | ENV LANGUAGE="en_US:en"
24 | ENV LC_ALL="en_US.UTF-8"
25 | ENV HOME="/root"
26 | 
27 | WORKDIR $HOME
28 | 
29 | COPY payload/scorer.zip ./
30 | COPY payload/license.sig .driverlessai/
31 | 
32 | RUN unzip scorer.zip
33 | 
34 | WORKDIR scoring-pipeline
35 | 
36 | RUN python3 -m virtualenv -p python3.6 env && \
37 |     env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed pip==19.3.1 pkginfo==1.5.0.1 && \
38 |     env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r requirements.txt -c full_constraints.txt && \
39 |     env/bin/python -m pip install --upgrade --upgrade-strategy only-if-needed -r http_server_requirements.txt -c full_constraints.txt && \
40 |     env/bin/python -m pip uninstall -y tensorflow && \
41 |     env/bin/python -m pip uninstall -y tensorflow-gpu && \
42 |     env/bin/python -m pip install tensorflow==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \
43 |     tf_path=`env/bin/python -c "import os ; import importlib.util ; tf_loader = importlib.util.find_spec('tensorflow') ; print(os.path.dirname(tf_loader.origin))"` && \
44 |     rm -rf ${tf_path}_cpu && mv ${tf_path} ${tf_path}_cpu && \
45 |     env/bin/python -m pip install tensorflow_gpu==1.13.1 --upgrade --upgrade-strategy only-if-needed -c full_constraints.txt && \
46 |     rm -rf ${tf_path}_gpu && mv ${tf_path} ${tf_path}_gpu
47 | 
48 | EXPOSE 9090
49 | 
50 | CMD ["env/bin/python", "http_server.py", "--port=9090"]
51 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/README.md:
--------------------------------------------------------------------------------
 1 | Python Scoring Pipeline Wrapper using Docker
 2 | ============================================
 3 | 
 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline
 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 docker container. This directory acts as the build
 6 | context for the docker build step.
 7 | 
 8 | 
 9 | Prerequisites
10 | -------------
11 | 
12 | The following pre-requisites are needed
13 | - [Docker](https://www.docker.com/)
14 | 
15 | Follow the installation instructions for your platform and get Docker Ce (or EE) installed on the machine.
16 | 
17 | 
18 | Code Structure
19 | --------------
20 | 
21 | The code assumes a directory structure as below:
22 | 
23 | ```
24 | top-dir: A directory with the below structure. Name can be anything. This is the build context for docker build command
25 | - README.md: This file with the details you are reading
26 | - Dockerfile: The docker image build script
27 | - payload: A directory that contains files to be used in the docker container for deployment
28 |     - scorer.zip: The DAI python scoring pipeline. (You need to put this file here)
29 |     - license.sig: Valid Driverless AI license file. (You need to provide your license file here)
30 | ```
31 | 
32 | Docker Container to expose HTTP REST endpoint for scoring
33 | ---------------------------------------------------------
34 | 
35 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker
36 | 3. Change to `top-dir`, which contains the files as mentioned in the above section
37 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory.
38 | 5. Copy Driverless AI license `license.sig` in the `payload` directory
39 | 6. Issue the command `docker build -f Dockerfile-pip-http -t score_python_http .`. This will
40 |     - Create a Ubuntu 18.04 based docker container
41 |     - Install required system dependencies, python3.6, pip etc..
42 |     - Install all python package dependencies needed for the scoring pipeline to work 
43 |     - Run `http_server.py` from the scoring pipeline and expose the REST scoring server at port 9090
44 | 
45 | Execute the command `docker run -p 9090:9090 score_python_http:latest` and you will notice the python scoring server start and accept connections. 
46 | 
47 | In the `scorer.zip` file you put in the `payload` directory there is a sample http client you can use to test this server. Extract the file `run_http_client.sh` and execute it while the docker image is still listening. You will see the predictions being returned.
48 | 
49 | Docker Container for Batch scoring 
50 | ----------------------------------
51 | 
52 | 1. Install Docker. Ensure you can invoke it using `docker version`. It should display client and server version of docker
53 | 3. Change to `top-dir`, which contains the files as mentioned in the above section
54 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory.
55 | 5. Copy Driverless AI license `license.sig` in the `payload` directory
56 | 6. Issue the command `docker build -f Dockerfile-pip-batch -t score_python_batch .`. This will
57 |     - Create a Ubuntu 18.04 based docker container
58 |     - Install required system dependencies, python3.6, pip etc..
59 |     - Install all python package dependencies needed for the scoring pipeline to work 
60 |     - Run the `batch_scorer.py` in the container such that it scores the file `/data/input.csv` and writes the predictions to `/data/output.csv` 
61 | 
62 | Execute the command `docker run -v some_dir_with_input_data:/data score_python_batch:latest`. Here `some_dir_with_input_data` is some directory on the machine where you are executing the docker run command. The file you want to score should be present in that directory with the name `input.csv`. The user executing the docker run command should have read and write permissions on the directory `some_dir_with_input_data` to be able to create the prediction output file `output.csv` in that same directory 
63 | 7. Once the `output.csv` is generate you can cbind the two files in linux using the command `paste -d ',' input.csv output.csv`
64 | 
65 | Disclaimer
66 | ----------
67 | 
68 | The scoring pipeline wrapper code shared in this directory is created to provide you
69 | a sample starting point and is not intended to be directly deployed to production as is.
70 | You can use this starting point and build over it to solve your deployment needs ensuring
71 | that your security etc. requirements are met.


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/batch_scorer.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from numpy import nan
 4 | from scipy.special._ufuncs import expit
 5 | import datatable as dt
 6 | INJECT_EXPERIMENT_IMPORT
 7 | 
 8 | scorer = Scorer()
 9 | 
10 | input_dt = dt.fread("/data/input.csv", na_strings=['', '?', 'None', 'nan', 'NA', 'N/A', 'unknown', 'inf', '-inf', '1.7976931348623157e+308', '-1.7976931348623157e+308'])
11 | output_dt = scorer.score_batch(input_dt, apply_data_recipes=False)
12 | dt.Frame(output_dt).to_csv("/data/output.csv")


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/docker/payload/README.md:
--------------------------------------------------------------------------------
1 | Payload Directory
2 | =================
3 | 
4 | Put the following two files in this directory
5 | 
6 | - `scorer.zip`
7 | - `license.sig`
8 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/vagrant/README.md:
--------------------------------------------------------------------------------
 1 | Python Scoring Pipeline Wrapper using Vagrant
 2 | =============================================
 3 | 
 4 | This directory contains sample code that explains the steps needed to deploy a python scoring pipeline
 5 | obtained from H2O Driverless AI in a Ubuntu 18.04 virtual machine in Vagrant.
 6 | 
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | The following pre-requisites are needed
12 | - [VirtualBox](https://www.virtualbox.org/): A free virtualization provider
13 | - [Vagrant](https://www.vagrantup.com/): A tool for building and managing virtual machines
14 | - [Vagrant Disk Resize plugin](https://github.com/sprotheroe/vagrant-disksize): A vagrnt plugin to manage disk sizes
15 | 
16 | Follow the installation instructions for your platform and get them installed in the above order.
17 | 
18 | 
19 | Code Structure
20 | --------------
21 | 
22 | The code assumes a directory structure as below:
23 | 
24 | ```
25 | top-dir: A directory with the below structure. Name of directory can be anything.
26 | - README.md: This file with the details you are reading
27 | - Vagrantfile: File providing the definition of the virtual machine to create using Vagrant
28 | - bootstrap.sh: The shell provisioner, installs core ubuntu packages
29 | - payload.sh: Shell provisioner, installs Miniconda, creates scoring environment, runs pipeline  
30 | - payload: A directory that contains files which can be used in the virtual machine for deployment
31 |     - scorer.zip: The DAI python scoring pipeline. (You need to put this file here)
32 |     - license.sig: Valid Driverless AI license file. (You need to provide your license file here)
33 | ```
34 | 
35 | Instructions
36 | ------------
37 | 
38 | 1. Install VirtualBox
39 | 2. Install Vagrant. Ensure you can invoke it using `vagrant --version`
40 | 2. Install Vagrant Disk Size plugin `vagrant plugin install vagrant-disksize`
41 | 3. Go to `top-dir`, which contains the files as mentioned in the above section
42 | 4. Copy the scoring pipeline `scorer.zip` in the `payload` directory. You may need to create the `payload` directory.
43 | 5. Copy Driverless AI license `license.sig` in the `payload` directory
44 | 6. Issue the command `vagrant up`. This will
45 |     - Create a Ubuntu 18.04 based virtual machine
46 |     - Bootstrap it i.e. install all dependencies, miniconda, python etc..
47 |     - Create a conda environment for the scoring pipeline by installing all needed dependencies
48 |     - Run `example.py` from the scoring pipeline
49 | 
50 | You can SSH to the machine using the command `vagrant ssh` from `top-dir` directory. Once connected it is like
51 | working on any Ubuntu terminal.
52 | 
53 | To run `example.py` you can follow the below steps once you are connected using SSH
54 | 
55 | ```
56 | conda env list                            # shows conda environments available on the system
57 | conda activate environment_name           # activate environment for required experiment (experiment key is in name)
58 | python example.py                         # to run example.py manually
59 | ```
60 | 
61 | Similarly, you can run the HTTP and TCP server python files too.
62 | 
63 | Multiple Deployments on same Host
64 | ---------------------------------
65 | 
66 | Each DAI experiment python deployment pipeline should be contained in its own virtual python environment.
67 | We support both `conda` and `pip + virtualenv` based virtual environments. This separation enables flexibility
68 | to have multiple experiment scoring pipelines to be deployed on the same machine without interfering with
69 | each other.
70 | 
71 | 
72 | Disclaimer
73 | ----------
74 | 
75 | The scoring pipeline wrapper code shared in this directory is created to provide you 
76 | a sample starting point and is not intended to be directly deployed to production as is.
77 | You can use this starting point and build over it to solve your deployment needs ensuring
78 | that your security etc. requirements are met.
79 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/vagrant/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | Vagrant.configure("2") do |config|
 5 |   # More boxes at https://vagrantcloud.com/search.
 6 |   config.vm.box = "ubuntu/bionic64"
 7 |   config.vm.network "private_network", ip: "192.168.33.10"
 8 |   # config.vm.network "forwarded_port", guest: 80, host: 8080
 9 | 
10 |   # HDD size for guest machine
11 |   config.disksize.size = '10GB'
12 | 
13 |   config.vm.provider "virtualbox" do |vb|
14 |      vb.memory = "8192"
15 |   end
16 |   
17 |   # Provisioning 
18 |   # File
19 |   config.vm.provision "file", source: "payload/scorer.zip", destination: "/home/vagrant/scorer.zip"
20 |   config.vm.provision "file", source: "payload/license.sig", destination: "/home/vagrant/.driverlessai/license.sig"
21 |   
22 |   # Shell - bootstraping 
23 |   config.vm.provision "shell", path: "bootstrap.sh", name: "bootstrap", privileged: true
24 |   # Shell - user install 
25 |   config.vm.provision "shell", path: "payload.sh", name: "payload", privileged: false 
26 | end
27 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/vagrant/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | apt-get -y update
4 | apt-get -y upgrade
5 | apt-get -y install unzip build-essential libopenblas-dev
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/vagrant/payload.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
4 | bash ~/miniconda.sh -b -p $HOME/miniconda3
5 | echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> ~/.bashrc
6 | export PATH=$HOME/miniconda3/bin:$PATH
7 | unzip scorer.zip && cd scoring-pipeline
8 | bash run_example.sh --pm conda


--------------------------------------------------------------------------------
/scoring-pipeline-deployment/python/ubuntu/vagrant/payload/README.md:
--------------------------------------------------------------------------------
1 | Payload Directory
2 | =================
3 | 
4 | Put the following two files in this directory
5 | 
6 | - `scorer.zip`
7 | - `license.sig`
8 | 


--------------------------------------------------------------------------------