├── .gitignore
├── LICENSE
├── README.md
├── catsdogs
    ├── cicd-example
    │   ├── cloudbuild.yaml
    │   └── test.py
    └── tensorflow
    │   ├── data
    │       ├── datagcs.csv
    │       └── datalocal.csv
    │   ├── notebooks
    │       ├── data_prep.ipynb
    │       └── model_training_and_deployment_local.ipynb
    │   └── tfcloud
    │       ├── model_training_tfcloud.ipynb
    │       ├── requirements.txt
    │       └── run_tfcloud.py
├── census
    ├── catboost
    │   └── gcp_ai_platform
    │   │   ├── notebooks
    │   │       └── catboost_census_notebook.ipynb
    │   │   ├── scripts
    │   │       └── train-cloud.sh
    │   │   ├── setup.py
    │   │   └── trainer
    │   │       ├── __init__.py
    │   │       └── train.py
    └── xgboost
    │   └── gcp_ai_platform
    │       ├── notebooks
    │           └── xgboost_census_notebook.ipynb
    │       ├── scripts
    │           └── train-cloud.sh
    │       └── trainer
    │           ├── __init__.py
    │           └── train.py
├── fannie_mae_loans
    └── rapids_xgboost
    │   └── notebooks
    │       └── dask_rapids.ipynb
├── higgs
    └── rapids_xgboost
    │   └── notebooks
    │       ├── a100_higgs_rapids_xgboost.ipynb
    │       └── t4_higgs_rapids_xgboost.ipynb
├── mlflow-vertex
    ├── mlflow-databricks-vertex-deployment.ipynb
    └── mlflow-oss-vertex-deployment.ipynb
└── tuning_llms
    ├── tuning_dialogsum.ipynb
    ├── tuning_legalbench.ipynb
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Mikhail Chrestkha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning Examples
2 | ## Organized by:
3 | ```markdown
4 | -dataset (i.e. Iris, MNIST, ImageNet)
5 | ---framework (i.e. TensorFlow, XGBoost)
6 | -----notebooks
7 | -----scripts
8 | 


--------------------------------------------------------------------------------
/catsdogs/cicd-example/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/catsdogs/cicd-example/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | 
 5 | # Model / data parameters
 6 | num_classes = 10
 7 | input_shape = (28, 28, 1)
 8 | 
 9 | # the data, split between train and test sets
10 | (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
11 | 
12 | # Scale images to the [0, 1] range
13 | x_train = x_train.astype("float32") / 255
14 | x_test = x_test.astype("float32") / 255
15 | # Make sure images have shape (28, 28, 1)
16 | x_train = np.expand_dims(x_train, -1)
17 | x_test = np.expand_dims(x_test, -1)
18 | print("x_train shape:", x_train.shape)
19 | print(x_train.shape[0], "train samples")
20 | print(x_test.shape[0], "test samples")
21 | 
22 | 
23 | # convert class vectors to binary class matrices
24 | y_train = keras.utils.to_categorical(y_train, num_classes)
25 | y_test = keras.utils.to_categorical(y_test, num_classes)
26 | 
27 | model = keras.Sequential(
28 |     [
29 |         keras.Input(shape=input_shape),
30 |         layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
31 |         layers.MaxPooling2D(pool_size=(2, 2)),
32 |         layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
33 |         layers.MaxPooling2D(pool_size=(2, 2)),
34 |         layers.Flatten(),
35 |         layers.Dropout(0.5),
36 |         layers.Dense(num_classes, activation="softmax"),
37 |     ]
38 | )
39 | 
40 | model.summary()
41 | 
42 | batch_size = 128
43 | epochs = 2
44 | 
45 | model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
46 | 
47 | model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
48 | 
49 | score = model.evaluate(x_test, y_test, verbose=0)
50 | print("Test loss:", score[0])
51 | print("Test accuracy:", score[1])
52 | 


--------------------------------------------------------------------------------
/catsdogs/tensorflow/notebooks/data_prep.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Libraries and datasets required"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "original dataset from  https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip \\\n",
 15 |     "Images: 3000 (2000 Training, 1000 Validation) "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 96,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "0.10.4\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "import os\n",
 33 |     "import pandas as pd\n",
 34 |     "import tfrecorder\n",
 35 |     "import wandb \n",
 36 |     "print(wandb.__version__)\n",
 37 |     "import tensorflow as tf\n",
 38 |     "import time"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 97,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "os.system('wandb login 3a6710e811d34207ea03768ba12e7ea6c8a9fefd')\n",
 48 |     "os.environ['WANDB_NOTEBOOK_NAME'] = 'data_prep.ipynb'"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### GCS Fuse to be able to use os utilities on GCS without copying data"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 98,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "#!gcsfuse --implicit-dirs mchrestkha-demo-env-ml-examples /home/jupyter/gcs/ \n",
 65 |     "#!fusermount -u /home/jupyter/gcs/"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Collect all image URIs"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 108,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/html": [
 83 |        "\n",
 84 |        "                Tracking run with wandb version 0.10.4<br/>\n",
 85 |        "                Syncing run <strong style=\"color:#cdcd00\">set_up_data_20201005_053719</strong> to <a href=\"https://wandb.ai\" target=\"_blank\">Weights & Biases</a> <a href=\"https://docs.wandb.com/integrations/jupyter.html\" target=\"_blank\">(Documentation)</a>.<br/>\n",
 86 |        "                Project page: <a href=\"https://wandb.ai/mchrestkha/cats-dogs-keras\" target=\"_blank\">https://wandb.ai/mchrestkha/cats-dogs-keras</a><br/>\n",
 87 |        "                Run page: <a href=\"https://wandb.ai/mchrestkha/cats-dogs-keras/runs/596flvgm\" target=\"_blank\">https://wandb.ai/mchrestkha/cats-dogs-keras/runs/596flvgm</a><br/>\n",
 88 |        "                Run data is saved locally in <code>wandb/run-20201005_053719-596flvgm</code><br/><br/>\n",
 89 |        "            "
 90 |       ],
 91 |       "text/plain": [
 92 |        "<IPython.core.display.HTML object>"
 93 |       ]
 94 |      },
 95 |      "metadata": {},
 96 |      "output_type": "display_data"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "RUN_NAME=time.strftime(\"set_up_data_%Y%m%d_%H%M%S\")\n",
101 |     "run = wandb.init(project='cats-dogs-keras', job_type='data', name=RUN_NAME)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 115,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "wandb.log({\"cat_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/cats/cat.1.jpg\", caption=\"Cat1\")})\n",
111 |     "wandb.log({\"cat_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/cats/cat.2.jpg\", caption=\"Cat2\")})\n",
112 |     "wandb.log({\"dog_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/dogs/dog.1.jpg\", caption=\"Dog1\")})\n",
113 |     "wandb.log({\"dog_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/dogs/dog.2.jpg\", caption=\"Dog2\")})"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "myDir='/home/jupyter/gcs/catsdogs/'\n",
123 |     "format='.jpg'\n",
124 |     "fileList = []\n",
125 |     "for root, dirs, files in os.walk(myDir, topdown=False):\n",
126 |     "        for name in files:\n",
127 |     "            if name.endswith(format):\n",
128 |     "                fullName = os.path.join(root, name)\n",
129 |     "                fileList.append(fullName)\n",
130 |     "                \n",
131 |     "fileList[:10]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "df=pd.DataFrame(fileList)\n",
141 |     "df.columns = ['image_uri']\n",
142 |     "df.head()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "df.loc[df['image_uri'].str.contains('train'), 'split'] = 'TRAIN'\n",
152 |     "df.loc[df['image_uri'].str.contains('validation'), 'split'] = 'VALIDATION'\n",
153 |     "df.loc[df['image_uri'].str.contains('|'.join(['train/cats', 'validation/cats'])), 'label'] = 'cats'\n",
154 |     "df.loc[df['image_uri'].str.contains('|'.join(['train/dogs', 'validation/dogs'])), 'label'] = 'dogs'\n",
155 |     "df = df[['split', 'image_uri', 'label']]\n",
156 |     "df.head()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "dflocal=df.copy()\n",
166 |     "dfgcs=df.copy()\n",
167 |     "dfgcs['image_uri'] = dfgcs['image_uri'].str.replace('/home/jupyter/gcs','gs://mchrestkha-demo-env-ml-examples')\n",
168 |     "dflocal.to_csv('../data/datalocal1.csv', index=False)\n",
169 |     "dfgcs.to_csv('../data/datagcs1.csv', index=False)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "artifact = wandb.Artifact(name='training_images', type='dataset')\n",
179 |     "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/train/')\n",
180 |     "run.log_artifact(artifact)\n",
181 |     "artifact = wandb.Artifact(name='validation_images', type='dataset')\n",
182 |     "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/validation/')\n",
183 |     "run.log_artifact(artifact)\n",
184 |     "artifact = wandb.Artifact(name='image_uris_csv', type='dataset')\n",
185 |     "artifact.add_file('../data/datagcs.csv')\n",
186 |     "run.log_artifact(artifact)\n",
187 |     "run.finish()"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "RUN_NAME=time.strftime(\"generate_tfrecords_%Y%m%d_%H%M%S\")\n",
197 |     "run = wandb.init(project='cats-dogs-keras',job_type='data', name=RUN_NAME)\n",
198 |     "artifact = run.use_artifact('training_images:latest')\n",
199 |     "artifact = run.use_artifact('validation_images:latest')\n",
200 |     "artifact = run.use_artifact('image_uris_csv:latest')"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "FILENAME='../data/datagcs.csv'\n",
210 |     "TFRECORD_OUTPUT='gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords'\n",
211 |     "PROJECT='mchrestkha-demo-env'\n",
212 |     "REGION='us-west1'\n",
213 |     "TFRECORDER_WHEEL='/home/jupyter/tfrecorder_wheel/tfrecorder-0.1.2-py3-none-any.whl'\n",
214 |     "\n",
215 |     "\n",
216 |     "dfgcs = pd.read_csv(FILENAME)\n",
217 |     "dfgcs.tensorflow.to_tfr(\n",
218 |     "    output_dir=TFRECORD_OUTPUT,\n",
219 |     "    runner='DataflowRunner',\n",
220 |     "    project=PROJECT,\n",
221 |     "    region=REGION,\n",
222 |     "    tfrecorder_wheel=TFRECORDER_WHEEL)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "dfgcs"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "artifact = wandb.Artifact(name='tfrecords', type='dataset')\n",
241 |     "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/')\n",
242 |     "run.log_artifact(artifact)\n",
243 |     "run.finish()"
244 |    ]
245 |   }
246 |  ],
247 |  "metadata": {
248 |   "environment": {
249 |    "name": "tf2-2-3-gpu.2-3.m55",
250 |    "type": "gcloud",
251 |    "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55"
252 |   },
253 |   "kernelspec": {
254 |    "display_name": "mchrestkha-env",
255 |    "language": "python",
256 |    "name": "mchrestkha-env"
257 |   },
258 |   "language_info": {
259 |    "codemirror_mode": {
260 |     "name": "ipython",
261 |     "version": 3
262 |    },
263 |    "file_extension": ".py",
264 |    "mimetype": "text/x-python",
265 |    "name": "python",
266 |    "nbconvert_exporter": "python",
267 |    "pygments_lexer": "ipython3",
268 |    "version": "3.7.8"
269 |   }
270 |  },
271 |  "nbformat": 4,
272 |  "nbformat_minor": 4
273 | }
274 | 


--------------------------------------------------------------------------------
/catsdogs/tensorflow/notebooks/model_training_and_deployment_local.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tensorflow as tf\n",
 10 |     "#import matplotlib.pyplot as plt\n",
 11 |     "#import IPython.display as display\n",
 12 |     "from tensorflow.keras.optimizers import RMSprop\n",
 13 |     "import tensorflow_cloud as tfc\n",
 14 |     "import time\n",
 15 |     "import wandb\n",
 16 |     "from wandb.keras import WandbCallback\n",
 17 |     "import os"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "os.system('wandb login 3a6710e811d34207ea03768ba12e7ea6c8a9fefd')\n",
 27 |     "os.environ['WANDB_NOTEBOOK_NAME'] = 'model_training_and_deployment_local.ipynb'"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "RUN_NAME=time.strftime(\"train_%Y%m%d_%H%M%S\")\n",
 37 |     "run = wandb.init(project='cats-dogs-keras',job_type='train', name=RUN_NAME)\n",
 38 |     "artifact = run.use_artifact('tfrecords:latest')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "AUTOTUNE = tf.data.experimental.AUTOTUNE\n",
 48 |     "GCS_PATH = \"gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/tfrecorder-20200930-193548-to-tfr\"\n",
 49 |     "BATCH_SIZE = 5\n",
 50 |     "IMAGE_SIZE = [150, 150]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "TRAINING_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/train*.tfrecord.gz\")\n",
 60 |     "VALID_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/validation*.tfrecord.gz\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "print(\"Train TFRecord Files:\", len(TRAINING_FILENAMES))\n",
 70 |     "print(\"Validation TFRecord Files:\", len(VALID_FILENAMES))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "def read_tfrecord(example):\n",
 80 |     "    tfr_format = {\n",
 81 |     "            \"image\": tf.io.FixedLenFeature([], tf.string),\n",
 82 |     "            \"image_channels\": tf.io.FixedLenFeature([], tf.int64),\n",
 83 |     "            \"image_height\": tf.io.FixedLenFeature([], tf.int64),\n",
 84 |     "            \"image_name\": tf.io.FixedLenFeature([], tf.string),\n",
 85 |     "            \"image_width\": tf.io.FixedLenFeature([], tf.int64),\n",
 86 |     "            \"label\": tf.io.FixedLenFeature([], tf.int64),\n",
 87 |     "            \"split\": tf.io.FixedLenFeature([], tf.string),\n",
 88 |     "        }\n",
 89 |     "    image_features= tf.io.parse_single_example(example, tfr_format)\n",
 90 |     "    image_channels=image_features['image_channels']\n",
 91 |     "    image_width=image_features['image_width']\n",
 92 |     "    image_height=image_features['image_height']\n",
 93 |     "    label=image_features['label']\n",
 94 |     "    image_b64_bytes=image_features['image']\n",
 95 |     "    image_decoded=tf.io.decode_base64(image_b64_bytes)\n",
 96 |     "    image_raw = tf.io.decode_raw(image_decoded, out_type=tf.uint8)\n",
 97 |     "    image = tf.reshape(image_raw, tf.stack([image_height, image_width, image_channels]))\n",
 98 |     "    image_resized = tf.cast(tf.image.resize(image, size=[*IMAGE_SIZE]),tf.uint8)\n",
 99 |     "    return image_resized, label"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "def get_dataset(filenames):\n",
109 |     "    dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP') \n",
110 |     "    dataset = dataset.map(read_tfrecord)\n",
111 |     "    dataset = dataset.shuffle(200)\n",
112 |     "    dataset = dataset.batch(BATCH_SIZE)\n",
113 |     "    return dataset"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "train_dataset = get_dataset(TRAINING_FILENAMES)\n",
123 |     "valid_dataset = get_dataset(VALID_FILENAMES)\n",
124 |     "# image_batch, label_batch = next(iter(train_dataset))\n",
125 |     "# image_batch[0].numpy()\n",
126 |     "# for n in range(2):\n",
127 |     "#         plt.imshow(image_batch[n])       "
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "model = tf.keras.models.Sequential([\n",
137 |     "    tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),\n",
138 |     "    tf.keras.layers.MaxPooling2D(2, 2),\n",
139 |     "    tf.keras.layers.Conv2D(32, (3,3), activation='relu'),\n",
140 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
141 |     "    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n",
142 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
143 |     "    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n",
144 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
145 |     "    tf.keras.layers.Flatten(),\n",
146 |     "    tf.keras.layers.Dense(256, activation='relu'),\n",
147 |     "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
148 |     "])\n",
149 |     "\n",
150 |     "model.summary()\n",
151 |     "model.compile(loss='binary_crossentropy',\n",
152 |     "              optimizer=RMSprop(lr=1e-4),\n",
153 |     "              metrics=['accuracy'])"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "model.fit(\n",
163 |     "    train_dataset,\n",
164 |     "    epochs=10,\n",
165 |     "    validation_data=valid_dataset,\n",
166 |     "    verbose=2,\n",
167 |     "    callbacks=[WandbCallback()]\n",
168 |     ")"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "MODEL_PATH=time.strftime(\"gs://mchrestkha-demo-env-ml-examples/catsdogs/models/model_%Y%m%d_%H%M%S\")\n",
178 |     "model.save(MODEL_PATH)\n",
179 |     "\n",
180 |     "\n",
181 |     "artifact = wandb.Artifact(name='model', type='model')\n",
182 |     "artifact.add_reference(MODEL_PATH)\n",
183 |     "run.log_artifact(artifact)\n",
184 |     "run.finish()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "os.environ['MODEL_PATH'] = MODEL_PATH #to be later used gcloud bash script\n",
194 |     "print(os.environ['MODEL_PATH'])"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "%%bash\n",
204 |     "MODEL_VERSION=\"v1\"\n",
205 |     "MODEL_NAME=\"cats_dogs_classifier3\"\n",
206 |     "REGION=\"us-central1\"\n",
207 |     "\n",
208 |     "gcloud ai-platform models create $MODEL_NAME \\\n",
209 |     "    --regions $REGION\n",
210 |     "\n",
211 |     "gcloud ai-platform versions create $MODEL_VERSION \\\n",
212 |     "  --model $MODEL_NAME \\\n",
213 |     "  --runtime-version 2.2 \\\n",
214 |     "  --python-version 3.7 \\\n",
215 |     "  --framework tensorflow \\\n",
216 |     "  --origin $MODEL_PATH"
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "environment": {
222 |    "name": "tf2-2-3-gpu.2-3.m55",
223 |    "type": "gcloud",
224 |    "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55"
225 |   },
226 |   "kernelspec": {
227 |    "display_name": "mchrestkha-env",
228 |    "language": "python",
229 |    "name": "mchrestkha-env"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.7.8"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 4
246 | }
247 | 


--------------------------------------------------------------------------------
/catsdogs/tensorflow/tfcloud/model_training_tfcloud.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tensorflow as tf\n",
 10 |     "from tensorflow.keras.optimizers import RMSprop\n",
 11 |     "import tensorflow_cloud as tfc\n",
 12 |     "import time\n",
 13 |     "import os"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "AUTOTUNE = tf.data.experimental.AUTOTUNE\n",
 23 |     "GCS_PATH = \"gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/tfrecorder-20200930-193548-to-tfr\"\n",
 24 |     "BATCH_SIZE = 5\n",
 25 |     "IMAGE_SIZE = [150, 150]"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "TRAINING_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/train*.tfrecord.gz\")\n",
 35 |     "VALID_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/validation*.tfrecord.gz\")"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "print(\"Train TFRecord Files:\", len(TRAINING_FILENAMES))\n",
 45 |     "print(\"Validation TFRecord Files:\", len(VALID_FILENAMES))"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "def read_tfrecord(example):\n",
 55 |     "    tfr_format = {\n",
 56 |     "            \"image\": tf.io.FixedLenFeature([], tf.string),\n",
 57 |     "            \"image_channels\": tf.io.FixedLenFeature([], tf.int64),\n",
 58 |     "            \"image_height\": tf.io.FixedLenFeature([], tf.int64),\n",
 59 |     "            \"image_name\": tf.io.FixedLenFeature([], tf.string),\n",
 60 |     "            \"image_width\": tf.io.FixedLenFeature([], tf.int64),\n",
 61 |     "            \"label\": tf.io.FixedLenFeature([], tf.int64),\n",
 62 |     "            \"split\": tf.io.FixedLenFeature([], tf.string),\n",
 63 |     "        }\n",
 64 |     "    image_features= tf.io.parse_single_example(example, tfr_format)\n",
 65 |     "    image_channels=image_features['image_channels']\n",
 66 |     "    image_width=image_features['image_width']\n",
 67 |     "    image_height=image_features['image_height']\n",
 68 |     "    label=image_features['label']\n",
 69 |     "    image_b64_bytes=image_features['image']\n",
 70 |     "    image_decoded=tf.io.decode_base64(image_b64_bytes)\n",
 71 |     "    image_raw = tf.io.decode_raw(image_decoded, out_type=tf.uint8)\n",
 72 |     "    image = tf.reshape(image_raw, tf.stack([image_height, image_width, image_channels]))\n",
 73 |     "    image_resized = tf.cast(tf.image.resize(image, size=[*IMAGE_SIZE]),tf.uint8)\n",
 74 |     "    return image_resized, label"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "def get_dataset(filenames):\n",
 84 |     "    dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP') \n",
 85 |     "    dataset = dataset.map(read_tfrecord)\n",
 86 |     "    dataset = dataset.shuffle(200)\n",
 87 |     "    dataset = dataset.batch(BATCH_SIZE)\n",
 88 |     "    return dataset"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "train_dataset = get_dataset(TRAINING_FILENAMES)\n",
 98 |     "valid_dataset = get_dataset(VALID_FILENAMES)\n",
 99 |     "# image_batch, label_batch = next(iter(train_dataset))\n",
100 |     "# image_batch[0].numpy()\n",
101 |     "# for n in range(2):\n",
102 |     "#         plt.imshow(image_batch[n])       "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "model = tf.keras.models.Sequential([\n",
112 |     "    tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),\n",
113 |     "    tf.keras.layers.MaxPooling2D(2, 2),\n",
114 |     "    tf.keras.layers.Conv2D(32, (3,3), activation='relu'),\n",
115 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
116 |     "    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n",
117 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
118 |     "    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n",
119 |     "    tf.keras.layers.MaxPooling2D(2,2),\n",
120 |     "    tf.keras.layers.Flatten(),\n",
121 |     "    tf.keras.layers.Dense(256, activation='relu'),\n",
122 |     "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
123 |     "])\n",
124 |     "\n",
125 |     "#model.summary()\n",
126 |     "model.compile(loss='binary_crossentropy',\n",
127 |     "              optimizer=RMSprop(lr=1e-4),\n",
128 |     "              metrics=['accuracy'])"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "model.fit(\n",
138 |     "    train_dataset,\n",
139 |     "    epochs=10,\n",
140 |     "    validation_data=valid_dataset,\n",
141 |     "    verbose=2)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "model.save(time.strftime(\"gs://mchrestkha-demo-env-ml-examples/catsdogs/models/model_%Y%m%d_%H%M%S\"))"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "environment": {
156 |    "name": "tf2-2-3-gpu.2-3.m55",
157 |    "type": "gcloud",
158 |    "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55"
159 |   },
160 |   "kernelspec": {
161 |    "display_name": "mchrestkha-env",
162 |    "language": "python",
163 |    "name": "mchrestkha-env"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.7.8"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 4
180 | }
181 | 


--------------------------------------------------------------------------------
/catsdogs/tensorflow/tfcloud/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.12.1
2 | tensorflow-cloud==0.1.7
3 | wandb==0.10.4
4 | 
5 | 


--------------------------------------------------------------------------------
/catsdogs/tensorflow/tfcloud/run_tfcloud.py:
--------------------------------------------------------------------------------
1 | import tensorflow_cloud as tfc
2 | tfc.run(entry_point='model_training_tfcloud.ipynb',
3 | #        chief_config=tfc.COMMON_MACHINE_CONFIGS['T4_4X'],
4 |         requirements_txt='requirements.txt')


--------------------------------------------------------------------------------
/census/catboost/gcp_ai_platform/notebooks/catboost_census_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Used https://github.com/GoogleCloudPlatform/cloudml-samples/blob/master/xgboost/notebooks/census_training/train.py as a starting point and adjusted to CatBoost"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 37,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "#Google Cloud Libraries\n",
 17 |     "from google.cloud import storage\n",
 18 |     "\n",
 19 |     "\n",
 20 |     "#System Libraries\n",
 21 |     "import datetime\n",
 22 |     "import subprocess\n",
 23 |     "\n",
 24 |     "#Data Libraries\n",
 25 |     "import pandas as pd\n",
 26 |     "import numpy as np\n",
 27 |     "\n",
 28 |     "#ML Libraries\n",
 29 |     "from sklearn.model_selection import train_test_split\n",
 30 |     "from sklearn.metrics import accuracy_score\n",
 31 |     "from sklearn.preprocessing import LabelEncoder\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "import xgboost as xgb\n",
 34 |     "from catboost import CatBoostClassifier, Pool, cv\n",
 35 |     "from catboost import CatBoost, Pool\n",
 36 |     "\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 50,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "I see 1 GPU devices\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "from catboost.utils import get_gpu_device_count\n",
 54 |     "print('I see %i GPU devices' % get_gpu_device_count())"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 61,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Fill in your Cloud Storage bucket name\n",
 64 |     "BUCKET_ID = \"mchrestkha-demo-env-ml-examples\"\n",
 65 |     "\n",
 66 |     "census_data_filename = 'adult.data.csv'\n",
 67 |     "\n",
 68 |     "# Public bucket holding the census data\n",
 69 |     "bucket = storage.Client().bucket('cloud-samples-data')\n",
 70 |     "\n",
 71 |     "# Path to the data inside the public bucket\n",
 72 |     "data_dir = 'ai-platform/census/data/'\n",
 73 |     "\n",
 74 |     "# Download the data\n",
 75 |     "blob = bucket.blob(''.join([data_dir, census_data_filename]))\n",
 76 |     "blob.download_to_filename(census_data_filename)\n"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 38,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# these are the column labels from the census data files\n",
 86 |     "COLUMNS = (\n",
 87 |     "    'age',\n",
 88 |     "    'workclass',\n",
 89 |     "    'fnlwgt',\n",
 90 |     "    'education',\n",
 91 |     "    'education-num',\n",
 92 |     "    'marital-status',\n",
 93 |     "    'occupation',\n",
 94 |     "    'relationship',\n",
 95 |     "    'race',\n",
 96 |     "    'sex',\n",
 97 |     "    'capital-gain',\n",
 98 |     "    'capital-loss',\n",
 99 |     "    'hours-per-week',\n",
100 |     "    'native-country',\n",
101 |     "    'income-level'\n",
102 |     ")\n",
103 |     "# categorical columns contain data that need to be turned into numerical values before being used by XGBoost\n",
104 |     "CATEGORICAL_COLUMNS = (\n",
105 |     "    'workclass',\n",
106 |     "    'education',\n",
107 |     "    'marital-status',\n",
108 |     "    'occupation',\n",
109 |     "    'relationship',\n",
110 |     "    'race',\n",
111 |     "    'sex',\n",
112 |     "    'native-country'\n",
113 |     ")\n",
114 |     "\n",
115 |     "# Load the training census dataset\n",
116 |     "with open(census_data_filename, 'r') as train_data:\n",
117 |     "    raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)\n",
118 |     "# remove column we are trying to predict ('income-level') from features list\n",
119 |     "X = raw_training_data.drop('income-level', axis=1)\n",
120 |     "# create training labels list\n",
121 |     "#train_labels = (raw_training_data['income-level'] == ' >50K')\n",
122 |     "y = raw_training_data['income-level']"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": []
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 39,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# Since the census data set has categorical features, we need to convert\n",
139 |     "# them to numerical values.\n",
140 |     "# convert data in categorical columns to numerical values\n",
141 |     "X_enc=X\n",
142 |     "encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}\n",
143 |     "for col in CATEGORICAL_COLUMNS:\n",
144 |     "    X_enc[col] = encoders[col].fit_transform(X[col])\n",
145 |     "        "
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 40,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "y_enc=LabelEncoder().fit_transform(y)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 43,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "X_train, X_validation, y_train, y_validation = train_test_split(X_enc, y_enc, train_size=0.75, random_state=42)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "print(type(y))\n",
173 |     "print(type(y_enc))"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 58,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "Learning rate set to 0.069772\n",
186 |       "0:\tlearn: 0.6282687\ttest: 0.6273059\tbest: 0.6273059 (0)\ttotal: 11.3ms\tremaining: 11.2s\n",
187 |       "50:\tlearn: 0.3021165\ttest: 0.3008721\tbest: 0.3008721 (50)\ttotal: 530ms\tremaining: 9.87s\n",
188 |       "100:\tlearn: 0.2857407\ttest: 0.2886646\tbest: 0.2886646 (100)\ttotal: 1.03s\tremaining: 9.14s\n",
189 |       "150:\tlearn: 0.2748276\ttest: 0.2825841\tbest: 0.2825841 (150)\ttotal: 1.53s\tremaining: 8.59s\n",
190 |       "200:\tlearn: 0.2660846\ttest: 0.2787806\tbest: 0.2787806 (200)\ttotal: 2.02s\tremaining: 8.04s\n",
191 |       "250:\tlearn: 0.2594067\ttest: 0.2771832\tbest: 0.2771832 (250)\ttotal: 2.52s\tremaining: 7.52s\n",
192 |       "Stopped by overfitting detector  (20 iterations wait)\n",
193 |       "\n",
194 |       "bestTest = 0.2770424728\n",
195 |       "bestIteration = 257\n",
196 |       "\n",
197 |       "Shrink model to first 258 iterations.\n",
198 |       "CPU times: user 9.63 s, sys: 788 ms, total: 10.4 s\n",
199 |       "Wall time: 2.85 s\n"
200 |      ]
201 |     },
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "<catboost.core.CatBoostClassifier at 0x7fdb929aa6d0>"
206 |       ]
207 |      },
208 |      "execution_count": 58,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "%%time\n",
215 |     "\n",
216 |     "#model = CatBoost({'iterations':50})\n",
217 |     "model=CatBoostClassifier(\n",
218 |     "        od_type='Iter'\n",
219 |     "#iterations=5000,\n",
220 |     "#custom_loss=['Accuracy']\n",
221 |     ")\n",
222 |     "model.fit(\n",
223 |     "    X_train,y_train,eval_set=(X_validation, y_validation),\n",
224 |     "\n",
225 |     "    verbose=50)\n",
226 |     "\n",
227 |     "# # load data into DMatrix object\n",
228 |     "# dtrain = xgb.DMatrix(train_features, train_labels)\n",
229 |     "# # train model\n",
230 |     "# bst = xgb.train({}, dtrain, 20)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": []
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": []
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 69,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "# Export the model to a file\n",
254 |     "fname = 'catboost_census_model.onnx'\n",
255 |     "model.save_model(fname, format='onnx')\n",
256 |     "\n",
257 |     "# Upload the model to GCS\n",
258 |     "bucket = storage.Client().bucket(BUCKET_ID)\n",
259 |     "blob = bucket.blob('{}/{}'.format(\n",
260 |     "    datetime.datetime.now().strftime('census/catboost_model_dir/catboost_census_%Y%m%d_%H%M%S'),\n",
261 |     "    fname))\n",
262 |     "blob.upload_from_filename(fname)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 66,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212707/:\n",
275 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212707/<catboost.core.CatBoostClassifier object at 0x7fdb929aa6d0>\n",
276 |       "\n",
277 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212852/:\n",
278 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212852/<catboost.core.CatBoostClassifier object at 0x7fdb929aa6d0>\n",
279 |       "\n",
280 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_213004/:\n",
281 |       "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_213004/<catboost.core.CatBoostClassifier object at 0x7fdb929aa6d0>\n",
282 |       "\n",
283 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/:\n",
284 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/model.bst\n",
285 |       "\n",
286 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_021023/:\n",
287 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_021023/model.bst\n",
288 |       "\n",
289 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_023122/:\n",
290 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_023122/model.bst\n",
291 |       "\n",
292 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_job_dir/:\n",
293 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_job_dir/packages/\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "!gsutil ls gs://$BUCKET_ID/census/*"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": []
307 |   }
308 |  ],
309 |  "metadata": {
310 |   "kernelspec": {
311 |    "display_name": "Python 3",
312 |    "language": "python",
313 |    "name": "python3"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 3
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython3",
325 |    "version": "3.7.6"
326 |   }
327 |  },
328 |  "nbformat": 4,
329 |  "nbformat_minor": 4
330 | }
331 | 


--------------------------------------------------------------------------------
/census/catboost/gcp_ai_platform/scripts/train-cloud.sh:
--------------------------------------------------------------------------------
 1 | echo "Submitting an AI Platform job..."
 2 | 
 3 | 
 4 | PROJECT_ID="mchrestkha-demo-env"
 5 | BUCKET_ID="mchrestkha-demo-env-ml-examples"
 6 | JOB_NAME=catboost_census_training_$(date +"%Y%m%d_%H%M%S")
 7 | JOB_DIR=gs://$BUCKET_ID/census/catboost_job_dir
 8 | TRAINING_PACKAGE_PATH="../trainer/"
 9 | MAIN_TRAINER_MODULE=trainer.train
10 | REGION=us-west1
11 | RUNTIME_VERSION=2.1
12 | PYTHON_VERSION=3.7
13 | SCALE_TIER=BASIC
14 | 
15 | 
16 | gcloud ai-platform jobs submit training $JOB_NAME \
17 | --job-dir $JOB_DIR \
18 | --package-path $TRAINING_PACKAGE_PATH \
19 | --module-name $MAIN_TRAINER_MODULE \
20 | --region $REGION \
21 | --runtime-version=$RUNTIME_VERSION \
22 | --python-version=$PYTHON_VERSION \
23 | --scale-tier $SCALE_TIER
24 | 
25 | 


--------------------------------------------------------------------------------
/census/catboost/gcp_ai_platform/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages
 2 | from setuptools import setup
 3 | 
 4 | REQUIRED_PACKAGES = ['catboost']
 5 | 
 6 | setup(
 7 |     name='trainer',
 8 |     version='0.1',
 9 |     install_requires=REQUIRED_PACKAGES,
10 |     packages=find_packages(),
11 |     include_package_data=True,
12 |     description='My training application package.'
13 | )


--------------------------------------------------------------------------------
/census/catboost/gcp_ai_platform/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mchrestkha/machine_learning_examples/9382b3426a4423d720df588bb510be98bb77599d/census/catboost/gcp_ai_platform/trainer/__init__.py


--------------------------------------------------------------------------------
/census/catboost/gcp_ai_platform/trainer/train.py:
--------------------------------------------------------------------------------
  1 | #Google Cloud Libraries
  2 | from google.cloud import storage
  3 | 
  4 | 
  5 | #System Libraries
  6 | import datetime
  7 | import subprocess
  8 | 
  9 | #Data Libraries
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | #ML Libraries
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.preprocessing import LabelEncoder
 17 | from sklearn.model_selection import train_test_split
 18 | import xgboost as xgb
 19 | from catboost import CatBoostClassifier, Pool, cv
 20 | from catboost import CatBoost, Pool
 21 | 
 22 | from catboost.utils import get_gpu_device_count
 23 | print('I see %i GPU devices' % get_gpu_device_count())
 24 | 
 25 | 
 26 | # Fill in your Cloud Storage bucket name
 27 | BUCKET_ID = "mchrestkha-demo-env-ml-examples"
 28 | 
 29 | census_data_filename = 'adult.data.csv'
 30 | 
 31 | # Public bucket holding the census data
 32 | bucket = storage.Client().bucket('cloud-samples-data')
 33 | 
 34 | # Path to the data inside the public bucket
 35 | data_dir = 'ai-platform/census/data/'
 36 | 
 37 | # Download the data
 38 | blob = bucket.blob(''.join([data_dir, census_data_filename]))
 39 | blob.download_to_filename(census_data_filename)
 40 | 
 41 | # these are the column labels from the census data files
 42 | COLUMNS = (
 43 |     'age',
 44 |     'workclass',
 45 |     'fnlwgt',
 46 |     'education',
 47 |     'education-num',
 48 |     'marital-status',
 49 |     'occupation',
 50 |     'relationship',
 51 |     'race',
 52 |     'sex',
 53 |     'capital-gain',
 54 |     'capital-loss',
 55 |     'hours-per-week',
 56 |     'native-country',
 57 |     'income-level'
 58 | )
 59 | # categorical columns contain data that need to be turned into numerical values before being used by XGBoost
 60 | CATEGORICAL_COLUMNS = (
 61 |     'workclass',
 62 |     'education',
 63 |     'marital-status',
 64 |     'occupation',
 65 |     'relationship',
 66 |     'race',
 67 |     'sex',
 68 |     'native-country'
 69 | )
 70 | 
 71 | # Load the training census dataset
 72 | with open(census_data_filename, 'r') as train_data:
 73 |     raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
 74 | # remove column we are trying to predict ('income-level') from features list
 75 | X = raw_training_data.drop('income-level', axis=1)
 76 | # create training labels list
 77 | #train_labels = (raw_training_data['income-level'] == ' >50K')
 78 | y = raw_training_data['income-level']
 79 | 
 80 | # Since the census data set has categorical features, we need to convert
 81 | # them to numerical values.
 82 | # convert data in categorical columns to numerical values
 83 | X_enc=X
 84 | encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}
 85 | for col in CATEGORICAL_COLUMNS:
 86 |     X_enc[col] = encoders[col].fit_transform(X[col])
 87 |         
 88 |         
 89 | y_enc=LabelEncoder().fit_transform(y)
 90 | 
 91 | X_train, X_validation, y_train, y_validation = train_test_split(X_enc, y_enc, train_size=0.75, random_state=42)
 92 | 
 93 | 
 94 | #model = CatBoost({'iterations':50})
 95 | model=CatBoostClassifier(
 96 |         od_type='Iter'
 97 | #iterations=5000,
 98 | #custom_loss=['Accuracy']
 99 | )
100 | model.fit(
101 |     X_train,y_train,eval_set=(X_validation, y_validation),
102 | 
103 |     verbose=50)
104 | 
105 | # # load data into DMatrix object
106 | # dtrain = xgb.DMatrix(train_features, train_labels)
107 | # # train model
108 | # bst = xgb.train({}, dtrain, 20)
109 | 
110 | 
111 | # Export the model to a file
112 | fname = 'catboost_census_model.onnx'
113 | model.save_model(fname, format='onnx')
114 | 
115 | # Upload the model to GCS
116 | bucket = storage.Client().bucket(BUCKET_ID)
117 | blob = bucket.blob('{}/{}'.format(
118 |     datetime.datetime.now().strftime('census/catboost_model_dir/catboost_census_%Y%m%d_%H%M%S'),
119 |     fname))
120 | blob.upload_from_filename(fname)


--------------------------------------------------------------------------------
/census/xgboost/gcp_ai_platform/notebooks/xgboost_census_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import datetime"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 3,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import xgboost as xgb\n",
 19 |     "import pandas as pd\n",
 20 |     "from sklearn.preprocessing import LabelEncoder\n",
 21 |     "import subprocess\n",
 22 |     "from google.cloud import storage\n",
 23 |     "\n",
 24 |     "# Fill in your Cloud Storage bucket name\n",
 25 |     "BUCKET_ID = \"mchrestkha-demo-env-ml-examples\"\n",
 26 |     "\n",
 27 |     "census_data_filename = 'adult.data.csv'\n",
 28 |     "\n",
 29 |     "# Public bucket holding the census data\n",
 30 |     "bucket = storage.Client().bucket('cloud-samples-data')\n",
 31 |     "\n",
 32 |     "# Path to the data inside the public bucket\n",
 33 |     "data_dir = 'ai-platform/census/data/'\n",
 34 |     "\n",
 35 |     "# Download the data\n",
 36 |     "blob = bucket.blob(''.join([data_dir, census_data_filename]))\n",
 37 |     "blob.download_to_filename(census_data_filename)\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# these are the column labels from the census data files\n",
 47 |     "COLUMNS = (\n",
 48 |     "    'age',\n",
 49 |     "    'workclass',\n",
 50 |     "    'fnlwgt',\n",
 51 |     "    'education',\n",
 52 |     "    'education-num',\n",
 53 |     "    'marital-status',\n",
 54 |     "    'occupation',\n",
 55 |     "    'relationship',\n",
 56 |     "    'race',\n",
 57 |     "    'sex',\n",
 58 |     "    'capital-gain',\n",
 59 |     "    'capital-loss',\n",
 60 |     "    'hours-per-week',\n",
 61 |     "    'native-country',\n",
 62 |     "    'income-level'\n",
 63 |     ")\n",
 64 |     "# categorical columns contain data that need to be turned into numerical values before being used by XGBoost\n",
 65 |     "CATEGORICAL_COLUMNS = (\n",
 66 |     "    'workclass',\n",
 67 |     "    'education',\n",
 68 |     "    'marital-status',\n",
 69 |     "    'occupation',\n",
 70 |     "    'relationship',\n",
 71 |     "    'race',\n",
 72 |     "    'sex',\n",
 73 |     "    'native-country'\n",
 74 |     ")\n",
 75 |     "\n",
 76 |     "# Load the training census dataset\n",
 77 |     "with open(census_data_filename, 'r') as train_data:\n",
 78 |     "    raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)\n",
 79 |     "    \n",
 80 |     "# remove column we are trying to predict ('income-level') from features list\n",
 81 |     "train_features = raw_training_data.drop('income-level', axis=1)\n",
 82 |     "# create training labels list\n",
 83 |     "train_labels = (raw_training_data['income-level'] == ' >50K')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/plain": [
 94 |        "(32561, 15)"
 95 |       ]
 96 |      },
 97 |      "execution_count": 5,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "raw_training_data.shape"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# Since the census data set has categorical features, we need to convert\n",
113 |     "# them to numerical values.\n",
114 |     "# convert data in categorical columns to numerical values\n",
115 |     "encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}\n",
116 |     "for col in CATEGORICAL_COLUMNS:\n",
117 |     "    train_features[col] = encoders[col].fit_transform(train_features[col])"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 7,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "CPU times: user 11.2 s, sys: 32 ms, total: 11.2 s\n",
130 |       "Wall time: 2.83 s\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "%%time\n",
136 |     "# load data into DMatrix object\n",
137 |     "dtrain = xgb.DMatrix(train_features, train_labels)\n",
138 |     "# train model\n",
139 |     "bst = xgb.train({\"verbosity\": 0}, dtrain, 200)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 14,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# Export the model to a file\n",
149 |     "model = 'model.bst'\n",
150 |     "bst.save_model(model)\n",
151 |     "\n",
152 |     "# Upload the model to GCS\n",
153 |     "bucket = storage.Client().bucket(BUCKET_ID)\n",
154 |     "blob = bucket.blob('{}/{}'.format(\n",
155 |     "    datetime.datetime.now().strftime('census/xgboost_model_dir/xgboost_census_%Y%m%d_%H%M%S'),\n",
156 |     "    model))\n",
157 |     "blob.upload_from_filename(model)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 16,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "gs://mchrestkha-demo-env-ml-examples/census/census_20200525_020425/:\n",
170 |       "gs://mchrestkha-demo-env-ml-examples/census/census_20200525_020425/model.bst\n",
171 |       "\n",
172 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/:\n",
173 |       "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/model.bst\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "!gsutil ls gs://$BUCKET_ID/census/*"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": []
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "kernelspec": {
191 |    "display_name": "Python 3",
192 |    "language": "python",
193 |    "name": "python3"
194 |   },
195 |   "language_info": {
196 |    "codemirror_mode": {
197 |     "name": "ipython",
198 |     "version": 3
199 |    },
200 |    "file_extension": ".py",
201 |    "mimetype": "text/x-python",
202 |    "name": "python",
203 |    "nbconvert_exporter": "python",
204 |    "pygments_lexer": "ipython3",
205 |    "version": "3.7.6"
206 |   }
207 |  },
208 |  "nbformat": 4,
209 |  "nbformat_minor": 4
210 | }
211 | 


--------------------------------------------------------------------------------
/census/xgboost/gcp_ai_platform/scripts/train-cloud.sh:
--------------------------------------------------------------------------------
 1 | echo "Submitting an AI Platform job..."
 2 | 
 3 | PROJECT_ID="mchrestkha-demo-env"
 4 | BUCKET_ID="mchrestkha-demo-env-ml-examples"
 5 | JOB_NAME=xgboost_census_training_$(date +"%Y%m%d_%H%M%S")
 6 | JOB_DIR=gs://$BUCKET_ID/census/xgboost_job_dir
 7 | TRAINING_PACKAGE_PATH="../trainer/"
 8 | MAIN_TRAINER_MODULE=trainer.train
 9 | REGION=us-west1
10 | RUNTIME_VERSION=2.1
11 | PYTHON_VERSION=3.7
12 | SCALE_TIER=BASIC
13 | 
14 | gcloud ai-platform jobs submit training $JOB_NAME \
15 | --job-dir $JOB_DIR \
16 | --package-path $TRAINING_PACKAGE_PATH \
17 | --module-name $MAIN_TRAINER_MODULE \
18 | --region $REGION \
19 | --runtime-version=$RUNTIME_VERSION \
20 | --python-version=$PYTHON_VERSION \
21 | --scale-tier $SCALE_TIER
22 | 
23 | 


--------------------------------------------------------------------------------
/census/xgboost/gcp_ai_platform/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mchrestkha/machine_learning_examples/9382b3426a4423d720df588bb510be98bb77599d/census/xgboost/gcp_ai_platform/trainer/__init__.py


--------------------------------------------------------------------------------
/census/xgboost/gcp_ai_platform/trainer/train.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import xgboost as xgb
 3 | import pandas as pd
 4 | from sklearn.preprocessing import LabelEncoder
 5 | import subprocess
 6 | from google.cloud import storage
 7 | 
 8 | # Fill in your Cloud Storage bucket name
 9 | BUCKET_ID = "mchrestkha-demo-env-ml-examples"
10 | 
11 | census_data_filename = 'adult.data.csv'
12 | 
13 | # Public bucket holding the census data
14 | bucket = storage.Client().bucket('cloud-samples-data')
15 | 
16 | # Path to the data inside the public bucket
17 | data_dir = 'ai-platform/census/data/'
18 | 
19 | # Download the data
20 | blob = bucket.blob(''.join([data_dir, census_data_filename]))
21 | blob.download_to_filename(census_data_filename)
22 | 
23 | # these are the column labels from the census data files
24 | COLUMNS = (
25 |     'age',
26 |     'workclass',
27 |     'fnlwgt',
28 |     'education',
29 |     'education-num',
30 |     'marital-status',
31 |     'occupation',
32 |     'relationship',
33 |     'race',
34 |     'sex',
35 |     'capital-gain',
36 |     'capital-loss',
37 |     'hours-per-week',
38 |     'native-country',
39 |     'income-level'
40 | )
41 | # categorical columns contain data that need to be turned into numerical values before being used by XGBoost
42 | CATEGORICAL_COLUMNS = (
43 |     'workclass',
44 |     'education',
45 |     'marital-status',
46 |     'occupation',
47 |     'relationship',
48 |     'race',
49 |     'sex',
50 |     'native-country'
51 | )
52 | 
53 | # Load the training census dataset
54 | with open(census_data_filename, 'r') as train_data:
55 |     raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
56 | # remove column we are trying to predict ('income-level') from features list
57 | train_features = raw_training_data.drop('income-level', axis=1)
58 | # create training labels list
59 | train_labels = (raw_training_data['income-level'] == ' >50K')
60 | 
61 | 
62 | # Since the census data set has categorical features, we need to convert
63 | # them to numerical values.
64 | # convert data in categorical columns to numerical values
65 | encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}
66 | for col in CATEGORICAL_COLUMNS:
67 |     train_features[col] = encoders[col].fit_transform(train_features[col])
68 |     
69 |     
70 | # load data into DMatrix object
71 | dtrain = xgb.DMatrix(train_features, train_labels)
72 | # train model
73 | bst = xgb.train({}, dtrain, 20)
74 | 
75 | 
76 | # Export the model to a file
77 | model = 'model.bst'
78 | bst.save_model(model)
79 | 
80 | # Upload the model to GCS
81 | bucket = storage.Client().bucket(BUCKET_ID)
82 | blob = bucket.blob('{}/{}'.format(
83 |     datetime.datetime.now().strftime('census/xgboost_model_dir/xgboost_census_%Y%m%d_%H%M%S'),
84 |     model))
85 | blob.upload_from_filename(model)


--------------------------------------------------------------------------------
/fannie_mae_loans/rapids_xgboost/notebooks/dask_rapids.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "### Resources Used"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "* Dataset\n",
  15 |     " - https://docs.rapids.ai/datasets/mortgage-data\n",
  16 |     " - https://capmrkt.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html\n",
  17 |     "* RAPIDS + Dask Documentation\n",
  18 |     " - https://docs.rapids.ai/api/cudf/stable/10min.html\n",
  19 |     " - https://docs.dask.org/en/latest/dataframe-best-practices.html\n",
  20 |     " - https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster\n",
  21 |     " - https://distributed.dask.org/en/latest/memory.html\n",
  22 |     " - https://dask-cuda.readthedocs.io/en/latest/specializations.html\n",
  23 |     "* Other examples with this dataset\n",
  24 |     " - https://www.dataquest.io/blog/data-science-portfolio-machine-learning/\n",
  25 |     " - https://github.com/dhananjaymehta/FannieMae_LoanForeclosure\n",
  26 |     " - https://degravek.github.io/project-pages/project1/2016/11/12/New-Notebook/\n",
  27 |     " - https://riskspan.com/hands-on-machine-learning-predicting-loan-delinquency/\n",
  28 |     " \n",
  29 |     " \n",
  30 |     " \n",
  31 |     "\n",
  32 |     "\n",
  33 |     "\n",
  34 |     "\n"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "metadata": {},
  40 |    "source": [
  41 |     "## Create conda enviornment with the following libraries\n",
  42 |     "```\n",
  43 |     "conda create -n rapids-0.17 -c rapidsai -c nvidia -c conda-forge -c defaults rapids-blazing=0.17 python=3.7 cudatoolkit=11.0 matplotlib=3.3.3 gcsfs=0.7.1\n",
  44 |     "```\n",
  45 |     "\n",
  46 |     "Once you create the conda environment open a Jupyter kernel associated to this conda environment"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "## Check Environment"
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "code",
  58 |    "execution_count": 1,
  59 |    "metadata": {},
  60 |    "outputs": [
  61 |     {
  62 |      "name": "stdout",
  63 |      "output_type": "stream",
  64 |      "text": [
  65 |       "Thu Jan 14 03:17:44 2021       \n",
  66 |       "+-----------------------------------------------------------------------------+\n",
  67 |       "| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |\n",
  68 |       "|-------------------------------+----------------------+----------------------+\n",
  69 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
  70 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
  71 |       "|                               |                      |               MIG M. |\n",
  72 |       "|===============================+======================+======================|\n",
  73 |       "|   0  A100-SXM4-40GB      On   | 00000000:00:04.0 Off |                    0 |\n",
  74 |       "| N/A   32C    P0    54W / 400W |      0MiB / 40537MiB |      0%      Default |\n",
  75 |       "|                               |                      |             Disabled |\n",
  76 |       "+-------------------------------+----------------------+----------------------+\n",
  77 |       "|   1  A100-SXM4-40GB      On   | 00000000:00:05.0 Off |                    0 |\n",
  78 |       "| N/A   33C    P0    52W / 400W |      0MiB / 40537MiB |      0%      Default |\n",
  79 |       "|                               |                      |             Disabled |\n",
  80 |       "+-------------------------------+----------------------+----------------------+\n",
  81 |       "                                                                               \n",
  82 |       "+-----------------------------------------------------------------------------+\n",
  83 |       "| Processes:                                                                  |\n",
  84 |       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
  85 |       "|        ID   ID                                                   Usage      |\n",
  86 |       "|=============================================================================|\n",
  87 |       "|  No running processes found                                                 |\n",
  88 |       "+-----------------------------------------------------------------------------+\n",
  89 |       "nvcc: NVIDIA (R) Cuda compiler driver\n",
  90 |       "Copyright (c) 2005-2020 NVIDIA Corporation\n",
  91 |       "Built on Thu_Jun_11_22:26:38_PDT_2020\n",
  92 |       "Cuda compilation tools, release 11.0, V11.0.194\n",
  93 |       "Build cuda_11.0_bu.TC445_37.28540450_0\n"
  94 |      ]
  95 |     }
  96 |    ],
  97 |    "source": [
  98 |     "%%bash\n",
  99 |     "nvidia-smi\n",
 100 |     "nvcc --version"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "code",
 105 |    "execution_count": null,
 106 |    "metadata": {},
 107 |    "outputs": [],
 108 |    "source": [
 109 |     "import numpy as np; print('numpy Version:', np.__version__)\n",
 110 |     "import pandas as pd; print('pandas Version:', pd.__version__)\n",
 111 |     "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n",
 112 |     "import cudf; print('cudf Version:', cudf.__version__)\n",
 113 |     "import cuml; print('cudf Version:', cuml.__version__)\n",
 114 |     "import gcsfs; print('gcsfs Version:', gcsfs.__version__)\n",
 115 |     "import time\n",
 116 |     "import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)\n",
 117 |     "import dask; print('dask Version:', gcsfs.__version__)\n",
 118 |     "import dask.dataframe as dask_df\n",
 119 |     "import glob;\n",
 120 |     "import matplotlib; print('matplotlib Version:', matplotlib.__version__)\n",
 121 |     "from dask.diagnostics import ProgressBar\n",
 122 |     "from dask.distributed import Client, progress, wait"
 123 |    ]
 124 |   },
 125 |   {
 126 |    "cell_type": "code",
 127 |    "execution_count": 3,
 128 |    "metadata": {},
 129 |    "outputs": [],
 130 |    "source": [
 131 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 132 |     "InteractiveShell.ast_node_interactivity = \"all\""
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "markdown",
 137 |    "metadata": {},
 138 |    "source": [
 139 |     "## Set up Dask Cluster"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "metadata": {},
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "# from dask.distributed import Client\n",
 149 |     "# client = Client()\n",
 150 |     "# client"
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": 4,
 156 |    "metadata": {},
 157 |    "outputs": [
 158 |     {
 159 |      "data": {
 160 |       "text/html": [
 161 |        "<table style=\"border: 2px solid white;\">\n",
 162 |        "<tr>\n",
 163 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 164 |        "<h3 style=\"text-align: left;\">Client</h3>\n",
 165 |        "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
 166 |        "  <li><b>Scheduler: </b>tcp://127.0.0.1:41177</li>\n",
 167 |        "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n",
 168 |        "</ul>\n",
 169 |        "</td>\n",
 170 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 171 |        "<h3 style=\"text-align: left;\">Cluster</h3>\n",
 172 |        "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
 173 |        "  <li><b>Workers: </b>2</li>\n",
 174 |        "  <li><b>Cores: </b>2</li>\n",
 175 |        "  <li><b>Memory: </b>179.38 GB</li>\n",
 176 |        "</ul>\n",
 177 |        "</td>\n",
 178 |        "</tr>\n",
 179 |        "</table>"
 180 |       ],
 181 |       "text/plain": [
 182 |        "<Client: 'tcp://127.0.0.1:41177' processes=2 threads=2, memory=179.38 GB>"
 183 |       ]
 184 |      },
 185 |      "execution_count": 4,
 186 |      "metadata": {},
 187 |      "output_type": "execute_result"
 188 |     }
 189 |    ],
 190 |    "source": [
 191 |     "import blazingsql\n",
 192 |     "import dask_cudf\n",
 193 |     "from dask.distributed import Client\n",
 194 |     "from dask_cuda import LocalCUDACluster\n",
 195 |     "\n",
 196 |     "cluster = LocalCUDACluster()\n",
 197 |     "client = Client(cluster)\n",
 198 |     "# bc = blazingsql.BlazingContext(dask_client=client, network_interface='lo')\n",
 199 |     "client"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "code",
 204 |    "execution_count": null,
 205 |    "metadata": {},
 206 |    "outputs": [],
 207 |    "source": [
 208 |     "# client.restart()\n",
 209 |     "# client"
 210 |    ]
 211 |   },
 212 |   {
 213 |    "cell_type": "markdown",
 214 |    "metadata": {},
 215 |    "source": [
 216 |     "## Define Data Schema & Data Types"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "code",
 221 |    "execution_count": 5,
 222 |    "metadata": {},
 223 |    "outputs": [],
 224 |    "source": [
 225 |     "col_acq_names = ['LoanID','Channel','SellerName','OrInterestRate','OrUnpaidPrinc','OrLoanTerm',\n",
 226 |     "        'OrDate','FirstPayment','OrLTV','OrCLTV','NumBorrow','DTIRat','CreditScore',\n",
 227 |     "        'FTHomeBuyer','LoanPurpose','PropertyType','NumUnits','OccStatus','PropertyState',\n",
 228 |     "        'Zip','MortInsPerc','ProductType','CoCreditScore','MortInsType','RelMortInd']\n",
 229 |     "col_per_names = ['LoanID','MonthRep','Servicer','CurrInterestRate','CAUPB','LoanAge','MonthsToMaturity',\n",
 230 |     "          'AdMonthsToMaturity','MaturityDate','MSA','CLDS','ModFlag','ZeroBalCode','ZeroBalDate',\n",
 231 |     "          'LastInstallDate','ForeclosureDate','DispositionDate','PPRC','AssetRecCost','MHRC',\n",
 232 |     "          'ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',\n",
 233 |     "          'FPWA','ServicingIndicator']\n",
 234 |     "\n",
 235 |     "col_acq = ['LoanID','OrDate','OrUnpaidPrinc','Channel','SellerName','PropertyType','NumUnits','PropertyState']\n",
 236 |     "col_per = ['LoanID','MonthRep', 'CAUPB','CLDS','ForeclosureDate']\n",
 237 |     "\n",
 238 |     "parse_dates_acq =['OrDate','FirstPayment']\n",
 239 |     "parse_dates_per =['MonthRep','MaturityDate','ZeroBalDate','LastInstallDate','ForeclosureDate','DispositionDate']\n",
 240 |     "\n",
 241 |     "# dtype_acq={ \"LoanID\":\"str\",\"Channel\":\"str\",\"SellerName\":\"str\",\"OrInterestRate\":\"str\",\"OrUnpaidPrinc\":\"str\",\"OrLoanTerm\":\"str\",\"OrDate\":\"str\",\n",
 242 |     "#    \"FirstPayment\":\"str\",\"OrLTV\":\"str\",\"OrCLTV\":\"str\",  \"NumBorrow\":\"str\", \"DTIRat\":\"str\", \"CreditScore\":\"str\", \"FTHomeBuyer\":\"str\",\n",
 243 |     "#    \"LoanPurpose\":\"str\", \"PropertyType\":\"str\", \"NumUnits\":\"str\", \"OccStatus\":\"str\",  \"PropertyState\":\"str\",  \"Zip\":\"str\", \"MortInsPerc\":\"str\",\n",
 244 |     "#    \"ProductType\":\"str\", \"CoCreditScore\":\"str\", \"MortInsType\":\"str\", \"RelMortInd\":\"str\"}\n",
 245 |     "\n",
 246 |     "dtype_acq={ \"LoanID\":\"int\",\"Channel\":\"str\",\"SellerName\":\"str\",\"OrInterestRate\":\"float\",\"OrUnpaidPrinc\":\"float\",\"OrLoanTerm\":\"float\",\"OrDate\":\"str\",\n",
 247 |     "   \"FirstPayment\":\"str\",\"OrLTV\":\"float\",\"OrCLTV\":\"float\",  \"NumBorrow\":\"float\", \"DTIRat\":\"float\", \"CreditScore\":\"float\", \"FTHomeBuyer\":\"str\",\n",
 248 |     "   \"LoanPurpose\":\"str\", \"PropertyType\":\"str\", \"NumUnits\":\"float\", \"OccStatus\":\"str\",  \"PropertyState\":\"str\",  \"Zip\":\"int\", \"MortInsPerc\":\"float\",\n",
 249 |     "   \"ProductType\":\"str\", \"CoCreditScore\":\"float\", \"MortInsType\":\"float\", \"RelMortInd\":\"str\"}\n",
 250 |     "\n",
 251 |     "# dtype_per={\"LoanID\":\"str\",\"MonthRep\":\"str\",\"Servicer\":\"str\", \"CurrInterestRate\":\"str\", \"CAUPB\":\"str\", \"LoanAge\":\"str\",\"MonthsToMaturity\":\"str\",\n",
 252 |     "#    \"AdMonthsToMaturity\":\"str\", \"MaturityDate\":\"str\", \"MSA\":\"str\", \"CLDS\":\"str\", \"ModFlag\":\"str\", \"ZeroBalCode\":\"str\", \"ZeroBalDate\":\"str\",\n",
 253 |     "#     \"LastInstallDate\":\"str\",  \"ForeclosureDate\":\"str\", \"DispositionDate\":\"str\", \"PPRC\":\"str\", \"AssetRecCost\":\"str\", \"MHRC\":\"str\", \"ATFHP\":\"str\",\n",
 254 |     "#     \"NetSaleProceeds\":\"str\", \"CreditEnhProceeds\":\"str\",\"RPMWP\":\"str\",\"OFP\":\"str\",\"NIBUPB\":\"str\", \"PFUPB\":\"str\", \"RMWPF\":\"str\",\n",
 255 |     "#    \"FPWA\":\"str\", \"ServicingIndicator\":\"str\"\n",
 256 |     "# }\n",
 257 |     "\n",
 258 |     "dtype_per={\"LoanID\":\"int\",\"MonthRep\":\"str\",\"Servicer\":\"str\", \"CurrInterestRate\":\"float\", \"CAUPB\":\"float\", \"LoanAge\":\"float\",\"MonthsToMaturity\":\"float\",\n",
 259 |     "   \"AdMonthsToMaturity\":\"float\", \"MaturityDate\":\"str\", \"MSA\":\"float\", \"CLDS\":\"float\", \"ModFlag\":\"str\", \"ZeroBalCode\":\"float\", \"ZeroBalDate\":\"str\",\n",
 260 |     "    \"LastInstallDate\":\"str\",  \"ForeclosureDate\":\"str\", \"DispositionDate\":\"str\", \"PPRC\":\"float\", \"AssetRecCost\":\"float\", \"MHRC\":\"float\", \"ATFHP\":\"float\",\n",
 261 |     "    \"NetSaleProceeds\":\"float\", \"CreditEnhProceeds\":\"float\",\"RPMWP\":\"float\",\"OFP\":\"float\",\"NIBUPB\":\"float\", \"PFUPB\":\"float\", \"RMWPF\":\"float\",\n",
 262 |     "   \"FPWA\":\"str\", \"ServicingIndicator\":\"str\"\n",
 263 |     "}"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {},
 269 |    "source": [
 270 |     "## Data Ingestion"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": 6,
 276 |    "metadata": {},
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "# csv_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_20*'\n",
 280 |     "# csv_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_20*'\n",
 281 |     "\n",
 282 |     "parq_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/parquet68'\n",
 283 |     "parq_per_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/parquet823'"
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "code",
 288 |    "execution_count": null,
 289 |    "metadata": {},
 290 |    "outputs": [],
 291 |    "source": [
 292 |     "# %time df_acq = dask_cudf.read_csv(csv_acq_fnames, sep='|', names=col_acq_names, dtype=dtype_acq, parse_dates=parse_dates_acq)\n",
 293 |     "# %time df_pe = dask_cudf.read_csv(csv_perf_fnames, sep='|', names=col_per_names, dtype=dtype_per, parse_dates=parse_dates_per)\n",
 294 |     "\n",
 295 |     "%time df_acq = dask_cudf.read_parquet(parq_acq_fnames)\n",
 296 |     "%time df_per = dask_cudf.read_parquet(parq_per_fnames, columns=col_per)"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "code",
 301 |    "execution_count": null,
 302 |    "metadata": {},
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "# %time print(\"Required Memory for df_acq:\",df_acq.memory_usage().sum().compute()/(1024**3), 'GB')\n",
 306 |     "# %time print(\"Required Memory for df_per:\",df_per.memory_usage().sum().compute()/(1024**3), 'GB')"
 307 |    ]
 308 |   },
 309 |   {
 310 |    "cell_type": "code",
 311 |    "execution_count": 13,
 312 |    "metadata": {},
 313 |    "outputs": [],
 314 |    "source": [
 315 |     "df_acq.head()\n",
 316 |     "df_per.head()"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "code",
 321 |    "execution_count": null,
 322 |    "metadata": {},
 323 |    "outputs": [],
 324 |    "source": [
 325 |     "df_per_shape=df_per.shape[0].persist()\n",
 326 |     "progress(df_per_shape)"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": 12,
 332 |    "metadata": {},
 333 |    "outputs": [
 334 |     {
 335 |      "data": {
 336 |       "text/plain": [
 337 |        "1890353680"
 338 |       ]
 339 |      },
 340 |      "execution_count": 12,
 341 |      "metadata": {},
 342 |      "output_type": "execute_result"
 343 |     },
 344 |     {
 345 |      "data": {
 346 |       "text/plain": [
 347 |        "37015214"
 348 |       ]
 349 |      },
 350 |      "execution_count": 12,
 351 |      "metadata": {},
 352 |      "output_type": "execute_result"
 353 |     }
 354 |    ],
 355 |    "source": [
 356 |     "df_per_shape.compute()\n",
 357 |     "df_acq.shape[0].compute()"
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "markdown",
 362 |    "metadata": {},
 363 |    "source": [
 364 |     "## Data Profiling & Data Quality Check against Summary Statistics \n",
 365 |     "- Data Dictionary: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf\n",
 366 |     "- Sumary Statistics: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Stat_Summary_Primary.pdf\n",
 367 |     "- Sample Data: https://docs.google.com/spreadsheets/d/1nCtusAE2naZlWHFKGRsQTxxusjfZYiBLdd5SF5AEGMA/edit"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": null,
 373 |    "metadata": {},
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "%time df_acq_describe=df_acq.describe().compute()\n",
 377 |     "%time df_per_describe=df_per.describe().compute()\n",
 378 |     "df_acq_describe\n",
 379 |     "df_per_describe"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": null,
 385 |    "metadata": {},
 386 |    "outputs": [],
 387 |    "source": [
 388 |     "df_acq['OrYr']=df_acq['OrDate'].str[-4:]\n",
 389 |     "df_acq['OrUnpaidPrinc $M']=df_acq['OrUnpaidPrinc']/1000000"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": null,
 395 |    "metadata": {},
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "# df_acq_describe=df_acq.describe().compute()\n",
 399 |     "# df_acq_nulls=df_acq.isna().sum().compute()\n",
 400 |     "df_acq_describe\n",
 401 |     "df_acq_nulls;"
 402 |    ]
 403 |   },
 404 |   {
 405 |    "cell_type": "code",
 406 |    "execution_count": null,
 407 |    "metadata": {},
 408 |    "outputs": [],
 409 |    "source": [
 410 |     "%time df_acq_summary = df_acq.groupby('OrYr',as_index=False).agg({'LoanID': 'count','OrUnpaidPrinc $M': 'sum'}).compute()\n",
 411 |     "df_acq_summary.rename(columns = {'LoanID': 'TotalLoans','OrUnpaidPrinc $M':'TotalOrUnpaidPrinc $M'},inplace=True)\n",
 412 |     "df_acq_summary['AvgOrUnpaidPrinc']=df_acq_summary['TotalOrUnpaidPrinc $M']/df_acq_summary['TotalLoans']*1000000\n",
 413 |     "df_acq_summary.to_pandas().sort_values(by=['OrYr']).plot.bar(x='OrYr',y='TotalLoans')\n",
 414 |     "df_acq_summary.to_pandas().sort_values(by=['OrYr']).plot.bar(x='OrYr',y='AvgOrUnpaidPrinc')"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "code",
 419 |    "execution_count": 14,
 420 |    "metadata": {},
 421 |    "outputs": [
 422 |     {
 423 |      "data": {
 424 |       "text/html": [
 425 |        "<div>\n",
 426 |        "<style scoped>\n",
 427 |        "    .dataframe tbody tr th:only-of-type {\n",
 428 |        "        vertical-align: middle;\n",
 429 |        "    }\n",
 430 |        "\n",
 431 |        "    .dataframe tbody tr th {\n",
 432 |        "        vertical-align: top;\n",
 433 |        "    }\n",
 434 |        "\n",
 435 |        "    .dataframe thead th {\n",
 436 |        "        text-align: right;\n",
 437 |        "    }\n",
 438 |        "</style>\n",
 439 |        "<table border=\"1\" class=\"dataframe\">\n",
 440 |        "  <thead>\n",
 441 |        "    <tr style=\"text-align: right;\">\n",
 442 |        "      <th></th>\n",
 443 |        "      <th>LoanID</th>\n",
 444 |        "      <th>MonthRep</th>\n",
 445 |        "      <th>CAUPB</th>\n",
 446 |        "      <th>CLDS</th>\n",
 447 |        "      <th>ForeclosureDate</th>\n",
 448 |        "      <th>DelinquentEvent</th>\n",
 449 |        "      <th>ForeclosureEvent</th>\n",
 450 |        "      <th>YrRep</th>\n",
 451 |        "      <th>CAUPB $M</th>\n",
 452 |        "    </tr>\n",
 453 |        "  </thead>\n",
 454 |        "  <tbody>\n",
 455 |        "    <tr>\n",
 456 |        "      <th>0</th>\n",
 457 |        "      <td>100007365142</td>\n",
 458 |        "      <td>01/01/2000</td>\n",
 459 |        "      <td>&lt;NA&gt;</td>\n",
 460 |        "      <td>0.0</td>\n",
 461 |        "      <td>&lt;NA&gt;</td>\n",
 462 |        "      <td>0</td>\n",
 463 |        "      <td>0</td>\n",
 464 |        "      <td>2000</td>\n",
 465 |        "      <td>&lt;NA&gt;</td>\n",
 466 |        "    </tr>\n",
 467 |        "    <tr>\n",
 468 |        "      <th>1</th>\n",
 469 |        "      <td>100007365142</td>\n",
 470 |        "      <td>01/01/2001</td>\n",
 471 |        "      <td>74319.0</td>\n",
 472 |        "      <td>0.0</td>\n",
 473 |        "      <td>&lt;NA&gt;</td>\n",
 474 |        "      <td>0</td>\n",
 475 |        "      <td>0</td>\n",
 476 |        "      <td>2001</td>\n",
 477 |        "      <td>0.074319</td>\n",
 478 |        "    </tr>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>2</th>\n",
 481 |        "      <td>100007365142</td>\n",
 482 |        "      <td>01/01/2002</td>\n",
 483 |        "      <td>73635.48</td>\n",
 484 |        "      <td>0.0</td>\n",
 485 |        "      <td>&lt;NA&gt;</td>\n",
 486 |        "      <td>0</td>\n",
 487 |        "      <td>0</td>\n",
 488 |        "      <td>2002</td>\n",
 489 |        "      <td>0.07363548</td>\n",
 490 |        "    </tr>\n",
 491 |        "    <tr>\n",
 492 |        "      <th>3</th>\n",
 493 |        "      <td>100007365142</td>\n",
 494 |        "      <td>01/01/2003</td>\n",
 495 |        "      <td>72795.41</td>\n",
 496 |        "      <td>0.0</td>\n",
 497 |        "      <td>&lt;NA&gt;</td>\n",
 498 |        "      <td>0</td>\n",
 499 |        "      <td>0</td>\n",
 500 |        "      <td>2003</td>\n",
 501 |        "      <td>0.07279541</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>4</th>\n",
 505 |        "      <td>100007365142</td>\n",
 506 |        "      <td>02/01/2000</td>\n",
 507 |        "      <td>&lt;NA&gt;</td>\n",
 508 |        "      <td>0.0</td>\n",
 509 |        "      <td>&lt;NA&gt;</td>\n",
 510 |        "      <td>0</td>\n",
 511 |        "      <td>0</td>\n",
 512 |        "      <td>2000</td>\n",
 513 |        "      <td>&lt;NA&gt;</td>\n",
 514 |        "    </tr>\n",
 515 |        "  </tbody>\n",
 516 |        "</table>\n",
 517 |        "</div>"
 518 |       ],
 519 |       "text/plain": [
 520 |        "         LoanID    MonthRep     CAUPB  CLDS ForeclosureDate  DelinquentEvent  \\\n",
 521 |        "0  100007365142  01/01/2000      <NA>   0.0            <NA>                0   \n",
 522 |        "1  100007365142  01/01/2001   74319.0   0.0            <NA>                0   \n",
 523 |        "2  100007365142  01/01/2002  73635.48   0.0            <NA>                0   \n",
 524 |        "3  100007365142  01/01/2003  72795.41   0.0            <NA>                0   \n",
 525 |        "4  100007365142  02/01/2000      <NA>   0.0            <NA>                0   \n",
 526 |        "\n",
 527 |        "   ForeclosureEvent YrRep    CAUPB $M  \n",
 528 |        "0                 0  2000        <NA>  \n",
 529 |        "1                 0  2001    0.074319  \n",
 530 |        "2                 0  2002  0.07363548  \n",
 531 |        "3                 0  2003  0.07279541  \n",
 532 |        "4                 0  2000        <NA>  "
 533 |       ]
 534 |      },
 535 |      "execution_count": 14,
 536 |      "metadata": {},
 537 |      "output_type": "execute_result"
 538 |     }
 539 |    ],
 540 |    "source": [
 541 |     "df_per['DelinquentEvent']=0\n",
 542 |     "df_per['DelinquentEvent']=df_per['DelinquentEvent'].where(df_per['CLDS']<1,1)\n",
 543 |     "df_per['ForeclosureEvent']=0\n",
 544 |     "df_per['ForeclosureEvent']=df_per['ForeclosureEvent'].where(df_per['ForeclosureDate'].isnull()== True,1)\n",
 545 |     "df_per['YrRep']=df_per['MonthRep'].str[-4:]\n",
 546 |     "df_per['CAUPB $M']=df_per['CAUPB']/1000000\n",
 547 |     "df_per.head()"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "code",
 552 |    "execution_count": null,
 553 |    "metadata": {},
 554 |    "outputs": [],
 555 |    "source": [
 556 |     "df_per_yr_summary = df_per.groupby('YrRep',as_index=False).agg({'LoanID': 'count', 'DelinquentEvent':'sum'}).persist()\n",
 557 |     "progress(df_per_yr_summary)"
 558 |    ]
 559 |   },
 560 |   {
 561 |    "cell_type": "code",
 562 |    "execution_count": 47,
 563 |    "metadata": {},
 564 |    "outputs": [
 565 |     {
 566 |      "data": {
 567 |       "text/plain": [
 568 |        "<AxesSubplot:xlabel='YrRep'>"
 569 |       ]
 570 |      },
 571 |      "execution_count": 47,
 572 |      "metadata": {},
 573 |      "output_type": "execute_result"
 574 |     },
 575 |     {
 576 |      "data": {
 577 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAA090lEQVR4nO3de3wU1d348c839wQSQm4QCJCAAUQCAcNFUEQFBbygVkWfVkWrlEdR0VpL2+ei7c+n1sd6QS0WKyrWKj5qK1oUEUHwgtyEhDshBEiABAIkIeSe8/tjJ+m6bsgm2WSy2e/79ZpXdmfO7HxnWOa755w5M2KMQSmllP8JsDsApZRS9tAEoJRSfkoTgFJK+SlNAEop5ac0ASillJ8KsjuA5oiLizPJycl2h6GUUj5l06ZNx40x8a7zfSoBJCcns3HjRrvDUEopnyIiB9zN1yYgpZTyU5oAlFLKT2kCUEopP+VTfQDuVFdXk5eXR0VFhd2hKNWksLAwkpKSCA4OtjsUpXw/AeTl5REZGUlycjIiYnc4SjXKGENRURF5eXmkpKTYHY5Svt8EVFFRQWxsrJ78VYcnIsTGxmptVXUYPp8AAD35K5+h31XVkXSKBKCUv8g7eYbl24/aHYbqJDxKACIyRUR2i0i2iMxzs1xEZL61PFNERjotyxWRLBHZIiIbnebHiMgKEdlr/e3unV1qf4GBgaSnp3PeeecxfPhwnn76aerq6s66Tm5uLkOHDgVg48aN3H///e0Rqle99tprxMfHk56e3jDt2LHDq9t49tlnOXPmjFc/05Xzv0VHZozhgbe38LM3NrF6d6Hd4ahOoMkEICKBwIvAVGAIcIuIDHEpNhVItaZZwAKX5ZcYY9KNMRlO8+YBK40xqcBK671PCg8PZ8uWLWzfvp0VK1awbNkyHnvsMY/Xz8jIYP78+W0YYduZMWMGW7ZsaZiGDHH9arROWyeAmpqaNvtsb/tizzE2HThJeHAgv3o/i5KKartDUj7OkxrAaCDbGJNjjKkC3gamu5SZDiw2DuuAaBFJbOJzpwOvW69fB671POyOKyEhgYULF/LCCy9gjKG2tpZf/OIXjBo1imHDhvHnP//5B+usXr2aq666CoBHH32UO++8k4kTJ9K/f//vJYbHH3+cQYMGMWnSJG655RaeeuopACZOnNhwi4zjx49Tf7+kxra9evVqJk6cyA033MDgwYP58Y9/TP2T4TZs2MC4ceMYPnw4o0ePprS0lIsuuogtW7Y0xDF+/HgyMzMbPQYzZsxg2bJlDe9nzpzJe++91+x45s+fz+HDh7nkkku45JJLqK2tZebMmQwdOpS0tDSeeeaZhv2fO3cu48aNY+jQoaxfvx6AsrIy7rzzTkaNGsWIESP44IMPAEfN5cYbb+Tqq6/m8ssvb3Q/Vq5cyYgRI0hLS+POO++ksrISgN/+9reMGjWKoUOHMmvWrIZjN3HiRH75y18yevRoBg4cyNq1axv97OYyxvDMij30jg5n8U9HU1BSwf/8c6fXPl/5J08uA+0NHHJ6nweM8aBMb+AIYIBPRcQAfzbGLLTK9DDGHAEwxhwRkQR3GxeRWThqFfTt2/esgT724XZ2HC7xYJc8N6RXFP999XnNWqd///7U1dVRWFjIBx98QLdu3diwYQOVlZWMHz+eyy+//Kydgbt27WLVqlWUlpYyaNAg/v3f/53MzEzefvttvvvuO2pqahg5ciTnn3/+WeN45ZVX3G4b4LvvvmP79u306tWL8ePH89VXXzF69GhmzJjBkiVLGDVqFCUlJYSHh3PXXXfx2muv8eyzz7Jnzx4qKysZNmwYmzdvZsmSJXz55ZcN2/zmm2+4+eabWbJkCdOmTaOqqoqVK1eyYMGCZsdz//338/TTT7Nq1Sri4uLYtGkT+fn5bNu2DYBTp041bLesrIyvv/6aNWvWcOedd7Jt2zYef/xxLr30UhYtWsSpU6cYPXo0kyZNaogzMzOTmJgYcnNzf3DsKioqmDlzJitXrmTgwIHcdtttLFiwgLlz5zJnzhz+67/+C4Bbb72Vjz76iKuvvhpw1CjWr1/fUAv87LPPmvi2eObzXYVszSvmievTGJUcw6wJA3jpi31MTUvk4oE/uMeXUh7xpAbg7kzl+iDhs5UZb4wZiaOZ6F4RmdCM+DDGLDTGZBhjMuLjfeeLXv+r8NNPP2Xx4sWkp6czZswYioqK2Lt371nXvfLKKwkNDSUuLo6EhAQKCgpYu3Yt1113HREREURFRXHNNdc0GcPZtj169GiSkpIICAggPT2d3Nxcdu/eTWJiIqNGjQIgKiqKoKAgbrzxRj766COqq6tZtGgRM2fObNiGaxNQeHg4U6dO5fPPP6eyspKPP/6YCRMmEB4e3ux4XPXv35+cnBzuu+8+PvnkE6KiohqW3XLLLQBMmDCBkpISTp06xaeffsoTTzxBeno6EydOpKKigoMHDwIwefJkYmJiGj12u3fvJiUlhYEDBwJw++23s2bNGgBWrVrFmDFjSEtL4/PPP2f79u0N611//fUAnH/++W73oSWMMTy9Yg99YyL40flJAMydlMo5CV2Z916mNgWpFvOkBpAH9HF6nwQc9rSMMab+b6GI/B1Hk9IaoEBEEq1f/4lAq3u1mvtLva3k5OQQGBhIQkICxhief/55rrjiiu+VOdvJITQ0tOF1YGBgQzt1Y7WGoKCghk5n52vMG9v26tWr3W7DGON2GxEREUyePJkPPviAd955p8k7soaFhTFx4kSWL1/OkiVLGk7OzY3HVffu3dm6dSvLly/nxRdf5J133mHRokVuj42IYIzhvffeY9CgQd9b9u2339KlS5ez7kN9AndVUVHBPffcw8aNG+nTpw+PPvro9455/X40tg8t8emOArYfLuGpG4cTHOj4zRYWHMj/3jCMHy34mt8v28nvrx/mlW0p/+JJDWADkCoiKSISAtwMLHUpsxS4zboaaCxQbJ3Yu4hIJICIdAEuB7Y5rXO79fp24INW7kuHcOzYMWbPns2cOXMQEa644goWLFhAdbXjV9qePXsoKytr9udOmDCBv//975SXl1NaWsqHH37YsCw5OZlNmzYB8O677zbMb+62Bw8ezOHDh9mwYQMApaWlDSexu+66i/vvv59Ro0ad9ZdzvZtvvplXX32VtWvXNpzwW3IsIiMjKS0tBRz9G3V1dfzoRz/id7/7HZs3b24ot2TJEgC+/PJLunXrRrdu3bjiiit4/vnnG07m3333XZNxOx+L3NxcsrOzAXjjjTe4+OKLG072cXFxnD59+nvHuy3U1Tna/lPiunBteq/vLRvRtzt3T+jPW+sPsWbPsTaNQ3VOTdYAjDE1IjIHWA4EAouMMdtFZLa1/CVgGTANyAbOAHdYq/cA/m79OgsC/maM+cRa9gTwjoj8FDgI3Oi1vWpn5eXlpKenU11dTVBQELfeeisPPfQQ4Dhx5ubmMnLkSIwxxMfH849//KPZ2xg5ciQzZswgPT2dfv36cdFFFzUse/jhh7npppt44403uPTSSxvmN3fbISEhLFmyhPvuu4/y8nLCw8P57LPP6Nq1K+effz5RUVHccccd31vHtQ/gT3/6E+PGjePyyy/ntttu45prriEkJKTFx2LWrFlMnTqVxMREnn32We64446G2s7vf//7hnLdu3dn3LhxlJSUNNQK/vM//5O5c+cybNgwjDEkJyfz0Ucfud3O7t27SUpKanj/zDPP8Oqrr3LjjTdSU1PDqFGjmD17NqGhodx9992kpaWRnJzc0FzWVj7ZfpRdR0t5dkY6QYE//L324KSBfLajgF+9n8Uncy8iMkzvMaQ8J41VdTuijIwM49r8sHPnTs4991ybIrLPo48+SteuXXn44YfbZXuHDx9m4sSJ7Nq1i4CAjjV+cOLEiTz11FNkZGQ0XbgD8PQ7W1tnmPLsGuqM4dMHLyYwwH0T4OaDJ7lhwdfMGNWX31+f5u1wVScgIptcLsMHdCSw8sDixYsZM2YMjz/+eIc7+XdmH2UeZm/haeZOGtjoyR9gZN/u3H1Rf95af5Av9x5vxwiVr9MagFLtzJPvbE1tHZc/s4bgwAA+fuAiAs6SAAAqqmuZNn8tldV1LH9wAl1Dff5Gv8qLOnUNwJeSmPJvnn5Xl249TM7xMh6cnNrkyR/qrwoazuHicn6/TAeIKc/4fAIICwujqKhIk4Dq8OqfBxAWFnbWcjW1dTy3ci9DEqO4fEhPjz///H7duevCFN789iBfZWtTkGqaz9cTk5KSyMvL49gxvQxOdXz1TwQ7m/e/y+dA0Rlevi3Do1//zn5++SBW7izkkXcztSlINcnnvx3BwcH6dCXVaVTV1DF/5V6GJXVj0rlu745yVmHBgfzvjcO44aVveOLjnfy/a/WqINU4n28CUqozeXdTHnkny3lw8sAWPzzm/H4x/HR8Cn9dd5CvtSlInYUmAKU6iMqaWl74fC8j+kYzsZU3ePv55YNIievCI+9lUlbpO7e8Vu1LE4BSHcQ7Gw5xuLiCh1rx679eeEggT94wjPxT5Tzx8S4vRag6G00ASnUAFdW1vLAqm1HJ3bnwnDivfOao5BjuGJfCG+sO8PU+bQpSP6QJQKkO4K31BykoqWxV2787v7hiEMmxEfxSm4KUG5oAlLJZeVUtL67axwX9Yxk3wDu//us5moKGk3eynCc/0aYg9X2aAJSy2V/XHeD4acev/7YwOiWGmeOSef2bA6zLKWqTbSjfpAlAKRuVVdbw0hf7uCg1jtEpTT9noaV+ccUg+sVG8Mi7mZyp0qYg5aAJQCkbLf7mAEVlVcyd1Da//utFhATxvzcM59DJMzz5ye423ZbyHZoAlLJJaUU1f16zj4mD4jm/X/c2397olBhuvyCZ177O1aYgBWgCUMo2r3+dy6kz1TzURm3/7jwyZRB9YxxXBWlTkPIoAYjIFBHZLSLZIjLPzXIRkfnW8kwRGemyPFBEvhORj5zmPSoi+SKyxZqmtX53lPINJRXVLFyTw6RzezAsKbrdthsREsSTNwzjQJE2BSkPEoCIBAIvAlOBIcAtIjLEpdhUINWaZgELXJY/ALi7Sfkzxph0a1rW3OCV8lWvrN1PSUUNcyeltvu2x/aPZeY4R1PQ2r16F11/5kkNYDSQbYzJMcZUAW8D013KTAcWG4d1QLSIJAKISBJwJfAXL8atlM86daaKRV/uZ8p5PRnau5stMcybOphzErry83e2cqKsypYYlP08SQC9gUNO7/OseZ6WeRZ4BKhz89lzrCajRSLithdMRGaJyEYR2aj3/FedwV/W7ud0VQ1zJ7f/r/96YcGBPHdzOqfOVPPL9zL1gUp+ypME4G5cuuu3xW0ZEbkKKDTGbHKzfAEwAEgHjgB/dLdxY8xCY0yGMSYjPr51d0hUym4nyqp49av9XJmWyOCeUbbGcl6vbjwyZRArdhTw9oZDTa+gOh1PEkAe0MfpfRJw2MMy44FrRCQXR9PRpSLyVwBjTIExptYYUwe8jKOpSalObeGaHM5U19rS9u/OneNTuPCcOH774Q72HTttdziqnXmSADYAqSKSIiIhwM3AUpcyS4HbrKuBxgLFxpgjxphfGWOSjDHJ1nqfG2N+AlDfR2C5DtjW2p1RqiM7frqS17/OZfrwXpyTEGl3OAAEBAh/vGk4ocEBzH17C1U17lpqVWfVZAIwxtQAc4DlOK7keccYs11EZovIbKvYMiAHyMbxa/4eD7b9pIhkiUgmcAnwYEt2QClf8dLqfVTW1HL/ZR3j13+9HlFhPHH9MLLyi3nmsz12h6PakUfPBLYu0VzmMu8lp9cGuLeJz1gNrHZ6f2sz4lTKpxWWVPDGugNcNyKJ/vFd7Q7nB6YM7cnNo/rw0hf7mJAazwUDYu0OSbUDHQmsVDt4d3MelTV13H/ZOXaH0qj/vGoIybFd+Pk7Wyg+U213OKodaAJQqh1kHiomOTaCfrFd7A6lUV1Cg3h2RjqFpZX8+h9ZemmoH9AEoFQ7yMovJq0db/nQUsP7RPPg5IH8M/MI72/Otzsc1cY0ASjVxk6UVZF/qpy03vZe9++p2RcPYHRKDP/1wTYOFJXZHY5qQ5oAlGpjWfnFAKT1jrY3EA8FBgjPzEgnIECYu2QLNbV6aWhnpQlAqTaWlXcKgPN8pAYA0Ds6nMevS+O7g6d4/vNsu8NRbUQTgFJtLCu/mJS4LkSFBdsdSrNcM7wX14/ozfOf72XTgRN2h6PagCYApdpYVl4xaTbd9bO1Hpt+Hr27h/PA21sordBLQzsbTQBKtaGi05UcLq7w2QQQGRbMszPSOXyqnP/+YLvd4Sgv0wSgVBtq6ABO8s0EAHB+vxjuuzSV97/LZ+lW1/tAKl+mCUCpNpSV50gA5/XynQ5gd+679BxG9I3mN3/PIv9Uud3hKC/RBKBUG8rKL6Z/XBcifawD2FVQYADPzRhBXZ3hwSVbqK3TUcKdgSYApdqQYwSw7zb/OOsbG8Fj04eyfv8JXvpin93hKC/QBKBUGzl+upIjPtwB7M6PRvbmymGJPLNiD1sPnbI7HNVKmgCUaiP/GgHceRKAiPA/16YRHxnK3CVbKKussTsk1QqaAJRqI1l5xYjAeZ0oAQB0iwjm6ZvSyS0q43cf7bA7HNUKmgCUaiP1I4C7hnr03CWfcsGAWGZfPIC3Nxzik21H7A5HtZBHCUBEpojIbhHJFpF5bpaLiMy3lmeKyEiX5YEi8p2IfOQ0L0ZEVojIXutv99bvjlIdR1ZeMcM62a9/Zw9OGsh5vaL47Yc7qNYbxvmkJhOAiAQCLwJTgSHALSIyxKXYVCDVmmYBC1yWP4DjecLO5gErjTGpwErrvVKdwrHSSo6WVDC0EyeAkKAAHpo8kMPFFXyUqQPEfJEnNYDRQLYxJscYUwW8DUx3KTMdWGwc1gHRIpIIICJJwJXAX9ys87r1+nXg2pbtglIdzzarA3iYDzwEpjUuGZRAakJX/vxFjj5BzAd5kgB6A4ec3udZ8zwt8yzwCOBaR+xhjDkCYP1NcLdxEZklIhtFZOOxY8c8CFcp+2XWdwD7+AjgpgQECHdP6M+uo6Ws2Xvc7nBUM3mSAMTNPNdU77aMiFwFFBpjNjU7svoPMWahMSbDGJMRHx/f0o9Rql3VjwDu0gk7gF1NT+9Fj6hQ/qyDw3yOJwkgD+jj9D4JcG3wa6zMeOAaEcnF0XR0qYj81SpT4NRMlAgUNjt6pTqorPxTnb75p15oUCB3jk/h631FDfc+Ur7BkwSwAUgVkRQRCQFuBpa6lFkK3GZdDTQWKDbGHDHG/MoYk2SMSbbW+9wY8xOndW63Xt8OfNDanVGqIygsraCgpLJTdwC7umVMXyJDg/jzGq0F+JImE4AxpgaYAyzHcSXPO8aY7SIyW0RmW8WWATlANvAycI8H234CmCwie4HJ1nulfN6/OoD9JwFEhQXzb2P6sizrCAeLztgdjvKQRw2UxphlOE7yzvNecnptgHub+IzVwGqn90XAZZ6HqpRvqO8AHpLYuTuAXd0xPoVFX+3nlS9zeGz6ULvDUR7QkcBKedm2/GIGxHf1iw5gZz27hXFtem+WbDzEibIqu8NRHtAEoJSXZXbyEcBnM2tCfyqq61j8Ta7doSgPaAJQyosKSiooLPWvDmBnqT0iuWxwAou/OUB5Va3d4agmaAJQyovqL4P0pw5gVz+7eAAnyqp4d9OhpgsrW2kCUMqLsvKLCRAY0slHAJ/NqOTupPeJ5uW1+/XRkR2cJgClvKi+AzgixL86gJ2JCLMv7s/BE2f4ZNtRu8NRZ6EJQCkvyuxEzwBujclDepIS14WXvtinN4nrwDQBKOUlBSUVHCut7FSPgGypwADhrotSyMov5pucIrvDUY3QBKCUl2RqB/D3/GhkEnFdQ1i4JsfuUFQjNAEo5SUNHcCJmgAAwoIDmTkumdW7j7HzSInd4Sg3NAEo5SXb8otJTYgkPCTQ7lA6jJ+M7UdESCAvay2gQ9IEoJQXGGPIzCv22wFgjYmOCGHGqD4s3XqYw6fK7Q5HudAEoJQXFJRUcvx0JWm9/ff6/8b89MIUDLDoy/12h6JcaAJQygsy804BkOYnD4FpjqTuEVw9LJG31h+kuLza7nCUE00ASnnBtoYOYK0BuDNrwgDKqmp589sDdoeinGgCUMoLsvKLGdhDO4AbM6RXFBelxvHqV7lUVOtN4joKTQBKtZIxhqx87QBuyuyLB3CstJJ/fJdvdyjK4lECEJEpIrJbRLJFZJ6b5SIi863lmSIy0pofJiLrRWSriGwXkcec1nlURPJFZIs1TfPebinVfo6WVHD8dJWOAG7CuAGxnNcrioVrc6jTm8R1CE0mABEJBF4EpgJDgFtEZIhLsalAqjXNAhZY8yuBS40xw4F0YIr10Ph6zxhj0q3pe4+cVMpX1I8A1nsAnZ2I8LOLB5BzrIzPdhbYHY7CsxrAaCDbGJNjjKkC3gamu5SZDiw2DuuAaBFJtN6ftsoEW5OmftWpbMsvJjBAtAPYA9OG9iSpezh/1oFhHYInCaA34PxkhzxrnkdlRCRQRLYAhcAKY8y3TuXmWE1Gi0Sku7uNi8gsEdkoIhuPHTvmQbhKta+s/GJSE7oSFqwdwE0JCgzgrgtT2HTgJBtzT9gdjt/zJAGIm3muv+IbLWOMqTXGpANJwGgRGWotXwAMwNE0dAT4o7uNG2MWGmMyjDEZ8fHxHoSrVPsxxpCVV6zt/81w06g+REcEay2gA/AkAeQBfZzeJwGHm1vGGHMKWA1Msd4XWMmhDngZR1OTUj7lSHEFRWVV2v7fDBEhQdx2QTIrdhSQXXi66RVUm/EkAWwAUkUkRURCgJuBpS5llgK3WVcDjQWKjTFHRCReRKIBRCQcmATsst4nOq1/HbCtdbuiVPtr6ADWGkCz3H5BP0KDAvjLWq0F2KnJBGCMqQHmAMuBncA7xpjtIjJbRGZbxZYBOUA2jl/z91jzE4FVIpKJI5GsMMZ8ZC17UkSyrGWXAA96a6eUai/1HcDnagdws8R2DeXGjCTe35xPYUmF3eH4LY8eXGpdornMZd5LTq8NcK+b9TKBEY185q3NilSpDqh+BLB2ADffXRf252/fHuTVr3P55ZTBdofjl3QksFItVD8CWO8A2jLJcV2YOjSRv647wOnKGrvD8UuaAJRqocPFFZwo0xHArTFrQn9KK2p4e/1Bu0PxS5oAlGqhLL0FdKsN7xPN2P4xvPLlfqpr6+wOx+9oAlCqhbLyiwkKEAb3jLQ7FJ/2s4sHcKS4gqVbXK8uV21NE4BSLZSVX6IdwF4wcWA8g3tGsuCLfdTqTeLalSYApVrAMQL4lLb/e4GIcN+lqWQXnuafWUfsDsevaAJQqgXyT5Vz8kw1Q3UEsFdMHdqTgT268vzKvVoLaEeaAJRqgSxrBPAwrQF4RUCAcP9lqewtPM0yrQW0G00ASrVAfQfwIO0A9pppQxNJTejKfK0FtBtNAEq1QFZ+MYN6agewNwUECA9M0lpAe9IEoFQz/WsEsDb/eJtzLUAfG9n2NAEo1Ux5J8s5daZaHwLfBr7XF7BNawFtTROAUs2UlW91AOsVQG1iWpqjFvDcZ1oLaGuaAJRqpqz8YoIDtQO4rQRqLaDdaAJQqpm2WR3AoUHaAdxWpqUlco72BbQ5TQBKNYMxhkx9BnCbq68F7Ck4zcfbjtodTqelCUCpZsg7WU5xuXYAt4crrVrAcyv3aC2gjXiUAERkiojsFpFsEZnnZrmIyHxreaaIjLTmh4nIehHZKiLbReQxp3ViRGSFiOy1/nb33m4p1TYyG0YAR9sbiB8IDBDuu/QcrQW0oSYTgIgEAi8CU4EhwC0iMsSl2FQg1ZpmAQus+ZXApcaY4UA6MMV6aDzAPGClMSYVWGm9V6pDq+8AHtizq92h+IWrhvViQHwX7QtoI57UAEYD2caYHGNMFfA2MN2lzHRgsXFYB0SLSKL1/rRVJtiajNM6r1uvXweubcV+KNUutuUXM7hnlHYAt5P6voDdBaV8sl1rAd7mSQLoDRxyep9nzfOojIgEisgWoBBYYYz51irTwxhzBMD6m+Bu4yIyS0Q2isjGY8eOeRCuUm2jfgSwtv+3r/pagI4L8D5PEoC4mef6r9BoGWNMrTEmHUgCRovI0OYEaIxZaIzJMMZkxMfHN2dVpbzq0AlHB7BeAdS+tBbQdjxJAHlAH6f3SYDrs9uaLGOMOQWsBqZYswpEJBHA+lvoadBK2SEz/xSgI4DtcNWwXvTXvgCv8yQBbABSRSRFREKAm4GlLmWWArdZVwONBYqNMUdEJF5EogFEJByYBOxyWud26/XtwAet2xWl2lZWfjEhgQEM7KEjgNtbYIDwwGWp7DpaynKtBXhNkwnAGFMDzAGWAzuBd4wx20VktojMtootA3KAbOBl4B5rfiKwSkQycSSSFcaYj6xlTwCTRWQvMNl6r1SHtS2/mMGJkYQE6fAZO9TXAp7TWoDXBHlSyBizDMdJ3nneS06vDXCvm/UygRGNfGYRcFlzglXKLo5nABdz1fBedofitwIDhPsvTWXuki0s336UqWmJdofk8/SnjFIeOHjiDCUVNfoISJtdPbwX/eO0FuAtmgCU8kD9CGC9BNRe9VcE7Tpayqc7tC+gtTQBKOWBbdoB3GHU1wKe1XEBraYJQCkPZOUXc652AHcIgQHCfZedo7UAL9Bvs1JN0BHAHc/Vw+r7ArK1FtAKmgCUasKBojOUVtToALAOJCgwgPsuO4edR0r4dEeB3eH4LE0ASjUhM187gDuiq4f1IkWvCGoVTQBKNWFbfjEhQdoB3NEEBQZw36VaC2gNTQBKNSErr5hzE6MIDtT/Lh3NNcMdtYD5K/fiGI+qmkO/0UqdRV2dYVt+MWm9o+wORblRXwvYobWAFtEEoNRZHDhxhtLKGn0EZAd2zfBeJMdG8NxnWgtoLk0ASp3Fmj2OhxCl6RVAHZajFpCqtYAW0ASgVCNOV9bw/OfZjEruzuCe2gHckU1P11pAS2gCUKoRC9fkcPx0Jb+edi4i7h56pzoK51rAx9t0dLCnNAEo5UZBSQUvr8nhqmGJjOjb3e5wlAemp/dicM9IHv6/razff8LucHyCJgCl3Hj60z3U1NXxyBWD7Q5FeSgoMIDFPx1NYrcwZr66nm9ziuwOqcPTBKCUi11HS3hn0yFuvyCZvrERdoejmiEhMoy3Zo2lV3Q4d7y2QZNAEzxKACIyRUR2i0i2iMxzs1xEZL61PFNERlrz+4jIKhHZKSLbReQBp3UeFZF8EdliTdO8t1tKtdzvl+0iMjSIOZeeY3coqgUSIsP4291j6BUdzsxXN7BOk0CjmkwAIhIIvAhMBYYAt4jIEJdiU4FUa5oFLLDm1wA/N8acC4wF7nVZ9xljTLo1fe+Rk0rZYc2eY3yx5xj3X5ZKdESI3eGoFkqIDOOtu8fSu3s4d7y6gW/2aRJwx5MawGgg2xiTY4ypAt4GpruUmQ4sNg7rgGgRSTTGHDHGbAYwxpTieKh8by/Gr5TX1NYZ/mfZTvrEhHPrBf3sDke1UnxkKG/dPZak7uHc+ZomAXc8SQC9gUNO7/P44Um8yTIikozjAfHfOs2eYzUZLRIRt5daiMgsEdkoIhuPHTvmQbhKtcz7m/PYdbSUR64YTGhQoN3hKC+Ijwzlb1YSuOO19ZoEXHiSANxdAO060uKsZUSkK/AeMNcYU2LNXgAMANKBI8Af3W3cGLPQGJNhjMmIj4/3IFylmq+8qpanPt3N8D7RXDUs0e5wlBfFR4by1qyx9I2J4I7X1vP1vuN2h9RheJIA8oA+Tu+TgMOelhGRYBwn/zeNMe/XFzDGFBhjao0xdcDLOJqalLLFK1/mUFBSyW900FenFNfVURPoF9OFO1/bwNfZmgTAswSwAUgVkRQRCQFuBpa6lFkK3GZdDTQWKDbGHBHH/6RXgJ3GmKedVxAR559Z1wHbWrwXSrXCsdJKFqzex+VDejA6JcbucFQbcSSBMY4k8PoGvtIk0HQCMMbUAHOA5Tg6cd8xxmwXkdkiMtsqtgzIAbJx/Jq/x5o/HrgVuNTN5Z5PikiWiGQClwAPem2vlGqG51buobKmjnlTddBXZxdrJYHkWEdNwN+TgPjSjZMyMjLMxo0b7Q5DdSLZhae54tk1/HhMX347fajd4ah2UnS6kh//5Vv2Hy/jldtHcWFqnN0htSkR2WSMyXCdryOBlV974uNdhAcH8sBlqXaHotpRrNUnkBLXhZ++voG1e/3zCkNNAMpvrcsp4rOdBdxzyQBiu4baHY5qZzFdQhqSwF2vb2x49oM/0QSg/FKdNeirV7cw7hyfYnc4yib1SaB/fFfuWux/SUATgPJLH2YeJjOvmIevGERYsA768mcxXUL4211jGGAlgS/8KAloAlB+p6K6lic/2c15vaK4Nl3vTKKgu5UEzonvyt2LN7J6d6HdIbULTQDK77z+dS75p8r5zbRzCQjQQV/KoXuXEN68awypCV2Z9cYm3t+c1+kfL6kJQPmVk2VVvLAqm0sGxTPunM596Z9qvvokMKx3Nx56Zys/fX0jR4rL7Q6rzWgCUH5l/ud7Kaus4VfTzrU7FNVBRUeEsORnF/AfV57L1/uOc/nTa3hr/cFOWRvQBKD8Ru7xMt745gAzRvVhYI9Iu8NRHVhggHDXRf1ZPncCQ3t341fvZ/Hjv3zLwaIzdofmVZoAlN94cvkuQoICeHDSQLtDUT6iX2wX3rxrDP9zXRqZecVc8ewaFn25n9q6zlEb0ASg/MKmAydYlnWUWRP6kxAVZnc4yocEBAj/NqYvnz44gbH9Y/jtRzu48aWvyS4stTu0VtMEoDo9YwyP/3MnCZGhzJrQ3+5wlI/qFR3OopmjePqm4eQcL2Pac1/y4qpsqmvr7A6txTQBqE7v421H2XzwFA9NHkhESJDd4SgfJiJcPzKJFQ9ezKQhCfzv8t1c++JXbD9cbHdoLaIJQHVqVTV1/OGTXQzqEcmNGX2aXkEpD8RHhvKnH5/PSz8ZSUFJJdNf+Iqnlu+msqbW7tCaRROA6tT+uu4AB4rOMG/aYAJ10JfysilDE/nsoQlck96LF1Zlc9X8L9l88KTdYXlM68Oq0your2b+53u58Jw4Jg7U50mrthEdEcLTN6Vz9fBe/Ob9LH604GvuHJ/Cw5cPIjyk6ftMGWMorayhsKSCo8WVHC2poKCkgqPFFRwtqXDML6ng+VtGev2JdZoAVKf1p1XZFJdX86tpg/U5v6rNXTIogeUPTuAPn+zilS/3s2JHAb+/Po2UuC6Ok7p1Qnec1Cs5Wmyd6EsqOFP1w6ajbuHB9IwKIyEqlIE9IokM8/7p2qMngonIFOA5IBD4izHmCZflYi2fBpwBZhpjNotIH2Ax0BOoAxYaY56z1okBlgDJQC5wkzHmrHUnfSKYasrpyho25J7g25wTLPpqP1cP68Ufbxpud1jKz3yzr4h572dywM3AseBAISEyjJ7dwugZFUaPqDB6dgulR/1r668ntQdPNfZEsCZTiogEAi8Ck4E8YIOILDXG7HAqNhVItaYxwALrbw3wcysZRAKbRGSFte48YKUx5gkRmWe9/2Wr9lL5ndKKajbmnmRdThHrcorYdriE2jpDcKCQ0S+GR6YMsjtE5YcuGBDLJw9M4L3NeQSINJzge0aF0T0ipMPchNCTOsVoINsYkwMgIm8D0wHnBDAdWGwc1Yl1IhItIonGmCPAEQBjTKmI7AR6W+tOByZa678OrEYTgGpCcXk1G3NPsC6niG/3n2BbfjF1xvGrKr1PNPdMHMCYlFhG9ovWSz6VrcJDAvnJ2H52h3FWnvwP6Q0ccnqfh+PXfVNlemOd/AFEJBkYAXxrzephJQiMMUdEJMHdxkVkFjALoG/fvh6EqzqT4jPVrG844Rex/XAJxkBIYADpfaOZc2kqY1NiGNG3u1erzEr5A08SgLu6imvHwVnLiEhX4D1grjGmxPPwwBizEFgIjj6A5qyrfENtnaG4vJqTZ6o4WVZFYWllQ7POzqPWCT8ogJF9o3ngslTGpMQyom+0PslLqVbyJAHkAc4jaJKAw56WEZFgHCf/N40x7zuVKahvJhKRRMA/HsHTyVXX1nHqzL9O5ifPVHHye++rfzC/uLwa12sRQoMCOL9fdx6cNJAxKTEM76MnfKW8zZMEsAFIFZEUIB+4Gfg3lzJLgTlW/8AYoNg6sQvwCrDTGPO0m3VuB56w/n7Q8t1QdimrrGHt3uN8trOAL/Yc41hpZaNlw4MD6R4RTPcuIXSPCKF39wi6RwQTHRFCjNP8mC4hpPboSmiQnvCVaktNJgBjTI2IzAGW47gMdJExZruIzLaWvwQsw3EJaDaOy0DvsFYfD9wKZInIFmver40xy3Cc+N8RkZ8CB4EbvbZXqk0VlFSwcmchK3Yc5at9RVTV1BEVFsTEQQkMiO9KTBfrpN4lhOiIYGKsE7v+gleqY/FoHEBHoeMA7GGMYdfRUj7bUcBnOwvYmue48VWfmHAmn9uTyUN6kJHcneBAvbOIUh1Ri8cBKP9UVVPH+v0n+GxnASt2FJB/qhwRSO8TzS+uGMTkIT1ITeiqI2yV8mGaAFSD4vJqVu8u5LOdhazeXUhpRQ2hQQFclBrH/ZedwyWDE0iI1IepKNVZaALwY6cra9ieX8zWvFOs3n2M9ftPUFNniOsawrShiUwa0oMLz4nT6+uV6qQ0AfiJM1U1bD9cQlZeMVn5xWTmnSLneFnD5ZfnJHTl7gn9mTykB+lJ0R1mqLpSqu1oAuiEyqtq2XGkhKy8U2TmF7Mtv5jswtPUP8e6R1Qoab27cc3w3qQlRTG0dzdt2lHKD2kCaGfGGI6VVnK6soaggAACA4WgACEwwPlvQMP7pn6JV1TXsvNICVn5xQ2/7vcWnqbWOtvHdQ1lWFI3pg5NJK13N9KSutFDH4qulEITQJswxlBYWsn+42UcKCpj//EzHCgqI7fI8dfdvb8bEyB8LyE4J4xAEQpLK6mxTvaxXUJIS+rG5CE9Gk72PaPC9EodpZRbmgBaqK7O5SRfVMaB42fILSrjQNEZyqv/dZIPChD6xkSQHNeFsf1jSI7tQrfwYGrrDDV1ddTUGcfrWutvnaHWef73lv9rfnWtoUdUKMOSoklL6kavbnqyV0p5ThNAM6zaXcjb6w+Se/wMB06UUVFd17AsONA6ycd2YdyAOFLiIugX24Xk2C70ig4jSAdJKaU6GE0AHtp5pITZb2yie0QIQ3tHcVFqHP3iupAcG2Gd5MP1oeNKKZ+iCcADpytruPfNzUSFB/PhfRcSHxlqd0hKKdVq2i7RBGMMv34/i9yiMp6/ZYSe/JVSnYYmgCa8tf4QS7ce5qHJAxnbP9bucJRSyms0AZzF9sPFPPrhdiYMjOeeiefYHY5SSnmVJoBGlFZUc++bm+keEcwzNw3XWyMopTod7QR2wxjDvPezOHSynLfuHktsV233V0p1PloDcOOv3x7kn5lH+PnlAxmdEmN3OEop1SY8SgAiMkVEdotItojMc7NcRGS+tTxTREY6LVskIoUiss1lnUdFJF9EtljTtNbvTuttyy/mdx/u4JJB8cyeMMDucJRSqs00mQBEJBB4EZgKDAFuEZEhLsWmAqnWNAtY4LTsNWBKIx//jDEm3ZqWNTN2ryupqOaeNzcT2zWEP96Uru3+SqlOzZMawGgg2xiTY4ypAt4GpruUmQ4sNg7rgGgRSQQwxqwBTngz6LZgjOGX72aSf6qc528ZQUyXELtDUkqpNuVJAugNHHJ6n2fNa24Zd+ZYTUaLRKS7uwIiMktENorIxmPHjnnwkS3z+te5fLztKI9cMYiMZG33V0p1fp4kAHftIKYFZVwtAAYA6cAR4I/uChljFhpjMowxGfHx8U18ZMtsPXSKx5ft5LLBCdx9Uf822YZSSnU0niSAPKCP0/sk4HALynyPMabAGFNrjKkDXsbR1NTuisurufdvm0mIDOOPer2/UsqPeJIANgCpIpIiIiHAzcBSlzJLgdusq4HGAsXGmCNn+9D6PgLLdcC2xsq2FWMMj7y7laPFFTz/byOIjtB2f6WU/2hyIJgxpkZE5gDLgUBgkTFmu4jMtpa/BCwDpgHZwBngjvr1ReQtYCIQJyJ5wH8bY14BnhSRdBxNRbnAz7y3W55Z9FUuy7cX8B9XnsvIvm67IJRSqtMSY5pqqu84MjIyzMaNG73yWd8dPMmNL33DJYMTWHjr+fokLaVUpyUim4wxGa7z/XIk8KkzVcz523f07BbGUzcM15O/Usov+d29gIwxPPx/WyksreDd2ePoFhFsd0hKKWULv6sB/GXtfj7bWcivp53L8D7RdoejlFK28asEsOnASf7wyS6mnNeTmeOS7Q5HKaVs5TcJ4GRZFff9bTOJ0WH84YZh2u6vlPJ7ftEHUFdneOidLRw/XcV7/z6ObuHa7q+UUn5RA1i4NodVu4/xH1edS1pSN7vDUUqpDsEvEkBitzBuPD+JW8f2szsUpZTqMPyiCWh6em+mp3tyc1KllPIfflEDUEop9UOaAJRSyk9pAlBKKT+lCUAppfyUJgCllPJTmgCUUspPaQJQSik/pQlAKaX8lE89EUxEjgEHWrh6HHDci+G0B4257flavKAxtxdfi/ls8fYzxsS7zvSpBNAaIrLR3SPROjKNue35WrygMbcXX4u5JfFqE5BSSvkpTQBKKeWn/CkBLLQ7gBbQmNuer8ULGnN78bWYmx2v3/QBKKWU+j5/qgEopZRyoglAKaX8lM8mABHpIyKrRGSniGwXkQes+TEiskJE9lp/uzut8ysRyRaR3SJyhdP880Uky1o2X/SJ8Q28fJxXW/O2WFOCHfvU0TT3GItIrFX+tIi84PJZ+l1uhJePc+f4LhtjfHICEoGR1utIYA8wBHgSmGfNnwf8wXo9BNgKhAIpwD4g0Fq2HrgAEOBjYKrd+9dRJi8f59VAht371NGmFhzjLsCFwGzgBZfP0u9y+xznTvFd9tkagDHmiDFms/W6FNgJ9AamA69bxV4HrrVeTwfeNsZUGmP2A9nAaBFJBKKMMd8Yx7/sYqd1/J63jnO7Bu1jmnuMjTFlxpgvgQrnz9Hv8tl56zh3Jj6bAJyJSDIwAvgW6GGMOQKOf3CgvmrWGzjktFqeNa+39dp1vnLRyuNc71Wryvyf2jzxQx4e48bod9lDrTzO9Xz+u+zzCUBEugLvAXONMSVnK+pmnjnLfOXEC8cZ4MfGmDTgImu61btR+rZmHONGP8LNPP0uu/DCcYZO8l326QQgIsE4/iHfNMa8b80usKrC9VXiQmt+HtDHafUk4LA1P8nNfGXx0nHGGJNv/S0F/oY2DTVo5jFujH6Xm+Cl49xpvss+mwCsKtcrwE5jzNNOi5YCt1uvbwc+cJp/s4iEikgKkAqst6p8pSIy1vrM25zW8XveOs4iEiQicdZnBgNXAdvaYx86uhYcY7f0u3x23jrOneq7bHcvdEsnHL3zBsgEtljTNCAWWAnstf7GOK3zGxxXpezG6eoIIAPHP+A+4AWsEdI6ee8447iiYpP1OduB57CuDvL3qYXHOBc4AZzG8ct/iDVfv8ttfJw703dZbwWhlFJ+ymebgJRSSrWOJgCllPJTmgCUUspPaQJQSik/pQlAKaX8lCYApZyIw5ciMtVp3k0i8ombsrnWnTczReQLEenXvtEq1Tp6GahSLkRkKPB/OO4VE4jjevEpxph91nLBcduFHBx3hDwuIo8BvYwxd9sTtVLNpzUApVwYY7YBHwK/BP4bx101a637yP8J2Mz3b3cB8A3WjddEJF5E3hORDdY03pr/qIi8ISKfW/ee12ShbBVkdwBKdVCP4TjRV+EYXZsIDALuMMbcA+ByA8gpwD+s188BzxhjvhSRvsBy4Fxr2TBgLI7RpN+JyD+NMXq/HmULTQBKuWGMKRORJcBpY0yldbI/YIxZ51J0lYj0wHEDsf+w5k0ChjgliCgRibRef2CMKQfKRWQVjpuI/aMNd0WpRmkTkFKNq7OmemVuylwC9MNxT5jfWvMCgAuMMenW1Ns47hoJP7w9s3bCKdtoAlCqlaxf9HOB20QkBvgUmFO/XETSnYpPF5EwEYkFJgIb2i9Spb5PE4BSXmAct2J+C7gXuB/IsC4P3YHjmbL11gP/BNYBv9P2f2UnvQxUqXYiIo/i6FN4yu5YlAKtASillN/SGoBSSvkprQEopZSf0gSglFJ+ShOAUkr5KU0ASinlpzQBKKWUn/r/2tqBlHVulpEAAAAASUVORK5CYII=\n",
 578 |       "text/plain": [
 579 |        "<Figure size 432x288 with 1 Axes>"
 580 |       ]
 581 |      },
 582 |      "metadata": {
 583 |       "needs_background": "light"
 584 |      },
 585 |      "output_type": "display_data"
 586 |     }
 587 |    ],
 588 |    "source": [
 589 |     "df_per_yr_summary=df_per_yr_summary.compute()\n",
 590 |     "df_per_yr_summary['DelinquencyEventsperLoan']=df_per_yr_summary['DelinquentEvent']/df_per_yr_summary['LoanID']\n",
 591 |     "df_per_yr_summary.to_pandas().sort_values(by=['YrRep']).plot.line(x='YrRep',y='DelinquencyEventsperLoan')"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "markdown",
 596 |    "metadata": {},
 597 |    "source": [
 598 |     "## Creating modeling dataset with label & features"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "markdown",
 603 |    "metadata": {},
 604 |    "source": [
 605 |     "Aggregate to one record per loan & flagging for delinquency event at least once historically"
 606 |    ]
 607 |   },
 608 |   {
 609 |    "cell_type": "code",
 610 |    "execution_count": 48,
 611 |    "metadata": {},
 612 |    "outputs": [
 613 |     {
 614 |      "data": {
 615 |       "application/vnd.jupyter.widget-view+json": {
 616 |        "model_id": "c5119883ea8b475ea25b1a4607516b8c",
 617 |        "version_major": 2,
 618 |        "version_minor": 0
 619 |       },
 620 |       "text/plain": [
 621 |        "VBox()"
 622 |       ]
 623 |      },
 624 |      "metadata": {},
 625 |      "output_type": "display_data"
 626 |     }
 627 |    ],
 628 |    "source": [
 629 |     "# This takes ~ 12-15 min with 2 A100 GPUs on 1.8B rows\n",
 630 |     "df_per_loan = df_per.groupby('LoanID',as_index=False).agg({'DelinquentEvent':'sum'}).persist()\n",
 631 |     "progress(df_per_loan)"
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "code",
 636 |    "execution_count": 49,
 637 |    "metadata": {},
 638 |    "outputs": [
 639 |     {
 640 |      "data": {
 641 |       "text/html": [
 642 |        "<div>\n",
 643 |        "<style scoped>\n",
 644 |        "    .dataframe tbody tr th:only-of-type {\n",
 645 |        "        vertical-align: middle;\n",
 646 |        "    }\n",
 647 |        "\n",
 648 |        "    .dataframe tbody tr th {\n",
 649 |        "        vertical-align: top;\n",
 650 |        "    }\n",
 651 |        "\n",
 652 |        "    .dataframe thead th {\n",
 653 |        "        text-align: right;\n",
 654 |        "    }\n",
 655 |        "</style>\n",
 656 |        "<table border=\"1\" class=\"dataframe\">\n",
 657 |        "  <thead>\n",
 658 |        "    <tr style=\"text-align: right;\">\n",
 659 |        "      <th></th>\n",
 660 |        "      <th>LoanID</th>\n",
 661 |        "      <th>DelinquentEvent</th>\n",
 662 |        "    </tr>\n",
 663 |        "  </thead>\n",
 664 |        "  <tbody>\n",
 665 |        "    <tr>\n",
 666 |        "      <th>0</th>\n",
 667 |        "      <td>185648046345</td>\n",
 668 |        "      <td>0</td>\n",
 669 |        "    </tr>\n",
 670 |        "    <tr>\n",
 671 |        "      <th>1</th>\n",
 672 |        "      <td>940160992979</td>\n",
 673 |        "      <td>0</td>\n",
 674 |        "    </tr>\n",
 675 |        "    <tr>\n",
 676 |        "      <th>2</th>\n",
 677 |        "      <td>338253480820</td>\n",
 678 |        "      <td>0</td>\n",
 679 |        "    </tr>\n",
 680 |        "    <tr>\n",
 681 |        "      <th>3</th>\n",
 682 |        "      <td>327180032737</td>\n",
 683 |        "      <td>0</td>\n",
 684 |        "    </tr>\n",
 685 |        "    <tr>\n",
 686 |        "      <th>4</th>\n",
 687 |        "      <td>287096047357</td>\n",
 688 |        "      <td>0</td>\n",
 689 |        "    </tr>\n",
 690 |        "  </tbody>\n",
 691 |        "</table>\n",
 692 |        "</div>"
 693 |       ],
 694 |       "text/plain": [
 695 |        "         LoanID  DelinquentEvent\n",
 696 |        "0  185648046345                0\n",
 697 |        "1  940160992979                0\n",
 698 |        "2  338253480820                0\n",
 699 |        "3  327180032737                0\n",
 700 |        "4  287096047357                0"
 701 |       ]
 702 |      },
 703 |      "execution_count": 49,
 704 |      "metadata": {},
 705 |      "output_type": "execute_result"
 706 |     }
 707 |    ],
 708 |    "source": [
 709 |     "df_per_loan.head()"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": 51,
 715 |    "metadata": {},
 716 |    "outputs": [
 717 |     {
 718 |      "data": {
 719 |       "text/html": [
 720 |        "<div>\n",
 721 |        "<style scoped>\n",
 722 |        "    .dataframe tbody tr th:only-of-type {\n",
 723 |        "        vertical-align: middle;\n",
 724 |        "    }\n",
 725 |        "\n",
 726 |        "    .dataframe tbody tr th {\n",
 727 |        "        vertical-align: top;\n",
 728 |        "    }\n",
 729 |        "\n",
 730 |        "    .dataframe thead th {\n",
 731 |        "        text-align: right;\n",
 732 |        "    }\n",
 733 |        "</style>\n",
 734 |        "<table border=\"1\" class=\"dataframe\">\n",
 735 |        "  <thead>\n",
 736 |        "    <tr style=\"text-align: right;\">\n",
 737 |        "      <th></th>\n",
 738 |        "      <th>LoanID</th>\n",
 739 |        "      <th>DelinquentEvent</th>\n",
 740 |        "      <th>DelinquentFlag</th>\n",
 741 |        "    </tr>\n",
 742 |        "  </thead>\n",
 743 |        "  <tbody>\n",
 744 |        "    <tr>\n",
 745 |        "      <th>6</th>\n",
 746 |        "      <td>736842854055</td>\n",
 747 |        "      <td>1</td>\n",
 748 |        "      <td>1</td>\n",
 749 |        "    </tr>\n",
 750 |        "    <tr>\n",
 751 |        "      <th>11</th>\n",
 752 |        "      <td>858243020041</td>\n",
 753 |        "      <td>15</td>\n",
 754 |        "      <td>1</td>\n",
 755 |        "    </tr>\n",
 756 |        "    <tr>\n",
 757 |        "      <th>24</th>\n",
 758 |        "      <td>229906838180</td>\n",
 759 |        "      <td>26</td>\n",
 760 |        "      <td>1</td>\n",
 761 |        "    </tr>\n",
 762 |        "    <tr>\n",
 763 |        "      <th>33</th>\n",
 764 |        "      <td>381137150529</td>\n",
 765 |        "      <td>1</td>\n",
 766 |        "      <td>1</td>\n",
 767 |        "    </tr>\n",
 768 |        "    <tr>\n",
 769 |        "      <th>35</th>\n",
 770 |        "      <td>769846953850</td>\n",
 771 |        "      <td>7</td>\n",
 772 |        "      <td>1</td>\n",
 773 |        "    </tr>\n",
 774 |        "  </tbody>\n",
 775 |        "</table>\n",
 776 |        "</div>"
 777 |       ],
 778 |       "text/plain": [
 779 |        "          LoanID  DelinquentEvent  DelinquentFlag\n",
 780 |        "6   736842854055                1               1\n",
 781 |        "11  858243020041               15               1\n",
 782 |        "24  229906838180               26               1\n",
 783 |        "33  381137150529                1               1\n",
 784 |        "35  769846953850                7               1"
 785 |       ]
 786 |      },
 787 |      "execution_count": 51,
 788 |      "metadata": {},
 789 |      "output_type": "execute_result"
 790 |     }
 791 |    ],
 792 |    "source": [
 793 |     "df_per_loan['DelinquentFlag']=0\n",
 794 |     "df_per_loan['DelinquentFlag']=df_per_loan['DelinquentFlag'].where(df_per_loan['DelinquentEvent']<1,1)\n",
 795 |     "df_per_loan[df_per_loan['DelinquentFlag']> 0].head()"
 796 |    ]
 797 |   },
 798 |   {
 799 |    "cell_type": "code",
 800 |    "execution_count": 52,
 801 |    "metadata": {},
 802 |    "outputs": [
 803 |     {
 804 |      "data": {
 805 |       "text/html": [
 806 |        "<div>\n",
 807 |        "<style scoped>\n",
 808 |        "    .dataframe tbody tr th:only-of-type {\n",
 809 |        "        vertical-align: middle;\n",
 810 |        "    }\n",
 811 |        "\n",
 812 |        "    .dataframe tbody tr th {\n",
 813 |        "        vertical-align: top;\n",
 814 |        "    }\n",
 815 |        "\n",
 816 |        "    .dataframe thead th {\n",
 817 |        "        text-align: right;\n",
 818 |        "    }\n",
 819 |        "</style>\n",
 820 |        "<table border=\"1\" class=\"dataframe\">\n",
 821 |        "  <thead>\n",
 822 |        "    <tr style=\"text-align: right;\">\n",
 823 |        "      <th></th>\n",
 824 |        "      <th>LoanID</th>\n",
 825 |        "      <th>Channel</th>\n",
 826 |        "      <th>SellerName</th>\n",
 827 |        "      <th>OrInterestRate</th>\n",
 828 |        "      <th>OrUnpaidPrinc</th>\n",
 829 |        "      <th>OrLoanTerm</th>\n",
 830 |        "      <th>OrDate</th>\n",
 831 |        "      <th>FirstPayment</th>\n",
 832 |        "      <th>OrLTV</th>\n",
 833 |        "      <th>OrCLTV</th>\n",
 834 |        "      <th>...</th>\n",
 835 |        "      <th>OccStatus</th>\n",
 836 |        "      <th>PropertyState</th>\n",
 837 |        "      <th>Zip</th>\n",
 838 |        "      <th>MortInsPerc</th>\n",
 839 |        "      <th>ProductType</th>\n",
 840 |        "      <th>CoCreditScore</th>\n",
 841 |        "      <th>MortInsType</th>\n",
 842 |        "      <th>RelMortInd</th>\n",
 843 |        "      <th>DelinquentEvent</th>\n",
 844 |        "      <th>DelinquentFlag</th>\n",
 845 |        "    </tr>\n",
 846 |        "  </thead>\n",
 847 |        "  <tbody>\n",
 848 |        "    <tr>\n",
 849 |        "      <th>0</th>\n",
 850 |        "      <td>183242426281</td>\n",
 851 |        "      <td>R</td>\n",
 852 |        "      <td>JPMORGAN CHASE BANK, NA</td>\n",
 853 |        "      <td>8.375</td>\n",
 854 |        "      <td>137000.0</td>\n",
 855 |        "      <td>360.0</td>\n",
 856 |        "      <td>01/2000</td>\n",
 857 |        "      <td>03/2000</td>\n",
 858 |        "      <td>97.0</td>\n",
 859 |        "      <td>&lt;NA&gt;</td>\n",
 860 |        "      <td>...</td>\n",
 861 |        "      <td>P</td>\n",
 862 |        "      <td>OR</td>\n",
 863 |        "      <td>974</td>\n",
 864 |        "      <td>30.0</td>\n",
 865 |        "      <td>FRM</td>\n",
 866 |        "      <td>&lt;NA&gt;</td>\n",
 867 |        "      <td>1.0</td>\n",
 868 |        "      <td>N</td>\n",
 869 |        "      <td>0</td>\n",
 870 |        "      <td>0</td>\n",
 871 |        "    </tr>\n",
 872 |        "    <tr>\n",
 873 |        "      <th>1</th>\n",
 874 |        "      <td>183243108339</td>\n",
 875 |        "      <td>R</td>\n",
 876 |        "      <td>WELLS FARGO BANK, N.A.</td>\n",
 877 |        "      <td>8.000</td>\n",
 878 |        "      <td>145000.0</td>\n",
 879 |        "      <td>360.0</td>\n",
 880 |        "      <td>11/1999</td>\n",
 881 |        "      <td>01/2000</td>\n",
 882 |        "      <td>80.0</td>\n",
 883 |        "      <td>&lt;NA&gt;</td>\n",
 884 |        "      <td>...</td>\n",
 885 |        "      <td>P</td>\n",
 886 |        "      <td>IL</td>\n",
 887 |        "      <td>604</td>\n",
 888 |        "      <td>&lt;NA&gt;</td>\n",
 889 |        "      <td>FRM</td>\n",
 890 |        "      <td>&lt;NA&gt;</td>\n",
 891 |        "      <td>&lt;NA&gt;</td>\n",
 892 |        "      <td>Y</td>\n",
 893 |        "      <td>1</td>\n",
 894 |        "      <td>1</td>\n",
 895 |        "    </tr>\n",
 896 |        "    <tr>\n",
 897 |        "      <th>2</th>\n",
 898 |        "      <td>183245218791</td>\n",
 899 |        "      <td>R</td>\n",
 900 |        "      <td>SUNTRUST MORTGAGE INC.</td>\n",
 901 |        "      <td>8.000</td>\n",
 902 |        "      <td>100000.0</td>\n",
 903 |        "      <td>180.0</td>\n",
 904 |        "      <td>01/2000</td>\n",
 905 |        "      <td>03/2000</td>\n",
 906 |        "      <td>33.0</td>\n",
 907 |        "      <td>&lt;NA&gt;</td>\n",
 908 |        "      <td>...</td>\n",
 909 |        "      <td>P</td>\n",
 910 |        "      <td>MD</td>\n",
 911 |        "      <td>210</td>\n",
 912 |        "      <td>&lt;NA&gt;</td>\n",
 913 |        "      <td>FRM</td>\n",
 914 |        "      <td>772.0</td>\n",
 915 |        "      <td>&lt;NA&gt;</td>\n",
 916 |        "      <td>N</td>\n",
 917 |        "      <td>0</td>\n",
 918 |        "      <td>0</td>\n",
 919 |        "    </tr>\n",
 920 |        "    <tr>\n",
 921 |        "      <th>3</th>\n",
 922 |        "      <td>183249720870</td>\n",
 923 |        "      <td>R</td>\n",
 924 |        "      <td>USAA FEDERAL SAVINGS BANK</td>\n",
 925 |        "      <td>8.000</td>\n",
 926 |        "      <td>212000.0</td>\n",
 927 |        "      <td>360.0</td>\n",
 928 |        "      <td>01/2000</td>\n",
 929 |        "      <td>03/2000</td>\n",
 930 |        "      <td>95.0</td>\n",
 931 |        "      <td>&lt;NA&gt;</td>\n",
 932 |        "      <td>...</td>\n",
 933 |        "      <td>P</td>\n",
 934 |        "      <td>PA</td>\n",
 935 |        "      <td>151</td>\n",
 936 |        "      <td>30.0</td>\n",
 937 |        "      <td>FRM</td>\n",
 938 |        "      <td>695.0</td>\n",
 939 |        "      <td>1.0</td>\n",
 940 |        "      <td>N</td>\n",
 941 |        "      <td>0</td>\n",
 942 |        "      <td>0</td>\n",
 943 |        "    </tr>\n",
 944 |        "    <tr>\n",
 945 |        "      <th>4</th>\n",
 946 |        "      <td>183251824837</td>\n",
 947 |        "      <td>R</td>\n",
 948 |        "      <td>OTHER</td>\n",
 949 |        "      <td>8.375</td>\n",
 950 |        "      <td>56000.0</td>\n",
 951 |        "      <td>360.0</td>\n",
 952 |        "      <td>02/2000</td>\n",
 953 |        "      <td>04/2000</td>\n",
 954 |        "      <td>48.0</td>\n",
 955 |        "      <td>&lt;NA&gt;</td>\n",
 956 |        "      <td>...</td>\n",
 957 |        "      <td>P</td>\n",
 958 |        "      <td>LA</td>\n",
 959 |        "      <td>700</td>\n",
 960 |        "      <td>&lt;NA&gt;</td>\n",
 961 |        "      <td>FRM</td>\n",
 962 |        "      <td>&lt;NA&gt;</td>\n",
 963 |        "      <td>&lt;NA&gt;</td>\n",
 964 |        "      <td>N</td>\n",
 965 |        "      <td>0</td>\n",
 966 |        "      <td>0</td>\n",
 967 |        "    </tr>\n",
 968 |        "  </tbody>\n",
 969 |        "</table>\n",
 970 |        "<p>5 rows × 27 columns</p>\n",
 971 |        "</div>"
 972 |       ],
 973 |       "text/plain": [
 974 |        "         LoanID Channel                 SellerName  OrInterestRate  \\\n",
 975 |        "0  183242426281       R    JPMORGAN CHASE BANK, NA           8.375   \n",
 976 |        "1  183243108339       R     WELLS FARGO BANK, N.A.           8.000   \n",
 977 |        "2  183245218791       R     SUNTRUST MORTGAGE INC.           8.000   \n",
 978 |        "3  183249720870       R  USAA FEDERAL SAVINGS BANK           8.000   \n",
 979 |        "4  183251824837       R                      OTHER           8.375   \n",
 980 |        "\n",
 981 |        "   OrUnpaidPrinc  OrLoanTerm   OrDate FirstPayment  OrLTV OrCLTV  ...  \\\n",
 982 |        "0       137000.0       360.0  01/2000      03/2000   97.0   <NA>  ...   \n",
 983 |        "1       145000.0       360.0  11/1999      01/2000   80.0   <NA>  ...   \n",
 984 |        "2       100000.0       180.0  01/2000      03/2000   33.0   <NA>  ...   \n",
 985 |        "3       212000.0       360.0  01/2000      03/2000   95.0   <NA>  ...   \n",
 986 |        "4        56000.0       360.0  02/2000      04/2000   48.0   <NA>  ...   \n",
 987 |        "\n",
 988 |        "   OccStatus PropertyState  Zip  MortInsPerc ProductType CoCreditScore  \\\n",
 989 |        "0          P            OR  974         30.0         FRM          <NA>   \n",
 990 |        "1          P            IL  604         <NA>         FRM          <NA>   \n",
 991 |        "2          P            MD  210         <NA>         FRM         772.0   \n",
 992 |        "3          P            PA  151         30.0         FRM         695.0   \n",
 993 |        "4          P            LA  700         <NA>         FRM          <NA>   \n",
 994 |        "\n",
 995 |        "  MortInsType RelMortInd DelinquentEvent  DelinquentFlag  \n",
 996 |        "0         1.0          N               0               0  \n",
 997 |        "1        <NA>          Y               1               1  \n",
 998 |        "2        <NA>          N               0               0  \n",
 999 |        "3         1.0          N               0               0  \n",
1000 |        "4        <NA>          N               0               0  \n",
1001 |        "\n",
1002 |        "[5 rows x 27 columns]"
1003 |       ]
1004 |      },
1005 |      "execution_count": 52,
1006 |      "metadata": {},
1007 |      "output_type": "execute_result"
1008 |     }
1009 |    ],
1010 |    "source": [
1011 |     "joined=df_acq.merge(df_per_loan,on=['LoanID'],how='left')\n",
1012 |     "joined.head()"
1013 |    ]
1014 |   },
1015 |   {
1016 |    "cell_type": "code",
1017 |    "execution_count": 90,
1018 |    "metadata": {},
1019 |    "outputs": [],
1020 |    "source": [
1021 |     "label=['DelinquentFlag']\n",
1022 |     "cat_features=['Channel','OccStatus','FTHomeBuyer','LoanPurpose','PropertyType','ProductType','RelMortInd']\n",
1023 |     "num_features=['OrInterestRate','OrUnpaidPrinc','OrLoanTerm','OrLTV','OrCLTV','CreditScore']\n",
1024 |     "modeling_dataset=joined_categorized[cat_features + num_features + label]"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": 91,
1030 |    "metadata": {},
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "modeling_dataset.dtypes"
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "code",
1038 |    "execution_count": 99,
1039 |    "metadata": {},
1040 |    "outputs": [],
1041 |    "source": [
1042 |     "modeling_dataset=dask_df.get_dummies(modeling_dataset)"
1043 |    ]
1044 |   },
1045 |   {
1046 |    "cell_type": "markdown",
1047 |    "metadata": {},
1048 |    "source": [
1049 |     "## Training a model"
1050 |    ]
1051 |   },
1052 |   {
1053 |    "cell_type": "code",
1054 |    "execution_count": 101,
1055 |    "metadata": {},
1056 |    "outputs": [],
1057 |    "source": [
1058 |     "X = modeling_dataset[modeling_dataset.columns.difference(['DelinquentFlag'])]\n",
1059 |     "y = modeling_dataset['DelinquentFlag']"
1060 |    ]
1061 |   },
1062 |   {
1063 |    "cell_type": "code",
1064 |    "execution_count": 102,
1065 |    "metadata": {},
1066 |    "outputs": [],
1067 |    "source": [
1068 |     "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": 104,
1074 |    "metadata": {},
1075 |    "outputs": [],
1076 |    "source": [
1077 |     "param =  {\n",
1078 |     "               'max_depth': 8,\n",
1079 |     "               'objective': 'reg:squarederror',\n",
1080 |     "               'tree_method': 'gpu_hist'\n",
1081 |     "             }\n",
1082 |     "model = xgb.dask.train(client,param, dtrain,num_boost_round=100)"
1083 |    ]
1084 |   },
1085 |   {
1086 |    "cell_type": "code",
1087 |    "execution_count": 110,
1088 |    "metadata": {},
1089 |    "outputs": [],
1090 |    "source": [
1091 |     "## TODO --- Add in metrics, feature importance, & evaluation plots"
1092 |    ]
1093 |   },
1094 |   {
1095 |    "cell_type": "markdown",
1096 |    "metadata": {},
1097 |    "source": [
1098 |     "## Appendix"
1099 |    ]
1100 |   }
1101 |  ],
1102 |  "metadata": {
1103 |   "environment": {
1104 |    "name": "common-cu110.m61",
1105 |    "type": "gcloud",
1106 |    "uri": "gcr.io/deeplearning-platform-release/base-cu110:m61"
1107 |   },
1108 |   "kernelspec": {
1109 |    "display_name": "Python [conda env:rapids-0.17]",
1110 |    "language": "python",
1111 |    "name": "conda-env-rapids-0.17-py"
1112 |   },
1113 |   "language_info": {
1114 |    "codemirror_mode": {
1115 |     "name": "ipython",
1116 |     "version": 3
1117 |    },
1118 |    "file_extension": ".py",
1119 |    "mimetype": "text/x-python",
1120 |    "name": "python",
1121 |    "nbconvert_exporter": "python",
1122 |    "pygments_lexer": "ipython3",
1123 |    "version": "3.7.8"
1124 |   }
1125 |  },
1126 |  "nbformat": 4,
1127 |  "nbformat_minor": 4
1128 | }
1129 | 


--------------------------------------------------------------------------------
/higgs/rapids_xgboost/notebooks/a100_higgs_rapids_xgboost.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#!conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=11.0"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%%bash\n",
 17 |     "nvidia-smi\n",
 18 |     "nvcc --version"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import numpy as np; print('numpy Version:', np.__version__)\n",
 28 |     "import pandas as pd; print('pandas Version:', pd.__version__)\n",
 29 |     "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n",
 30 |     "import cudf; print('cudf Version:', cudf.__version__)\n",
 31 |     "import cuml; print('cudf Version:', cuml.__version__)\n",
 32 |     "import gcsfs; print('gcsfs Version:', gcsfs.__version__)\n",
 33 |     "import time\n",
 34 |     "import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)\n",
 35 |     "import dask; print('dask Version:', gcsfs.__version__)\n",
 36 |     "import dask.dataframe as dask_df"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Download HIGGs dataset & unzip\n",
 44 |     "https://archive.ics.uci.edu/ml/datasets/HIGGS"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# %%bash\n",
 54 |     "# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P /home/jupyter/\n",
 55 |     "# gzip -d /home/jupyter/HIGGS.csv.gz /home/jupyter/\n",
 56 |     "# ls -lh /home/jupyter/"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]\n",
 66 |     "#filname = '/home/jupyter/HIGGS.csv'\n",
 67 |     "filname = 'gs://mchrestkha-github-ml-examples/higgs/HIGGS.csv'"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Pandas"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "start_time = time.time()\n",
 84 |     "df=pd.read_csv(filname, header=None, names=colnames)\n",
 85 |     "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
 86 |     "start_time = time.time()\n",
 87 |     "X = df[df.columns.difference(['label'])]\n",
 88 |     "y = df['label']\n",
 89 |     "dtrain=xgb.DMatrix(X,y)\n",
 90 |     "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "start_time = time.time()\n",
 94 |     "param =  {\n",
 95 |     "               'max_depth': 8,\n",
 96 |     "               'objective': 'reg:squarederror',\n",
 97 |     "               'tree_method': 'hist'\n",
 98 |     "             }\n",
 99 |     "bst = xgb.train(param, dtrain,num_boost_round=100)\n",
100 |     "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## cuDF"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "start_time = time.time()\n",
117 |     "df=cudf.read_csv(filname, header=None, names=colnames)\n",
118 |     "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
119 |     "start_time = time.time()\n",
120 |     "X = df[df.columns.difference(['label'])]\n",
121 |     "y = df['label']\n",
122 |     "dtrain=xgb.DMatrix(X,y)\n",
123 |     "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n",
124 |     "\n",
125 |     "start_time = time.time()\n",
126 |     "param =  {\n",
127 |     "               'max_depth': 8,\n",
128 |     "               'objective': 'reg:squarederror',\n",
129 |     "               'tree_method': 'gpu_hist'\n",
130 |     "             }\n",
131 |     "bst = xgb.train(param, dtrain,num_boost_round=100)\n",
132 |     "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Dask"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "from dask.distributed import Client, LocalCluster\n",
149 |     "cluster = LocalCluster()\n",
150 |     "# num_workders=2\n",
151 |     "# threads_per_worker=12\n",
152 |     "# cluster = LocalCluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n",
153 |     "\n",
154 |     "client = Client(cluster)\n",
155 |     "client"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "start_time = time.time()\n",
165 |     "df=dask_df.read_csv(filname, header=None, names=colnames)\n",
166 |     "df=df.persist()\n",
167 |     "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
168 |     "# start_time = time.time()\n",
169 |     "X = df[df.columns.difference(['label'])]\n",
170 |     "y = df['label']\n",
171 |     "dtrain=xgb.dask.DaskDMatrix(client,X,y)\n",
172 |     "\n",
173 |     "del df\n",
174 |     "del X\n",
175 |     "del y\n",
176 |     "\n",
177 |     "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n",
178 |     "\n",
179 |     "start_time = time.time()\n",
180 |     "param =  {\n",
181 |     "               'max_depth': 8,\n",
182 |     "               'objective': 'reg:squarederror',\n",
183 |     "               'tree_method': 'hist'\n",
184 |     "             }\n",
185 |     "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n",
186 |     "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Dask_cuDF"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "from dask_cuda import LocalCUDACluster\n",
203 |     "from dask.distributed import Client\n",
204 |     "# Create a Dask Cluster with one worker per GPU\n",
205 |     "num_workders=2\n",
206 |     "threads_per_worker=12\n",
207 |     "cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n",
208 |     "#cluster = LocalCUDACluster()\n",
209 |     "\n",
210 |     "client = Client(cluster)\n",
211 |     "client"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "start_time = time.time()\n",
221 |     "df=dask_cudf.read_csv(filname, header=None, names=colnames)\n",
222 |     "df=df.persist()\n",
223 |     "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
224 |     "# start_time = time.time()\n",
225 |     "X = df[df.columns.difference(['label'])]\n",
226 |     "y = df['label']\n",
227 |     "#dtrain=xgb.dask.DaskDMatrix(client,X,y)\n",
228 |     "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)\n",
229 |     "\n",
230 |     "del df\n",
231 |     "del X\n",
232 |     "del y\n",
233 |     "\n",
234 |     "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n",
235 |     "\n",
236 |     "start_time = time.time()\n",
237 |     "param =  {\n",
238 |     "               'max_depth': 8,\n",
239 |     "               'objective': 'reg:squarederror',\n",
240 |     "               'tree_method': 'gpu_hist'\n",
241 |     "             }\n",
242 |     "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n",
243 |     "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "from dask_cuda import LocalCUDACluster\n",
253 |     "from dask.distributed import Client\n",
254 |     "# Create a Dask Cluster with one worker per GPU\n",
255 |     "# num_workders=2\n",
256 |     "# threads_per_worker=12\n",
257 |     "# cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n",
258 |     "cluster = LocalCUDACluster()\n",
259 |     "\n",
260 |     "client = Client(cluster)\n",
261 |     "client\n",
262 |     "\n",
263 |     "start_time = time.time()\n",
264 |     "df=dask_cudf.read_csv(filname, header=None, names=colnames)\n",
265 |     "df=df.persist()\n",
266 |     "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
267 |     "# start_time = time.time()\n",
268 |     "X = df[df.columns.difference(['label'])]\n",
269 |     "y = df['label']\n",
270 |     "#dtrain=xgb.dask.DaskDMatrix(client,X,y)\n",
271 |     "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)\n",
272 |     "\n",
273 |     "del df\n",
274 |     "del X\n",
275 |     "del y\n",
276 |     "\n",
277 |     "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n",
278 |     "\n",
279 |     "start_time = time.time()\n",
280 |     "param =  {\n",
281 |     "               'max_depth': 8,\n",
282 |     "               'objective': 'reg:squarederror',\n",
283 |     "               'tree_method': 'gpu_hist'\n",
284 |     "             }\n",
285 |     "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n",
286 |     "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": []
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "environment": {
299 |    "name": "common-cu110.m59",
300 |    "type": "gcloud",
301 |    "uri": "gcr.io/deeplearning-platform-release/base-cu110:m59"
302 |   },
303 |   "kernelspec": {
304 |    "display_name": "Python [conda env:rapids-0.16]",
305 |    "language": "python",
306 |    "name": "conda-env-rapids-0.16-py"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.7.9"
319 |   }
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 4
323 | }
324 | 


--------------------------------------------------------------------------------
/higgs/rapids_xgboost/notebooks/t4_higgs_rapids_xgboost.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "- Using the pre-build RAPIDS image on Google Cloud's AI Platform Notebooks with a T4 GPU, 8vCPUs, 30GB RAM\n",
  8 |     "- https://cloud.google.com/ai-platform/notebooks/docs/images#deciding\n",
  9 |     "- This should provide CUDA 10.0, rapids 0.12"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Sun Dec  6 06:43:44 2020       \n",
 22 |       "+-----------------------------------------------------------------------------+\n",
 23 |       "| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |\n",
 24 |       "|-------------------------------+----------------------+----------------------+\n",
 25 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 26 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 27 |       "|                               |                      |               MIG M. |\n",
 28 |       "|===============================+======================+======================|\n",
 29 |       "|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |\n",
 30 |       "| N/A   40C    P0    55W / 400W |      0MiB / 40537MiB |      0%      Default |\n",
 31 |       "|                               |                      |             Disabled |\n",
 32 |       "+-------------------------------+----------------------+----------------------+\n",
 33 |       "                                                                               \n",
 34 |       "+-----------------------------------------------------------------------------+\n",
 35 |       "| Processes:                                                                  |\n",
 36 |       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
 37 |       "|        ID   ID                                                   Usage      |\n",
 38 |       "|=============================================================================|\n",
 39 |       "|  No running processes found                                                 |\n",
 40 |       "+-----------------------------------------------------------------------------+\n",
 41 |       "nvcc: NVIDIA (R) Cuda compiler driver\n",
 42 |       "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 43 |       "Built on Thu_Jun_11_22:26:38_PDT_2020\n",
 44 |       "Cuda compilation tools, release 11.0, V11.0.194\n",
 45 |       "Build cuda_11.0_bu.TC445_37.28540450_0\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "%%bash\n",
 51 |     "nvidia-smi\n",
 52 |     "nvcc --version"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "numpy Version: 1.18.5\n",
 65 |       "pandas Version: 1.1.4\n"
 66 |      ]
 67 |     },
 68 |     {
 69 |      "ename": "ModuleNotFoundError",
 70 |      "evalue": "No module named 'xgboost'",
 71 |      "output_type": "error",
 72 |      "traceback": [
 73 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 74 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 75 |       "\u001b[0;32m<ipython-input-2-5e8f2170fc10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'numpy Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'pandas Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'XGBoost Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcudf\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cudf Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcuml\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cudf Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcuml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 76 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "import numpy as np; print('numpy Version:', np.__version__)\n",
 82 |     "import pandas as pd; print('pandas Version:', pd.__version__)\n",
 83 |     "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n",
 84 |     "import cudf; print('cudf Version:', cudf.__version__)\n",
 85 |     "import cuml; print('cudf Version:', cuml.__version__)\n",
 86 |     "import time"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Download HIGGs dataset & unzip\n",
 94 |     "https://archive.ics.uci.edu/ml/datasets/HIGGS"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# %%bash\n",
104 |     "# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P /home/jupyter/"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Single Node with CPUs (PANDAS + XGBoost) or single GPU (RAPIDS-cuDF + XGBoost)\n",
112 |     "- XGBoost w/ RAPIDS examples https://rapids.ai/xgboost.html\n",
113 |     "\n",
114 |     "### Expected CPUs numbers\n",
115 |     "[INFO]: ------ Data Ingestion is completed in 104.7611632347107 seconds ---   \n",
116 |     "TOD0: Add Data transformation steps  \n",
117 |     "[INFO]: ------ Training is completed in 30.218074321746826 seconds ---\n",
118 |     "\n",
119 |     "#### Expected GPU numbers\n",
120 |     "[INFO]: ------ Data Ingestion is completed in 18.212464094161987 seconds ---  \n",
121 |     "TOD0: Add Data transformation steps  \n",
122 |     "[INFO]: ------ Training is completed in 5.825598955154419 seconds ---"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "def xgboost_fun(gpu_cpu, tree_method):\n",
132 |     "    colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]\n",
133 |     "    start_time = time.time()\n",
134 |     "    if gpu_cpu=='cpu':\n",
135 |     "        df=pd.read_csv('/home/jupyter/HIGGS.csv', header=None, names=colnames)\n",
136 |     "    else: \n",
137 |     "        df=cudf.read_csv('/home/jupyter/HIGGS.csv', header=None, names=colnames)\n",
138 |     "    print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n",
139 |     "\n",
140 |     "    X = df[df.columns.difference(['label'])]\n",
141 |     "    y = df['label']\n",
142 |     "    dtrain=xgb.DMatrix(X,y)\n",
143 |     "    param =  {\n",
144 |     "               'max_depth': 8,\n",
145 |     "               'objective': 'reg:squarederror',\n",
146 |     "               'tree_method': tree_method\n",
147 |     "             }\n",
148 |     "\n",
149 |     "    start_time = time.time()\n",
150 |     "    bst = xgb.train(param, dtrain)\n",
151 |     "    print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))\n",
152 |     "    return bst"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "bst=xgboost_fun('gpu','gpu_hist')\n",
162 |     "#bst=xgboost_fun('cpu','hist')"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## TODO: Single Node with multiple GPUS (Dask + RAPIDS) --- Scales to 4 T4s, 8 V100s, or 16 A100s on GCP\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## TODO: Multi-Node with multiple GPUS (Dask + RAPIDS) Scales to 64+ GPUs"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": []
192 |   }
193 |  ],
194 |  "metadata": {
195 |   "environment": {
196 |    "name": "common-cu110.m59",
197 |    "type": "gcloud",
198 |    "uri": "gcr.io/deeplearning-platform-release/base-cu110:m59"
199 |   },
200 |   "kernelspec": {
201 |    "display_name": "Python 3",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.7.8"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 4
220 | }
221 | 


--------------------------------------------------------------------------------
/mlflow-vertex/mlflow-databricks-vertex-deployment.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["#install gcloud\n%sh\ncurl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-356.0.0-linux-x86_64.tar.gz\ntar -xf google-cloud-sdk-356.0.0-linux-x86_64.tar.gz\nyes | ./google-cloud-sdk/install.sh"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bc96dfc5-dfda-4b15-9f70-f788b918fa22"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["#add gcloud directory to PATH env var\nimport os\nprint(os.environ['PATH'])\npath='/databricks/driver/google-cloud-sdk/bin'\nos.environ[\"PATH\"] += os.pathsep + os.path.join(path)\nprint(os.environ['PATH'])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c349a27e-e5bd-4314-9b40-84090b861015"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["#check if gcloud works with new PATH env variable\n%sh\n/databricks/driver/google-cloud-sdk/bin/gcloud version\ngcloud version"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ed500289-107a-47b7-8a41-04117f85d1a7"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["import logging\nlogger = spark._jvm.org.apache.log4j\nlogging.getLogger(\"py4j.java_gateway\").setLevel(logging.ERROR)\n\nimport mlflow\nfrom mlflow.deployments import get_deploy_client\nprint(mlflow.__version__)\nclient=mlflow.deployments.get_deploy_client(\"google_cloud\")\n\nimport mlflow\nimport mlflow.sklearn\nimport pandas as pd\nimport matplotlib.pyplot as plt\n \nfrom numpy import savetxt\n \nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import load_diabetes\n \nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5619fb3d-69ef-424a-9cd7-7778ea6b26c4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["db = load_diabetes()\nX = db.data\ny = db.target\nX_train, X_test, y_train, y_test = train_test_split(X, y)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"8681e901-5f78-4648-a3da-869e380253b2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["# Enable autolog()\n# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.\nmlflow.sklearn.autolog()\n \n# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.  \nwith mlflow.start_run():\n  \n  # Set the model parameters. \n  n_estimators = 100\n  max_depth = 6\n  max_features = 3\n  \n  # Create and train model.\n  rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)\n  rf.fit(X_train, y_train)\n  \n  # Use the model to make predictions on the test dataset.\n  predictions = rf.predict(X_test)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"761dee39-b293-42e0-b576-39dbe8052d0e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["#mchrestkha-sklearnVersion 1\nmodel_name = \"mchrestkha-test-3\"\nmodel_version = 1\nmodel_uri=f\"models:/{model_name}/{model_version}\"\nprint(model_uri)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"eaf6dea4-6991-4881-a365-127bf6d10a7c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["deployment = client.create_deployment(\n    name=\"mlflow_on_gcp_test_3\",\n    model_uri=model_uri)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"28b56b42-6db4-4105-8b3d-543370ff4ca4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"<div class=\"ansiout\">INFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Project not set. Using project-aa-258321 as project\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Destination image URI not set. Building and uploading image to gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Building image. This can take up to 20 minutes\n2021/09/10 06:13:41 INFO mlflow.models.cli: Selected backend for flavor &#39;python_function&#39;\nINFO:google_cloud_mlflow._mlflow_models_docker_utils_patch:Building docker image with name gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\n</div>","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n  .ansiout {\n    display: block;\n    unicode-bidi: embed;\n    white-space: pre-wrap;\n    word-wrap: break-word;\n    word-break: break-all;\n    font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n    font-size: 13px;\n    color: #555;\n    margin-left: 4px;\n    line-height: 19px;\n  }\n</style>\n<div class=\"ansiout\">INFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Project not set. Using project-aa-258321 as project\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Destination image URI not set. Building and uploading image to gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Building image. This can take up to 20 minutes\n2021/09/10 06:13:41 INFO mlflow.models.cli: Selected backend for flavor &#39;python_function&#39;\nINFO:google_cloud_mlflow._mlflow_models_docker_utils_patch:Building docker image with name gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\n</div>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"mlflow-test","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":1678309511451670}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/mlflow-vertex/mlflow-oss-vertex-deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "8ae80b7c",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# import packages\n",
 11 |     "import mlflow\n",
 12 |     "from mlflow.deployments import get_deploy_client\n",
 13 |     "from sklearn.model_selection import train_test_split\n",
 14 |     "from sklearn.datasets import load_diabetes \n",
 15 |     "from sklearn.ensemble import RandomForestRegressor"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "e3e58a35",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Start local MLFlow server on CLI with SQLLite DB for model registry\n",
 26 |     "# mlflow server \\\n",
 27 |     "#     --backend-store-uri sqlite:///mlflow.db \\\n",
 28 |     "#     --default-artifact-root ./artifacts \\\n",
 29 |     "#     --host 0.0.0.0"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "c2f78b68",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "# set mlflow registry and tracking URIs\n",
 40 |     "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
 41 |     "mlflow.set_registry_uri(\"http://localhost:5000\")\n",
 42 |     " \n",
 43 |     "mr_uri = mlflow.get_registry_uri()\n",
 44 |     "print(\"Current registry uri: {}\".format(mr_uri))\n",
 45 |     "tracking_uri = mlflow.get_tracking_uri()\n",
 46 |     "print(\"Current tracking uri: {}\".format(tracking_uri))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "c6005dd9",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# load dataset\n",
 57 |     "db = load_diabetes()\n",
 58 |     "X = db.data\n",
 59 |     "y = db.target\n",
 60 |     "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
 61 |     " \n",
 62 |     "# Enable autolog()\n",
 63 |     "# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.\n",
 64 |     "mlflow.sklearn.autolog()\n",
 65 |     " \n",
 66 |     "# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.  \n",
 67 |     "with mlflow.start_run() as run:  \n",
 68 |     "  # Set the model parameters. \n",
 69 |     "  n_estimators = 100\n",
 70 |     "  max_depth = 6\n",
 71 |     "  max_features = 3\n",
 72 |     "  # Create and train model.\n",
 73 |     "  rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)\n",
 74 |     "  rf.fit(X_train, y_train)\n",
 75 |     "  # Use the model to make predictions on the test dataset.\n",
 76 |     "  predictions = rf.predict(X_test)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "ab2fe324",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# log model\n",
 87 |     "model_name = \"mchrestkha-sklearn\"\n",
 88 |     "mlflow.sklearn.log_model(rf, model_name, registered_model_name=model_name)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "efdf6e25",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "client = mlflow.tracking.MlflowClient()\n",
 99 |     "client.list_registered_models()"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "b3f57857",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# deploy model to Vertex AI\n",
110 |     "client = mlflow.deployments.get_deploy_client(\"google_cloud\")\n",
111 |     " \n",
112 |     "model_version = 1\n",
113 |     "model_uri=f\"models:/{model_name}/{model_version}\"\n",
114 |     "print(model_uri)\n",
115 |     " \n",
116 |     "deployment = client.create_deployment(\n",
117 |     "    name=\"mlflow_on_gcp\",\n",
118 |     "    model_uri=model_uri)"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "environment": {
124 |    "name": "common-cpu.m78",
125 |    "type": "gcloud",
126 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m78"
127 |   },
128 |   "kernelspec": {
129 |    "display_name": "Python 3",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.7.10"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 5
148 | }
149 | 


--------------------------------------------------------------------------------
/tuning_llms/tuning_dialogsum.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "bf8ec3e0-c954-48a8-95a5-0e5c2d320841",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Library Imports"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 200,
 14 |    "id": "b25bef4f-3262-4d94-879e-36a0c50eaf82",
 15 |    "metadata": {
 16 |     "tags": []
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from datasets import load_dataset\n",
 21 |     "import random\n",
 22 |     "import time\n",
 23 |     "import json\n",
 24 |     "import utils\n",
 25 |     "import mercury as mr\n",
 26 |     "import openai\n",
 27 |     "import os\n",
 28 |     "import pandas as pd\n",
 29 |     "pd.set_option('display.float_format', '{:.10f}'.format)\n",
 30 |     "from google.cloud.exceptions import NotFound\n",
 31 |     "import os\n",
 32 |     "from tqdm import tqdm\n",
 33 |     "import time\n",
 34 |     "\n",
 35 |     "#Vertex AI libraries\n",
 36 |     "import vertexai\n",
 37 |     "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part\n",
 38 |     "from vertexai.preview.tuning import sft\n",
 39 |     "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part\n",
 40 |     "from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples, PointwiseMetric, PairwiseMetric\n",
 41 |     "\n",
 42 |     "#OpenAI library\n",
 43 |     "from openai import OpenAI\n",
 44 |     "client = OpenAI()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "9e4899ea-e1c1-4802-84f2-c5143b32f837",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Loading Dataset From HuggingFace"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 201,
 58 |    "id": "d12921cc-be7f-49e2-8383-d5d43dd4dda6",
 59 |    "metadata": {
 60 |     "tags": []
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "dataset = load_dataset(\"knkarthick/dialogsum\")"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "5377f9ce-2948-41e9-9d29-e05cd632e023",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Creating different sized tuning Datasets"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 202,
 78 |    "id": "f608e323-3600-47c9-8db2-2e3f5216ebdb",
 79 |    "metadata": {
 80 |     "tags": []
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# full size datasets\n",
 85 |     "train12460=dataset[\"train\"].to_list()\n",
 86 |     "valid500 =dataset[\"validation\"].to_list()\n",
 87 |     "test1500 =dataset[\"test\"].to_list()\n",
 88 |     "\n",
 89 |     "base_instruction=\"Summarize the following dialogue: \"\n",
 90 |     "for item in test1500: \n",
 91 |     "    item[\"dialogue\"] = base_instruction + item[\"dialogue\"]\n",
 92 |     "\n",
 93 |     "# smaller datasets for rapid testing\n",
 94 |     "train2000=train12460[:2000]\n",
 95 |     "test100=test1500[:100]\n",
 96 |     "test250=test1500[:250]\n",
 97 |     "test10=test1500[:10]"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "bda92d07-c449-4316-ad64-6c746ba33367",
103 |    "metadata": {
104 |     "tags": []
105 |    },
106 |    "source": [
107 |     "## Data Formatting for Tuning API"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "f4466d7e-e982-4281-bb75-c9aa9b774091",
114 |    "metadata": {
115 |     "tags": []
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#Prepare data for Gemini 1.5 Tuning\n",
120 |     "# Define a base prompt for zero-shot summarization \n",
121 |     "base_instruction=\"Summarize the following dialogue: \"\n",
122 |     "utils.format_tuning_dataset(train2000, valid500, base_instruction, \"dialogsum_train2000_inst\",\"dialogsum_valid500_inst\")\n",
123 |     "utils.format_tuning_dataset(train12460, valid500, base_instruction, \"dialogsum_train12460_inst\",\"dialogsum_valid500_inst\")\n",
124 |     "\n",
125 |     "base_instruction=\"\"\n",
126 |     "utils.format_tuning_dataset(train2000, valid500, base_instruction, \"dialogsum_train2000_no_inst\",\"dialogsum_valid500_no_inst\")\n",
127 |     "utils.format_tuning_dataset(train12460, valid500, base_instruction, \"dialogsum_train12460_no_inst\",\"dialogsum_valid500_no_inst\")"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "837758af-e2c5-47b6-b105-9453beeacd2b",
134 |    "metadata": {
135 |     "tags": []
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "utils.delete_and_upload(\"dialogsum_train12460_inst.jsonl\")\n",
140 |     "utils.delete_and_upload(\"dialogsum_train2000_inst.jsonl\")\n",
141 |     "utils.delete_and_upload(\"dialogsum_valid500_inst.jsonl\")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "6871cb58-bf7f-49c2-8612-f48c9a0d8d15",
147 |    "metadata": {},
148 |    "source": [
149 |     "## Submit Tuning Job"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "debeed0e-a4f6-4956-94cf-148103819d2c",
156 |    "metadata": {
157 |     "tags": []
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "model=\"gemini-1.5-flash-001\"\n",
162 |     "utils.tune_gemini(\"gs://mchrestkha-sample-data/dialogsum/dialogsum_train2000_inst.jsonl\", \"gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl\", model, \"dialogsum_2000_inst\")\n",
163 |     "utils.tune_gemini(\"gs://mchrestkha-sample-data/dialogsum/dialogsum_train12460_inst.jsonl\", \"gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl\", model, \"dialogsum_124600_inst\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "id": "2de09063-2454-41d0-9963-3fac8b79966f",
169 |    "metadata": {
170 |     "tags": []
171 |    },
172 |    "source": [
173 |     "## OpenAI Tuning"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "id": "708ebb59-e764-453f-bd76-59a2083ff81e",
180 |    "metadata": {
181 |     "tags": []
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "#Prepare data for OpenAI  Tuning\n",
186 |     "# Define a base prompt for zero-shot summarization \n",
187 |     "system_prompt=\"Summarize the following dialogue: \"\n",
188 |     "\n",
189 |     "# Initialize lists to store messages for training and validation\n",
190 |     "train_messages = []\n",
191 |     "validation_messages = []\n",
192 |     "train = train2000\n",
193 |     "valid = valid500\n",
194 |     "\n",
195 |     "# Iterate over training data and create messages for each dialogue-summary pair\n",
196 |     "for d in train:\n",
197 |     "  prompts = []\n",
198 |     "  prompts.append({\"role\": \"system\", \"content\": system_prompt})\n",
199 |     "  prompts.append({\"role\": \"user\", \"content\": d[\"dialogue\"]})\n",
200 |     "  prompts.append({\"role\": \"assistant\", \"content\": d[\"summary\"]})\n",
201 |     "  train_messages.append({'messages': prompts})\n",
202 |     "\n",
203 |     "# Iterate over validation data and create messages similarly\n",
204 |     "for d in valid:\n",
205 |     "  prompts = []\n",
206 |     "  prompts.append({\"role\": \"system\", \"content\": system_prompt})\n",
207 |     "  prompts.append({\"role\": \"user\", \"content\": d[\"dialogue\"]})\n",
208 |     "  prompts.append({\"role\": \"assistant\", \"content\": d[\"summary\"]})\n",
209 |     "  validation_messages.append({'messages': prompts})\n",
210 |     "\n",
211 |     "    # Print lengths of message lists and an example training message\n",
212 |     "len(train_messages), len(validation_messages), train_messages[2]"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "id": "bff2af86-efab-4389-ae6b-e9a31e21e5f8",
219 |    "metadata": {
220 |     "tags": []
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "# Save to JSON locally\n",
225 |     "utils.dicts_to_jsonl(train_messages, \"openai_dialogsum_train2000\", False)\n",
226 |     "utils.dicts_to_jsonl(validation_messages, \"openai_dialogsum_valid500\", False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "id": "7ac72e15-9afe-48ce-847e-255d26dfea90",
233 |    "metadata": {
234 |     "tags": []
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "# Register & Uplaod Files to OpenAI Storage\n",
239 |     "client.files.create(\n",
240 |     "  file=open(\"openai_dialogsum_train2000.jsonl\", \"rb\"),\n",
241 |     "  purpose=\"fine-tune\"\n",
242 |     ")\n",
243 |     "\n",
244 |     "client.files.create(\n",
245 |     "  file=open(\"openai_dialogsum_valid500.jsonl\", \"rb\"),\n",
246 |     "  purpose=\"fine-tune\"\n",
247 |     ")"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "a90c953a-df68-402a-8ecd-89da6d2d885c",
254 |    "metadata": {
255 |     "tags": []
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "#Submit Tuning Job\n",
260 |     "client.fine_tuning.jobs.create(\n",
261 |     "  training_file=\"file-KxJuvj5sQ3kLQoI7f6X8S9PE\", \n",
262 |     "  validation_file=\"file-QGvOkG9PtiZJ7y0L1JmNbzDE\",\n",
263 |     "  model=\"gpt-4o-mini-2024-07-18\"\n",
264 |     ")"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "id": "9935f0a5-b915-4a9f-b2ec-da49e4c986f3",
270 |    "metadata": {},
271 |    "source": [
272 |     "## Running Predictions on Test Data\n",
273 |     "### For X test examples takes Y min to generate predictions"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 203,
279 |    "id": "2d98e745-253a-44b5-aef7-c8d763ee7f03",
280 |    "metadata": {
281 |     "tags": []
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/html": [
287 |        "\n",
288 |        "        \n",
289 |        "    <link rel=\"stylesheet\" href=\"https://fonts.googleapis.com/icon?family=Material+Icons\">\n",
290 |        "    <style>\n",
291 |        "      .view-vertex-resource,\n",
292 |        "      .view-vertex-resource:hover,\n",
293 |        "      .view-vertex-resource:visited {\n",
294 |        "        position: relative;\n",
295 |        "        display: inline-flex;\n",
296 |        "        flex-direction: row;\n",
297 |        "        height: 32px;\n",
298 |        "        padding: 0 12px;\n",
299 |        "          margin: 4px 18px;\n",
300 |        "        gap: 4px;\n",
301 |        "        border-radius: 4px;\n",
302 |        "\n",
303 |        "        align-items: center;\n",
304 |        "        justify-content: center;\n",
305 |        "        background-color: rgb(255, 255, 255);\n",
306 |        "        color: rgb(51, 103, 214);\n",
307 |        "\n",
308 |        "        font-family: Roboto,\"Helvetica Neue\",sans-serif;\n",
309 |        "        font-size: 13px;\n",
310 |        "        font-weight: 500;\n",
311 |        "        text-transform: uppercase;\n",
312 |        "        text-decoration: none !important;\n",
313 |        "\n",
314 |        "        transition: box-shadow 280ms cubic-bezier(0.4, 0, 0.2, 1) 0s;\n",
315 |        "        box-shadow: 0px 3px 1px -2px rgba(0,0,0,0.2), 0px 2px 2px 0px rgba(0,0,0,0.14), 0px 1px 5px 0px rgba(0,0,0,0.12);\n",
316 |        "      }\n",
317 |        "      .view-vertex-resource:active {\n",
318 |        "        box-shadow: 0px 5px 5px -3px rgba(0,0,0,0.2),0px 8px 10px 1px rgba(0,0,0,0.14),0px 3px 14px 2px rgba(0,0,0,0.12);\n",
319 |        "      }\n",
320 |        "      .view-vertex-resource:active .view-vertex-ripple::before {\n",
321 |        "        position: absolute;\n",
322 |        "        top: 0;\n",
323 |        "        bottom: 0;\n",
324 |        "        left: 0;\n",
325 |        "        right: 0;\n",
326 |        "        border-radius: 4px;\n",
327 |        "        pointer-events: none;\n",
328 |        "\n",
329 |        "        content: '';\n",
330 |        "        background-color: rgb(51, 103, 214);\n",
331 |        "        opacity: 0.12;\n",
332 |        "      }\n",
333 |        "      .view-vertex-icon {\n",
334 |        "        font-size: 18px;\n",
335 |        "      }\n",
336 |        "    </style>\n",
337 |        "  \n",
338 |        "        <a class=\"view-vertex-resource\" id=\"view-vertex-resource-3334ba2b-c19a-4af7-9ef9-603c98ba9252\" href=\"#view-view-vertex-resource-3334ba2b-c19a-4af7-9ef9-603c98ba9252\">\n",
339 |        "          <span class=\"material-icons view-vertex-icon\">tune</span>\n",
340 |        "          <span>View Tuning Job</span>\n",
341 |        "        </a>\n",
342 |        "        \n",
343 |        "        <script>\n",
344 |        "          (function () {\n",
345 |        "            const link = document.getElementById('view-vertex-resource-3334ba2b-c19a-4af7-9ef9-603c98ba9252');\n",
346 |        "            link.addEventListener('click', (e) => {\n",
347 |        "              if (window.google?.colab?.openUrl) {\n",
348 |        "                window.google.colab.openUrl('https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/2137747369456828416?project=642508009780');\n",
349 |        "              } else {\n",
350 |        "                window.open('https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/2137747369456828416?project=642508009780', '_blank');\n",
351 |        "              }\n",
352 |        "              e.stopPropagation();\n",
353 |        "              e.preventDefault();\n",
354 |        "            });\n",
355 |        "          })();\n",
356 |        "        </script>\n",
357 |        "    "
358 |       ],
359 |       "text/plain": [
360 |        "<IPython.core.display.HTML object>"
361 |       ]
362 |      },
363 |      "metadata": {},
364 |      "output_type": "display_data"
365 |     },
366 |     {
367 |      "name": "stderr",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "Processing: 100%|██████████| 250/250 [13:44<00:00,  3.30s/row]\n"
371 |      ]
372 |     }
373 |    ],
374 |    "source": [
375 |     "gemini_text = []\n",
376 |     "openai_text = []\n",
377 |     "gemini_tuned_text = []\n",
378 |     "openai_tuned_text = []\n",
379 |     "\n",
380 |     "tuning_job = sft.SupervisedTuningJob(\"projects/642508009780/locations/us-central1/tuningJobs/2137747369456828416\")\n",
381 |     "tuned_model = GenerativeModel(tuning_job.tuned_model_endpoint_name)\n",
382 |     "model = GenerativeModel(\"gemini-1.5-flash-001\")\n",
383 |     "client = OpenAI()\n",
384 |     "#test=test10\n",
385 |     "#test=test1500\n",
386 |     "test=test250\n",
387 |     "\n",
388 |     "for row in tqdm(test, desc=\"Processing\", unit=\"row\"):\n",
389 |     "    try:\n",
390 |     "        gemini_response = model.generate_content(contents=row[\"dialogue\"])\n",
391 |     "        gemini_text.append(gemini_response.text)\n",
392 |     "    except (ValueError, AttributeError):  # Catch broader potential errors\n",
393 |     "        gemini_text.append(\"Blocked\")\n",
394 |     "        \n",
395 |     "    try:\n",
396 |     "        gemini_tuned_response = tuned_model.generate_content(contents=row[\"dialogue\"])\n",
397 |     "        gemini_tuned_text.append(gemini_tuned_response.text)\n",
398 |     "    except (ValueError, AttributeError):  # Catch broader potential errors\n",
399 |     "        gemini_tuned_text.append(\"Blocked\")\n",
400 |     "\n",
401 |     "    try:\n",
402 |     "        openai_response = client.chat.completions.create(\n",
403 |     "            model=\"gpt-4o-mini-2024-07-18\",\n",
404 |     "            messages=[{\"role\": \"user\", \"content\": row[\"dialogue\"]}]\n",
405 |     "        )\n",
406 |     "        openai_text.append(openai_response.choices[0].message.content)\n",
407 |     "    except (ValueError, AttributeError): \n",
408 |     "        openai_text.append(\"Blocked\")\n",
409 |     "\n",
410 |     "    try:\n",
411 |     "        openai_tuned_response = client.chat.completions.create(\n",
412 |     "            model=\"ft:gpt-4o-mini-2024-07-18:personal::A3WwHRrJ\",\n",
413 |     "            messages=[{\"role\": \"user\", \"content\": row[\"dialogue\"]}]\n",
414 |     "        )\n",
415 |     "        openai_tuned_text.append(openai_tuned_response.choices[0].message.content)\n",
416 |     "    except (ValueError, AttributeError): \n",
417 |     "        openai_tuned_text.append(\"Blocked\")\n",
418 |     "\n",
419 |     "# Directly create the final DataFrame with responses included\n",
420 |     "df_final = pd.DataFrame(test)\n",
421 |     "df_final[\"gemini_response\"] = gemini_text\n",
422 |     "df_final[\"openai_response\"] = openai_text\n",
423 |     "df_final[\"gemini_tuned_response\"] = gemini_tuned_text\n",
424 |     "df_final[\"openai_tuned_response\"] = openai_tuned_text"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 207,
430 |    "id": "303a0d29-1348-4f39-9a9e-6db0da569f5d",
431 |    "metadata": {
432 |     "tags": []
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "df_final[\"summary_response\"]=df_final[\"summary\"]\n",
437 |     "df_test_predictions_final=df_final\n",
438 |     "df_test_predictions_final.to_csv('df_test_predictions_final.csv', index=False) "
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "id": "5c15c342-214f-4c05-a19d-61af00f31089",
444 |    "metadata": {},
445 |    "source": [
446 |     "## Running Computation & Model Pointwise Evals"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 210,
452 |    "id": "e13fe0c0-faff-4734-8939-2b2b004e7f95",
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "#Define a pointwise custom summarization quality metric \n",
457 |     "pointwise_custom_summary_metric_prompt = \"\"\"\n",
458 |     "# Instruction\n",
459 |     "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.\n",
460 |     "We will provide you with the user input and an AI-generated response.\n",
461 |     "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n",
462 |     "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric. **Explicitly include the word count of the response as the first step in your explanation**, and ensure it aligns with the criteria.\n",
463 |     "\n",
464 |     "# Evaluation\n",
465 |     "## Metric Definition\n",
466 |     "You will be assessing summarization quality, which measures the overall ability to summarize text.  The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.\n",
467 |     "\n",
468 |     "## Criteria\n",
469 |     "Less than 50 words: The response contains less than 50 words.  Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, \" \"))`\n",
470 |     "Groundedness: The response contains information included only in the context. The response does not reference any outside information.\n",
471 |     "Observer Perspective: The response is written from an observer perspective.\n",
472 |     "\n",
473 |     "## Rating Rubric\n",
474 |     "5: (Very good). The summary is less than 50 words, is grounded and is written as an observer.\n",
475 |     "4: (Good). The summary is less than 50 words and is grounded.  \n",
476 |     "3: (Ok). The summary is more than 50 words but mostly grounded\n",
477 |     "2: (Bad). The summary is more than 50 words and not grounded.\n",
478 |     "1: (Very bad). The summary is more than 50 words and not grounded.\n",
479 |     "\n",
480 |     "## Evaluation Steps\n",
481 |     "STEP 1: Assess the response in aspects of word count, groundedness, and observer perspective according to the criteria.  **Use the provided formula to determine the EXACT word count**\n",
482 |     "STEP 2: Score based on the rubric.\n",
483 |     "\n",
484 |     "# User Inputs and AI-generated Response\n",
485 |     "## User Inputs\n",
486 |     "\n",
487 |     "### Prompt\n",
488 |     "{prompt}\n",
489 |     "\n",
490 |     "## AI-generated Response\n",
491 |     "{response}\n",
492 |     "\n",
493 |     "\"\"\"\n",
494 |     "\n",
495 |     "pointwise_custom_summary_metric = PointwiseMetric(\n",
496 |     "  metric=\"custom_point_summary_metric\",\n",
497 |     "  metric_prompt_template=pointwise_custom_summary_metric_prompt,\n",
498 |     ")"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "id": "11c2afb2-b127-4cb7-86f6-b90d402bb5cd",
505 |    "metadata": {
506 |     "tags": []
507 |    },
508 |    "outputs": [
509 |     {
510 |      "name": "stdout",
511 |      "output_type": "stream",
512 |      "text": [
513 |       "Computing metrics with a total of 750 Vertex online evaluation service requests.\n"
514 |      ]
515 |     },
516 |     {
517 |      "name": "stderr",
518 |      "output_type": "stream",
519 |      "text": [
520 |       "100%|██████████| 750/750 [50:03<00:00,  4.00s/it]\n"
521 |      ]
522 |     },
523 |     {
524 |      "name": "stdout",
525 |      "output_type": "stream",
526 |      "text": [
527 |       "All 750 metric requests are successfully computed.\n",
528 |       "Evaluation Took:3003.383097610007 seconds\n",
529 |       "Computing metrics with a total of 750 Vertex online evaluation service requests.\n"
530 |      ]
531 |     },
532 |     {
533 |      "name": "stderr",
534 |      "output_type": "stream",
535 |      "text": [
536 |       "100%|██████████| 750/750 [50:00<00:00,  4.00s/it]\n"
537 |      ]
538 |     },
539 |     {
540 |      "name": "stdout",
541 |      "output_type": "stream",
542 |      "text": [
543 |       "All 750 metric requests are successfully computed.\n",
544 |       "Evaluation Took:3000.8083779989975 seconds\n",
545 |       "Computing metrics with a total of 750 Vertex online evaluation service requests.\n"
546 |      ]
547 |     },
548 |     {
549 |      "name": "stderr",
550 |      "output_type": "stream",
551 |      "text": [
552 |       "100%|██████████| 750/750 [50:01<00:00,  4.00s/it] \n"
553 |      ]
554 |     },
555 |     {
556 |      "name": "stdout",
557 |      "output_type": "stream",
558 |      "text": [
559 |       "All 750 metric requests are successfully computed.\n",
560 |       "Evaluation Took:3001.101887780009 seconds\n",
561 |       "Computing metrics with a total of 750 Vertex online evaluation service requests.\n"
562 |      ]
563 |     },
564 |     {
565 |      "name": "stderr",
566 |      "output_type": "stream",
567 |      "text": [
568 |       " 64%|██████▍   | 483/750 [32:12<19:18,  4.34s/it] "
569 |      ]
570 |     }
571 |    ],
572 |    "source": [
573 |     "def run_eval(dataset, col_prompt,col_response,col_reference):\n",
574 |     "    eval_dataset_comp=dataset[[col_prompt,col_response,col_reference]]\n",
575 |     "    #print(eval_dataset_comp)\n",
576 |     "    eval_dataset_comp = eval_dataset_comp.rename(columns={col_prompt: 'prompt', col_response: 'response', col_reference: 'reference'})\n",
577 |     "    #print(eval_dataset_comp)\n",
578 |     "    eval_task = EvalTask(\n",
579 |     "        dataset=eval_dataset_comp, \n",
580 |     "        metrics=[\"rouge_l_sum\",MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY, pointwise_custom_summary_metric],\n",
581 |     "        )\n",
582 |     "    eval_result = eval_task.evaluate().summary_metrics\n",
583 |     "    eval_result_df = pd.DataFrame(eval_result, index=[col_response]).rename_axis('model').reset_index()\n",
584 |     "    return eval_result_df\n",
585 |     "\n",
586 |     "# Evaluate different models\n",
587 |     "results = [\n",
588 |     "run_eval(df_final, \"dialogue\", \"gemini_response\", \"summary\"),\n",
589 |     "run_eval(df_final, \"dialogue\", \"gemini_tuned_response\", \"summary\"),\n",
590 |     "run_eval(df_final, \"dialogue\", \"summary_response\", \"summary\"),\n",
591 |     "run_eval(df_final, \"dialogue\", \"openai_response\", \"summary\"),\n",
592 |     "run_eval(df_final, \"dialogue\", \"openai_tuned_response\", \"summary\"),\n",
593 |     "]\n",
594 |     "\n",
595 |     "# Combine results\n",
596 |     "combined_comp_point_eval_result = pd.concat(results, ignore_index=True)"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "id": "08f5978f-5bc6-4146-871a-f4805211afcb",
603 |    "metadata": {
604 |     "tags": []
605 |    },
606 |    "outputs": [],
607 |    "source": [
608 |     "combined_comp_point_eval_result\n",
609 |     "#combined_comp_point_eval_result.to_csv('combined_comp_point_eval_result.csv', index=False) "
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "markdown",
614 |    "id": "0a1a3ab3-d9e5-4ca3-b9b5-90c394343282",
615 |    "metadata": {
616 |     "tags": []
617 |    },
618 |    "source": [
619 |     "## Running Pairwise (AutoSxS) Model Evals"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "id": "f31a1ac5-5f71-4a38-8eb9-e3cbfb02055f",
626 |    "metadata": {
627 |     "tags": []
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "#Define a pointwise custom summarization quality metric \n",
632 |     "pairwise_custom_summary_metric_prompt = \"\"\"\n",
633 |     "# Instruction\n",
634 |     "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. We will provide you with the user input and a pair of AI-generated responses (Response A and Response B).\n",
635 |     "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n",
636 |     "You will first judge responses individually, following the Rating Rubric and Evaluation Steps.\n",
637 |     "Then you will give step-by-step explanations for your judgement, compare results to declare the winner based on the Rating Rubric and Evaluation Steps.\n",
638 |     "# Evaluation\n",
639 |     "## Metric Definition\n",
640 |     "You will be assessing summarization quality, which measures the overall ability to summarize text.  The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.\n",
641 |     "\n",
642 |     "## Criteria\n",
643 |     "Less than 50 words: The response contains less than 50 words.  Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, \" \"))`\n",
644 |     "Groundedness: The response contains information included only in the context. The response does not reference any outside information.\n",
645 |     "Observer Perspective: The response is written from an observer perspective.\n",
646 |     "\n",
647 |     "## Rating Rubric\n",
648 |     "\"A\": Response A summarizes the given context as per the criteria better than response B.\n",
649 |     "\"SAME\": Response A and B summarizes the given context equally well as per the criteria.\n",
650 |     "\"B\": Response B summarizes the given context as per the criteria better than response A.\n",
651 |     "\n",
652 |     "## Evaluation Steps\n",
653 |     "STEP 1: Analyze Response A based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.\n",
654 |     "STEP 2: Analyze Response B based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.\n",
655 |     "STEP 3: Compare the overall performance of Response A and Response B based on your analyses and assessment.\n",
656 |     "STEP 4: Output your preference of \"A\", \"SAME\" or \"B\" to the pairwise_choice field according to the Rating Rubric.\n",
657 |     "STEP 5: Output your assessment reasoning in the explanation field.\n",
658 |     "\n",
659 |     "\n",
660 |     "# User Inputs and AI-generated Responses\n",
661 |     "## User Inputs\n",
662 |     "\n",
663 |     "### Prompt\n",
664 |     "{prompt}\n",
665 |     "\n",
666 |     "## AI-generated Responses\n",
667 |     "### Response A\n",
668 |     "{baseline_model_response}\n",
669 |     "\n",
670 |     "### Response B\n",
671 |     "{response}\n",
672 |     "\n",
673 |     "\"\"\"\n",
674 |     "\n",
675 |     "pairwise_custom_summary_metric = PairwiseMetric(\n",
676 |     "  metric=\"custom_pairwise_summary_metric\",\n",
677 |     "  metric_prompt_template=pairwise_custom_summary_metric_prompt,\n",
678 |     ")"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": null,
684 |    "id": "ed176375-c29e-4fff-9136-635036f3c868",
685 |    "metadata": {
686 |     "tags": []
687 |    },
688 |    "outputs": [],
689 |    "source": [
690 |     "eval_dataset_pair = df_final[['dialogue', 'gemini_tuned_response', 'openai_tuned_response']].rename(columns={\n",
691 |     "    'dialogue': 'prompt', \n",
692 |     "    'gemini_tuned_response': 'response', \n",
693 |     "    'openai_tuned_response': 'baseline_model_response'\n",
694 |     "})"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": null,
700 |    "id": "ccd0678e-ce6e-4246-9809-56400edb9124",
701 |    "metadata": {
702 |     "tags": []
703 |    },
704 |    "outputs": [],
705 |    "source": [
706 |     "eval_task = EvalTask(\n",
707 |     "    dataset=eval_dataset_pair, \n",
708 |     "    metrics=[MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY, pairwise_custom_summary_metric],\n",
709 |     "    )\n",
710 |     "eval_result = eval_task.evaluate()"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": null,
716 |    "id": "ce80cee6-7548-47de-af2f-08d384a89b10",
717 |    "metadata": {
718 |     "tags": []
719 |    },
720 |    "outputs": [],
721 |    "source": [
722 |     "combined_pair_eval_result=eval_result.summary_metrics\n",
723 |     "#combined_pair_eval_result.to_csv('combined_pair_eval_result.csv', index=False) "
724 |    ]
725 |   }
726 |  ],
727 |  "metadata": {
728 |   "environment": {
729 |    "kernel": "conda-root-py",
730 |    "name": "workbench-notebooks.m113",
731 |    "type": "gcloud",
732 |    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113"
733 |   },
734 |   "kernelspec": {
735 |    "display_name": "Python 3 (ipykernel) (Local)",
736 |    "language": "python",
737 |    "name": "conda-root-py"
738 |   },
739 |   "language_info": {
740 |    "codemirror_mode": {
741 |     "name": "ipython",
742 |     "version": 3
743 |    },
744 |    "file_extension": ".py",
745 |    "mimetype": "text/x-python",
746 |    "name": "python",
747 |    "nbconvert_exporter": "python",
748 |    "pygments_lexer": "ipython3",
749 |    "version": "3.10.13"
750 |   }
751 |  },
752 |  "nbformat": 4,
753 |  "nbformat_minor": 5
754 | }
755 | 


--------------------------------------------------------------------------------
/tuning_llms/tuning_legalbench.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "c7087b40-d110-48d3-8e8e-852109bab99b",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "#!pip install datasets vertexai mercury"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "e4e282f3-c9ac-4ca0-9039-9b20bed1134d",
 19 |    "metadata": {
 20 |     "tags": []
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#! gcloud auth list"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "9f441437-a6da-4f55-afca-9a57923f10f4",
 31 |    "metadata": {
 32 |     "tags": []
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from datasets import load_dataset\n",
 37 |     "import random\n",
 38 |     "import time\n",
 39 |     "import vertexai\n",
 40 |     "from vertexai.preview.tuning import sft\n",
 41 |     "import json\n",
 42 |     "import utils\n",
 43 |     "import mercury as mr"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "7c70aae2-6b21-4dbd-addf-75f09bf9b702",
 50 |    "metadata": {
 51 |     "tags": []
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Load the data\n",
 56 |     "dataset = load_dataset(\"nguha/legalbench\", \"contract_nli_explicit_identification\")\n",
 57 |     "\n",
 58 |     "# Merge and shuffle\n",
 59 |     "data = dataset[\"train\"].to_list() + dataset[\"test\"].to_list()  # Convert to lists before concatenating\n",
 60 |     "random.shuffle(data)\n",
 61 |     "\n",
 62 |     "# Add new index\n",
 63 |     "for idx, d in enumerate(data):\n",
 64 |     "    d[\"new_index\"] = idx"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "7ef891b9-ba74-4fe1-97e8-c35bf13acd48",
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "len(data)\n",
 77 |     "mr.JSON(data)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "613f6fae-bd36-4311-9680-32a7c279eb4b",
 84 |    "metadata": {
 85 |     "tags": []
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "base_prompt_zero_shot = \"Identify if the clause provides that all Confidential Information shall be expressly identified by the Disclosing Party. Answer with only `Yes` or `No`\""
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "c8fef0c7-b094-4da7-8b80-3b0ff8ef4ea8",
 96 |    "metadata": {
 97 |     "tags": []
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "n_train = 30\n",
102 |     "n_test = len(data) - n_train"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "97da2b67-fc37-44cf-ba9a-f352eab8f4cd",
109 |    "metadata": {
110 |     "tags": []
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "train_messages = []\n",
115 |     "test_messages = []\n",
116 |     "\n",
117 |     "for d in data:\n",
118 |     "  prompts=[]\n",
119 |     "  prompts = [{\"role\": \"system\", \"parts\": [{\"text\": base_prompt_zero_shot}]}]\n",
120 |     "  prompts.append({\"role\": \"user\", \"parts\": [{\"text\": d[\"text\"]}]})\n",
121 |     "  prompts.append({\"role\": \"model\", \"parts\": [{\"text\": d[\"answer\"]}]}) \n",
122 |     "\n",
123 |     "  if int(d[\"new_index\"]) < n_train:\n",
124 |     "    #train_messages.append({'messages': prompts})\n",
125 |     "    train_messages.append({'contents': prompts})\n",
126 |     "\n",
127 |     "  else:\n",
128 |     "    #test_messages.append({'messages': prompts})\n",
129 |     "    test_messages.append({'contents': prompts})\n",
130 |     "\n",
131 |     "len(train_messages), len(test_messages), n_test, train_messages[5]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "3b765a73-a769-413f-b9b1-81a7f040afbb",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "for d in data:\n",
142 |     "  tuningdataset=[]\n",
143 |     "  tuningdataset = [{\"role\": \"system\", \"parts\": [{\"text\": system_instructions}]}]\n",
144 |     "  tuningdataset.append({\"role\": \"user\", \"parts\": [{\"text\": d[\"text\"]}]})\n",
145 |     "  tuningdataset.append({\"role\": \"model\", \"parts\": [{\"text\": d[\"answer\"]}]}) \n",
146 |     "  tuningdataset.append({'contents': prompts})"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "231130e7-a60c-44f3-a93b-45dd8d986611",
153 |    "metadata": {
154 |     "tags": []
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "utils.dicts_to_jsonl(train_messages, \"train_contents\", False)\n",
159 |     "utils.dicts_to_jsonl(test_messages, \"test_contents\", False)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "id": "0e280382-bda1-4400-965e-b675e9c970ac",
166 |    "metadata": {
167 |     "tags": []
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "#upload_blob(bucket_name, source_file_name, destination_blob_name)\n",
172 |     "#delete_blob(bucket_name, blob_name):\n",
173 |     "utils.delete_blob(\"mchrestkha-sample-data\",\"legalbench/contract_nli_explicit_identification/train_contents.jsonl\")\n",
174 |     "utils.delete_blob(\"mchrestkha-sample-data\",\"legalbench/contract_nli_explicit_identification/test_contents.jsonl\")\n",
175 |     "utils.upload_blob(\"mchrestkha-sample-data\",\"train_contents.jsonl\",\"legalbench/contract_nli_explicit_identification/train_contents.jsonl\")\n",
176 |     "utils.upload_blob(\"mchrestkha-sample-data\",\"test_contents.jsonl\",\"legalbench/contract_nli_explicit_identification/test_contents.jsonl\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "id": "7e9e0977-ca96-4a8b-a686-4a00e6527fe0",
183 |    "metadata": {
184 |     "tags": []
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "vertexai.init(project=\"mchrestkha-sandbox\", location=\"us-central1\")\n",
189 |     "\n",
190 |     "sft_tuning_job = sft.train(\n",
191 |     "    source_model=\"gemini-1.5-pro-001\",\n",
192 |     "    train_dataset=\"gs://mchrestkha-sample-data/legalbench/contract_nli_explicit_identification/train_contents.jsonl\",\n",
193 |     "    #train_dataset=\"gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl\",\n",
194 |     "    # The following parameters are optional\n",
195 |     "    validation_dataset=\"gs://mchrestkha-sample-data/legalbench/contract_nli_explicit_identification/test_contents.jsonl\",\n",
196 |     "    epochs=5,\n",
197 |     "    adapter_size=4,\n",
198 |     "    learning_rate_multiplier=1.0,\n",
199 |     "    tuned_model_display_name=\"1.5_flash_testing\",\n",
200 |     ")\n",
201 |     "\n",
202 |     "# Polling for job completion\n",
203 |     "while not sft_tuning_job.has_ended:\n",
204 |     "    time.sleep(60)\n",
205 |     "    sft_tuning_job.refresh()\n",
206 |     "\n",
207 |     "print(sft_tuning_job.tuned_model_name)\n",
208 |     "print(sft_tuning_job.tuned_model_endpoint_name)\n",
209 |     "print(sft_tuning_job.experiment)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "id": "57740801-8130-4332-bf6f-ef586716d708",
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": []
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "070036f6-3351-4c29-9fec-2d830ea2283f",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": []
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "id": "ed88e8e3-7887-4a0a-86ed-5f0a6083d4fb",
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix"
236 |    ]
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "environment": {
241 |    "kernel": "conda-root-py",
242 |    "name": "workbench-notebooks.m113",
243 |    "type": "gcloud",
244 |    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113"
245 |   },
246 |   "kernelspec": {
247 |    "display_name": "Python 3 (ipykernel) (Local)",
248 |    "language": "python",
249 |    "name": "conda-root-py"
250 |   },
251 |   "language_info": {
252 |    "codemirror_mode": {
253 |     "name": "ipython",
254 |     "version": 3
255 |    },
256 |    "file_extension": ".py",
257 |    "mimetype": "text/x-python",
258 |    "name": "python",
259 |    "nbconvert_exporter": "python",
260 |    "pygments_lexer": "ipython3",
261 |    "version": "3.10.13"
262 |   }
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 5
266 | }
267 | 


--------------------------------------------------------------------------------
/tuning_llms/utils.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | from google.cloud import storage
  4 | from datasets import load_dataset
  5 | import random
  6 | import time
  7 | import vertexai
  8 | from vertexai.preview.tuning import sft
  9 | from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
 10 | import json
 11 | import utils
 12 | import mercury as mr
 13 | import openai
 14 | import os
 15 | import pandas as pd
 16 | from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part
 17 | from google.cloud.exceptions import NotFound
 18 | 
 19 | def format_tuning_dataset(train_list, valid_list, base_instruction, train_filename, valid_filename):
 20 | 
 21 |     # Initialize lists to store messages for training and validation
 22 |     train_messages = []
 23 |     validation_messages = []
 24 | 
 25 |     # Iterate over training data and create messages for each dialogue-summary pair
 26 |     for d in train_list:
 27 |       prompts=[]
 28 |       prompts.append({"role": "user", "parts": [{"text": base_instruction + d["dialogue"]}]})
 29 |       prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 
 30 |       train_messages.append({'contents': prompts})
 31 | 
 32 |     # Iterate over validation data and create messages similarly
 33 |     for d in valid_list:
 34 |       prompts=[]
 35 |       prompts.append({"role": "user", "parts": [{"text": base_instruction + d["dialogue"]}]})
 36 |       prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 
 37 |       validation_messages.append({'contents': prompts})
 38 |     
 39 |     # Save to JSON locally
 40 |     dicts_to_jsonl(train_messages, train_filename, False)
 41 |     dicts_to_jsonl(validation_messages, valid_filename, False)
 42 | 
 43 |     # Print lengths of message lists and an example training message
 44 |     len(train_messages), len(validation_messages), train_messages[3]
 45 | 
 46 | # Delete & Overwrite files to upload to GCS
 47 | def delete_and_upload(filename):
 48 |     try:
 49 |         delete_blob("mchrestkha-sample-data",f"dialogsum/{filename}")
 50 |     except NotFound:
 51 |         pass
 52 |     upload_blob("mchrestkha-sample-data",filename,f"dialogsum/{filename}")    
 53 | 
 54 |     
 55 | #Submit Tuning Job
 56 | def tune_gemini(train_file, valid_file, model, model_name):
 57 |     timestr = time.strftime("%Y%m%d-%H%M%S")
 58 |     model_name=model_name+timestr
 59 |     vertexai.init(project="mchrestkha-sandbox", location="us-central1")
 60 | 
 61 |     sft_tuning_job = sft.train(
 62 |         source_model=model,
 63 |         train_dataset=train_file,
 64 |         # The following parameters are optional
 65 |         validation_dataset=valid_file,
 66 |         epochs=5,
 67 |         adapter_size=4,
 68 |         learning_rate_multiplier=1.0,
 69 |         tuned_model_display_name=model_name,
 70 |     ) 
 71 |     
 72 | 
 73 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
 74 |     """Uploads a file to the bucket."""
 75 |     # The ID of your GCS bucket
 76 |     # bucket_name = "your-bucket-name"
 77 |     # The path to your file to upload
 78 |     # source_file_name = "local/path/to/file"
 79 |     # The ID of your GCS object
 80 |     # destination_blob_name = "storage-object-name"
 81 | 
 82 |     storage_client = storage.Client()
 83 |     bucket = storage_client.bucket(bucket_name)
 84 |     blob = bucket.blob(destination_blob_name)
 85 | 
 86 |     # Optional: set a generation-match precondition to avoid potential race conditions
 87 |     # and data corruptions. The request to upload is aborted if the object's
 88 |     # generation number does not match your precondition. For a destination
 89 |     # object that does not yet exist, set the if_generation_match precondition to 0.
 90 |     # If the destination object already exists in your bucket, set instead a
 91 |     # generation-match precondition using its generation number.
 92 |     generation_match_precondition = 0
 93 | 
 94 |     blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)
 95 | 
 96 |     print(
 97 |         f"File {source_file_name} uploaded to {destination_blob_name}."
 98 |     )
 99 |     
100 | def delete_blob(bucket_name, blob_name):
101 |     """Deletes a blob from the bucket."""
102 |     # bucket_name = "your-bucket-name"
103 |     # blob_name = "your-object-name"
104 | 
105 |     storage_client = storage.Client()
106 | 
107 |     bucket = storage_client.bucket(bucket_name)
108 |     blob = bucket.blob(blob_name)
109 |     generation_match_precondition = None
110 | 
111 |     # Optional: set a generation-match precondition to avoid potential race conditions
112 |     # and data corruptions. The request to delete is aborted if the object's
113 |     # generation number does not match your precondition.
114 |     blob.reload()  # Fetch blob metadata to use in generation_match_precondition.
115 |     generation_match_precondition = blob.generation
116 | 
117 |     blob.delete(if_generation_match=generation_match_precondition)
118 | 
119 |     print(f"Blob {blob_name} deleted.")
120 | 
121 | 
122 | def dicts_to_jsonl(data_list: list, filename: str, compress: bool = True) -> None:
123 |     """
124 |     Method saves list of dicts into jsonl file.
125 | 
126 |     :param data: (list) list of dicts to be stored,
127 |     :param filename: (str) path to the output file. If suffix .jsonl is not given then methods appends
128 |         .jsonl suffix into the file.
129 |     :param compress: (bool) should file be compressed into a gzip archive?
130 |     """
131 | 
132 |     sjsonl = '.jsonl'
133 |     sgz = '.gz'
134 | 
135 |     # Check filename
136 | 
137 |     if not filename.endswith(sjsonl):
138 |         filename = filename + sjsonl
139 | 
140 |     # Save data
141 |     
142 |     if compress:
143 |         filename = filename + sgz
144 |         with gzip.open(filename, 'w') as compressed:
145 |             for ddict in data_list:
146 |                 jout = json.dumps(ddict) + '\n'
147 |                 jout = jout.encode('utf-8')
148 |                 compressed.write(jout)
149 |     else:
150 |         with open(filename, 'w') as out:
151 |             for ddict in data_list:
152 |                 jout = json.dumps(ddict) + '\n'
153 |                 out.write(jout)


--------------------------------------------------------------------------------