├── .gitignore ├── LICENSE ├── README.md ├── catsdogs ├── cicd-example │ ├── cloudbuild.yaml │ └── test.py └── tensorflow │ ├── data │ ├── datagcs.csv │ └── datalocal.csv │ ├── notebooks │ ├── data_prep.ipynb │ └── model_training_and_deployment_local.ipynb │ └── tfcloud │ ├── model_training_tfcloud.ipynb │ ├── requirements.txt │ └── run_tfcloud.py ├── census ├── catboost │ └── gcp_ai_platform │ │ ├── notebooks │ │ └── catboost_census_notebook.ipynb │ │ ├── scripts │ │ └── train-cloud.sh │ │ ├── setup.py │ │ └── trainer │ │ ├── __init__.py │ │ └── train.py └── xgboost │ └── gcp_ai_platform │ ├── notebooks │ └── xgboost_census_notebook.ipynb │ ├── scripts │ └── train-cloud.sh │ └── trainer │ ├── __init__.py │ └── train.py ├── fannie_mae_loans └── rapids_xgboost │ └── notebooks │ └── dask_rapids.ipynb ├── higgs └── rapids_xgboost │ └── notebooks │ ├── a100_higgs_rapids_xgboost.ipynb │ └── t4_higgs_rapids_xgboost.ipynb ├── mlflow-vertex ├── mlflow-databricks-vertex-deployment.ipynb └── mlflow-oss-vertex-deployment.ipynb └── tuning_llms ├── tuning_dialogsum.ipynb ├── tuning_legalbench.ipynb └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Mikhail Chrestkha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Examples 2 | ## Organized by: 3 | ```markdown 4 | -dataset (i.e. Iris, MNIST, ImageNet) 5 | ---framework (i.e. TensorFlow, XGBoost) 6 | -----notebooks 7 | -----scripts 8 | -------------------------------------------------------------------------------- /catsdogs/cicd-example/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /catsdogs/cicd-example/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | 5 | # Model / data parameters 6 | num_classes = 10 7 | input_shape = (28, 28, 1) 8 | 9 | # the data, split between train and test sets 10 | (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data() 11 | 12 | # Scale images to the [0, 1] range 13 | x_train = x_train.astype("float32") / 255 14 | x_test = x_test.astype("float32") / 255 15 | # Make sure images have shape (28, 28, 1) 16 | x_train = np.expand_dims(x_train, -1) 17 | x_test = np.expand_dims(x_test, -1) 18 | print("x_train shape:", x_train.shape) 19 | print(x_train.shape[0], "train samples") 20 | print(x_test.shape[0], "test samples") 21 | 22 | 23 | # convert class vectors to binary class matrices 24 | y_train = keras.utils.to_categorical(y_train, num_classes) 25 | y_test = keras.utils.to_categorical(y_test, num_classes) 26 | 27 | model = keras.Sequential( 28 | [ 29 | keras.Input(shape=input_shape), 30 | layers.Conv2D(32, kernel_size=(3, 3), activation="relu"), 31 | layers.MaxPooling2D(pool_size=(2, 2)), 32 | layers.Conv2D(64, kernel_size=(3, 3), activation="relu"), 33 | layers.MaxPooling2D(pool_size=(2, 2)), 34 | layers.Flatten(), 35 | layers.Dropout(0.5), 36 | layers.Dense(num_classes, activation="softmax"), 37 | ] 38 | ) 39 | 40 | model.summary() 41 | 42 | batch_size = 128 43 | epochs = 2 44 | 45 | model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) 46 | 47 | model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1) 48 | 49 | score = model.evaluate(x_test, y_test, verbose=0) 50 | print("Test loss:", score[0]) 51 | print("Test accuracy:", score[1]) 52 | -------------------------------------------------------------------------------- /catsdogs/tensorflow/notebooks/data_prep.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Libraries and datasets required" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "original dataset from https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip \\\n", 15 | "Images: 3000 (2000 Training, 1000 Validation) " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 96, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "0.10.4\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "import os\n", 33 | "import pandas as pd\n", 34 | "import tfrecorder\n", 35 | "import wandb \n", 36 | "print(wandb.__version__)\n", 37 | "import tensorflow as tf\n", 38 | "import time" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 97, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "os.system('wandb login 3a6710e811d34207ea03768ba12e7ea6c8a9fefd')\n", 48 | "os.environ['WANDB_NOTEBOOK_NAME'] = 'data_prep.ipynb'" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### GCS Fuse to be able to use os utilities on GCS without copying data" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 98, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#!gcsfuse --implicit-dirs mchrestkha-demo-env-ml-examples /home/jupyter/gcs/ \n", 65 | "#!fusermount -u /home/jupyter/gcs/" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Collect all image URIs" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 108, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/html": [ 83 | "\n", 84 | " Tracking run with wandb version 0.10.4
\n", 85 | " Syncing run set_up_data_20201005_053719 to Weights & Biases (Documentation).
\n", 86 | " Project page: https://wandb.ai/mchrestkha/cats-dogs-keras
\n", 87 | " Run page: https://wandb.ai/mchrestkha/cats-dogs-keras/runs/596flvgm
\n", 88 | " Run data is saved locally in wandb/run-20201005_053719-596flvgm

\n", 89 | " " 90 | ], 91 | "text/plain": [ 92 | "" 93 | ] 94 | }, 95 | "metadata": {}, 96 | "output_type": "display_data" 97 | } 98 | ], 99 | "source": [ 100 | "RUN_NAME=time.strftime(\"set_up_data_%Y%m%d_%H%M%S\")\n", 101 | "run = wandb.init(project='cats-dogs-keras', job_type='data', name=RUN_NAME)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 115, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "wandb.log({\"cat_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/cats/cat.1.jpg\", caption=\"Cat1\")})\n", 111 | "wandb.log({\"cat_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/cats/cat.2.jpg\", caption=\"Cat2\")})\n", 112 | "wandb.log({\"dog_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/dogs/dog.1.jpg\", caption=\"Dog1\")})\n", 113 | "wandb.log({\"dog_examples\": wandb.Image(\"/home/jupyter/gcs/catsdogs/train/dogs/dog.2.jpg\", caption=\"Dog2\")})" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "myDir='/home/jupyter/gcs/catsdogs/'\n", 123 | "format='.jpg'\n", 124 | "fileList = []\n", 125 | "for root, dirs, files in os.walk(myDir, topdown=False):\n", 126 | " for name in files:\n", 127 | " if name.endswith(format):\n", 128 | " fullName = os.path.join(root, name)\n", 129 | " fileList.append(fullName)\n", 130 | " \n", 131 | "fileList[:10]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "df=pd.DataFrame(fileList)\n", 141 | "df.columns = ['image_uri']\n", 142 | "df.head()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "df.loc[df['image_uri'].str.contains('train'), 'split'] = 'TRAIN'\n", 152 | "df.loc[df['image_uri'].str.contains('validation'), 'split'] = 'VALIDATION'\n", 153 | "df.loc[df['image_uri'].str.contains('|'.join(['train/cats', 'validation/cats'])), 'label'] = 'cats'\n", 154 | "df.loc[df['image_uri'].str.contains('|'.join(['train/dogs', 'validation/dogs'])), 'label'] = 'dogs'\n", 155 | "df = df[['split', 'image_uri', 'label']]\n", 156 | "df.head()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "dflocal=df.copy()\n", 166 | "dfgcs=df.copy()\n", 167 | "dfgcs['image_uri'] = dfgcs['image_uri'].str.replace('/home/jupyter/gcs','gs://mchrestkha-demo-env-ml-examples')\n", 168 | "dflocal.to_csv('../data/datalocal1.csv', index=False)\n", 169 | "dfgcs.to_csv('../data/datagcs1.csv', index=False)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "artifact = wandb.Artifact(name='training_images', type='dataset')\n", 179 | "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/train/')\n", 180 | "run.log_artifact(artifact)\n", 181 | "artifact = wandb.Artifact(name='validation_images', type='dataset')\n", 182 | "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/validation/')\n", 183 | "run.log_artifact(artifact)\n", 184 | "artifact = wandb.Artifact(name='image_uris_csv', type='dataset')\n", 185 | "artifact.add_file('../data/datagcs.csv')\n", 186 | "run.log_artifact(artifact)\n", 187 | "run.finish()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "RUN_NAME=time.strftime(\"generate_tfrecords_%Y%m%d_%H%M%S\")\n", 197 | "run = wandb.init(project='cats-dogs-keras',job_type='data', name=RUN_NAME)\n", 198 | "artifact = run.use_artifact('training_images:latest')\n", 199 | "artifact = run.use_artifact('validation_images:latest')\n", 200 | "artifact = run.use_artifact('image_uris_csv:latest')" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "FILENAME='../data/datagcs.csv'\n", 210 | "TFRECORD_OUTPUT='gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords'\n", 211 | "PROJECT='mchrestkha-demo-env'\n", 212 | "REGION='us-west1'\n", 213 | "TFRECORDER_WHEEL='/home/jupyter/tfrecorder_wheel/tfrecorder-0.1.2-py3-none-any.whl'\n", 214 | "\n", 215 | "\n", 216 | "dfgcs = pd.read_csv(FILENAME)\n", 217 | "dfgcs.tensorflow.to_tfr(\n", 218 | " output_dir=TFRECORD_OUTPUT,\n", 219 | " runner='DataflowRunner',\n", 220 | " project=PROJECT,\n", 221 | " region=REGION,\n", 222 | " tfrecorder_wheel=TFRECORDER_WHEEL)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "dfgcs" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "artifact = wandb.Artifact(name='tfrecords', type='dataset')\n", 241 | "artifact.add_reference('gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/')\n", 242 | "run.log_artifact(artifact)\n", 243 | "run.finish()" 244 | ] 245 | } 246 | ], 247 | "metadata": { 248 | "environment": { 249 | "name": "tf2-2-3-gpu.2-3.m55", 250 | "type": "gcloud", 251 | "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55" 252 | }, 253 | "kernelspec": { 254 | "display_name": "mchrestkha-env", 255 | "language": "python", 256 | "name": "mchrestkha-env" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.7.8" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 4 273 | } 274 | -------------------------------------------------------------------------------- /catsdogs/tensorflow/notebooks/model_training_and_deployment_local.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tensorflow as tf\n", 10 | "#import matplotlib.pyplot as plt\n", 11 | "#import IPython.display as display\n", 12 | "from tensorflow.keras.optimizers import RMSprop\n", 13 | "import tensorflow_cloud as tfc\n", 14 | "import time\n", 15 | "import wandb\n", 16 | "from wandb.keras import WandbCallback\n", 17 | "import os" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "os.system('wandb login 3a6710e811d34207ea03768ba12e7ea6c8a9fefd')\n", 27 | "os.environ['WANDB_NOTEBOOK_NAME'] = 'model_training_and_deployment_local.ipynb'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "RUN_NAME=time.strftime(\"train_%Y%m%d_%H%M%S\")\n", 37 | "run = wandb.init(project='cats-dogs-keras',job_type='train', name=RUN_NAME)\n", 38 | "artifact = run.use_artifact('tfrecords:latest')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "AUTOTUNE = tf.data.experimental.AUTOTUNE\n", 48 | "GCS_PATH = \"gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/tfrecorder-20200930-193548-to-tfr\"\n", 49 | "BATCH_SIZE = 5\n", 50 | "IMAGE_SIZE = [150, 150]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "TRAINING_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/train*.tfrecord.gz\")\n", 60 | "VALID_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/validation*.tfrecord.gz\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "print(\"Train TFRecord Files:\", len(TRAINING_FILENAMES))\n", 70 | "print(\"Validation TFRecord Files:\", len(VALID_FILENAMES))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "def read_tfrecord(example):\n", 80 | " tfr_format = {\n", 81 | " \"image\": tf.io.FixedLenFeature([], tf.string),\n", 82 | " \"image_channels\": tf.io.FixedLenFeature([], tf.int64),\n", 83 | " \"image_height\": tf.io.FixedLenFeature([], tf.int64),\n", 84 | " \"image_name\": tf.io.FixedLenFeature([], tf.string),\n", 85 | " \"image_width\": tf.io.FixedLenFeature([], tf.int64),\n", 86 | " \"label\": tf.io.FixedLenFeature([], tf.int64),\n", 87 | " \"split\": tf.io.FixedLenFeature([], tf.string),\n", 88 | " }\n", 89 | " image_features= tf.io.parse_single_example(example, tfr_format)\n", 90 | " image_channels=image_features['image_channels']\n", 91 | " image_width=image_features['image_width']\n", 92 | " image_height=image_features['image_height']\n", 93 | " label=image_features['label']\n", 94 | " image_b64_bytes=image_features['image']\n", 95 | " image_decoded=tf.io.decode_base64(image_b64_bytes)\n", 96 | " image_raw = tf.io.decode_raw(image_decoded, out_type=tf.uint8)\n", 97 | " image = tf.reshape(image_raw, tf.stack([image_height, image_width, image_channels]))\n", 98 | " image_resized = tf.cast(tf.image.resize(image, size=[*IMAGE_SIZE]),tf.uint8)\n", 99 | " return image_resized, label" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def get_dataset(filenames):\n", 109 | " dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP') \n", 110 | " dataset = dataset.map(read_tfrecord)\n", 111 | " dataset = dataset.shuffle(200)\n", 112 | " dataset = dataset.batch(BATCH_SIZE)\n", 113 | " return dataset" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "train_dataset = get_dataset(TRAINING_FILENAMES)\n", 123 | "valid_dataset = get_dataset(VALID_FILENAMES)\n", 124 | "# image_batch, label_batch = next(iter(train_dataset))\n", 125 | "# image_batch[0].numpy()\n", 126 | "# for n in range(2):\n", 127 | "# plt.imshow(image_batch[n]) " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "model = tf.keras.models.Sequential([\n", 137 | " tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),\n", 138 | " tf.keras.layers.MaxPooling2D(2, 2),\n", 139 | " tf.keras.layers.Conv2D(32, (3,3), activation='relu'),\n", 140 | " tf.keras.layers.MaxPooling2D(2,2),\n", 141 | " tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n", 142 | " tf.keras.layers.MaxPooling2D(2,2),\n", 143 | " tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n", 144 | " tf.keras.layers.MaxPooling2D(2,2),\n", 145 | " tf.keras.layers.Flatten(),\n", 146 | " tf.keras.layers.Dense(256, activation='relu'),\n", 147 | " tf.keras.layers.Dense(1, activation='sigmoid')\n", 148 | "])\n", 149 | "\n", 150 | "model.summary()\n", 151 | "model.compile(loss='binary_crossentropy',\n", 152 | " optimizer=RMSprop(lr=1e-4),\n", 153 | " metrics=['accuracy'])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "model.fit(\n", 163 | " train_dataset,\n", 164 | " epochs=10,\n", 165 | " validation_data=valid_dataset,\n", 166 | " verbose=2,\n", 167 | " callbacks=[WandbCallback()]\n", 168 | ")" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "MODEL_PATH=time.strftime(\"gs://mchrestkha-demo-env-ml-examples/catsdogs/models/model_%Y%m%d_%H%M%S\")\n", 178 | "model.save(MODEL_PATH)\n", 179 | "\n", 180 | "\n", 181 | "artifact = wandb.Artifact(name='model', type='model')\n", 182 | "artifact.add_reference(MODEL_PATH)\n", 183 | "run.log_artifact(artifact)\n", 184 | "run.finish()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "os.environ['MODEL_PATH'] = MODEL_PATH #to be later used gcloud bash script\n", 194 | "print(os.environ['MODEL_PATH'])" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "%%bash\n", 204 | "MODEL_VERSION=\"v1\"\n", 205 | "MODEL_NAME=\"cats_dogs_classifier3\"\n", 206 | "REGION=\"us-central1\"\n", 207 | "\n", 208 | "gcloud ai-platform models create $MODEL_NAME \\\n", 209 | " --regions $REGION\n", 210 | "\n", 211 | "gcloud ai-platform versions create $MODEL_VERSION \\\n", 212 | " --model $MODEL_NAME \\\n", 213 | " --runtime-version 2.2 \\\n", 214 | " --python-version 3.7 \\\n", 215 | " --framework tensorflow \\\n", 216 | " --origin $MODEL_PATH" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "environment": { 222 | "name": "tf2-2-3-gpu.2-3.m55", 223 | "type": "gcloud", 224 | "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55" 225 | }, 226 | "kernelspec": { 227 | "display_name": "mchrestkha-env", 228 | "language": "python", 229 | "name": "mchrestkha-env" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.7.8" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 4 246 | } 247 | -------------------------------------------------------------------------------- /catsdogs/tensorflow/tfcloud/model_training_tfcloud.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tensorflow as tf\n", 10 | "from tensorflow.keras.optimizers import RMSprop\n", 11 | "import tensorflow_cloud as tfc\n", 12 | "import time\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "AUTOTUNE = tf.data.experimental.AUTOTUNE\n", 23 | "GCS_PATH = \"gs://mchrestkha-demo-env-ml-examples/catsdogs/tfrecords/tfrecorder-20200930-193548-to-tfr\"\n", 24 | "BATCH_SIZE = 5\n", 25 | "IMAGE_SIZE = [150, 150]" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "TRAINING_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/train*.tfrecord.gz\")\n", 35 | "VALID_FILENAMES=tf.io.gfile.glob(GCS_PATH + \"/validation*.tfrecord.gz\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "print(\"Train TFRecord Files:\", len(TRAINING_FILENAMES))\n", 45 | "print(\"Validation TFRecord Files:\", len(VALID_FILENAMES))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def read_tfrecord(example):\n", 55 | " tfr_format = {\n", 56 | " \"image\": tf.io.FixedLenFeature([], tf.string),\n", 57 | " \"image_channels\": tf.io.FixedLenFeature([], tf.int64),\n", 58 | " \"image_height\": tf.io.FixedLenFeature([], tf.int64),\n", 59 | " \"image_name\": tf.io.FixedLenFeature([], tf.string),\n", 60 | " \"image_width\": tf.io.FixedLenFeature([], tf.int64),\n", 61 | " \"label\": tf.io.FixedLenFeature([], tf.int64),\n", 62 | " \"split\": tf.io.FixedLenFeature([], tf.string),\n", 63 | " }\n", 64 | " image_features= tf.io.parse_single_example(example, tfr_format)\n", 65 | " image_channels=image_features['image_channels']\n", 66 | " image_width=image_features['image_width']\n", 67 | " image_height=image_features['image_height']\n", 68 | " label=image_features['label']\n", 69 | " image_b64_bytes=image_features['image']\n", 70 | " image_decoded=tf.io.decode_base64(image_b64_bytes)\n", 71 | " image_raw = tf.io.decode_raw(image_decoded, out_type=tf.uint8)\n", 72 | " image = tf.reshape(image_raw, tf.stack([image_height, image_width, image_channels]))\n", 73 | " image_resized = tf.cast(tf.image.resize(image, size=[*IMAGE_SIZE]),tf.uint8)\n", 74 | " return image_resized, label" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def get_dataset(filenames):\n", 84 | " dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP') \n", 85 | " dataset = dataset.map(read_tfrecord)\n", 86 | " dataset = dataset.shuffle(200)\n", 87 | " dataset = dataset.batch(BATCH_SIZE)\n", 88 | " return dataset" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "train_dataset = get_dataset(TRAINING_FILENAMES)\n", 98 | "valid_dataset = get_dataset(VALID_FILENAMES)\n", 99 | "# image_batch, label_batch = next(iter(train_dataset))\n", 100 | "# image_batch[0].numpy()\n", 101 | "# for n in range(2):\n", 102 | "# plt.imshow(image_batch[n]) " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "model = tf.keras.models.Sequential([\n", 112 | " tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),\n", 113 | " tf.keras.layers.MaxPooling2D(2, 2),\n", 114 | " tf.keras.layers.Conv2D(32, (3,3), activation='relu'),\n", 115 | " tf.keras.layers.MaxPooling2D(2,2),\n", 116 | " tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n", 117 | " tf.keras.layers.MaxPooling2D(2,2),\n", 118 | " tf.keras.layers.Conv2D(64, (3,3), activation='relu'),\n", 119 | " tf.keras.layers.MaxPooling2D(2,2),\n", 120 | " tf.keras.layers.Flatten(),\n", 121 | " tf.keras.layers.Dense(256, activation='relu'),\n", 122 | " tf.keras.layers.Dense(1, activation='sigmoid')\n", 123 | "])\n", 124 | "\n", 125 | "#model.summary()\n", 126 | "model.compile(loss='binary_crossentropy',\n", 127 | " optimizer=RMSprop(lr=1e-4),\n", 128 | " metrics=['accuracy'])" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "model.fit(\n", 138 | " train_dataset,\n", 139 | " epochs=10,\n", 140 | " validation_data=valid_dataset,\n", 141 | " verbose=2)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "model.save(time.strftime(\"gs://mchrestkha-demo-env-ml-examples/catsdogs/models/model_%Y%m%d_%H%M%S\"))" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "environment": { 156 | "name": "tf2-2-3-gpu.2-3.m55", 157 | "type": "gcloud", 158 | "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m55" 159 | }, 160 | "kernelspec": { 161 | "display_name": "mchrestkha-env", 162 | "language": "python", 163 | "name": "mchrestkha-env" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.7.8" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 4 180 | } 181 | -------------------------------------------------------------------------------- /catsdogs/tensorflow/tfcloud/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.12.1 2 | tensorflow-cloud==0.1.7 3 | wandb==0.10.4 4 | 5 | -------------------------------------------------------------------------------- /catsdogs/tensorflow/tfcloud/run_tfcloud.py: -------------------------------------------------------------------------------- 1 | import tensorflow_cloud as tfc 2 | tfc.run(entry_point='model_training_tfcloud.ipynb', 3 | # chief_config=tfc.COMMON_MACHINE_CONFIGS['T4_4X'], 4 | requirements_txt='requirements.txt') -------------------------------------------------------------------------------- /census/catboost/gcp_ai_platform/notebooks/catboost_census_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Used https://github.com/GoogleCloudPlatform/cloudml-samples/blob/master/xgboost/notebooks/census_training/train.py as a starting point and adjusted to CatBoost" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 37, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#Google Cloud Libraries\n", 17 | "from google.cloud import storage\n", 18 | "\n", 19 | "\n", 20 | "#System Libraries\n", 21 | "import datetime\n", 22 | "import subprocess\n", 23 | "\n", 24 | "#Data Libraries\n", 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "\n", 28 | "#ML Libraries\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.metrics import accuracy_score\n", 31 | "from sklearn.preprocessing import LabelEncoder\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "import xgboost as xgb\n", 34 | "from catboost import CatBoostClassifier, Pool, cv\n", 35 | "from catboost import CatBoost, Pool\n", 36 | "\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 50, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "I see 1 GPU devices\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "from catboost.utils import get_gpu_device_count\n", 54 | "print('I see %i GPU devices' % get_gpu_device_count())" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 61, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Fill in your Cloud Storage bucket name\n", 64 | "BUCKET_ID = \"mchrestkha-demo-env-ml-examples\"\n", 65 | "\n", 66 | "census_data_filename = 'adult.data.csv'\n", 67 | "\n", 68 | "# Public bucket holding the census data\n", 69 | "bucket = storage.Client().bucket('cloud-samples-data')\n", 70 | "\n", 71 | "# Path to the data inside the public bucket\n", 72 | "data_dir = 'ai-platform/census/data/'\n", 73 | "\n", 74 | "# Download the data\n", 75 | "blob = bucket.blob(''.join([data_dir, census_data_filename]))\n", 76 | "blob.download_to_filename(census_data_filename)\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 38, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# these are the column labels from the census data files\n", 86 | "COLUMNS = (\n", 87 | " 'age',\n", 88 | " 'workclass',\n", 89 | " 'fnlwgt',\n", 90 | " 'education',\n", 91 | " 'education-num',\n", 92 | " 'marital-status',\n", 93 | " 'occupation',\n", 94 | " 'relationship',\n", 95 | " 'race',\n", 96 | " 'sex',\n", 97 | " 'capital-gain',\n", 98 | " 'capital-loss',\n", 99 | " 'hours-per-week',\n", 100 | " 'native-country',\n", 101 | " 'income-level'\n", 102 | ")\n", 103 | "# categorical columns contain data that need to be turned into numerical values before being used by XGBoost\n", 104 | "CATEGORICAL_COLUMNS = (\n", 105 | " 'workclass',\n", 106 | " 'education',\n", 107 | " 'marital-status',\n", 108 | " 'occupation',\n", 109 | " 'relationship',\n", 110 | " 'race',\n", 111 | " 'sex',\n", 112 | " 'native-country'\n", 113 | ")\n", 114 | "\n", 115 | "# Load the training census dataset\n", 116 | "with open(census_data_filename, 'r') as train_data:\n", 117 | " raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)\n", 118 | "# remove column we are trying to predict ('income-level') from features list\n", 119 | "X = raw_training_data.drop('income-level', axis=1)\n", 120 | "# create training labels list\n", 121 | "#train_labels = (raw_training_data['income-level'] == ' >50K')\n", 122 | "y = raw_training_data['income-level']" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 39, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# Since the census data set has categorical features, we need to convert\n", 139 | "# them to numerical values.\n", 140 | "# convert data in categorical columns to numerical values\n", 141 | "X_enc=X\n", 142 | "encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}\n", 143 | "for col in CATEGORICAL_COLUMNS:\n", 144 | " X_enc[col] = encoders[col].fit_transform(X[col])\n", 145 | " " 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 40, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "y_enc=LabelEncoder().fit_transform(y)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 43, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "X_train, X_validation, y_train, y_validation = train_test_split(X_enc, y_enc, train_size=0.75, random_state=42)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "print(type(y))\n", 173 | "print(type(y_enc))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 58, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "Learning rate set to 0.069772\n", 186 | "0:\tlearn: 0.6282687\ttest: 0.6273059\tbest: 0.6273059 (0)\ttotal: 11.3ms\tremaining: 11.2s\n", 187 | "50:\tlearn: 0.3021165\ttest: 0.3008721\tbest: 0.3008721 (50)\ttotal: 530ms\tremaining: 9.87s\n", 188 | "100:\tlearn: 0.2857407\ttest: 0.2886646\tbest: 0.2886646 (100)\ttotal: 1.03s\tremaining: 9.14s\n", 189 | "150:\tlearn: 0.2748276\ttest: 0.2825841\tbest: 0.2825841 (150)\ttotal: 1.53s\tremaining: 8.59s\n", 190 | "200:\tlearn: 0.2660846\ttest: 0.2787806\tbest: 0.2787806 (200)\ttotal: 2.02s\tremaining: 8.04s\n", 191 | "250:\tlearn: 0.2594067\ttest: 0.2771832\tbest: 0.2771832 (250)\ttotal: 2.52s\tremaining: 7.52s\n", 192 | "Stopped by overfitting detector (20 iterations wait)\n", 193 | "\n", 194 | "bestTest = 0.2770424728\n", 195 | "bestIteration = 257\n", 196 | "\n", 197 | "Shrink model to first 258 iterations.\n", 198 | "CPU times: user 9.63 s, sys: 788 ms, total: 10.4 s\n", 199 | "Wall time: 2.85 s\n" 200 | ] 201 | }, 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "" 206 | ] 207 | }, 208 | "execution_count": 58, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "%%time\n", 215 | "\n", 216 | "#model = CatBoost({'iterations':50})\n", 217 | "model=CatBoostClassifier(\n", 218 | " od_type='Iter'\n", 219 | "#iterations=5000,\n", 220 | "#custom_loss=['Accuracy']\n", 221 | ")\n", 222 | "model.fit(\n", 223 | " X_train,y_train,eval_set=(X_validation, y_validation),\n", 224 | "\n", 225 | " verbose=50)\n", 226 | "\n", 227 | "# # load data into DMatrix object\n", 228 | "# dtrain = xgb.DMatrix(train_features, train_labels)\n", 229 | "# # train model\n", 230 | "# bst = xgb.train({}, dtrain, 20)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 69, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "# Export the model to a file\n", 254 | "fname = 'catboost_census_model.onnx'\n", 255 | "model.save_model(fname, format='onnx')\n", 256 | "\n", 257 | "# Upload the model to GCS\n", 258 | "bucket = storage.Client().bucket(BUCKET_ID)\n", 259 | "blob = bucket.blob('{}/{}'.format(\n", 260 | " datetime.datetime.now().strftime('census/catboost_model_dir/catboost_census_%Y%m%d_%H%M%S'),\n", 261 | " fname))\n", 262 | "blob.upload_from_filename(fname)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 66, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212707/:\n", 275 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212707/\n", 276 | "\n", 277 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212852/:\n", 278 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_212852/\n", 279 | "\n", 280 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_213004/:\n", 281 | "gs://mchrestkha-demo-env-ml-examples/census/catboost_census_20200525_213004/\n", 282 | "\n", 283 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/:\n", 284 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/model.bst\n", 285 | "\n", 286 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_021023/:\n", 287 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_021023/model.bst\n", 288 | "\n", 289 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_023122/:\n", 290 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_023122/model.bst\n", 291 | "\n", 292 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_job_dir/:\n", 293 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_job_dir/packages/\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "!gsutil ls gs://$BUCKET_ID/census/*" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 3", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.7.6" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 4 330 | } 331 | -------------------------------------------------------------------------------- /census/catboost/gcp_ai_platform/scripts/train-cloud.sh: -------------------------------------------------------------------------------- 1 | echo "Submitting an AI Platform job..." 2 | 3 | 4 | PROJECT_ID="mchrestkha-demo-env" 5 | BUCKET_ID="mchrestkha-demo-env-ml-examples" 6 | JOB_NAME=catboost_census_training_$(date +"%Y%m%d_%H%M%S") 7 | JOB_DIR=gs://$BUCKET_ID/census/catboost_job_dir 8 | TRAINING_PACKAGE_PATH="../trainer/" 9 | MAIN_TRAINER_MODULE=trainer.train 10 | REGION=us-west1 11 | RUNTIME_VERSION=2.1 12 | PYTHON_VERSION=3.7 13 | SCALE_TIER=BASIC 14 | 15 | 16 | gcloud ai-platform jobs submit training $JOB_NAME \ 17 | --job-dir $JOB_DIR \ 18 | --package-path $TRAINING_PACKAGE_PATH \ 19 | --module-name $MAIN_TRAINER_MODULE \ 20 | --region $REGION \ 21 | --runtime-version=$RUNTIME_VERSION \ 22 | --python-version=$PYTHON_VERSION \ 23 | --scale-tier $SCALE_TIER 24 | 25 | -------------------------------------------------------------------------------- /census/catboost/gcp_ai_platform/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | REQUIRED_PACKAGES = ['catboost'] 5 | 6 | setup( 7 | name='trainer', 8 | version='0.1', 9 | install_requires=REQUIRED_PACKAGES, 10 | packages=find_packages(), 11 | include_package_data=True, 12 | description='My training application package.' 13 | ) -------------------------------------------------------------------------------- /census/catboost/gcp_ai_platform/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mchrestkha/machine_learning_examples/9382b3426a4423d720df588bb510be98bb77599d/census/catboost/gcp_ai_platform/trainer/__init__.py -------------------------------------------------------------------------------- /census/catboost/gcp_ai_platform/trainer/train.py: -------------------------------------------------------------------------------- 1 | #Google Cloud Libraries 2 | from google.cloud import storage 3 | 4 | 5 | #System Libraries 6 | import datetime 7 | import subprocess 8 | 9 | #Data Libraries 10 | import pandas as pd 11 | import numpy as np 12 | 13 | #ML Libraries 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.preprocessing import LabelEncoder 17 | from sklearn.model_selection import train_test_split 18 | import xgboost as xgb 19 | from catboost import CatBoostClassifier, Pool, cv 20 | from catboost import CatBoost, Pool 21 | 22 | from catboost.utils import get_gpu_device_count 23 | print('I see %i GPU devices' % get_gpu_device_count()) 24 | 25 | 26 | # Fill in your Cloud Storage bucket name 27 | BUCKET_ID = "mchrestkha-demo-env-ml-examples" 28 | 29 | census_data_filename = 'adult.data.csv' 30 | 31 | # Public bucket holding the census data 32 | bucket = storage.Client().bucket('cloud-samples-data') 33 | 34 | # Path to the data inside the public bucket 35 | data_dir = 'ai-platform/census/data/' 36 | 37 | # Download the data 38 | blob = bucket.blob(''.join([data_dir, census_data_filename])) 39 | blob.download_to_filename(census_data_filename) 40 | 41 | # these are the column labels from the census data files 42 | COLUMNS = ( 43 | 'age', 44 | 'workclass', 45 | 'fnlwgt', 46 | 'education', 47 | 'education-num', 48 | 'marital-status', 49 | 'occupation', 50 | 'relationship', 51 | 'race', 52 | 'sex', 53 | 'capital-gain', 54 | 'capital-loss', 55 | 'hours-per-week', 56 | 'native-country', 57 | 'income-level' 58 | ) 59 | # categorical columns contain data that need to be turned into numerical values before being used by XGBoost 60 | CATEGORICAL_COLUMNS = ( 61 | 'workclass', 62 | 'education', 63 | 'marital-status', 64 | 'occupation', 65 | 'relationship', 66 | 'race', 67 | 'sex', 68 | 'native-country' 69 | ) 70 | 71 | # Load the training census dataset 72 | with open(census_data_filename, 'r') as train_data: 73 | raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS) 74 | # remove column we are trying to predict ('income-level') from features list 75 | X = raw_training_data.drop('income-level', axis=1) 76 | # create training labels list 77 | #train_labels = (raw_training_data['income-level'] == ' >50K') 78 | y = raw_training_data['income-level'] 79 | 80 | # Since the census data set has categorical features, we need to convert 81 | # them to numerical values. 82 | # convert data in categorical columns to numerical values 83 | X_enc=X 84 | encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS} 85 | for col in CATEGORICAL_COLUMNS: 86 | X_enc[col] = encoders[col].fit_transform(X[col]) 87 | 88 | 89 | y_enc=LabelEncoder().fit_transform(y) 90 | 91 | X_train, X_validation, y_train, y_validation = train_test_split(X_enc, y_enc, train_size=0.75, random_state=42) 92 | 93 | 94 | #model = CatBoost({'iterations':50}) 95 | model=CatBoostClassifier( 96 | od_type='Iter' 97 | #iterations=5000, 98 | #custom_loss=['Accuracy'] 99 | ) 100 | model.fit( 101 | X_train,y_train,eval_set=(X_validation, y_validation), 102 | 103 | verbose=50) 104 | 105 | # # load data into DMatrix object 106 | # dtrain = xgb.DMatrix(train_features, train_labels) 107 | # # train model 108 | # bst = xgb.train({}, dtrain, 20) 109 | 110 | 111 | # Export the model to a file 112 | fname = 'catboost_census_model.onnx' 113 | model.save_model(fname, format='onnx') 114 | 115 | # Upload the model to GCS 116 | bucket = storage.Client().bucket(BUCKET_ID) 117 | blob = bucket.blob('{}/{}'.format( 118 | datetime.datetime.now().strftime('census/catboost_model_dir/catboost_census_%Y%m%d_%H%M%S'), 119 | fname)) 120 | blob.upload_from_filename(fname) -------------------------------------------------------------------------------- /census/xgboost/gcp_ai_platform/notebooks/xgboost_census_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import datetime" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import xgboost as xgb\n", 19 | "import pandas as pd\n", 20 | "from sklearn.preprocessing import LabelEncoder\n", 21 | "import subprocess\n", 22 | "from google.cloud import storage\n", 23 | "\n", 24 | "# Fill in your Cloud Storage bucket name\n", 25 | "BUCKET_ID = \"mchrestkha-demo-env-ml-examples\"\n", 26 | "\n", 27 | "census_data_filename = 'adult.data.csv'\n", 28 | "\n", 29 | "# Public bucket holding the census data\n", 30 | "bucket = storage.Client().bucket('cloud-samples-data')\n", 31 | "\n", 32 | "# Path to the data inside the public bucket\n", 33 | "data_dir = 'ai-platform/census/data/'\n", 34 | "\n", 35 | "# Download the data\n", 36 | "blob = bucket.blob(''.join([data_dir, census_data_filename]))\n", 37 | "blob.download_to_filename(census_data_filename)\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# these are the column labels from the census data files\n", 47 | "COLUMNS = (\n", 48 | " 'age',\n", 49 | " 'workclass',\n", 50 | " 'fnlwgt',\n", 51 | " 'education',\n", 52 | " 'education-num',\n", 53 | " 'marital-status',\n", 54 | " 'occupation',\n", 55 | " 'relationship',\n", 56 | " 'race',\n", 57 | " 'sex',\n", 58 | " 'capital-gain',\n", 59 | " 'capital-loss',\n", 60 | " 'hours-per-week',\n", 61 | " 'native-country',\n", 62 | " 'income-level'\n", 63 | ")\n", 64 | "# categorical columns contain data that need to be turned into numerical values before being used by XGBoost\n", 65 | "CATEGORICAL_COLUMNS = (\n", 66 | " 'workclass',\n", 67 | " 'education',\n", 68 | " 'marital-status',\n", 69 | " 'occupation',\n", 70 | " 'relationship',\n", 71 | " 'race',\n", 72 | " 'sex',\n", 73 | " 'native-country'\n", 74 | ")\n", 75 | "\n", 76 | "# Load the training census dataset\n", 77 | "with open(census_data_filename, 'r') as train_data:\n", 78 | " raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)\n", 79 | " \n", 80 | "# remove column we are trying to predict ('income-level') from features list\n", 81 | "train_features = raw_training_data.drop('income-level', axis=1)\n", 82 | "# create training labels list\n", 83 | "train_labels = (raw_training_data['income-level'] == ' >50K')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "(32561, 15)" 95 | ] 96 | }, 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "raw_training_data.shape" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# Since the census data set has categorical features, we need to convert\n", 113 | "# them to numerical values.\n", 114 | "# convert data in categorical columns to numerical values\n", 115 | "encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS}\n", 116 | "for col in CATEGORICAL_COLUMNS:\n", 117 | " train_features[col] = encoders[col].fit_transform(train_features[col])" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "CPU times: user 11.2 s, sys: 32 ms, total: 11.2 s\n", 130 | "Wall time: 2.83 s\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "%%time\n", 136 | "# load data into DMatrix object\n", 137 | "dtrain = xgb.DMatrix(train_features, train_labels)\n", 138 | "# train model\n", 139 | "bst = xgb.train({\"verbosity\": 0}, dtrain, 200)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 14, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# Export the model to a file\n", 149 | "model = 'model.bst'\n", 150 | "bst.save_model(model)\n", 151 | "\n", 152 | "# Upload the model to GCS\n", 153 | "bucket = storage.Client().bucket(BUCKET_ID)\n", 154 | "blob = bucket.blob('{}/{}'.format(\n", 155 | " datetime.datetime.now().strftime('census/xgboost_model_dir/xgboost_census_%Y%m%d_%H%M%S'),\n", 156 | " model))\n", 157 | "blob.upload_from_filename(model)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 16, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "gs://mchrestkha-demo-env-ml-examples/census/census_20200525_020425/:\n", 170 | "gs://mchrestkha-demo-env-ml-examples/census/census_20200525_020425/model.bst\n", 171 | "\n", 172 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/:\n", 173 | "gs://mchrestkha-demo-env-ml-examples/census/xgboost_census_20200525_020526/model.bst\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "!gsutil ls gs://$BUCKET_ID/census/*" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.7.6" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 4 210 | } 211 | -------------------------------------------------------------------------------- /census/xgboost/gcp_ai_platform/scripts/train-cloud.sh: -------------------------------------------------------------------------------- 1 | echo "Submitting an AI Platform job..." 2 | 3 | PROJECT_ID="mchrestkha-demo-env" 4 | BUCKET_ID="mchrestkha-demo-env-ml-examples" 5 | JOB_NAME=xgboost_census_training_$(date +"%Y%m%d_%H%M%S") 6 | JOB_DIR=gs://$BUCKET_ID/census/xgboost_job_dir 7 | TRAINING_PACKAGE_PATH="../trainer/" 8 | MAIN_TRAINER_MODULE=trainer.train 9 | REGION=us-west1 10 | RUNTIME_VERSION=2.1 11 | PYTHON_VERSION=3.7 12 | SCALE_TIER=BASIC 13 | 14 | gcloud ai-platform jobs submit training $JOB_NAME \ 15 | --job-dir $JOB_DIR \ 16 | --package-path $TRAINING_PACKAGE_PATH \ 17 | --module-name $MAIN_TRAINER_MODULE \ 18 | --region $REGION \ 19 | --runtime-version=$RUNTIME_VERSION \ 20 | --python-version=$PYTHON_VERSION \ 21 | --scale-tier $SCALE_TIER 22 | 23 | -------------------------------------------------------------------------------- /census/xgboost/gcp_ai_platform/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mchrestkha/machine_learning_examples/9382b3426a4423d720df588bb510be98bb77599d/census/xgboost/gcp_ai_platform/trainer/__init__.py -------------------------------------------------------------------------------- /census/xgboost/gcp_ai_platform/trainer/train.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import xgboost as xgb 3 | import pandas as pd 4 | from sklearn.preprocessing import LabelEncoder 5 | import subprocess 6 | from google.cloud import storage 7 | 8 | # Fill in your Cloud Storage bucket name 9 | BUCKET_ID = "mchrestkha-demo-env-ml-examples" 10 | 11 | census_data_filename = 'adult.data.csv' 12 | 13 | # Public bucket holding the census data 14 | bucket = storage.Client().bucket('cloud-samples-data') 15 | 16 | # Path to the data inside the public bucket 17 | data_dir = 'ai-platform/census/data/' 18 | 19 | # Download the data 20 | blob = bucket.blob(''.join([data_dir, census_data_filename])) 21 | blob.download_to_filename(census_data_filename) 22 | 23 | # these are the column labels from the census data files 24 | COLUMNS = ( 25 | 'age', 26 | 'workclass', 27 | 'fnlwgt', 28 | 'education', 29 | 'education-num', 30 | 'marital-status', 31 | 'occupation', 32 | 'relationship', 33 | 'race', 34 | 'sex', 35 | 'capital-gain', 36 | 'capital-loss', 37 | 'hours-per-week', 38 | 'native-country', 39 | 'income-level' 40 | ) 41 | # categorical columns contain data that need to be turned into numerical values before being used by XGBoost 42 | CATEGORICAL_COLUMNS = ( 43 | 'workclass', 44 | 'education', 45 | 'marital-status', 46 | 'occupation', 47 | 'relationship', 48 | 'race', 49 | 'sex', 50 | 'native-country' 51 | ) 52 | 53 | # Load the training census dataset 54 | with open(census_data_filename, 'r') as train_data: 55 | raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS) 56 | # remove column we are trying to predict ('income-level') from features list 57 | train_features = raw_training_data.drop('income-level', axis=1) 58 | # create training labels list 59 | train_labels = (raw_training_data['income-level'] == ' >50K') 60 | 61 | 62 | # Since the census data set has categorical features, we need to convert 63 | # them to numerical values. 64 | # convert data in categorical columns to numerical values 65 | encoders = {col:LabelEncoder() for col in CATEGORICAL_COLUMNS} 66 | for col in CATEGORICAL_COLUMNS: 67 | train_features[col] = encoders[col].fit_transform(train_features[col]) 68 | 69 | 70 | # load data into DMatrix object 71 | dtrain = xgb.DMatrix(train_features, train_labels) 72 | # train model 73 | bst = xgb.train({}, dtrain, 20) 74 | 75 | 76 | # Export the model to a file 77 | model = 'model.bst' 78 | bst.save_model(model) 79 | 80 | # Upload the model to GCS 81 | bucket = storage.Client().bucket(BUCKET_ID) 82 | blob = bucket.blob('{}/{}'.format( 83 | datetime.datetime.now().strftime('census/xgboost_model_dir/xgboost_census_%Y%m%d_%H%M%S'), 84 | model)) 85 | blob.upload_from_filename(model) -------------------------------------------------------------------------------- /fannie_mae_loans/rapids_xgboost/notebooks/dask_rapids.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Resources Used" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "* Dataset\n", 15 | " - https://docs.rapids.ai/datasets/mortgage-data\n", 16 | " - https://capmrkt.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html\n", 17 | "* RAPIDS + Dask Documentation\n", 18 | " - https://docs.rapids.ai/api/cudf/stable/10min.html\n", 19 | " - https://docs.dask.org/en/latest/dataframe-best-practices.html\n", 20 | " - https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster\n", 21 | " - https://distributed.dask.org/en/latest/memory.html\n", 22 | " - https://dask-cuda.readthedocs.io/en/latest/specializations.html\n", 23 | "* Other examples with this dataset\n", 24 | " - https://www.dataquest.io/blog/data-science-portfolio-machine-learning/\n", 25 | " - https://github.com/dhananjaymehta/FannieMae_LoanForeclosure\n", 26 | " - https://degravek.github.io/project-pages/project1/2016/11/12/New-Notebook/\n", 27 | " - https://riskspan.com/hands-on-machine-learning-predicting-loan-delinquency/\n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Create conda enviornment with the following libraries\n", 42 | "```\n", 43 | "conda create -n rapids-0.17 -c rapidsai -c nvidia -c conda-forge -c defaults rapids-blazing=0.17 python=3.7 cudatoolkit=11.0 matplotlib=3.3.3 gcsfs=0.7.1\n", 44 | "```\n", 45 | "\n", 46 | "Once you create the conda environment open a Jupyter kernel associated to this conda environment" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Check Environment" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Thu Jan 14 03:17:44 2021 \n", 66 | "+-----------------------------------------------------------------------------+\n", 67 | "| NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 |\n", 68 | "|-------------------------------+----------------------+----------------------+\n", 69 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 70 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 71 | "| | | MIG M. |\n", 72 | "|===============================+======================+======================|\n", 73 | "| 0 A100-SXM4-40GB On | 00000000:00:04.0 Off | 0 |\n", 74 | "| N/A 32C P0 54W / 400W | 0MiB / 40537MiB | 0% Default |\n", 75 | "| | | Disabled |\n", 76 | "+-------------------------------+----------------------+----------------------+\n", 77 | "| 1 A100-SXM4-40GB On | 00000000:00:05.0 Off | 0 |\n", 78 | "| N/A 33C P0 52W / 400W | 0MiB / 40537MiB | 0% Default |\n", 79 | "| | | Disabled |\n", 80 | "+-------------------------------+----------------------+----------------------+\n", 81 | " \n", 82 | "+-----------------------------------------------------------------------------+\n", 83 | "| Processes: |\n", 84 | "| GPU GI CI PID Type Process name GPU Memory |\n", 85 | "| ID ID Usage |\n", 86 | "|=============================================================================|\n", 87 | "| No running processes found |\n", 88 | "+-----------------------------------------------------------------------------+\n", 89 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 90 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 91 | "Built on Thu_Jun_11_22:26:38_PDT_2020\n", 92 | "Cuda compilation tools, release 11.0, V11.0.194\n", 93 | "Build cuda_11.0_bu.TC445_37.28540450_0\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "%%bash\n", 99 | "nvidia-smi\n", 100 | "nvcc --version" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import numpy as np; print('numpy Version:', np.__version__)\n", 110 | "import pandas as pd; print('pandas Version:', pd.__version__)\n", 111 | "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n", 112 | "import cudf; print('cudf Version:', cudf.__version__)\n", 113 | "import cuml; print('cudf Version:', cuml.__version__)\n", 114 | "import gcsfs; print('gcsfs Version:', gcsfs.__version__)\n", 115 | "import time\n", 116 | "import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)\n", 117 | "import dask; print('dask Version:', gcsfs.__version__)\n", 118 | "import dask.dataframe as dask_df\n", 119 | "import glob;\n", 120 | "import matplotlib; print('matplotlib Version:', matplotlib.__version__)\n", 121 | "from dask.diagnostics import ProgressBar\n", 122 | "from dask.distributed import Client, progress, wait" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 3, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from IPython.core.interactiveshell import InteractiveShell\n", 132 | "InteractiveShell.ast_node_interactivity = \"all\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Set up Dask Cluster" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# from dask.distributed import Client\n", 149 | "# client = Client()\n", 150 | "# client" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 4, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/html": [ 161 | "\n", 162 | "\n", 163 | "\n", 170 | "\n", 178 | "\n", 179 | "
\n", 164 | "

Client

\n", 165 | "\n", 169 | "
\n", 171 | "

Cluster

\n", 172 | "
    \n", 173 | "
  • Workers: 2
  • \n", 174 | "
  • Cores: 2
  • \n", 175 | "
  • Memory: 179.38 GB
  • \n", 176 | "
\n", 177 | "
" 180 | ], 181 | "text/plain": [ 182 | "" 183 | ] 184 | }, 185 | "execution_count": 4, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "import blazingsql\n", 192 | "import dask_cudf\n", 193 | "from dask.distributed import Client\n", 194 | "from dask_cuda import LocalCUDACluster\n", 195 | "\n", 196 | "cluster = LocalCUDACluster()\n", 197 | "client = Client(cluster)\n", 198 | "# bc = blazingsql.BlazingContext(dask_client=client, network_interface='lo')\n", 199 | "client" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# client.restart()\n", 209 | "# client" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## Define Data Schema & Data Types" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 5, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "col_acq_names = ['LoanID','Channel','SellerName','OrInterestRate','OrUnpaidPrinc','OrLoanTerm',\n", 226 | " 'OrDate','FirstPayment','OrLTV','OrCLTV','NumBorrow','DTIRat','CreditScore',\n", 227 | " 'FTHomeBuyer','LoanPurpose','PropertyType','NumUnits','OccStatus','PropertyState',\n", 228 | " 'Zip','MortInsPerc','ProductType','CoCreditScore','MortInsType','RelMortInd']\n", 229 | "col_per_names = ['LoanID','MonthRep','Servicer','CurrInterestRate','CAUPB','LoanAge','MonthsToMaturity',\n", 230 | " 'AdMonthsToMaturity','MaturityDate','MSA','CLDS','ModFlag','ZeroBalCode','ZeroBalDate',\n", 231 | " 'LastInstallDate','ForeclosureDate','DispositionDate','PPRC','AssetRecCost','MHRC',\n", 232 | " 'ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',\n", 233 | " 'FPWA','ServicingIndicator']\n", 234 | "\n", 235 | "col_acq = ['LoanID','OrDate','OrUnpaidPrinc','Channel','SellerName','PropertyType','NumUnits','PropertyState']\n", 236 | "col_per = ['LoanID','MonthRep', 'CAUPB','CLDS','ForeclosureDate']\n", 237 | "\n", 238 | "parse_dates_acq =['OrDate','FirstPayment']\n", 239 | "parse_dates_per =['MonthRep','MaturityDate','ZeroBalDate','LastInstallDate','ForeclosureDate','DispositionDate']\n", 240 | "\n", 241 | "# dtype_acq={ \"LoanID\":\"str\",\"Channel\":\"str\",\"SellerName\":\"str\",\"OrInterestRate\":\"str\",\"OrUnpaidPrinc\":\"str\",\"OrLoanTerm\":\"str\",\"OrDate\":\"str\",\n", 242 | "# \"FirstPayment\":\"str\",\"OrLTV\":\"str\",\"OrCLTV\":\"str\", \"NumBorrow\":\"str\", \"DTIRat\":\"str\", \"CreditScore\":\"str\", \"FTHomeBuyer\":\"str\",\n", 243 | "# \"LoanPurpose\":\"str\", \"PropertyType\":\"str\", \"NumUnits\":\"str\", \"OccStatus\":\"str\", \"PropertyState\":\"str\", \"Zip\":\"str\", \"MortInsPerc\":\"str\",\n", 244 | "# \"ProductType\":\"str\", \"CoCreditScore\":\"str\", \"MortInsType\":\"str\", \"RelMortInd\":\"str\"}\n", 245 | "\n", 246 | "dtype_acq={ \"LoanID\":\"int\",\"Channel\":\"str\",\"SellerName\":\"str\",\"OrInterestRate\":\"float\",\"OrUnpaidPrinc\":\"float\",\"OrLoanTerm\":\"float\",\"OrDate\":\"str\",\n", 247 | " \"FirstPayment\":\"str\",\"OrLTV\":\"float\",\"OrCLTV\":\"float\", \"NumBorrow\":\"float\", \"DTIRat\":\"float\", \"CreditScore\":\"float\", \"FTHomeBuyer\":\"str\",\n", 248 | " \"LoanPurpose\":\"str\", \"PropertyType\":\"str\", \"NumUnits\":\"float\", \"OccStatus\":\"str\", \"PropertyState\":\"str\", \"Zip\":\"int\", \"MortInsPerc\":\"float\",\n", 249 | " \"ProductType\":\"str\", \"CoCreditScore\":\"float\", \"MortInsType\":\"float\", \"RelMortInd\":\"str\"}\n", 250 | "\n", 251 | "# dtype_per={\"LoanID\":\"str\",\"MonthRep\":\"str\",\"Servicer\":\"str\", \"CurrInterestRate\":\"str\", \"CAUPB\":\"str\", \"LoanAge\":\"str\",\"MonthsToMaturity\":\"str\",\n", 252 | "# \"AdMonthsToMaturity\":\"str\", \"MaturityDate\":\"str\", \"MSA\":\"str\", \"CLDS\":\"str\", \"ModFlag\":\"str\", \"ZeroBalCode\":\"str\", \"ZeroBalDate\":\"str\",\n", 253 | "# \"LastInstallDate\":\"str\", \"ForeclosureDate\":\"str\", \"DispositionDate\":\"str\", \"PPRC\":\"str\", \"AssetRecCost\":\"str\", \"MHRC\":\"str\", \"ATFHP\":\"str\",\n", 254 | "# \"NetSaleProceeds\":\"str\", \"CreditEnhProceeds\":\"str\",\"RPMWP\":\"str\",\"OFP\":\"str\",\"NIBUPB\":\"str\", \"PFUPB\":\"str\", \"RMWPF\":\"str\",\n", 255 | "# \"FPWA\":\"str\", \"ServicingIndicator\":\"str\"\n", 256 | "# }\n", 257 | "\n", 258 | "dtype_per={\"LoanID\":\"int\",\"MonthRep\":\"str\",\"Servicer\":\"str\", \"CurrInterestRate\":\"float\", \"CAUPB\":\"float\", \"LoanAge\":\"float\",\"MonthsToMaturity\":\"float\",\n", 259 | " \"AdMonthsToMaturity\":\"float\", \"MaturityDate\":\"str\", \"MSA\":\"float\", \"CLDS\":\"float\", \"ModFlag\":\"str\", \"ZeroBalCode\":\"float\", \"ZeroBalDate\":\"str\",\n", 260 | " \"LastInstallDate\":\"str\", \"ForeclosureDate\":\"str\", \"DispositionDate\":\"str\", \"PPRC\":\"float\", \"AssetRecCost\":\"float\", \"MHRC\":\"float\", \"ATFHP\":\"float\",\n", 261 | " \"NetSaleProceeds\":\"float\", \"CreditEnhProceeds\":\"float\",\"RPMWP\":\"float\",\"OFP\":\"float\",\"NIBUPB\":\"float\", \"PFUPB\":\"float\", \"RMWPF\":\"float\",\n", 262 | " \"FPWA\":\"str\", \"ServicingIndicator\":\"str\"\n", 263 | "}" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## Data Ingestion" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 6, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "# csv_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/Acquisition_20*'\n", 280 | "# csv_perf_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/Performance_20*'\n", 281 | "\n", 282 | "parq_acq_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/acq/parquet68'\n", 283 | "parq_per_fnames='gs://mchrestkha-github-ml-examples/fannie_mae_loans/perf/parquet823'" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# %time df_acq = dask_cudf.read_csv(csv_acq_fnames, sep='|', names=col_acq_names, dtype=dtype_acq, parse_dates=parse_dates_acq)\n", 293 | "# %time df_pe = dask_cudf.read_csv(csv_perf_fnames, sep='|', names=col_per_names, dtype=dtype_per, parse_dates=parse_dates_per)\n", 294 | "\n", 295 | "%time df_acq = dask_cudf.read_parquet(parq_acq_fnames)\n", 296 | "%time df_per = dask_cudf.read_parquet(parq_per_fnames, columns=col_per)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "# %time print(\"Required Memory for df_acq:\",df_acq.memory_usage().sum().compute()/(1024**3), 'GB')\n", 306 | "# %time print(\"Required Memory for df_per:\",df_per.memory_usage().sum().compute()/(1024**3), 'GB')" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 13, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "df_acq.head()\n", 316 | "df_per.head()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "df_per_shape=df_per.shape[0].persist()\n", 326 | "progress(df_per_shape)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 12, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "1890353680" 338 | ] 339 | }, 340 | "execution_count": 12, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | }, 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "37015214" 348 | ] 349 | }, 350 | "execution_count": 12, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "df_per_shape.compute()\n", 357 | "df_acq.shape[0].compute()" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "## Data Profiling & Data Quality Check against Summary Statistics \n", 365 | "- Data Dictionary: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf\n", 366 | "- Sumary Statistics: https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Stat_Summary_Primary.pdf\n", 367 | "- Sample Data: https://docs.google.com/spreadsheets/d/1nCtusAE2naZlWHFKGRsQTxxusjfZYiBLdd5SF5AEGMA/edit" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "%time df_acq_describe=df_acq.describe().compute()\n", 377 | "%time df_per_describe=df_per.describe().compute()\n", 378 | "df_acq_describe\n", 379 | "df_per_describe" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "df_acq['OrYr']=df_acq['OrDate'].str[-4:]\n", 389 | "df_acq['OrUnpaidPrinc $M']=df_acq['OrUnpaidPrinc']/1000000" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "# df_acq_describe=df_acq.describe().compute()\n", 399 | "# df_acq_nulls=df_acq.isna().sum().compute()\n", 400 | "df_acq_describe\n", 401 | "df_acq_nulls;" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "%time df_acq_summary = df_acq.groupby('OrYr',as_index=False).agg({'LoanID': 'count','OrUnpaidPrinc $M': 'sum'}).compute()\n", 411 | "df_acq_summary.rename(columns = {'LoanID': 'TotalLoans','OrUnpaidPrinc $M':'TotalOrUnpaidPrinc $M'},inplace=True)\n", 412 | "df_acq_summary['AvgOrUnpaidPrinc']=df_acq_summary['TotalOrUnpaidPrinc $M']/df_acq_summary['TotalLoans']*1000000\n", 413 | "df_acq_summary.to_pandas().sort_values(by=['OrYr']).plot.bar(x='OrYr',y='TotalLoans')\n", 414 | "df_acq_summary.to_pandas().sort_values(by=['OrYr']).plot.bar(x='OrYr',y='AvgOrUnpaidPrinc')" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 14, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/html": [ 425 | "
\n", 426 | "\n", 439 | "\n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | "
LoanIDMonthRepCAUPBCLDSForeclosureDateDelinquentEventForeclosureEventYrRepCAUPB $M
010000736514201/01/2000<NA>0.0<NA>002000<NA>
110000736514201/01/200174319.00.0<NA>0020010.074319
210000736514201/01/200273635.480.0<NA>0020020.07363548
310000736514201/01/200372795.410.0<NA>0020030.07279541
410000736514202/01/2000<NA>0.0<NA>002000<NA>
\n", 517 | "
" 518 | ], 519 | "text/plain": [ 520 | " LoanID MonthRep CAUPB CLDS ForeclosureDate DelinquentEvent \\\n", 521 | "0 100007365142 01/01/2000 0.0 0 \n", 522 | "1 100007365142 01/01/2001 74319.0 0.0 0 \n", 523 | "2 100007365142 01/01/2002 73635.48 0.0 0 \n", 524 | "3 100007365142 01/01/2003 72795.41 0.0 0 \n", 525 | "4 100007365142 02/01/2000 0.0 0 \n", 526 | "\n", 527 | " ForeclosureEvent YrRep CAUPB $M \n", 528 | "0 0 2000 \n", 529 | "1 0 2001 0.074319 \n", 530 | "2 0 2002 0.07363548 \n", 531 | "3 0 2003 0.07279541 \n", 532 | "4 0 2000 " 533 | ] 534 | }, 535 | "execution_count": 14, 536 | "metadata": {}, 537 | "output_type": "execute_result" 538 | } 539 | ], 540 | "source": [ 541 | "df_per['DelinquentEvent']=0\n", 542 | "df_per['DelinquentEvent']=df_per['DelinquentEvent'].where(df_per['CLDS']<1,1)\n", 543 | "df_per['ForeclosureEvent']=0\n", 544 | "df_per['ForeclosureEvent']=df_per['ForeclosureEvent'].where(df_per['ForeclosureDate'].isnull()== True,1)\n", 545 | "df_per['YrRep']=df_per['MonthRep'].str[-4:]\n", 546 | "df_per['CAUPB $M']=df_per['CAUPB']/1000000\n", 547 | "df_per.head()" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "df_per_yr_summary = df_per.groupby('YrRep',as_index=False).agg({'LoanID': 'count', 'DelinquentEvent':'sum'}).persist()\n", 557 | "progress(df_per_yr_summary)" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 47, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "" 569 | ] 570 | }, 571 | "execution_count": 47, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | }, 575 | { 576 | "data": { 577 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAA090lEQVR4nO3de3wU1d348c839wQSQm4QCJCAAUQCAcNFUEQFBbygVkWfVkWrlEdR0VpL2+ei7c+n1sd6QS0WKyrWKj5qK1oUEUHwgtyEhDshBEiABAIkIeSe8/tjJ+m6bsgm2WSy2e/79ZpXdmfO7HxnWOa755w5M2KMQSmllP8JsDsApZRS9tAEoJRSfkoTgFJK+SlNAEop5ac0ASillJ8KsjuA5oiLizPJycl2h6GUUj5l06ZNx40x8a7zfSoBJCcns3HjRrvDUEopnyIiB9zN1yYgpZTyU5oAlFLKT2kCUEopP+VTfQDuVFdXk5eXR0VFhd2hKNWksLAwkpKSCA4OtjsUpXw/AeTl5REZGUlycjIiYnc4SjXKGENRURF5eXmkpKTYHY5Svt8EVFFRQWxsrJ78VYcnIsTGxmptVXUYPp8AAD35K5+h31XVkXSKBKCUv8g7eYbl24/aHYbqJDxKACIyRUR2i0i2iMxzs1xEZL61PFNERjotyxWRLBHZIiIbnebHiMgKEdlr/e3unV1qf4GBgaSnp3PeeecxfPhwnn76aerq6s66Tm5uLkOHDgVg48aN3H///e0Rqle99tprxMfHk56e3jDt2LHDq9t49tlnOXPmjFc/05Xzv0VHZozhgbe38LM3NrF6d6Hd4ahOoMkEICKBwIvAVGAIcIuIDHEpNhVItaZZwAKX5ZcYY9KNMRlO8+YBK40xqcBK671PCg8PZ8uWLWzfvp0VK1awbNkyHnvsMY/Xz8jIYP78+W0YYduZMWMGW7ZsaZiGDHH9arROWyeAmpqaNvtsb/tizzE2HThJeHAgv3o/i5KKartDUj7OkxrAaCDbGJNjjKkC3gamu5SZDiw2DuuAaBFJbOJzpwOvW69fB671POyOKyEhgYULF/LCCy9gjKG2tpZf/OIXjBo1imHDhvHnP//5B+usXr2aq666CoBHH32UO++8k4kTJ9K/f//vJYbHH3+cQYMGMWnSJG655RaeeuopACZOnNhwi4zjx49Tf7+kxra9evVqJk6cyA033MDgwYP58Y9/TP2T4TZs2MC4ceMYPnw4o0ePprS0lIsuuogtW7Y0xDF+/HgyMzMbPQYzZsxg2bJlDe9nzpzJe++91+x45s+fz+HDh7nkkku45JJLqK2tZebMmQwdOpS0tDSeeeaZhv2fO3cu48aNY+jQoaxfvx6AsrIy7rzzTkaNGsWIESP44IMPAEfN5cYbb+Tqq6/m8ssvb3Q/Vq5cyYgRI0hLS+POO++ksrISgN/+9reMGjWKoUOHMmvWrIZjN3HiRH75y18yevRoBg4cyNq1axv97OYyxvDMij30jg5n8U9HU1BSwf/8c6fXPl/5J08uA+0NHHJ6nweM8aBMb+AIYIBPRcQAfzbGLLTK9DDGHAEwxhwRkQR3GxeRWThqFfTt2/esgT724XZ2HC7xYJc8N6RXFP999XnNWqd///7U1dVRWFjIBx98QLdu3diwYQOVlZWMHz+eyy+//Kydgbt27WLVqlWUlpYyaNAg/v3f/53MzEzefvttvvvuO2pqahg5ciTnn3/+WeN45ZVX3G4b4LvvvmP79u306tWL8ePH89VXXzF69GhmzJjBkiVLGDVqFCUlJYSHh3PXXXfx2muv8eyzz7Jnzx4qKysZNmwYmzdvZsmSJXz55ZcN2/zmm2+4+eabWbJkCdOmTaOqqoqVK1eyYMGCZsdz//338/TTT7Nq1Sri4uLYtGkT+fn5bNu2DYBTp041bLesrIyvv/6aNWvWcOedd7Jt2zYef/xxLr30UhYtWsSpU6cYPXo0kyZNaogzMzOTmJgYcnNzf3DsKioqmDlzJitXrmTgwIHcdtttLFiwgLlz5zJnzhz+67/+C4Bbb72Vjz76iKuvvhpw1CjWr1/fUAv87LPPmvi2eObzXYVszSvmievTGJUcw6wJA3jpi31MTUvk4oE/uMeXUh7xpAbg7kzl+iDhs5UZb4wZiaOZ6F4RmdCM+DDGLDTGZBhjMuLjfeeLXv+r8NNPP2Xx4sWkp6czZswYioqK2Lt371nXvfLKKwkNDSUuLo6EhAQKCgpYu3Yt1113HREREURFRXHNNdc0GcPZtj169GiSkpIICAggPT2d3Nxcdu/eTWJiIqNGjQIgKiqKoKAgbrzxRj766COqq6tZtGgRM2fObNiGaxNQeHg4U6dO5fPPP6eyspKPP/6YCRMmEB4e3ux4XPXv35+cnBzuu+8+PvnkE6KiohqW3XLLLQBMmDCBkpISTp06xaeffsoTTzxBeno6EydOpKKigoMHDwIwefJkYmJiGj12u3fvJiUlhYEDBwJw++23s2bNGgBWrVrFmDFjSEtL4/PPP2f79u0N611//fUAnH/++W73oSWMMTy9Yg99YyL40flJAMydlMo5CV2Z916mNgWpFvOkBpAH9HF6nwQc9rSMMab+b6GI/B1Hk9IaoEBEEq1f/4lAq3u1mvtLva3k5OQQGBhIQkICxhief/55rrjiiu+VOdvJITQ0tOF1YGBgQzt1Y7WGoKCghk5n52vMG9v26tWr3W7DGON2GxEREUyePJkPPviAd955p8k7soaFhTFx4kSWL1/OkiVLGk7OzY3HVffu3dm6dSvLly/nxRdf5J133mHRokVuj42IYIzhvffeY9CgQd9b9u2339KlS5ez7kN9AndVUVHBPffcw8aNG+nTpw+PPvro9455/X40tg8t8emOArYfLuGpG4cTHOj4zRYWHMj/3jCMHy34mt8v28nvrx/mlW0p/+JJDWADkCoiKSISAtwMLHUpsxS4zboaaCxQbJ3Yu4hIJICIdAEuB7Y5rXO79fp24INW7kuHcOzYMWbPns2cOXMQEa644goWLFhAdbXjV9qePXsoKytr9udOmDCBv//975SXl1NaWsqHH37YsCw5OZlNmzYB8O677zbMb+62Bw8ezOHDh9mwYQMApaWlDSexu+66i/vvv59Ro0ad9ZdzvZtvvplXX32VtWvXNpzwW3IsIiMjKS0tBRz9G3V1dfzoRz/id7/7HZs3b24ot2TJEgC+/PJLunXrRrdu3bjiiit4/vnnG07m3333XZNxOx+L3NxcsrOzAXjjjTe4+OKLG072cXFxnD59+nvHuy3U1Tna/lPiunBteq/vLRvRtzt3T+jPW+sPsWbPsTaNQ3VOTdYAjDE1IjIHWA4EAouMMdtFZLa1/CVgGTANyAbOAHdYq/cA/m79OgsC/maM+cRa9gTwjoj8FDgI3Oi1vWpn5eXlpKenU11dTVBQELfeeisPPfQQ4Dhx5ubmMnLkSIwxxMfH849//KPZ2xg5ciQzZswgPT2dfv36cdFFFzUse/jhh7npppt44403uPTSSxvmN3fbISEhLFmyhPvuu4/y8nLCw8P57LPP6Nq1K+effz5RUVHccccd31vHtQ/gT3/6E+PGjePyyy/ntttu45prriEkJKTFx2LWrFlMnTqVxMREnn32We64446G2s7vf//7hnLdu3dn3LhxlJSUNNQK/vM//5O5c+cybNgwjDEkJyfz0Ucfud3O7t27SUpKanj/zDPP8Oqrr3LjjTdSU1PDqFGjmD17NqGhodx9992kpaWRnJzc0FzWVj7ZfpRdR0t5dkY6QYE//L324KSBfLajgF+9n8Uncy8iMkzvMaQ8J41VdTuijIwM49r8sHPnTs4991ybIrLPo48+SteuXXn44YfbZXuHDx9m4sSJ7Nq1i4CAjjV+cOLEiTz11FNkZGQ0XbgD8PQ7W1tnmPLsGuqM4dMHLyYwwH0T4OaDJ7lhwdfMGNWX31+f5u1wVScgIptcLsMHdCSw8sDixYsZM2YMjz/+eIc7+XdmH2UeZm/haeZOGtjoyR9gZN/u3H1Rf95af5Av9x5vxwiVr9MagFLtzJPvbE1tHZc/s4bgwAA+fuAiAs6SAAAqqmuZNn8tldV1LH9wAl1Dff5Gv8qLOnUNwJeSmPJvnn5Xl249TM7xMh6cnNrkyR/qrwoazuHicn6/TAeIKc/4fAIICwujqKhIk4Dq8OqfBxAWFnbWcjW1dTy3ci9DEqO4fEhPjz///H7duevCFN789iBfZWtTkGqaz9cTk5KSyMvL49gxvQxOdXz1TwQ7m/e/y+dA0Rlevi3Do1//zn5++SBW7izkkXcztSlINcnnvx3BwcH6dCXVaVTV1DF/5V6GJXVj0rlu745yVmHBgfzvjcO44aVveOLjnfy/a/WqINU4n28CUqozeXdTHnkny3lw8sAWPzzm/H4x/HR8Cn9dd5CvtSlInYUmAKU6iMqaWl74fC8j+kYzsZU3ePv55YNIievCI+9lUlbpO7e8Vu1LE4BSHcQ7Gw5xuLiCh1rx679eeEggT94wjPxT5Tzx8S4vRag6G00ASnUAFdW1vLAqm1HJ3bnwnDivfOao5BjuGJfCG+sO8PU+bQpSP6QJQKkO4K31BykoqWxV2787v7hiEMmxEfxSm4KUG5oAlLJZeVUtL67axwX9Yxk3wDu//us5moKGk3eynCc/0aYg9X2aAJSy2V/XHeD4acev/7YwOiWGmeOSef2bA6zLKWqTbSjfpAlAKRuVVdbw0hf7uCg1jtEpTT9noaV+ccUg+sVG8Mi7mZyp0qYg5aAJQCkbLf7mAEVlVcyd1Da//utFhATxvzcM59DJMzz5ye423ZbyHZoAlLJJaUU1f16zj4mD4jm/X/c2397olBhuvyCZ177O1aYgBWgCUMo2r3+dy6kz1TzURm3/7jwyZRB9YxxXBWlTkPIoAYjIFBHZLSLZIjLPzXIRkfnW8kwRGemyPFBEvhORj5zmPSoi+SKyxZqmtX53lPINJRXVLFyTw6RzezAsKbrdthsREsSTNwzjQJE2BSkPEoCIBAIvAlOBIcAtIjLEpdhUINWaZgELXJY/ALi7Sfkzxph0a1rW3OCV8lWvrN1PSUUNcyeltvu2x/aPZeY4R1PQ2r16F11/5kkNYDSQbYzJMcZUAW8D013KTAcWG4d1QLSIJAKISBJwJfAXL8atlM86daaKRV/uZ8p5PRnau5stMcybOphzErry83e2cqKsypYYlP08SQC9gUNO7/OseZ6WeRZ4BKhz89lzrCajRSLithdMRGaJyEYR2aj3/FedwV/W7ud0VQ1zJ7f/r/96YcGBPHdzOqfOVPPL9zL1gUp+ypME4G5cuuu3xW0ZEbkKKDTGbHKzfAEwAEgHjgB/dLdxY8xCY0yGMSYjPr51d0hUym4nyqp49av9XJmWyOCeUbbGcl6vbjwyZRArdhTw9oZDTa+gOh1PEkAe0MfpfRJw2MMy44FrRCQXR9PRpSLyVwBjTIExptYYUwe8jKOpSalObeGaHM5U19rS9u/OneNTuPCcOH774Q72HTttdziqnXmSADYAqSKSIiIhwM3AUpcyS4HbrKuBxgLFxpgjxphfGWOSjDHJ1nqfG2N+AlDfR2C5DtjW2p1RqiM7frqS17/OZfrwXpyTEGl3OAAEBAh/vGk4ocEBzH17C1U17lpqVWfVZAIwxtQAc4DlOK7keccYs11EZovIbKvYMiAHyMbxa/4eD7b9pIhkiUgmcAnwYEt2QClf8dLqfVTW1HL/ZR3j13+9HlFhPHH9MLLyi3nmsz12h6PakUfPBLYu0VzmMu8lp9cGuLeJz1gNrHZ6f2sz4lTKpxWWVPDGugNcNyKJ/vFd7Q7nB6YM7cnNo/rw0hf7mJAazwUDYu0OSbUDHQmsVDt4d3MelTV13H/ZOXaH0qj/vGoIybFd+Pk7Wyg+U213OKodaAJQqh1kHiomOTaCfrFd7A6lUV1Cg3h2RjqFpZX8+h9ZemmoH9AEoFQ7yMovJq0db/nQUsP7RPPg5IH8M/MI72/Otzsc1cY0ASjVxk6UVZF/qpy03vZe9++p2RcPYHRKDP/1wTYOFJXZHY5qQ5oAlGpjWfnFAKT1jrY3EA8FBgjPzEgnIECYu2QLNbV6aWhnpQlAqTaWlXcKgPN8pAYA0Ds6nMevS+O7g6d4/vNsu8NRbUQTgFJtLCu/mJS4LkSFBdsdSrNcM7wX14/ozfOf72XTgRN2h6PagCYApdpYVl4xaTbd9bO1Hpt+Hr27h/PA21sordBLQzsbTQBKtaGi05UcLq7w2QQQGRbMszPSOXyqnP/+YLvd4Sgv0wSgVBtq6ABO8s0EAHB+vxjuuzSV97/LZ+lW1/tAKl+mCUCpNpSV50gA5/XynQ5gd+679BxG9I3mN3/PIv9Uud3hKC/RBKBUG8rKL6Z/XBcifawD2FVQYADPzRhBXZ3hwSVbqK3TUcKdgSYApdqQYwSw7zb/OOsbG8Fj04eyfv8JXvpin93hKC/QBKBUGzl+upIjPtwB7M6PRvbmymGJPLNiD1sPnbI7HNVKmgCUaiP/GgHceRKAiPA/16YRHxnK3CVbKKussTsk1QqaAJRqI1l5xYjAeZ0oAQB0iwjm6ZvSyS0q43cf7bA7HNUKmgCUaiP1I4C7hnr03CWfcsGAWGZfPIC3Nxzik21H7A5HtZBHCUBEpojIbhHJFpF5bpaLiMy3lmeKyEiX5YEi8p2IfOQ0L0ZEVojIXutv99bvjlIdR1ZeMcM62a9/Zw9OGsh5vaL47Yc7qNYbxvmkJhOAiAQCLwJTgSHALSIyxKXYVCDVmmYBC1yWP4DjecLO5gErjTGpwErrvVKdwrHSSo6WVDC0EyeAkKAAHpo8kMPFFXyUqQPEfJEnNYDRQLYxJscYUwW8DUx3KTMdWGwc1gHRIpIIICJJwJXAX9ys87r1+nXg2pbtglIdzzarA3iYDzwEpjUuGZRAakJX/vxFjj5BzAd5kgB6A4ec3udZ8zwt8yzwCOBaR+xhjDkCYP1NcLdxEZklIhtFZOOxY8c8CFcp+2XWdwD7+AjgpgQECHdP6M+uo6Ws2Xvc7nBUM3mSAMTNPNdU77aMiFwFFBpjNjU7svoPMWahMSbDGJMRHx/f0o9Rql3VjwDu0gk7gF1NT+9Fj6hQ/qyDw3yOJwkgD+jj9D4JcG3wa6zMeOAaEcnF0XR0qYj81SpT4NRMlAgUNjt6pTqorPxTnb75p15oUCB3jk/h631FDfc+Ur7BkwSwAUgVkRQRCQFuBpa6lFkK3GZdDTQWKDbGHDHG/MoYk2SMSbbW+9wY8xOndW63Xt8OfNDanVGqIygsraCgpLJTdwC7umVMXyJDg/jzGq0F+JImE4AxpgaYAyzHcSXPO8aY7SIyW0RmW8WWATlANvAycI8H234CmCwie4HJ1nulfN6/OoD9JwFEhQXzb2P6sizrCAeLztgdjvKQRw2UxphlOE7yzvNecnptgHub+IzVwGqn90XAZZ6HqpRvqO8AHpLYuTuAXd0xPoVFX+3nlS9zeGz6ULvDUR7QkcBKedm2/GIGxHf1iw5gZz27hXFtem+WbDzEibIqu8NRHtAEoJSXZXbyEcBnM2tCfyqq61j8Ta7doSgPaAJQyosKSiooLPWvDmBnqT0iuWxwAou/OUB5Va3d4agmaAJQyovqL4P0pw5gVz+7eAAnyqp4d9OhpgsrW2kCUMqLsvKLCRAY0slHAJ/NqOTupPeJ5uW1+/XRkR2cJgClvKi+AzgixL86gJ2JCLMv7s/BE2f4ZNtRu8NRZ6EJQCkvyuxEzwBujclDepIS14WXvtinN4nrwDQBKOUlBSUVHCut7FSPgGypwADhrotSyMov5pucIrvDUY3QBKCUl2RqB/D3/GhkEnFdQ1i4JsfuUFQjNAEo5SUNHcCJmgAAwoIDmTkumdW7j7HzSInd4Sg3NAEo5SXb8otJTYgkPCTQ7lA6jJ+M7UdESCAvay2gQ9IEoJQXGGPIzCv22wFgjYmOCGHGqD4s3XqYw6fK7Q5HudAEoJQXFJRUcvx0JWm9/ff6/8b89MIUDLDoy/12h6JcaAJQygsy804BkOYnD4FpjqTuEVw9LJG31h+kuLza7nCUE00ASnnBtoYOYK0BuDNrwgDKqmp589sDdoeinGgCUMoLsvKLGdhDO4AbM6RXFBelxvHqV7lUVOtN4joKTQBKtZIxhqx87QBuyuyLB3CstJJ/fJdvdyjK4lECEJEpIrJbRLJFZJ6b5SIi863lmSIy0pofJiLrRWSriGwXkcec1nlURPJFZIs1TfPebinVfo6WVHD8dJWOAG7CuAGxnNcrioVrc6jTm8R1CE0mABEJBF4EpgJDgFtEZIhLsalAqjXNAhZY8yuBS40xw4F0YIr10Ph6zxhj0q3pe4+cVMpX1I8A1nsAnZ2I8LOLB5BzrIzPdhbYHY7CsxrAaCDbGJNjjKkC3gamu5SZDiw2DuuAaBFJtN6ftsoEW5OmftWpbMsvJjBAtAPYA9OG9iSpezh/1oFhHYInCaA34PxkhzxrnkdlRCRQRLYAhcAKY8y3TuXmWE1Gi0Sku7uNi8gsEdkoIhuPHTvmQbhKta+s/GJSE7oSFqwdwE0JCgzgrgtT2HTgJBtzT9gdjt/zJAGIm3muv+IbLWOMqTXGpANJwGgRGWotXwAMwNE0dAT4o7uNG2MWGmMyjDEZ8fHxHoSrVPsxxpCVV6zt/81w06g+REcEay2gA/AkAeQBfZzeJwGHm1vGGHMKWA1Msd4XWMmhDngZR1OTUj7lSHEFRWVV2v7fDBEhQdx2QTIrdhSQXXi66RVUm/EkAWwAUkUkRURCgJuBpS5llgK3WVcDjQWKjTFHRCReRKIBRCQcmATsst4nOq1/HbCtdbuiVPtr6ADWGkCz3H5BP0KDAvjLWq0F2KnJBGCMqQHmAMuBncA7xpjtIjJbRGZbxZYBOUA2jl/z91jzE4FVIpKJI5GsMMZ8ZC17UkSyrGWXAA96a6eUai/1HcDnagdws8R2DeXGjCTe35xPYUmF3eH4LY8eXGpdornMZd5LTq8NcK+b9TKBEY185q3NilSpDqh+BLB2ADffXRf252/fHuTVr3P55ZTBdofjl3QksFItVD8CWO8A2jLJcV2YOjSRv647wOnKGrvD8UuaAJRqocPFFZwo0xHArTFrQn9KK2p4e/1Bu0PxS5oAlGqhLL0FdKsN7xPN2P4xvPLlfqpr6+wOx+9oAlCqhbLyiwkKEAb3jLQ7FJ/2s4sHcKS4gqVbXK8uV21NE4BSLZSVX6IdwF4wcWA8g3tGsuCLfdTqTeLalSYApVrAMQL4lLb/e4GIcN+lqWQXnuafWUfsDsevaAJQqgXyT5Vz8kw1Q3UEsFdMHdqTgT268vzKvVoLaEeaAJRqgSxrBPAwrQF4RUCAcP9lqewtPM0yrQW0G00ASrVAfQfwIO0A9pppQxNJTejKfK0FtBtNAEq1QFZ+MYN6agewNwUECA9M0lpAe9IEoFQz/WsEsDb/eJtzLUAfG9n2NAEo1Ux5J8s5daZaHwLfBr7XF7BNawFtTROAUs2UlW91AOsVQG1iWpqjFvDcZ1oLaGuaAJRqpqz8YoIDtQO4rQRqLaDdaAJQqpm2WR3AoUHaAdxWpqUlco72BbQ5TQBKNYMxhkx9BnCbq68F7Ck4zcfbjtodTqelCUCpZsg7WU5xuXYAt4crrVrAcyv3aC2gjXiUAERkiojsFpFsEZnnZrmIyHxreaaIjLTmh4nIehHZKiLbReQxp3ViRGSFiOy1/nb33m4p1TYyG0YAR9sbiB8IDBDuu/QcrQW0oSYTgIgEAi8CU4EhwC0iMsSl2FQg1ZpmAQus+ZXApcaY4UA6MMV6aDzAPGClMSYVWGm9V6pDq+8AHtizq92h+IWrhvViQHwX7QtoI57UAEYD2caYHGNMFfA2MN2lzHRgsXFYB0SLSKL1/rRVJtiajNM6r1uvXweubcV+KNUutuUXM7hnlHYAt5P6voDdBaV8sl1rAd7mSQLoDRxyep9nzfOojIgEisgWoBBYYYz51irTwxhzBMD6m+Bu4yIyS0Q2isjGY8eOeRCuUm2jfgSwtv+3r/pagI4L8D5PEoC4mef6r9BoGWNMrTEmHUgCRovI0OYEaIxZaIzJMMZkxMfHN2dVpbzq0AlHB7BeAdS+tBbQdjxJAHlAH6f3SYDrs9uaLGOMOQWsBqZYswpEJBHA+lvoadBK2SEz/xSgI4DtcNWwXvTXvgCv8yQBbABSRSRFREKAm4GlLmWWArdZVwONBYqNMUdEJF5EogFEJByYBOxyWud26/XtwAet2xWl2lZWfjEhgQEM7KEjgNtbYIDwwGWp7DpaynKtBXhNkwnAGFMDzAGWAzuBd4wx20VktojMtootA3KAbOBl4B5rfiKwSkQycSSSFcaYj6xlTwCTRWQvMNl6r1SHtS2/mMGJkYQE6fAZO9TXAp7TWoDXBHlSyBizDMdJ3nneS06vDXCvm/UygRGNfGYRcFlzglXKLo5nABdz1fBedofitwIDhPsvTWXuki0s336UqWmJdofk8/SnjFIeOHjiDCUVNfoISJtdPbwX/eO0FuAtmgCU8kD9CGC9BNRe9VcE7Tpayqc7tC+gtTQBKOWBbdoB3GHU1wKe1XEBraYJQCkPZOUXc652AHcIgQHCfZedo7UAL9Bvs1JN0BHAHc/Vw+r7ArK1FtAKmgCUasKBojOUVtToALAOJCgwgPsuO4edR0r4dEeB3eH4LE0ASjUhM187gDuiq4f1IkWvCGoVTQBKNWFbfjEhQdoB3NEEBQZw36VaC2gNTQBKNSErr5hzE6MIDtT/Lh3NNcMdtYD5K/fiGI+qmkO/0UqdRV2dYVt+MWm9o+wORblRXwvYobWAFtEEoNRZHDhxhtLKGn0EZAd2zfBeJMdG8NxnWgtoLk0ASp3Fmj2OhxCl6RVAHZajFpCqtYAW0ASgVCNOV9bw/OfZjEruzuCe2gHckU1P11pAS2gCUKoRC9fkcPx0Jb+edi4i7h56pzoK51rAx9t0dLCnNAEo5UZBSQUvr8nhqmGJjOjb3e5wlAemp/dicM9IHv6/razff8LucHyCJgCl3Hj60z3U1NXxyBWD7Q5FeSgoMIDFPx1NYrcwZr66nm9ziuwOqcPTBKCUi11HS3hn0yFuvyCZvrERdoejmiEhMoy3Zo2lV3Q4d7y2QZNAEzxKACIyRUR2i0i2iMxzs1xEZL61PFNERlrz+4jIKhHZKSLbReQBp3UeFZF8EdliTdO8t1tKtdzvl+0iMjSIOZeeY3coqgUSIsP4291j6BUdzsxXN7BOk0CjmkwAIhIIvAhMBYYAt4jIEJdiU4FUa5oFLLDm1wA/N8acC4wF7nVZ9xljTLo1fe+Rk0rZYc2eY3yx5xj3X5ZKdESI3eGoFkqIDOOtu8fSu3s4d7y6gW/2aRJwx5MawGgg2xiTY4ypAt4GpruUmQ4sNg7rgGgRSTTGHDHGbAYwxpTieKh8by/Gr5TX1NYZ/mfZTvrEhHPrBf3sDke1UnxkKG/dPZak7uHc+ZomAXc8SQC9gUNO7/P44Um8yTIikozjAfHfOs2eYzUZLRIRt5daiMgsEdkoIhuPHTvmQbhKtcz7m/PYdbSUR64YTGhQoN3hKC+Ijwzlb1YSuOO19ZoEXHiSANxdAO060uKsZUSkK/AeMNcYU2LNXgAMANKBI8Af3W3cGLPQGJNhjMmIj4/3IFylmq+8qpanPt3N8D7RXDUs0e5wlBfFR4by1qyx9I2J4I7X1vP1vuN2h9RheJIA8oA+Tu+TgMOelhGRYBwn/zeNMe/XFzDGFBhjao0xdcDLOJqalLLFK1/mUFBSyW900FenFNfVURPoF9OFO1/bwNfZmgTAswSwAUgVkRQRCQFuBpa6lFkK3GZdDTQWKDbGHBHH/6RXgJ3GmKedVxAR559Z1wHbWrwXSrXCsdJKFqzex+VDejA6JcbucFQbcSSBMY4k8PoGvtIk0HQCMMbUAHOA5Tg6cd8xxmwXkdkiMtsqtgzIAbJx/Jq/x5o/HrgVuNTN5Z5PikiWiGQClwAPem2vlGqG51buobKmjnlTddBXZxdrJYHkWEdNwN+TgPjSjZMyMjLMxo0b7Q5DdSLZhae54tk1/HhMX347fajd4ah2UnS6kh//5Vv2Hy/jldtHcWFqnN0htSkR2WSMyXCdryOBlV974uNdhAcH8sBlqXaHotpRrNUnkBLXhZ++voG1e/3zCkNNAMpvrcsp4rOdBdxzyQBiu4baHY5qZzFdQhqSwF2vb2x49oM/0QSg/FKdNeirV7cw7hyfYnc4yib1SaB/fFfuWux/SUATgPJLH2YeJjOvmIevGERYsA768mcxXUL4211jGGAlgS/8KAloAlB+p6K6lic/2c15vaK4Nl3vTKKgu5UEzonvyt2LN7J6d6HdIbULTQDK77z+dS75p8r5zbRzCQjQQV/KoXuXEN68awypCV2Z9cYm3t+c1+kfL6kJQPmVk2VVvLAqm0sGxTPunM596Z9qvvokMKx3Nx56Zys/fX0jR4rL7Q6rzWgCUH5l/ud7Kaus4VfTzrU7FNVBRUeEsORnF/AfV57L1/uOc/nTa3hr/cFOWRvQBKD8Ru7xMt745gAzRvVhYI9Iu8NRHVhggHDXRf1ZPncCQ3t341fvZ/Hjv3zLwaIzdofmVZoAlN94cvkuQoICeHDSQLtDUT6iX2wX3rxrDP9zXRqZecVc8ewaFn25n9q6zlEb0ASg/MKmAydYlnWUWRP6kxAVZnc4yocEBAj/NqYvnz44gbH9Y/jtRzu48aWvyS4stTu0VtMEoDo9YwyP/3MnCZGhzJrQ3+5wlI/qFR3OopmjePqm4eQcL2Pac1/y4qpsqmvr7A6txTQBqE7v421H2XzwFA9NHkhESJDd4SgfJiJcPzKJFQ9ezKQhCfzv8t1c++JXbD9cbHdoLaIJQHVqVTV1/OGTXQzqEcmNGX2aXkEpD8RHhvKnH5/PSz8ZSUFJJdNf+Iqnlu+msqbW7tCaRROA6tT+uu4AB4rOMG/aYAJ10JfysilDE/nsoQlck96LF1Zlc9X8L9l88KTdYXlM68Oq0your2b+53u58Jw4Jg7U50mrthEdEcLTN6Vz9fBe/Ob9LH604GvuHJ/Cw5cPIjyk6ftMGWMorayhsKSCo8WVHC2poKCkgqPFFRwtqXDML6ng+VtGev2JdZoAVKf1p1XZFJdX86tpg/U5v6rNXTIogeUPTuAPn+zilS/3s2JHAb+/Po2UuC6Ok7p1Qnec1Cs5Wmyd6EsqOFP1w6ajbuHB9IwKIyEqlIE9IokM8/7p2qMngonIFOA5IBD4izHmCZflYi2fBpwBZhpjNotIH2Ax0BOoAxYaY56z1okBlgDJQC5wkzHmrHUnfSKYasrpyho25J7g25wTLPpqP1cP68Ufbxpud1jKz3yzr4h572dywM3AseBAISEyjJ7dwugZFUaPqDB6dgulR/1r668ntQdPNfZEsCZTiogEAi8Ck4E8YIOILDXG7HAqNhVItaYxwALrbw3wcysZRAKbRGSFte48YKUx5gkRmWe9/2Wr9lL5ndKKajbmnmRdThHrcorYdriE2jpDcKCQ0S+GR6YMsjtE5YcuGBDLJw9M4L3NeQSINJzge0aF0T0ipMPchNCTOsVoINsYkwMgIm8D0wHnBDAdWGwc1Yl1IhItIonGmCPAEQBjTKmI7AR6W+tOByZa678OrEYTgGpCcXk1G3NPsC6niG/3n2BbfjF1xvGrKr1PNPdMHMCYlFhG9ovWSz6VrcJDAvnJ2H52h3FWnvwP6Q0ccnqfh+PXfVNlemOd/AFEJBkYAXxrzephJQiMMUdEJMHdxkVkFjALoG/fvh6EqzqT4jPVrG844Rex/XAJxkBIYADpfaOZc2kqY1NiGNG3u1erzEr5A08SgLu6imvHwVnLiEhX4D1grjGmxPPwwBizEFgIjj6A5qyrfENtnaG4vJqTZ6o4WVZFYWllQ7POzqPWCT8ogJF9o3ngslTGpMQyom+0PslLqVbyJAHkAc4jaJKAw56WEZFgHCf/N40x7zuVKahvJhKRRMA/HsHTyVXX1nHqzL9O5ifPVHHye++rfzC/uLwa12sRQoMCOL9fdx6cNJAxKTEM76MnfKW8zZMEsAFIFZEUIB+4Gfg3lzJLgTlW/8AYoNg6sQvwCrDTGPO0m3VuB56w/n7Q8t1QdimrrGHt3uN8trOAL/Yc41hpZaNlw4MD6R4RTPcuIXSPCKF39wi6RwQTHRFCjNP8mC4hpPboSmiQnvCVaktNJgBjTI2IzAGW47gMdJExZruIzLaWvwQsw3EJaDaOy0DvsFYfD9wKZInIFmver40xy3Cc+N8RkZ8CB4EbvbZXqk0VlFSwcmchK3Yc5at9RVTV1BEVFsTEQQkMiO9KTBfrpN4lhOiIYGKsE7v+gleqY/FoHEBHoeMA7GGMYdfRUj7bUcBnOwvYmue48VWfmHAmn9uTyUN6kJHcneBAvbOIUh1Ri8cBKP9UVVPH+v0n+GxnASt2FJB/qhwRSO8TzS+uGMTkIT1ITeiqI2yV8mGaAFSD4vJqVu8u5LOdhazeXUhpRQ2hQQFclBrH/ZedwyWDE0iI1IepKNVZaALwY6cra9ieX8zWvFOs3n2M9ftPUFNniOsawrShiUwa0oMLz4nT6+uV6qQ0AfiJM1U1bD9cQlZeMVn5xWTmnSLneFnD5ZfnJHTl7gn9mTykB+lJ0R1mqLpSqu1oAuiEyqtq2XGkhKy8U2TmF7Mtv5jswtPUP8e6R1Qoab27cc3w3qQlRTG0dzdt2lHKD2kCaGfGGI6VVnK6soaggAACA4WgACEwwPlvQMP7pn6JV1TXsvNICVn5xQ2/7vcWnqbWOtvHdQ1lWFI3pg5NJK13N9KSutFDH4qulEITQJswxlBYWsn+42UcKCpj//EzHCgqI7fI8dfdvb8bEyB8LyE4J4xAEQpLK6mxTvaxXUJIS+rG5CE9Gk72PaPC9EodpZRbmgBaqK7O5SRfVMaB42fILSrjQNEZyqv/dZIPChD6xkSQHNeFsf1jSI7tQrfwYGrrDDV1ddTUGcfrWutvnaHWef73lv9rfnWtoUdUKMOSoklL6kavbnqyV0p5ThNAM6zaXcjb6w+Se/wMB06UUVFd17AsONA6ycd2YdyAOFLiIugX24Xk2C70ig4jSAdJKaU6GE0AHtp5pITZb2yie0QIQ3tHcVFqHP3iupAcG2Gd5MP1oeNKKZ+iCcADpytruPfNzUSFB/PhfRcSHxlqd0hKKdVq2i7RBGMMv34/i9yiMp6/ZYSe/JVSnYYmgCa8tf4QS7ce5qHJAxnbP9bucJRSyms0AZzF9sPFPPrhdiYMjOeeiefYHY5SSnmVJoBGlFZUc++bm+keEcwzNw3XWyMopTod7QR2wxjDvPezOHSynLfuHktsV233V0p1PloDcOOv3x7kn5lH+PnlAxmdEmN3OEop1SY8SgAiMkVEdotItojMc7NcRGS+tTxTREY6LVskIoUiss1lnUdFJF9EtljTtNbvTuttyy/mdx/u4JJB8cyeMMDucJRSqs00mQBEJBB4EZgKDAFuEZEhLsWmAqnWNAtY4LTsNWBKIx//jDEm3ZqWNTN2ryupqOaeNzcT2zWEP96Uru3+SqlOzZMawGgg2xiTY4ypAt4GpruUmQ4sNg7rgGgRSQQwxqwBTngz6LZgjOGX72aSf6qc528ZQUyXELtDUkqpNuVJAugNHHJ6n2fNa24Zd+ZYTUaLRKS7uwIiMktENorIxmPHjnnwkS3z+te5fLztKI9cMYiMZG33V0p1fp4kAHftIKYFZVwtAAYA6cAR4I/uChljFhpjMowxGfHx8U18ZMtsPXSKx5ft5LLBCdx9Uf822YZSSnU0niSAPKCP0/sk4HALynyPMabAGFNrjKkDXsbR1NTuisurufdvm0mIDOOPer2/UsqPeJIANgCpIpIiIiHAzcBSlzJLgdusq4HGAsXGmCNn+9D6PgLLdcC2xsq2FWMMj7y7laPFFTz/byOIjtB2f6WU/2hyIJgxpkZE5gDLgUBgkTFmu4jMtpa/BCwDpgHZwBngjvr1ReQtYCIQJyJ5wH8bY14BnhSRdBxNRbnAz7y3W55Z9FUuy7cX8B9XnsvIvm67IJRSqtMSY5pqqu84MjIyzMaNG73yWd8dPMmNL33DJYMTWHjr+fokLaVUpyUim4wxGa7z/XIk8KkzVcz523f07BbGUzcM15O/Usov+d29gIwxPPx/WyksreDd2ePoFhFsd0hKKWULv6sB/GXtfj7bWcivp53L8D7RdoejlFK28asEsOnASf7wyS6mnNeTmeOS7Q5HKaVs5TcJ4GRZFff9bTOJ0WH84YZh2u6vlPJ7ftEHUFdneOidLRw/XcV7/z6ObuHa7q+UUn5RA1i4NodVu4/xH1edS1pSN7vDUUqpDsEvEkBitzBuPD+JW8f2szsUpZTqMPyiCWh6em+mp3tyc1KllPIfflEDUEop9UOaAJRSyk9pAlBKKT+lCUAppfyUJgCllPJTmgCUUspPaQJQSik/pQlAKaX8lE89EUxEjgEHWrh6HHDci+G0B4257flavKAxtxdfi/ls8fYzxsS7zvSpBNAaIrLR3SPROjKNue35WrygMbcXX4u5JfFqE5BSSvkpTQBKKeWn/CkBLLQ7gBbQmNuer8ULGnN78bWYmx2v3/QBKKWU+j5/qgEopZRyoglAKaX8lM8mABHpIyKrRGSniGwXkQes+TEiskJE9lp/uzut8ysRyRaR3SJyhdP880Uky1o2X/SJ8Q28fJxXW/O2WFOCHfvU0TT3GItIrFX+tIi84PJZ+l1uhJePc+f4LhtjfHICEoGR1utIYA8wBHgSmGfNnwf8wXo9BNgKhAIpwD4g0Fq2HrgAEOBjYKrd+9dRJi8f59VAht371NGmFhzjLsCFwGzgBZfP0u9y+xznTvFd9tkagDHmiDFms/W6FNgJ9AamA69bxV4HrrVeTwfeNsZUGmP2A9nAaBFJBKKMMd8Yx7/sYqd1/J63jnO7Bu1jmnuMjTFlxpgvgQrnz9Hv8tl56zh3Jj6bAJyJSDIwAvgW6GGMOQKOf3CgvmrWGzjktFqeNa+39dp1vnLRyuNc71Wryvyf2jzxQx4e48bod9lDrTzO9Xz+u+zzCUBEugLvAXONMSVnK+pmnjnLfOXEC8cZ4MfGmDTgImu61btR+rZmHONGP8LNPP0uu/DCcYZO8l326QQgIsE4/iHfNMa8b80usKrC9VXiQmt+HtDHafUk4LA1P8nNfGXx0nHGGJNv/S0F/oY2DTVo5jFujH6Xm+Cl49xpvss+mwCsKtcrwE5jzNNOi5YCt1uvbwc+cJp/s4iEikgKkAqst6p8pSIy1vrM25zW8XveOs4iEiQicdZnBgNXAdvaYx86uhYcY7f0u3x23jrOneq7bHcvdEsnHL3zBsgEtljTNCAWWAnstf7GOK3zGxxXpezG6eoIIAPHP+A+4AWsEdI6ee8447iiYpP1OduB57CuDvL3qYXHOBc4AZzG8ct/iDVfv8ttfJw703dZbwWhlFJ+ymebgJRSSrWOJgCllPJTmgCUUspPaQJQSik/pQlAKaX8lCYApZyIw5ciMtVp3k0i8ombsrnWnTczReQLEenXvtEq1Tp6GahSLkRkKPB/OO4VE4jjevEpxph91nLBcduFHBx3hDwuIo8BvYwxd9sTtVLNpzUApVwYY7YBHwK/BP4bx101a637yP8J2Mz3b3cB8A3WjddEJF5E3hORDdY03pr/qIi8ISKfW/ee12ShbBVkdwBKdVCP4TjRV+EYXZsIDALuMMbcA+ByA8gpwD+s188BzxhjvhSRvsBy4Fxr2TBgLI7RpN+JyD+NMXq/HmULTQBKuWGMKRORJcBpY0yldbI/YIxZ51J0lYj0wHEDsf+w5k0ChjgliCgRibRef2CMKQfKRWQVjpuI/aMNd0WpRmkTkFKNq7OmemVuylwC9MNxT5jfWvMCgAuMMenW1Ns47hoJP7w9s3bCKdtoAlCqlaxf9HOB20QkBvgUmFO/XETSnYpPF5EwEYkFJgIb2i9Spb5PE4BSXmAct2J+C7gXuB/IsC4P3YHjmbL11gP/BNYBv9P2f2UnvQxUqXYiIo/i6FN4yu5YlAKtASillN/SGoBSSvkprQEopZSf0gSglFJ+ShOAUkr5KU0ASinlpzQBKKWUn/r/2tqBlHVulpEAAAAASUVORK5CYII=\n", 578 | "text/plain": [ 579 | "
" 580 | ] 581 | }, 582 | "metadata": { 583 | "needs_background": "light" 584 | }, 585 | "output_type": "display_data" 586 | } 587 | ], 588 | "source": [ 589 | "df_per_yr_summary=df_per_yr_summary.compute()\n", 590 | "df_per_yr_summary['DelinquencyEventsperLoan']=df_per_yr_summary['DelinquentEvent']/df_per_yr_summary['LoanID']\n", 591 | "df_per_yr_summary.to_pandas().sort_values(by=['YrRep']).plot.line(x='YrRep',y='DelinquencyEventsperLoan')" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "## Creating modeling dataset with label & features" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "Aggregate to one record per loan & flagging for delinquency event at least once historically" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 48, 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "data": { 615 | "application/vnd.jupyter.widget-view+json": { 616 | "model_id": "c5119883ea8b475ea25b1a4607516b8c", 617 | "version_major": 2, 618 | "version_minor": 0 619 | }, 620 | "text/plain": [ 621 | "VBox()" 622 | ] 623 | }, 624 | "metadata": {}, 625 | "output_type": "display_data" 626 | } 627 | ], 628 | "source": [ 629 | "# This takes ~ 12-15 min with 2 A100 GPUs on 1.8B rows\n", 630 | "df_per_loan = df_per.groupby('LoanID',as_index=False).agg({'DelinquentEvent':'sum'}).persist()\n", 631 | "progress(df_per_loan)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 49, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "data": { 641 | "text/html": [ 642 | "
\n", 643 | "\n", 656 | "\n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | "
LoanIDDelinquentEvent
01856480463450
19401609929790
23382534808200
33271800327370
42870960473570
\n", 692 | "
" 693 | ], 694 | "text/plain": [ 695 | " LoanID DelinquentEvent\n", 696 | "0 185648046345 0\n", 697 | "1 940160992979 0\n", 698 | "2 338253480820 0\n", 699 | "3 327180032737 0\n", 700 | "4 287096047357 0" 701 | ] 702 | }, 703 | "execution_count": 49, 704 | "metadata": {}, 705 | "output_type": "execute_result" 706 | } 707 | ], 708 | "source": [ 709 | "df_per_loan.head()" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 51, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "data": { 719 | "text/html": [ 720 | "
\n", 721 | "\n", 734 | "\n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | "
LoanIDDelinquentEventDelinquentFlag
673684285405511
11858243020041151
24229906838180261
3338113715052911
3576984695385071
\n", 776 | "
" 777 | ], 778 | "text/plain": [ 779 | " LoanID DelinquentEvent DelinquentFlag\n", 780 | "6 736842854055 1 1\n", 781 | "11 858243020041 15 1\n", 782 | "24 229906838180 26 1\n", 783 | "33 381137150529 1 1\n", 784 | "35 769846953850 7 1" 785 | ] 786 | }, 787 | "execution_count": 51, 788 | "metadata": {}, 789 | "output_type": "execute_result" 790 | } 791 | ], 792 | "source": [ 793 | "df_per_loan['DelinquentFlag']=0\n", 794 | "df_per_loan['DelinquentFlag']=df_per_loan['DelinquentFlag'].where(df_per_loan['DelinquentEvent']<1,1)\n", 795 | "df_per_loan[df_per_loan['DelinquentFlag']> 0].head()" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 52, 801 | "metadata": {}, 802 | "outputs": [ 803 | { 804 | "data": { 805 | "text/html": [ 806 | "
\n", 807 | "\n", 820 | "\n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | "
LoanIDChannelSellerNameOrInterestRateOrUnpaidPrincOrLoanTermOrDateFirstPaymentOrLTVOrCLTV...OccStatusPropertyStateZipMortInsPercProductTypeCoCreditScoreMortInsTypeRelMortIndDelinquentEventDelinquentFlag
0183242426281RJPMORGAN CHASE BANK, NA8.375137000.0360.001/200003/200097.0<NA>...POR97430.0FRM<NA>1.0N00
1183243108339RWELLS FARGO BANK, N.A.8.000145000.0360.011/199901/200080.0<NA>...PIL604<NA>FRM<NA><NA>Y11
2183245218791RSUNTRUST MORTGAGE INC.8.000100000.0180.001/200003/200033.0<NA>...PMD210<NA>FRM772.0<NA>N00
3183249720870RUSAA FEDERAL SAVINGS BANK8.000212000.0360.001/200003/200095.0<NA>...PPA15130.0FRM695.01.0N00
4183251824837ROTHER8.37556000.0360.002/200004/200048.0<NA>...PLA700<NA>FRM<NA><NA>N00
\n", 970 | "

5 rows × 27 columns

\n", 971 | "
" 972 | ], 973 | "text/plain": [ 974 | " LoanID Channel SellerName OrInterestRate \\\n", 975 | "0 183242426281 R JPMORGAN CHASE BANK, NA 8.375 \n", 976 | "1 183243108339 R WELLS FARGO BANK, N.A. 8.000 \n", 977 | "2 183245218791 R SUNTRUST MORTGAGE INC. 8.000 \n", 978 | "3 183249720870 R USAA FEDERAL SAVINGS BANK 8.000 \n", 979 | "4 183251824837 R OTHER 8.375 \n", 980 | "\n", 981 | " OrUnpaidPrinc OrLoanTerm OrDate FirstPayment OrLTV OrCLTV ... \\\n", 982 | "0 137000.0 360.0 01/2000 03/2000 97.0 ... \n", 983 | "1 145000.0 360.0 11/1999 01/2000 80.0 ... \n", 984 | "2 100000.0 180.0 01/2000 03/2000 33.0 ... \n", 985 | "3 212000.0 360.0 01/2000 03/2000 95.0 ... \n", 986 | "4 56000.0 360.0 02/2000 04/2000 48.0 ... \n", 987 | "\n", 988 | " OccStatus PropertyState Zip MortInsPerc ProductType CoCreditScore \\\n", 989 | "0 P OR 974 30.0 FRM \n", 990 | "1 P IL 604 FRM \n", 991 | "2 P MD 210 FRM 772.0 \n", 992 | "3 P PA 151 30.0 FRM 695.0 \n", 993 | "4 P LA 700 FRM \n", 994 | "\n", 995 | " MortInsType RelMortInd DelinquentEvent DelinquentFlag \n", 996 | "0 1.0 N 0 0 \n", 997 | "1 Y 1 1 \n", 998 | "2 N 0 0 \n", 999 | "3 1.0 N 0 0 \n", 1000 | "4 N 0 0 \n", 1001 | "\n", 1002 | "[5 rows x 27 columns]" 1003 | ] 1004 | }, 1005 | "execution_count": 52, 1006 | "metadata": {}, 1007 | "output_type": "execute_result" 1008 | } 1009 | ], 1010 | "source": [ 1011 | "joined=df_acq.merge(df_per_loan,on=['LoanID'],how='left')\n", 1012 | "joined.head()" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "execution_count": 90, 1018 | "metadata": {}, 1019 | "outputs": [], 1020 | "source": [ 1021 | "label=['DelinquentFlag']\n", 1022 | "cat_features=['Channel','OccStatus','FTHomeBuyer','LoanPurpose','PropertyType','ProductType','RelMortInd']\n", 1023 | "num_features=['OrInterestRate','OrUnpaidPrinc','OrLoanTerm','OrLTV','OrCLTV','CreditScore']\n", 1024 | "modeling_dataset=joined_categorized[cat_features + num_features + label]" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 91, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [ 1033 | "modeling_dataset.dtypes" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": 99, 1039 | "metadata": {}, 1040 | "outputs": [], 1041 | "source": [ 1042 | "modeling_dataset=dask_df.get_dummies(modeling_dataset)" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "markdown", 1047 | "metadata": {}, 1048 | "source": [ 1049 | "## Training a model" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": 101, 1055 | "metadata": {}, 1056 | "outputs": [], 1057 | "source": [ 1058 | "X = modeling_dataset[modeling_dataset.columns.difference(['DelinquentFlag'])]\n", 1059 | "y = modeling_dataset['DelinquentFlag']" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "code", 1064 | "execution_count": 102, 1065 | "metadata": {}, 1066 | "outputs": [], 1067 | "source": [ 1068 | "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": 104, 1074 | "metadata": {}, 1075 | "outputs": [], 1076 | "source": [ 1077 | "param = {\n", 1078 | " 'max_depth': 8,\n", 1079 | " 'objective': 'reg:squarederror',\n", 1080 | " 'tree_method': 'gpu_hist'\n", 1081 | " }\n", 1082 | "model = xgb.dask.train(client,param, dtrain,num_boost_round=100)" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "execution_count": 110, 1088 | "metadata": {}, 1089 | "outputs": [], 1090 | "source": [ 1091 | "## TODO --- Add in metrics, feature importance, & evaluation plots" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "metadata": {}, 1097 | "source": [ 1098 | "## Appendix" 1099 | ] 1100 | } 1101 | ], 1102 | "metadata": { 1103 | "environment": { 1104 | "name": "common-cu110.m61", 1105 | "type": "gcloud", 1106 | "uri": "gcr.io/deeplearning-platform-release/base-cu110:m61" 1107 | }, 1108 | "kernelspec": { 1109 | "display_name": "Python [conda env:rapids-0.17]", 1110 | "language": "python", 1111 | "name": "conda-env-rapids-0.17-py" 1112 | }, 1113 | "language_info": { 1114 | "codemirror_mode": { 1115 | "name": "ipython", 1116 | "version": 3 1117 | }, 1118 | "file_extension": ".py", 1119 | "mimetype": "text/x-python", 1120 | "name": "python", 1121 | "nbconvert_exporter": "python", 1122 | "pygments_lexer": "ipython3", 1123 | "version": "3.7.8" 1124 | } 1125 | }, 1126 | "nbformat": 4, 1127 | "nbformat_minor": 4 1128 | } 1129 | -------------------------------------------------------------------------------- /higgs/rapids_xgboost/notebooks/a100_higgs_rapids_xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#!conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=11.0" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%%bash\n", 17 | "nvidia-smi\n", 18 | "nvcc --version" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import numpy as np; print('numpy Version:', np.__version__)\n", 28 | "import pandas as pd; print('pandas Version:', pd.__version__)\n", 29 | "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n", 30 | "import cudf; print('cudf Version:', cudf.__version__)\n", 31 | "import cuml; print('cudf Version:', cuml.__version__)\n", 32 | "import gcsfs; print('gcsfs Version:', gcsfs.__version__)\n", 33 | "import time\n", 34 | "import dask_cudf; print('dask_cudf Version:', gcsfs.__version__)\n", 35 | "import dask; print('dask Version:', gcsfs.__version__)\n", 36 | "import dask.dataframe as dask_df" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Download HIGGs dataset & unzip\n", 44 | "https://archive.ics.uci.edu/ml/datasets/HIGGS" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# %%bash\n", 54 | "# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P /home/jupyter/\n", 55 | "# gzip -d /home/jupyter/HIGGS.csv.gz /home/jupyter/\n", 56 | "# ls -lh /home/jupyter/" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]\n", 66 | "#filname = '/home/jupyter/HIGGS.csv'\n", 67 | "filname = 'gs://mchrestkha-github-ml-examples/higgs/HIGGS.csv'" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Pandas" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "start_time = time.time()\n", 84 | "df=pd.read_csv(filname, header=None, names=colnames)\n", 85 | "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 86 | "start_time = time.time()\n", 87 | "X = df[df.columns.difference(['label'])]\n", 88 | "y = df['label']\n", 89 | "dtrain=xgb.DMatrix(X,y)\n", 90 | "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n", 91 | "\n", 92 | "\n", 93 | "start_time = time.time()\n", 94 | "param = {\n", 95 | " 'max_depth': 8,\n", 96 | " 'objective': 'reg:squarederror',\n", 97 | " 'tree_method': 'hist'\n", 98 | " }\n", 99 | "bst = xgb.train(param, dtrain,num_boost_round=100)\n", 100 | "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## cuDF" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "start_time = time.time()\n", 117 | "df=cudf.read_csv(filname, header=None, names=colnames)\n", 118 | "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 119 | "start_time = time.time()\n", 120 | "X = df[df.columns.difference(['label'])]\n", 121 | "y = df['label']\n", 122 | "dtrain=xgb.DMatrix(X,y)\n", 123 | "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n", 124 | "\n", 125 | "start_time = time.time()\n", 126 | "param = {\n", 127 | " 'max_depth': 8,\n", 128 | " 'objective': 'reg:squarederror',\n", 129 | " 'tree_method': 'gpu_hist'\n", 130 | " }\n", 131 | "bst = xgb.train(param, dtrain,num_boost_round=100)\n", 132 | "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Dask" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from dask.distributed import Client, LocalCluster\n", 149 | "cluster = LocalCluster()\n", 150 | "# num_workders=2\n", 151 | "# threads_per_worker=12\n", 152 | "# cluster = LocalCluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n", 153 | "\n", 154 | "client = Client(cluster)\n", 155 | "client" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "start_time = time.time()\n", 165 | "df=dask_df.read_csv(filname, header=None, names=colnames)\n", 166 | "df=df.persist()\n", 167 | "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 168 | "# start_time = time.time()\n", 169 | "X = df[df.columns.difference(['label'])]\n", 170 | "y = df['label']\n", 171 | "dtrain=xgb.dask.DaskDMatrix(client,X,y)\n", 172 | "\n", 173 | "del df\n", 174 | "del X\n", 175 | "del y\n", 176 | "\n", 177 | "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n", 178 | "\n", 179 | "start_time = time.time()\n", 180 | "param = {\n", 181 | " 'max_depth': 8,\n", 182 | " 'objective': 'reg:squarederror',\n", 183 | " 'tree_method': 'hist'\n", 184 | " }\n", 185 | "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n", 186 | "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Dask_cuDF" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "from dask_cuda import LocalCUDACluster\n", 203 | "from dask.distributed import Client\n", 204 | "# Create a Dask Cluster with one worker per GPU\n", 205 | "num_workders=2\n", 206 | "threads_per_worker=12\n", 207 | "cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n", 208 | "#cluster = LocalCUDACluster()\n", 209 | "\n", 210 | "client = Client(cluster)\n", 211 | "client" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "start_time = time.time()\n", 221 | "df=dask_cudf.read_csv(filname, header=None, names=colnames)\n", 222 | "df=df.persist()\n", 223 | "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 224 | "# start_time = time.time()\n", 225 | "X = df[df.columns.difference(['label'])]\n", 226 | "y = df['label']\n", 227 | "#dtrain=xgb.dask.DaskDMatrix(client,X,y)\n", 228 | "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)\n", 229 | "\n", 230 | "del df\n", 231 | "del X\n", 232 | "del y\n", 233 | "\n", 234 | "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n", 235 | "\n", 236 | "start_time = time.time()\n", 237 | "param = {\n", 238 | " 'max_depth': 8,\n", 239 | " 'objective': 'reg:squarederror',\n", 240 | " 'tree_method': 'gpu_hist'\n", 241 | " }\n", 242 | "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n", 243 | "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "from dask_cuda import LocalCUDACluster\n", 253 | "from dask.distributed import Client\n", 254 | "# Create a Dask Cluster with one worker per GPU\n", 255 | "# num_workders=2\n", 256 | "# threads_per_worker=12\n", 257 | "# cluster = LocalCUDACluster(n_workers=num_workders, threads_per_worker=threads_per_worker)\n", 258 | "cluster = LocalCUDACluster()\n", 259 | "\n", 260 | "client = Client(cluster)\n", 261 | "client\n", 262 | "\n", 263 | "start_time = time.time()\n", 264 | "df=dask_cudf.read_csv(filname, header=None, names=colnames)\n", 265 | "df=df.persist()\n", 266 | "print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 267 | "# start_time = time.time()\n", 268 | "X = df[df.columns.difference(['label'])]\n", 269 | "y = df['label']\n", 270 | "#dtrain=xgb.dask.DaskDMatrix(client,X,y)\n", 271 | "dtrain=xgb.dask.DaskDeviceQuantileDMatrix(client, X,y)\n", 272 | "\n", 273 | "del df\n", 274 | "del X\n", 275 | "del y\n", 276 | "\n", 277 | "print(\"[INFO]: ------ DMatrix is completed in {} seconds ---\".format((time.time() - start_time)))\n", 278 | "\n", 279 | "start_time = time.time()\n", 280 | "param = {\n", 281 | " 'max_depth': 8,\n", 282 | " 'objective': 'reg:squarederror',\n", 283 | " 'tree_method': 'gpu_hist'\n", 284 | " }\n", 285 | "bst = xgb.dask.train(client,param, dtrain,num_boost_round=100)\n", 286 | "print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "environment": { 299 | "name": "common-cu110.m59", 300 | "type": "gcloud", 301 | "uri": "gcr.io/deeplearning-platform-release/base-cu110:m59" 302 | }, 303 | "kernelspec": { 304 | "display_name": "Python [conda env:rapids-0.16]", 305 | "language": "python", 306 | "name": "conda-env-rapids-0.16-py" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 3 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython3", 318 | "version": "3.7.9" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 4 323 | } 324 | -------------------------------------------------------------------------------- /higgs/rapids_xgboost/notebooks/t4_higgs_rapids_xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "- Using the pre-build RAPIDS image on Google Cloud's AI Platform Notebooks with a T4 GPU, 8vCPUs, 30GB RAM\n", 8 | "- https://cloud.google.com/ai-platform/notebooks/docs/images#deciding\n", 9 | "- This should provide CUDA 10.0, rapids 0.12" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Sun Dec 6 06:43:44 2020 \n", 22 | "+-----------------------------------------------------------------------------+\n", 23 | "| NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 |\n", 24 | "|-------------------------------+----------------------+----------------------+\n", 25 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 26 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 27 | "| | | MIG M. |\n", 28 | "|===============================+======================+======================|\n", 29 | "| 0 A100-SXM4-40GB Off | 00000000:00:04.0 Off | 0 |\n", 30 | "| N/A 40C P0 55W / 400W | 0MiB / 40537MiB | 0% Default |\n", 31 | "| | | Disabled |\n", 32 | "+-------------------------------+----------------------+----------------------+\n", 33 | " \n", 34 | "+-----------------------------------------------------------------------------+\n", 35 | "| Processes: |\n", 36 | "| GPU GI CI PID Type Process name GPU Memory |\n", 37 | "| ID ID Usage |\n", 38 | "|=============================================================================|\n", 39 | "| No running processes found |\n", 40 | "+-----------------------------------------------------------------------------+\n", 41 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 42 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 43 | "Built on Thu_Jun_11_22:26:38_PDT_2020\n", 44 | "Cuda compilation tools, release 11.0, V11.0.194\n", 45 | "Build cuda_11.0_bu.TC445_37.28540450_0\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%%bash\n", 51 | "nvidia-smi\n", 52 | "nvcc --version" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "numpy Version: 1.18.5\n", 65 | "pandas Version: 1.1.4\n" 66 | ] 67 | }, 68 | { 69 | "ename": "ModuleNotFoundError", 70 | "evalue": "No module named 'xgboost'", 71 | "output_type": "error", 72 | "traceback": [ 73 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 74 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 75 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'numpy Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'pandas Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'XGBoost Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcudf\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cudf Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcuml\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cudf Version:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcuml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 76 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import numpy as np; print('numpy Version:', np.__version__)\n", 82 | "import pandas as pd; print('pandas Version:', pd.__version__)\n", 83 | "import xgboost as xgb; print('XGBoost Version:', xgb.__version__)\n", 84 | "import cudf; print('cudf Version:', cudf.__version__)\n", 85 | "import cuml; print('cudf Version:', cuml.__version__)\n", 86 | "import time" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Download HIGGs dataset & unzip\n", 94 | "https://archive.ics.uci.edu/ml/datasets/HIGGS" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# %%bash\n", 104 | "# wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P /home/jupyter/" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Single Node with CPUs (PANDAS + XGBoost) or single GPU (RAPIDS-cuDF + XGBoost)\n", 112 | "- XGBoost w/ RAPIDS examples https://rapids.ai/xgboost.html\n", 113 | "\n", 114 | "### Expected CPUs numbers\n", 115 | "[INFO]: ------ Data Ingestion is completed in 104.7611632347107 seconds --- \n", 116 | "TOD0: Add Data transformation steps \n", 117 | "[INFO]: ------ Training is completed in 30.218074321746826 seconds ---\n", 118 | "\n", 119 | "#### Expected GPU numbers\n", 120 | "[INFO]: ------ Data Ingestion is completed in 18.212464094161987 seconds --- \n", 121 | "TOD0: Add Data transformation steps \n", 122 | "[INFO]: ------ Training is completed in 5.825598955154419 seconds ---" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def xgboost_fun(gpu_cpu, tree_method):\n", 132 | " colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]\n", 133 | " start_time = time.time()\n", 134 | " if gpu_cpu=='cpu':\n", 135 | " df=pd.read_csv('/home/jupyter/HIGGS.csv', header=None, names=colnames)\n", 136 | " else: \n", 137 | " df=cudf.read_csv('/home/jupyter/HIGGS.csv', header=None, names=colnames)\n", 138 | " print(\"[INFO]: ------ Data Ingestion is completed in {} seconds ---\".format((time.time() - start_time)))\n", 139 | "\n", 140 | " X = df[df.columns.difference(['label'])]\n", 141 | " y = df['label']\n", 142 | " dtrain=xgb.DMatrix(X,y)\n", 143 | " param = {\n", 144 | " 'max_depth': 8,\n", 145 | " 'objective': 'reg:squarederror',\n", 146 | " 'tree_method': tree_method\n", 147 | " }\n", 148 | "\n", 149 | " start_time = time.time()\n", 150 | " bst = xgb.train(param, dtrain)\n", 151 | " print(\"[INFO]: ------ Training is completed in {} seconds ---\".format((time.time() - start_time)))\n", 152 | " return bst" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "bst=xgboost_fun('gpu','gpu_hist')\n", 162 | "#bst=xgboost_fun('cpu','hist')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## TODO: Single Node with multiple GPUS (Dask + RAPIDS) --- Scales to 4 T4s, 8 V100s, or 16 A100s on GCP\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## TODO: Multi-Node with multiple GPUS (Dask + RAPIDS) Scales to 64+ GPUs" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "environment": { 196 | "name": "common-cu110.m59", 197 | "type": "gcloud", 198 | "uri": "gcr.io/deeplearning-platform-release/base-cu110:m59" 199 | }, 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.7.8" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 4 220 | } 221 | -------------------------------------------------------------------------------- /mlflow-vertex/mlflow-databricks-vertex-deployment.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":["#install gcloud\n%sh\ncurl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-356.0.0-linux-x86_64.tar.gz\ntar -xf google-cloud-sdk-356.0.0-linux-x86_64.tar.gz\nyes | ./google-cloud-sdk/install.sh"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bc96dfc5-dfda-4b15-9f70-f788b918fa22"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["#add gcloud directory to PATH env var\nimport os\nprint(os.environ['PATH'])\npath='/databricks/driver/google-cloud-sdk/bin'\nos.environ[\"PATH\"] += os.pathsep + os.path.join(path)\nprint(os.environ['PATH'])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c349a27e-e5bd-4314-9b40-84090b861015"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["#check if gcloud works with new PATH env variable\n%sh\n/databricks/driver/google-cloud-sdk/bin/gcloud version\ngcloud version"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ed500289-107a-47b7-8a41-04117f85d1a7"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["import logging\nlogger = spark._jvm.org.apache.log4j\nlogging.getLogger(\"py4j.java_gateway\").setLevel(logging.ERROR)\n\nimport mlflow\nfrom mlflow.deployments import get_deploy_client\nprint(mlflow.__version__)\nclient=mlflow.deployments.get_deploy_client(\"google_cloud\")\n\nimport mlflow\nimport mlflow.sklearn\nimport pandas as pd\nimport matplotlib.pyplot as plt\n \nfrom numpy import savetxt\n \nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import load_diabetes\n \nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5619fb3d-69ef-424a-9cd7-7778ea6b26c4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["db = load_diabetes()\nX = db.data\ny = db.target\nX_train, X_test, y_train, y_test = train_test_split(X, y)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"8681e901-5f78-4648-a3da-869e380253b2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["# Enable autolog()\n# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.\nmlflow.sklearn.autolog()\n \n# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged. \nwith mlflow.start_run():\n \n # Set the model parameters. \n n_estimators = 100\n max_depth = 6\n max_features = 3\n \n # Create and train model.\n rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)\n rf.fit(X_train, y_train)\n \n # Use the model to make predictions on the test dataset.\n predictions = rf.predict(X_test)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"761dee39-b293-42e0-b576-39dbe8052d0e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["#mchrestkha-sklearnVersion 1\nmodel_name = \"mchrestkha-test-3\"\nmodel_version = 1\nmodel_uri=f\"models:/{model_name}/{model_version}\"\nprint(model_uri)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"eaf6dea4-6991-4881-a365-127bf6d10a7c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0},{"cell_type":"code","source":["deployment = client.create_deployment(\n name=\"mlflow_on_gcp_test_3\",\n model_uri=model_uri)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"28b56b42-6db4-4105-8b3d-543370ff4ca4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"
INFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Project not set. Using project-aa-258321 as project\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Destination image URI not set. Building and uploading image to gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Building image. This can take up to 20 minutes\n2021/09/10 06:13:41 INFO mlflow.models.cli: Selected backend for flavor 'python_function'\nINFO:google_cloud_mlflow._mlflow_models_docker_utils_patch:Building docker image with name gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n
INFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Project not set. Using project-aa-258321 as project\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Destination image URI not set. Building and uploading image to gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\nINFO:google_cloud_mlflow._mlflow_model_gcp_deployment_utils:Building image. This can take up to 20 minutes\n2021/09/10 06:13:41 INFO mlflow.models.cli: Selected backend for flavor 'python_function'\nINFO:google_cloud_mlflow._mlflow_models_docker_utils_patch:Building docker image with name gcr.io/project-aa-258321/mlflow/mlflow_on_gcp\n
"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"mlflow-test","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":1678309511451670}},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /mlflow-vertex/mlflow-oss-vertex-deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "8ae80b7c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# import packages\n", 11 | "import mlflow\n", 12 | "from mlflow.deployments import get_deploy_client\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.datasets import load_diabetes \n", 15 | "from sklearn.ensemble import RandomForestRegressor" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "e3e58a35", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Start local MLFlow server on CLI with SQLLite DB for model registry\n", 26 | "# mlflow server \\\n", 27 | "# --backend-store-uri sqlite:///mlflow.db \\\n", 28 | "# --default-artifact-root ./artifacts \\\n", 29 | "# --host 0.0.0.0" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "c2f78b68", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# set mlflow registry and tracking URIs\n", 40 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 41 | "mlflow.set_registry_uri(\"http://localhost:5000\")\n", 42 | " \n", 43 | "mr_uri = mlflow.get_registry_uri()\n", 44 | "print(\"Current registry uri: {}\".format(mr_uri))\n", 45 | "tracking_uri = mlflow.get_tracking_uri()\n", 46 | "print(\"Current tracking uri: {}\".format(tracking_uri))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "c6005dd9", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# load dataset\n", 57 | "db = load_diabetes()\n", 58 | "X = db.data\n", 59 | "y = db.target\n", 60 | "X_train, X_test, y_train, y_test = train_test_split(X, y)\n", 61 | " \n", 62 | "# Enable autolog()\n", 63 | "# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.\n", 64 | "mlflow.sklearn.autolog()\n", 65 | " \n", 66 | "# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged. \n", 67 | "with mlflow.start_run() as run: \n", 68 | " # Set the model parameters. \n", 69 | " n_estimators = 100\n", 70 | " max_depth = 6\n", 71 | " max_features = 3\n", 72 | " # Create and train model.\n", 73 | " rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)\n", 74 | " rf.fit(X_train, y_train)\n", 75 | " # Use the model to make predictions on the test dataset.\n", 76 | " predictions = rf.predict(X_test)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "ab2fe324", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# log model\n", 87 | "model_name = \"mchrestkha-sklearn\"\n", 88 | "mlflow.sklearn.log_model(rf, model_name, registered_model_name=model_name)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "efdf6e25", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "client = mlflow.tracking.MlflowClient()\n", 99 | "client.list_registered_models()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "b3f57857", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# deploy model to Vertex AI\n", 110 | "client = mlflow.deployments.get_deploy_client(\"google_cloud\")\n", 111 | " \n", 112 | "model_version = 1\n", 113 | "model_uri=f\"models:/{model_name}/{model_version}\"\n", 114 | "print(model_uri)\n", 115 | " \n", 116 | "deployment = client.create_deployment(\n", 117 | " name=\"mlflow_on_gcp\",\n", 118 | " model_uri=model_uri)" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "environment": { 124 | "name": "common-cpu.m78", 125 | "type": "gcloud", 126 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m78" 127 | }, 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.7.10" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 5 148 | } 149 | -------------------------------------------------------------------------------- /tuning_llms/tuning_dialogsum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bf8ec3e0-c954-48a8-95a5-0e5c2d320841", 6 | "metadata": {}, 7 | "source": [ 8 | "## Library Imports" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 200, 14 | "id": "b25bef4f-3262-4d94-879e-36a0c50eaf82", 15 | "metadata": { 16 | "tags": [] 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from datasets import load_dataset\n", 21 | "import random\n", 22 | "import time\n", 23 | "import json\n", 24 | "import utils\n", 25 | "import mercury as mr\n", 26 | "import openai\n", 27 | "import os\n", 28 | "import pandas as pd\n", 29 | "pd.set_option('display.float_format', '{:.10f}'.format)\n", 30 | "from google.cloud.exceptions import NotFound\n", 31 | "import os\n", 32 | "from tqdm import tqdm\n", 33 | "import time\n", 34 | "\n", 35 | "#Vertex AI libraries\n", 36 | "import vertexai\n", 37 | "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part\n", 38 | "from vertexai.preview.tuning import sft\n", 39 | "from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part\n", 40 | "from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples, PointwiseMetric, PairwiseMetric\n", 41 | "\n", 42 | "#OpenAI library\n", 43 | "from openai import OpenAI\n", 44 | "client = OpenAI()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "9e4899ea-e1c1-4802-84f2-c5143b32f837", 50 | "metadata": {}, 51 | "source": [ 52 | "## Loading Dataset From HuggingFace" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 201, 58 | "id": "d12921cc-be7f-49e2-8383-d5d43dd4dda6", 59 | "metadata": { 60 | "tags": [] 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "dataset = load_dataset(\"knkarthick/dialogsum\")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "5377f9ce-2948-41e9-9d29-e05cd632e023", 70 | "metadata": {}, 71 | "source": [ 72 | "## Creating different sized tuning Datasets" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 202, 78 | "id": "f608e323-3600-47c9-8db2-2e3f5216ebdb", 79 | "metadata": { 80 | "tags": [] 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "# full size datasets\n", 85 | "train12460=dataset[\"train\"].to_list()\n", 86 | "valid500 =dataset[\"validation\"].to_list()\n", 87 | "test1500 =dataset[\"test\"].to_list()\n", 88 | "\n", 89 | "base_instruction=\"Summarize the following dialogue: \"\n", 90 | "for item in test1500: \n", 91 | " item[\"dialogue\"] = base_instruction + item[\"dialogue\"]\n", 92 | "\n", 93 | "# smaller datasets for rapid testing\n", 94 | "train2000=train12460[:2000]\n", 95 | "test100=test1500[:100]\n", 96 | "test250=test1500[:250]\n", 97 | "test10=test1500[:10]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "bda92d07-c449-4316-ad64-6c746ba33367", 103 | "metadata": { 104 | "tags": [] 105 | }, 106 | "source": [ 107 | "## Data Formatting for Tuning API" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "f4466d7e-e982-4281-bb75-c9aa9b774091", 114 | "metadata": { 115 | "tags": [] 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#Prepare data for Gemini 1.5 Tuning\n", 120 | "# Define a base prompt for zero-shot summarization \n", 121 | "base_instruction=\"Summarize the following dialogue: \"\n", 122 | "utils.format_tuning_dataset(train2000, valid500, base_instruction, \"dialogsum_train2000_inst\",\"dialogsum_valid500_inst\")\n", 123 | "utils.format_tuning_dataset(train12460, valid500, base_instruction, \"dialogsum_train12460_inst\",\"dialogsum_valid500_inst\")\n", 124 | "\n", 125 | "base_instruction=\"\"\n", 126 | "utils.format_tuning_dataset(train2000, valid500, base_instruction, \"dialogsum_train2000_no_inst\",\"dialogsum_valid500_no_inst\")\n", 127 | "utils.format_tuning_dataset(train12460, valid500, base_instruction, \"dialogsum_train12460_no_inst\",\"dialogsum_valid500_no_inst\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "837758af-e2c5-47b6-b105-9453beeacd2b", 134 | "metadata": { 135 | "tags": [] 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "utils.delete_and_upload(\"dialogsum_train12460_inst.jsonl\")\n", 140 | "utils.delete_and_upload(\"dialogsum_train2000_inst.jsonl\")\n", 141 | "utils.delete_and_upload(\"dialogsum_valid500_inst.jsonl\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "6871cb58-bf7f-49c2-8612-f48c9a0d8d15", 147 | "metadata": {}, 148 | "source": [ 149 | "## Submit Tuning Job" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "debeed0e-a4f6-4956-94cf-148103819d2c", 156 | "metadata": { 157 | "tags": [] 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "model=\"gemini-1.5-flash-001\"\n", 162 | "utils.tune_gemini(\"gs://mchrestkha-sample-data/dialogsum/dialogsum_train2000_inst.jsonl\", \"gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl\", model, \"dialogsum_2000_inst\")\n", 163 | "utils.tune_gemini(\"gs://mchrestkha-sample-data/dialogsum/dialogsum_train12460_inst.jsonl\", \"gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl\", model, \"dialogsum_124600_inst\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "2de09063-2454-41d0-9963-3fac8b79966f", 169 | "metadata": { 170 | "tags": [] 171 | }, 172 | "source": [ 173 | "## OpenAI Tuning" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "708ebb59-e764-453f-bd76-59a2083ff81e", 180 | "metadata": { 181 | "tags": [] 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "#Prepare data for OpenAI Tuning\n", 186 | "# Define a base prompt for zero-shot summarization \n", 187 | "system_prompt=\"Summarize the following dialogue: \"\n", 188 | "\n", 189 | "# Initialize lists to store messages for training and validation\n", 190 | "train_messages = []\n", 191 | "validation_messages = []\n", 192 | "train = train2000\n", 193 | "valid = valid500\n", 194 | "\n", 195 | "# Iterate over training data and create messages for each dialogue-summary pair\n", 196 | "for d in train:\n", 197 | " prompts = []\n", 198 | " prompts.append({\"role\": \"system\", \"content\": system_prompt})\n", 199 | " prompts.append({\"role\": \"user\", \"content\": d[\"dialogue\"]})\n", 200 | " prompts.append({\"role\": \"assistant\", \"content\": d[\"summary\"]})\n", 201 | " train_messages.append({'messages': prompts})\n", 202 | "\n", 203 | "# Iterate over validation data and create messages similarly\n", 204 | "for d in valid:\n", 205 | " prompts = []\n", 206 | " prompts.append({\"role\": \"system\", \"content\": system_prompt})\n", 207 | " prompts.append({\"role\": \"user\", \"content\": d[\"dialogue\"]})\n", 208 | " prompts.append({\"role\": \"assistant\", \"content\": d[\"summary\"]})\n", 209 | " validation_messages.append({'messages': prompts})\n", 210 | "\n", 211 | " # Print lengths of message lists and an example training message\n", 212 | "len(train_messages), len(validation_messages), train_messages[2]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "bff2af86-efab-4389-ae6b-e9a31e21e5f8", 219 | "metadata": { 220 | "tags": [] 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "# Save to JSON locally\n", 225 | "utils.dicts_to_jsonl(train_messages, \"openai_dialogsum_train2000\", False)\n", 226 | "utils.dicts_to_jsonl(validation_messages, \"openai_dialogsum_valid500\", False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "id": "7ac72e15-9afe-48ce-847e-255d26dfea90", 233 | "metadata": { 234 | "tags": [] 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# Register & Uplaod Files to OpenAI Storage\n", 239 | "client.files.create(\n", 240 | " file=open(\"openai_dialogsum_train2000.jsonl\", \"rb\"),\n", 241 | " purpose=\"fine-tune\"\n", 242 | ")\n", 243 | "\n", 244 | "client.files.create(\n", 245 | " file=open(\"openai_dialogsum_valid500.jsonl\", \"rb\"),\n", 246 | " purpose=\"fine-tune\"\n", 247 | ")" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "a90c953a-df68-402a-8ecd-89da6d2d885c", 254 | "metadata": { 255 | "tags": [] 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "#Submit Tuning Job\n", 260 | "client.fine_tuning.jobs.create(\n", 261 | " training_file=\"file-KxJuvj5sQ3kLQoI7f6X8S9PE\", \n", 262 | " validation_file=\"file-QGvOkG9PtiZJ7y0L1JmNbzDE\",\n", 263 | " model=\"gpt-4o-mini-2024-07-18\"\n", 264 | ")" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "9935f0a5-b915-4a9f-b2ec-da49e4c986f3", 270 | "metadata": {}, 271 | "source": [ 272 | "## Running Predictions on Test Data\n", 273 | "### For X test examples takes Y min to generate predictions" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 203, 279 | "id": "2d98e745-253a-44b5-aef7-c8d763ee7f03", 280 | "metadata": { 281 | "tags": [] 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/html": [ 287 | "\n", 288 | " \n", 289 | " \n", 290 | " \n", 337 | " \n", 338 | " \n", 339 | " tune\n", 340 | " View Tuning Job\n", 341 | " \n", 342 | " \n", 343 | " \n", 357 | " " 358 | ], 359 | "text/plain": [ 360 | "" 361 | ] 362 | }, 363 | "metadata": {}, 364 | "output_type": "display_data" 365 | }, 366 | { 367 | "name": "stderr", 368 | "output_type": "stream", 369 | "text": [ 370 | "Processing: 100%|██████████| 250/250 [13:44<00:00, 3.30s/row]\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "gemini_text = []\n", 376 | "openai_text = []\n", 377 | "gemini_tuned_text = []\n", 378 | "openai_tuned_text = []\n", 379 | "\n", 380 | "tuning_job = sft.SupervisedTuningJob(\"projects/642508009780/locations/us-central1/tuningJobs/2137747369456828416\")\n", 381 | "tuned_model = GenerativeModel(tuning_job.tuned_model_endpoint_name)\n", 382 | "model = GenerativeModel(\"gemini-1.5-flash-001\")\n", 383 | "client = OpenAI()\n", 384 | "#test=test10\n", 385 | "#test=test1500\n", 386 | "test=test250\n", 387 | "\n", 388 | "for row in tqdm(test, desc=\"Processing\", unit=\"row\"):\n", 389 | " try:\n", 390 | " gemini_response = model.generate_content(contents=row[\"dialogue\"])\n", 391 | " gemini_text.append(gemini_response.text)\n", 392 | " except (ValueError, AttributeError): # Catch broader potential errors\n", 393 | " gemini_text.append(\"Blocked\")\n", 394 | " \n", 395 | " try:\n", 396 | " gemini_tuned_response = tuned_model.generate_content(contents=row[\"dialogue\"])\n", 397 | " gemini_tuned_text.append(gemini_tuned_response.text)\n", 398 | " except (ValueError, AttributeError): # Catch broader potential errors\n", 399 | " gemini_tuned_text.append(\"Blocked\")\n", 400 | "\n", 401 | " try:\n", 402 | " openai_response = client.chat.completions.create(\n", 403 | " model=\"gpt-4o-mini-2024-07-18\",\n", 404 | " messages=[{\"role\": \"user\", \"content\": row[\"dialogue\"]}]\n", 405 | " )\n", 406 | " openai_text.append(openai_response.choices[0].message.content)\n", 407 | " except (ValueError, AttributeError): \n", 408 | " openai_text.append(\"Blocked\")\n", 409 | "\n", 410 | " try:\n", 411 | " openai_tuned_response = client.chat.completions.create(\n", 412 | " model=\"ft:gpt-4o-mini-2024-07-18:personal::A3WwHRrJ\",\n", 413 | " messages=[{\"role\": \"user\", \"content\": row[\"dialogue\"]}]\n", 414 | " )\n", 415 | " openai_tuned_text.append(openai_tuned_response.choices[0].message.content)\n", 416 | " except (ValueError, AttributeError): \n", 417 | " openai_tuned_text.append(\"Blocked\")\n", 418 | "\n", 419 | "# Directly create the final DataFrame with responses included\n", 420 | "df_final = pd.DataFrame(test)\n", 421 | "df_final[\"gemini_response\"] = gemini_text\n", 422 | "df_final[\"openai_response\"] = openai_text\n", 423 | "df_final[\"gemini_tuned_response\"] = gemini_tuned_text\n", 424 | "df_final[\"openai_tuned_response\"] = openai_tuned_text" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 207, 430 | "id": "303a0d29-1348-4f39-9a9e-6db0da569f5d", 431 | "metadata": { 432 | "tags": [] 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "df_final[\"summary_response\"]=df_final[\"summary\"]\n", 437 | "df_test_predictions_final=df_final\n", 438 | "df_test_predictions_final.to_csv('df_test_predictions_final.csv', index=False) " 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "5c15c342-214f-4c05-a19d-61af00f31089", 444 | "metadata": {}, 445 | "source": [ 446 | "## Running Computation & Model Pointwise Evals" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 210, 452 | "id": "e13fe0c0-faff-4734-8939-2b2b004e7f95", 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "#Define a pointwise custom summarization quality metric \n", 457 | "pointwise_custom_summary_metric_prompt = \"\"\"\n", 458 | "# Instruction\n", 459 | "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.\n", 460 | "We will provide you with the user input and an AI-generated response.\n", 461 | "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", 462 | "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric. **Explicitly include the word count of the response as the first step in your explanation**, and ensure it aligns with the criteria.\n", 463 | "\n", 464 | "# Evaluation\n", 465 | "## Metric Definition\n", 466 | "You will be assessing summarization quality, which measures the overall ability to summarize text. The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.\n", 467 | "\n", 468 | "## Criteria\n", 469 | "Less than 50 words: The response contains less than 50 words. Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, \" \"))`\n", 470 | "Groundedness: The response contains information included only in the context. The response does not reference any outside information.\n", 471 | "Observer Perspective: The response is written from an observer perspective.\n", 472 | "\n", 473 | "## Rating Rubric\n", 474 | "5: (Very good). The summary is less than 50 words, is grounded and is written as an observer.\n", 475 | "4: (Good). The summary is less than 50 words and is grounded. \n", 476 | "3: (Ok). The summary is more than 50 words but mostly grounded\n", 477 | "2: (Bad). The summary is more than 50 words and not grounded.\n", 478 | "1: (Very bad). The summary is more than 50 words and not grounded.\n", 479 | "\n", 480 | "## Evaluation Steps\n", 481 | "STEP 1: Assess the response in aspects of word count, groundedness, and observer perspective according to the criteria.  **Use the provided formula to determine the EXACT word count**\n", 482 | "STEP 2: Score based on the rubric.\n", 483 | "\n", 484 | "# User Inputs and AI-generated Response\n", 485 | "## User Inputs\n", 486 | "\n", 487 | "### Prompt\n", 488 | "{prompt}\n", 489 | "\n", 490 | "## AI-generated Response\n", 491 | "{response}\n", 492 | "\n", 493 | "\"\"\"\n", 494 | "\n", 495 | "pointwise_custom_summary_metric = PointwiseMetric(\n", 496 | " metric=\"custom_point_summary_metric\",\n", 497 | " metric_prompt_template=pointwise_custom_summary_metric_prompt,\n", 498 | ")" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "id": "11c2afb2-b127-4cb7-86f6-b90d402bb5cd", 505 | "metadata": { 506 | "tags": [] 507 | }, 508 | "outputs": [ 509 | { 510 | "name": "stdout", 511 | "output_type": "stream", 512 | "text": [ 513 | "Computing metrics with a total of 750 Vertex online evaluation service requests.\n" 514 | ] 515 | }, 516 | { 517 | "name": "stderr", 518 | "output_type": "stream", 519 | "text": [ 520 | "100%|██████████| 750/750 [50:03<00:00, 4.00s/it]\n" 521 | ] 522 | }, 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "All 750 metric requests are successfully computed.\n", 528 | "Evaluation Took:3003.383097610007 seconds\n", 529 | "Computing metrics with a total of 750 Vertex online evaluation service requests.\n" 530 | ] 531 | }, 532 | { 533 | "name": "stderr", 534 | "output_type": "stream", 535 | "text": [ 536 | "100%|██████████| 750/750 [50:00<00:00, 4.00s/it]\n" 537 | ] 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "All 750 metric requests are successfully computed.\n", 544 | "Evaluation Took:3000.8083779989975 seconds\n", 545 | "Computing metrics with a total of 750 Vertex online evaluation service requests.\n" 546 | ] 547 | }, 548 | { 549 | "name": "stderr", 550 | "output_type": "stream", 551 | "text": [ 552 | "100%|██████████| 750/750 [50:01<00:00, 4.00s/it] \n" 553 | ] 554 | }, 555 | { 556 | "name": "stdout", 557 | "output_type": "stream", 558 | "text": [ 559 | "All 750 metric requests are successfully computed.\n", 560 | "Evaluation Took:3001.101887780009 seconds\n", 561 | "Computing metrics with a total of 750 Vertex online evaluation service requests.\n" 562 | ] 563 | }, 564 | { 565 | "name": "stderr", 566 | "output_type": "stream", 567 | "text": [ 568 | " 64%|██████▍ | 483/750 [32:12<19:18, 4.34s/it] " 569 | ] 570 | } 571 | ], 572 | "source": [ 573 | "def run_eval(dataset, col_prompt,col_response,col_reference):\n", 574 | " eval_dataset_comp=dataset[[col_prompt,col_response,col_reference]]\n", 575 | " #print(eval_dataset_comp)\n", 576 | " eval_dataset_comp = eval_dataset_comp.rename(columns={col_prompt: 'prompt', col_response: 'response', col_reference: 'reference'})\n", 577 | " #print(eval_dataset_comp)\n", 578 | " eval_task = EvalTask(\n", 579 | " dataset=eval_dataset_comp, \n", 580 | " metrics=[\"rouge_l_sum\",MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY, pointwise_custom_summary_metric],\n", 581 | " )\n", 582 | " eval_result = eval_task.evaluate().summary_metrics\n", 583 | " eval_result_df = pd.DataFrame(eval_result, index=[col_response]).rename_axis('model').reset_index()\n", 584 | " return eval_result_df\n", 585 | "\n", 586 | "# Evaluate different models\n", 587 | "results = [\n", 588 | "run_eval(df_final, \"dialogue\", \"gemini_response\", \"summary\"),\n", 589 | "run_eval(df_final, \"dialogue\", \"gemini_tuned_response\", \"summary\"),\n", 590 | "run_eval(df_final, \"dialogue\", \"summary_response\", \"summary\"),\n", 591 | "run_eval(df_final, \"dialogue\", \"openai_response\", \"summary\"),\n", 592 | "run_eval(df_final, \"dialogue\", \"openai_tuned_response\", \"summary\"),\n", 593 | "]\n", 594 | "\n", 595 | "# Combine results\n", 596 | "combined_comp_point_eval_result = pd.concat(results, ignore_index=True)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "id": "08f5978f-5bc6-4146-871a-f4805211afcb", 603 | "metadata": { 604 | "tags": [] 605 | }, 606 | "outputs": [], 607 | "source": [ 608 | "combined_comp_point_eval_result\n", 609 | "#combined_comp_point_eval_result.to_csv('combined_comp_point_eval_result.csv', index=False) " 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "id": "0a1a3ab3-d9e5-4ca3-b9b5-90c394343282", 615 | "metadata": { 616 | "tags": [] 617 | }, 618 | "source": [ 619 | "## Running Pairwise (AutoSxS) Model Evals" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "id": "f31a1ac5-5f71-4a38-8eb9-e3cbfb02055f", 626 | "metadata": { 627 | "tags": [] 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "#Define a pointwise custom summarization quality metric \n", 632 | "pairwise_custom_summary_metric_prompt = \"\"\"\n", 633 | "# Instruction\n", 634 | "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. We will provide you with the user input and a pair of AI-generated responses (Response A and Response B).\n", 635 | "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", 636 | "You will first judge responses individually, following the Rating Rubric and Evaluation Steps.\n", 637 | "Then you will give step-by-step explanations for your judgement, compare results to declare the winner based on the Rating Rubric and Evaluation Steps.\n", 638 | "# Evaluation\n", 639 | "## Metric Definition\n", 640 | "You will be assessing summarization quality, which measures the overall ability to summarize text. The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.\n", 641 | "\n", 642 | "## Criteria\n", 643 | "Less than 50 words: The response contains less than 50 words. Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, \" \"))`\n", 644 | "Groundedness: The response contains information included only in the context. The response does not reference any outside information.\n", 645 | "Observer Perspective: The response is written from an observer perspective.\n", 646 | "\n", 647 | "## Rating Rubric\n", 648 | "\"A\": Response A summarizes the given context as per the criteria better than response B.\n", 649 | "\"SAME\": Response A and B summarizes the given context equally well as per the criteria.\n", 650 | "\"B\": Response B summarizes the given context as per the criteria better than response A.\n", 651 | "\n", 652 | "## Evaluation Steps\n", 653 | "STEP 1: Analyze Response A based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.\n", 654 | "STEP 2: Analyze Response B based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.\n", 655 | "STEP 3: Compare the overall performance of Response A and Response B based on your analyses and assessment.\n", 656 | "STEP 4: Output your preference of \"A\", \"SAME\" or \"B\" to the pairwise_choice field according to the Rating Rubric.\n", 657 | "STEP 5: Output your assessment reasoning in the explanation field.\n", 658 | "\n", 659 | "\n", 660 | "# User Inputs and AI-generated Responses\n", 661 | "## User Inputs\n", 662 | "\n", 663 | "### Prompt\n", 664 | "{prompt}\n", 665 | "\n", 666 | "## AI-generated Responses\n", 667 | "### Response A\n", 668 | "{baseline_model_response}\n", 669 | "\n", 670 | "### Response B\n", 671 | "{response}\n", 672 | "\n", 673 | "\"\"\"\n", 674 | "\n", 675 | "pairwise_custom_summary_metric = PairwiseMetric(\n", 676 | " metric=\"custom_pairwise_summary_metric\",\n", 677 | " metric_prompt_template=pairwise_custom_summary_metric_prompt,\n", 678 | ")" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "id": "ed176375-c29e-4fff-9136-635036f3c868", 685 | "metadata": { 686 | "tags": [] 687 | }, 688 | "outputs": [], 689 | "source": [ 690 | "eval_dataset_pair = df_final[['dialogue', 'gemini_tuned_response', 'openai_tuned_response']].rename(columns={\n", 691 | " 'dialogue': 'prompt', \n", 692 | " 'gemini_tuned_response': 'response', \n", 693 | " 'openai_tuned_response': 'baseline_model_response'\n", 694 | "})" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "id": "ccd0678e-ce6e-4246-9809-56400edb9124", 701 | "metadata": { 702 | "tags": [] 703 | }, 704 | "outputs": [], 705 | "source": [ 706 | "eval_task = EvalTask(\n", 707 | " dataset=eval_dataset_pair, \n", 708 | " metrics=[MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY, pairwise_custom_summary_metric],\n", 709 | " )\n", 710 | "eval_result = eval_task.evaluate()" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "id": "ce80cee6-7548-47de-af2f-08d384a89b10", 717 | "metadata": { 718 | "tags": [] 719 | }, 720 | "outputs": [], 721 | "source": [ 722 | "combined_pair_eval_result=eval_result.summary_metrics\n", 723 | "#combined_pair_eval_result.to_csv('combined_pair_eval_result.csv', index=False) " 724 | ] 725 | } 726 | ], 727 | "metadata": { 728 | "environment": { 729 | "kernel": "conda-root-py", 730 | "name": "workbench-notebooks.m113", 731 | "type": "gcloud", 732 | "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" 733 | }, 734 | "kernelspec": { 735 | "display_name": "Python 3 (ipykernel) (Local)", 736 | "language": "python", 737 | "name": "conda-root-py" 738 | }, 739 | "language_info": { 740 | "codemirror_mode": { 741 | "name": "ipython", 742 | "version": 3 743 | }, 744 | "file_extension": ".py", 745 | "mimetype": "text/x-python", 746 | "name": "python", 747 | "nbconvert_exporter": "python", 748 | "pygments_lexer": "ipython3", 749 | "version": "3.10.13" 750 | } 751 | }, 752 | "nbformat": 4, 753 | "nbformat_minor": 5 754 | } 755 | -------------------------------------------------------------------------------- /tuning_llms/tuning_legalbench.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c7087b40-d110-48d3-8e8e-852109bab99b", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "#!pip install datasets vertexai mercury" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "e4e282f3-c9ac-4ca0-9039-9b20bed1134d", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "#! gcloud auth list" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "9f441437-a6da-4f55-afca-9a57923f10f4", 31 | "metadata": { 32 | "tags": [] 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "from datasets import load_dataset\n", 37 | "import random\n", 38 | "import time\n", 39 | "import vertexai\n", 40 | "from vertexai.preview.tuning import sft\n", 41 | "import json\n", 42 | "import utils\n", 43 | "import mercury as mr" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "7c70aae2-6b21-4dbd-addf-75f09bf9b702", 50 | "metadata": { 51 | "tags": [] 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "# Load the data\n", 56 | "dataset = load_dataset(\"nguha/legalbench\", \"contract_nli_explicit_identification\")\n", 57 | "\n", 58 | "# Merge and shuffle\n", 59 | "data = dataset[\"train\"].to_list() + dataset[\"test\"].to_list() # Convert to lists before concatenating\n", 60 | "random.shuffle(data)\n", 61 | "\n", 62 | "# Add new index\n", 63 | "for idx, d in enumerate(data):\n", 64 | " d[\"new_index\"] = idx" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "7ef891b9-ba74-4fe1-97e8-c35bf13acd48", 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "len(data)\n", 77 | "mr.JSON(data)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "613f6fae-bd36-4311-9680-32a7c279eb4b", 84 | "metadata": { 85 | "tags": [] 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "base_prompt_zero_shot = \"Identify if the clause provides that all Confidential Information shall be expressly identified by the Disclosing Party. Answer with only `Yes` or `No`\"" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "c8fef0c7-b094-4da7-8b80-3b0ff8ef4ea8", 96 | "metadata": { 97 | "tags": [] 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "n_train = 30\n", 102 | "n_test = len(data) - n_train" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "97da2b67-fc37-44cf-ba9a-f352eab8f4cd", 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "train_messages = []\n", 115 | "test_messages = []\n", 116 | "\n", 117 | "for d in data:\n", 118 | " prompts=[]\n", 119 | " prompts = [{\"role\": \"system\", \"parts\": [{\"text\": base_prompt_zero_shot}]}]\n", 120 | " prompts.append({\"role\": \"user\", \"parts\": [{\"text\": d[\"text\"]}]})\n", 121 | " prompts.append({\"role\": \"model\", \"parts\": [{\"text\": d[\"answer\"]}]}) \n", 122 | "\n", 123 | " if int(d[\"new_index\"]) < n_train:\n", 124 | " #train_messages.append({'messages': prompts})\n", 125 | " train_messages.append({'contents': prompts})\n", 126 | "\n", 127 | " else:\n", 128 | " #test_messages.append({'messages': prompts})\n", 129 | " test_messages.append({'contents': prompts})\n", 130 | "\n", 131 | "len(train_messages), len(test_messages), n_test, train_messages[5]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "3b765a73-a769-413f-b9b1-81a7f040afbb", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "for d in data:\n", 142 | " tuningdataset=[]\n", 143 | " tuningdataset = [{\"role\": \"system\", \"parts\": [{\"text\": system_instructions}]}]\n", 144 | " tuningdataset.append({\"role\": \"user\", \"parts\": [{\"text\": d[\"text\"]}]})\n", 145 | " tuningdataset.append({\"role\": \"model\", \"parts\": [{\"text\": d[\"answer\"]}]}) \n", 146 | " tuningdataset.append({'contents': prompts})" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "231130e7-a60c-44f3-a93b-45dd8d986611", 153 | "metadata": { 154 | "tags": [] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "utils.dicts_to_jsonl(train_messages, \"train_contents\", False)\n", 159 | "utils.dicts_to_jsonl(test_messages, \"test_contents\", False)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "0e280382-bda1-4400-965e-b675e9c970ac", 166 | "metadata": { 167 | "tags": [] 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "#upload_blob(bucket_name, source_file_name, destination_blob_name)\n", 172 | "#delete_blob(bucket_name, blob_name):\n", 173 | "utils.delete_blob(\"mchrestkha-sample-data\",\"legalbench/contract_nli_explicit_identification/train_contents.jsonl\")\n", 174 | "utils.delete_blob(\"mchrestkha-sample-data\",\"legalbench/contract_nli_explicit_identification/test_contents.jsonl\")\n", 175 | "utils.upload_blob(\"mchrestkha-sample-data\",\"train_contents.jsonl\",\"legalbench/contract_nli_explicit_identification/train_contents.jsonl\")\n", 176 | "utils.upload_blob(\"mchrestkha-sample-data\",\"test_contents.jsonl\",\"legalbench/contract_nli_explicit_identification/test_contents.jsonl\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "id": "7e9e0977-ca96-4a8b-a686-4a00e6527fe0", 183 | "metadata": { 184 | "tags": [] 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "vertexai.init(project=\"mchrestkha-sandbox\", location=\"us-central1\")\n", 189 | "\n", 190 | "sft_tuning_job = sft.train(\n", 191 | " source_model=\"gemini-1.5-pro-001\",\n", 192 | " train_dataset=\"gs://mchrestkha-sample-data/legalbench/contract_nli_explicit_identification/train_contents.jsonl\",\n", 193 | " #train_dataset=\"gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl\",\n", 194 | " # The following parameters are optional\n", 195 | " validation_dataset=\"gs://mchrestkha-sample-data/legalbench/contract_nli_explicit_identification/test_contents.jsonl\",\n", 196 | " epochs=5,\n", 197 | " adapter_size=4,\n", 198 | " learning_rate_multiplier=1.0,\n", 199 | " tuned_model_display_name=\"1.5_flash_testing\",\n", 200 | ")\n", 201 | "\n", 202 | "# Polling for job completion\n", 203 | "while not sft_tuning_job.has_ended:\n", 204 | " time.sleep(60)\n", 205 | " sft_tuning_job.refresh()\n", 206 | "\n", 207 | "print(sft_tuning_job.tuned_model_name)\n", 208 | "print(sft_tuning_job.tuned_model_endpoint_name)\n", 209 | "print(sft_tuning_job.experiment)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "id": "57740801-8130-4332-bf6f-ef586716d708", 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "070036f6-3351-4c29-9fec-2d830ea2283f", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "ed88e8e3-7887-4a0a-86ed-5f0a6083d4fb", 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix" 236 | ] 237 | } 238 | ], 239 | "metadata": { 240 | "environment": { 241 | "kernel": "conda-root-py", 242 | "name": "workbench-notebooks.m113", 243 | "type": "gcloud", 244 | "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" 245 | }, 246 | "kernelspec": { 247 | "display_name": "Python 3 (ipykernel) (Local)", 248 | "language": "python", 249 | "name": "conda-root-py" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 3 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython3", 261 | "version": "3.10.13" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 5 266 | } 267 | -------------------------------------------------------------------------------- /tuning_llms/utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from google.cloud import storage 4 | from datasets import load_dataset 5 | import random 6 | import time 7 | import vertexai 8 | from vertexai.preview.tuning import sft 9 | from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples 10 | import json 11 | import utils 12 | import mercury as mr 13 | import openai 14 | import os 15 | import pandas as pd 16 | from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part 17 | from google.cloud.exceptions import NotFound 18 | 19 | def format_tuning_dataset(train_list, valid_list, base_instruction, train_filename, valid_filename): 20 | 21 | # Initialize lists to store messages for training and validation 22 | train_messages = [] 23 | validation_messages = [] 24 | 25 | # Iterate over training data and create messages for each dialogue-summary pair 26 | for d in train_list: 27 | prompts=[] 28 | prompts.append({"role": "user", "parts": [{"text": base_instruction + d["dialogue"]}]}) 29 | prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 30 | train_messages.append({'contents': prompts}) 31 | 32 | # Iterate over validation data and create messages similarly 33 | for d in valid_list: 34 | prompts=[] 35 | prompts.append({"role": "user", "parts": [{"text": base_instruction + d["dialogue"]}]}) 36 | prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 37 | validation_messages.append({'contents': prompts}) 38 | 39 | # Save to JSON locally 40 | dicts_to_jsonl(train_messages, train_filename, False) 41 | dicts_to_jsonl(validation_messages, valid_filename, False) 42 | 43 | # Print lengths of message lists and an example training message 44 | len(train_messages), len(validation_messages), train_messages[3] 45 | 46 | # Delete & Overwrite files to upload to GCS 47 | def delete_and_upload(filename): 48 | try: 49 | delete_blob("mchrestkha-sample-data",f"dialogsum/{filename}") 50 | except NotFound: 51 | pass 52 | upload_blob("mchrestkha-sample-data",filename,f"dialogsum/{filename}") 53 | 54 | 55 | #Submit Tuning Job 56 | def tune_gemini(train_file, valid_file, model, model_name): 57 | timestr = time.strftime("%Y%m%d-%H%M%S") 58 | model_name=model_name+timestr 59 | vertexai.init(project="mchrestkha-sandbox", location="us-central1") 60 | 61 | sft_tuning_job = sft.train( 62 | source_model=model, 63 | train_dataset=train_file, 64 | # The following parameters are optional 65 | validation_dataset=valid_file, 66 | epochs=5, 67 | adapter_size=4, 68 | learning_rate_multiplier=1.0, 69 | tuned_model_display_name=model_name, 70 | ) 71 | 72 | 73 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 74 | """Uploads a file to the bucket.""" 75 | # The ID of your GCS bucket 76 | # bucket_name = "your-bucket-name" 77 | # The path to your file to upload 78 | # source_file_name = "local/path/to/file" 79 | # The ID of your GCS object 80 | # destination_blob_name = "storage-object-name" 81 | 82 | storage_client = storage.Client() 83 | bucket = storage_client.bucket(bucket_name) 84 | blob = bucket.blob(destination_blob_name) 85 | 86 | # Optional: set a generation-match precondition to avoid potential race conditions 87 | # and data corruptions. The request to upload is aborted if the object's 88 | # generation number does not match your precondition. For a destination 89 | # object that does not yet exist, set the if_generation_match precondition to 0. 90 | # If the destination object already exists in your bucket, set instead a 91 | # generation-match precondition using its generation number. 92 | generation_match_precondition = 0 93 | 94 | blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition) 95 | 96 | print( 97 | f"File {source_file_name} uploaded to {destination_blob_name}." 98 | ) 99 | 100 | def delete_blob(bucket_name, blob_name): 101 | """Deletes a blob from the bucket.""" 102 | # bucket_name = "your-bucket-name" 103 | # blob_name = "your-object-name" 104 | 105 | storage_client = storage.Client() 106 | 107 | bucket = storage_client.bucket(bucket_name) 108 | blob = bucket.blob(blob_name) 109 | generation_match_precondition = None 110 | 111 | # Optional: set a generation-match precondition to avoid potential race conditions 112 | # and data corruptions. The request to delete is aborted if the object's 113 | # generation number does not match your precondition. 114 | blob.reload() # Fetch blob metadata to use in generation_match_precondition. 115 | generation_match_precondition = blob.generation 116 | 117 | blob.delete(if_generation_match=generation_match_precondition) 118 | 119 | print(f"Blob {blob_name} deleted.") 120 | 121 | 122 | def dicts_to_jsonl(data_list: list, filename: str, compress: bool = True) -> None: 123 | """ 124 | Method saves list of dicts into jsonl file. 125 | 126 | :param data: (list) list of dicts to be stored, 127 | :param filename: (str) path to the output file. If suffix .jsonl is not given then methods appends 128 | .jsonl suffix into the file. 129 | :param compress: (bool) should file be compressed into a gzip archive? 130 | """ 131 | 132 | sjsonl = '.jsonl' 133 | sgz = '.gz' 134 | 135 | # Check filename 136 | 137 | if not filename.endswith(sjsonl): 138 | filename = filename + sjsonl 139 | 140 | # Save data 141 | 142 | if compress: 143 | filename = filename + sgz 144 | with gzip.open(filename, 'w') as compressed: 145 | for ddict in data_list: 146 | jout = json.dumps(ddict) + '\n' 147 | jout = jout.encode('utf-8') 148 | compressed.write(jout) 149 | else: 150 | with open(filename, 'w') as out: 151 | for ddict in data_list: 152 | jout = json.dumps(ddict) + '\n' 153 | out.write(jout) --------------------------------------------------------------------------------