├── .gitignore
├── Dockerfile
├── README.md
├── assets
    ├── setup_01.png
    ├── setup_02.png
    ├── slides.pdf
    └── slides_cover.png
├── notebooks
    ├── 00_code_snipets.ipynb
    ├── 00_prepare_dataset.ipynb
    ├── 01_train.ipynb
    ├── 02_inference_review.ipynb
    ├── 02_timing.ipynb
    ├── 03_optimizing_model.ipynb
    ├── 04_packaging.ipynb
    └── utils.py
├── requirements.txt
├── serving
    ├── Dockerfile
    ├── config.properties
    ├── handler.py
    ├── requirements.txt
    └── sample_input.json
├── setup.ipynb
└── workshop_infra
    ├── cert
        └── .gitkeep
    ├── config.enc.yaml
    ├── config_public.yaml
    ├── docker-setup.sh
    └── setup.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | 
 3 | .pth 
 4 | .mar 
 5 | logs 
 6 | *.gz
 7 | serving/model_store/
 8 | __pycache__
 9 | *.zip
10 | notebooks/trainer_*/ 
11 | notebooks/wandb/
12 | .venv
13 | workshop_infra/cert/*
14 | workshop_infra/config.yaml
15 | 
16 | !/**/.gitkeep
17 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #FROM jupyter/scipy-notebook:python-3.10.5
 2 | 
 3 | FROM jupyter/scipy-notebook:python-3.8.8
 4 | 
 5 | 
 6 | 
 7 | USER root
 8 | 
 9 | 
10 | RUN apt-get update && apt-get --yes install apt-utils && \
11 |     apt-get --yes install htop tmux graphviz openjdk-11-jre-headless curl && \
12 |     apt-get clean;
13 | 
14 | 
15 | # set the user back to original setting
16 | USER $NB_UID
17 | 
18 | 
19 | 
20 | # Install from requirements.txt file
21 | COPY --chown=${NB_UID}:${NB_GID} requirements.txt /tmp/
22 | 
23 | RUN pip install --no-cache-dir --requirement /tmp/requirements.txt && \
24 |     fix-permissions "${CONDA_DIR}" && \
25 |     fix-permissions "/home/${NB_USER}"
26 | 
27 | 
28 | #COPY --chown=${NB_UID}:${NB_GID} docker-setup.sh /tmp/
29 | 
30 | COPY --chown=${NB_UID}:${NB_GID} setup.ipynb /tmp/
31 | 
32 | RUN papermill /tmp/setup.ipynb /tmp/setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar && \
33 |     fix-permissions "${CONDA_DIR}" && \
34 |     fix-permissions "/home/${NB_USER}"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Readme
 2 | 
 3 | 
 4 | ## Overview
 5 | 
 6 | 
 7 | This repo contains notebooks for Pytorch Serving Workshop.
 8 | 
 9 | Note: We **do not** need a GPU runtime
10 | 
11 | ## Setup 
12 | 
13 | If you came to this repo, during a workshop visit this custom [jupyter hub](http://hub2.np.training) with all the dependencies already set up.
14 | 
15 | 
16 | 
17 | Otherwise, consider using [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/npatta01/pytorch-serving-workshop/main)
18 | 
19 | 
20 | 
21 | ## Contents
22 | 
23 | There are five notebooks.
24 | 
25 | a. `00_prepare_dataset.ipynb`
26 | 
27 | Notebook that prepares the e-comeerce dataset and saves it.
28 | 
29 | b. `01_train.ipynb`
30 | 
31 | Trains a DistilBert model
32 | 
33 | c. `02_inference_review.ipynb`
34 | 
35 | Notebook that shows how to use the HuggingFace ecosystem. Also shows how to use the trained model from previous notebook.
36 | 
37 | d. `03_optimizing_model.ipynb`
38 | 
39 | Notebook that shows impact of Quantization and TorschScript
40 | 
41 | 
42 | e. `04_packaging.ipynb`
43 | 
44 | Notebook that shows how to use TorchServe to serve models
45 | 
46 | 
47 | ## Slides
48 | 
49 | [![Watch the video](assets/slides_cover.png)](https://www.slideshare.net/nidhinpattaniyil/serving-bert-models-in-production-with-torchserve)
50 | 
51 | 
52 | ## Video
53 | 
54 | [![PyData Video](https://img.youtube.com/vi/sDGxzkOvxqY/0.jpg)](https://www.youtube.com/watch?v=sDGxzkOvxqY&ab_channel=PyData)
55 | 
56 | 
57 | ## References
58 | 
59 | [Pydata 2021 Slides](https://www.slideshare.net/nidhinpattaniyil/serving-bert-models-in-production-with-torchserve)
60 | 
61 | [Pydata 2021 Conference Page](https://pydata.org/global2021/schedule/presentation/136/serving-pytorch-models-in-production/)
62 | 
63 | 
64 | ## Libraries
65 | 
66 | This repro uses HuggingFace transformers and dataset pacakge. 
67 | 
68 | The dataset used is [Amazon Berkeley Objects (ABO) Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) created by Amazon and UC Berkeley.
69 | For more reference, refer to this [paper](https://arxiv.org/abs/2110.06199)
70 | 
71 | 
72 | ## Contact
73 | 
74 | For help or feedback, please reach out to :
75 | 
76 | - [Nidhin Pattaniyil](https://www.linkedin.com/in/nidhinpattaniyil/)   
77 | - [Adway Dhillon](https://www.linkedin.com/in/adwaydhillon/)    
78 | - [Vishal Rathi](https://www.linkedin.com/in/vishalkumarrathi/)   
79 | 


--------------------------------------------------------------------------------
/assets/setup_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/setup_01.png


--------------------------------------------------------------------------------
/assets/setup_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/setup_02.png


--------------------------------------------------------------------------------
/assets/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/slides.pdf


--------------------------------------------------------------------------------
/assets/slides_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/slides_cover.png


--------------------------------------------------------------------------------
/notebooks/00_code_snipets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "parliamentary-metropolitan",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# About\n",
  9 |     "\n",
 10 |     "This is an internal notebok to help create code snippets "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "american-journalist",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": []
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "suspended-attendance",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": []
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 1,
 32 |    "id": "favorite-subdivision",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from transformers import BertTokenizer"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "id": "provincial-electron",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Bert uses WordPiece Tokenizer\n",
 47 |     "# splitting words either into the full forms\n",
 48 |     "# (e.g., one word becomes one token) or into word piece\n",
 49 |     "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "id": "homeless-employment",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "['cheap', 'nike', 'men', 'running', 'shoes']"
 62 |       ]
 63 |      },
 64 |      "execution_count": 3,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "tokenizer.tokenize(\"cheap nike men running shoes\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "id": "legitimate-employee",
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "['che', '##p', 'nike', 'men', 'shoes', 'run', '##ing', 'under', '100', '$']"
 83 |       ]
 84 |      },
 85 |      "execution_count": 4,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "# chep/runing is mispelled\n",
 92 |     "tokenizer.tokenize(\"chep nike men shoes runing under 100$ \")"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "id": "mathematical-acceptance",
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "30522"
105 |       ]
106 |      },
107 |      "execution_count": 5,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "# size of vocabulary\n",
114 |     "tokenizer.vocab_size"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "applied-ribbon",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": []
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "incoming-mentor",
128 |    "metadata": {},
129 |    "source": [
130 |     "# Training Code"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "difficult-smooth",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "# hugging face library to load existing/custom datasets\n",
141 |     "import datasets\n",
142 |     "# hugging face library contains tokenizers / models \n",
143 |     "import transformers"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "parallel-webster",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# dataset contains two columns \"text/label\"\n",
154 |     "raw_datasets = datasets.load_from_disk(dataset_path)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "respected-nothing",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# use existing distilbert tokenizer\n",
165 |     "tokenizer = transformers.AutoTokenizer.from_pretrained(\"distilbert-base-uncased\" )\n",
166 |     "\n",
167 |     "def tokenize_function(examples):\n",
168 |     "    return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
169 |     "\n",
170 |     "# calculate ['input_ids' , 'attention_mask']\n",
171 |     "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) "
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "alleged-sapphire",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# use pretrained distilbert model\n",
182 |     "model = transformers.AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\"\n",
183 |     "                                                                        , num_labels=len(labels) ... )"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "stunning-storage",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "training_args = transformers.TrainingArguments(\"trainer\",num_train_epochs=5...)             \n",
194 |     "                                 )\n",
195 |     "trainer = transformers.Trainer(\n",
196 |     "    model=model, \n",
197 |     "    args=training_args, \n",
198 |     "    train_dataset=tokenized_datasets['train'], \n",
199 |     "    eval_dataset=tokenized_datasets['validation'],.... )"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "current-rebate",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "# train on datasets/argumets passed to trainer args\n",
210 |     "trainer.train()\n"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "single-mother",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": []
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "metallic-manor",
224 |    "metadata": {},
225 |    "source": [
226 |     "# Inference Code"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "id": "structured-mirror",
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "query = 'comfortable men sandals'"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "id": "senior-sunset",
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "# compute input id / attention mask\n",
247 |     "tokenized_res = tokenizer.encode_plus(query, return_tensors=\"pt\")"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "continent-sullivan",
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# pass input to model\n",
258 |     "model_res = model(**tokenized_res)\n",
259 |     "# get softmax of logits\n",
260 |     "logits = model_res.logits\n",
261 |     "softmax_res = torch.softmax(logits, dim=1).toList()[0]"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "id": "color-polls",
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# get the label and probability sorted\n",
272 |     "predictions = list ( zip (labels , softmax_res ) )\n",
273 |     "predictions = sorted (predictions , key=lambda x:x[1] , reverse =True)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "id": "minimal-marketplace",
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": []
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "id": "false-mississippi",
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": []
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "id": "acoustic-ensemble",
295 |    "metadata": {},
296 |    "source": [
297 |     "# Torch Archiving"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "id": "aggressive-alexander",
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "# name and version of the model\n",
308 |     "MODEL_NAME=\"pt_classifier\"\n",
309 |     "MODEL_VERSION=\"1.0\"\n",
310 |     "\n",
311 |     "# folder where model is saved\n",
312 |     "MODEL_STORE=\"model_store\"\n",
313 |     "# path of saved pytorch models\n",
314 |     "MODEL_SERIALIZED_FILE=\"traced_model.pt\"\n",
315 |     "# path of extra files to include\n",
316 |     "MODEL_EXTRA_FILES=\"index_to_name.json,setup_config.json\"\n",
317 |     "# model code\n",
318 |     "MODEL_CODE=\"handler.py\"\n",
319 |     "\n",
320 |     "\n",
321 |     "torch-model-archiver --model-name ${MODEL_NAME} \\\n",
322 |     "--version ${MODEL_VERSION} \\\n",
323 |     "--serialized-file ${MODEL_SERIALIZED_FILE} \\\n",
324 |     "--export-path ${MODEL_STORE} \\\n",
325 |     "--extra-files ${MODEL_EXTRA_FILES} \\\n",
326 |     "--handler ${MODEL_CODE} \\"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "id": "brilliant-crisis",
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": []
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "environment": {
340 |    "name": "rapids-gpu.0-18.m65",
341 |    "type": "gcloud",
342 |    "uri": "gcr.io/deeplearning-platform-release/rapids-gpu.0-18:m65"
343 |   },
344 |   "kernelspec": {
345 |    "display_name": "Python [conda env:pytorch]",
346 |    "language": "python",
347 |    "name": "conda-env-pytorch-py"
348 |   },
349 |   "language_info": {
350 |    "codemirror_mode": {
351 |     "name": "ipython",
352 |     "version": 3
353 |    },
354 |    "file_extension": ".py",
355 |    "mimetype": "text/x-python",
356 |    "name": "python",
357 |    "nbconvert_exporter": "python",
358 |    "pygments_lexer": "ipython3",
359 |    "version": "3.7.10"
360 |   }
361 |  },
362 |  "nbformat": 4,
363 |  "nbformat_minor": 5
364 | }
365 | 


--------------------------------------------------------------------------------
/notebooks/00_prepare_dataset.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "swedish-certificate",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Prepare Dataset"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "markdown",
  13 |    "id": "laden-intellectual",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "## About\n",
  17 |     "This notebook contains the code to\n",
  18 |     "1. download the ABO dataset \n",
  19 |     "2. Clean the dataset to extract title/product type\n",
  20 |     "3. export dataset as HuggingFace compatible dataset"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": null,
  26 |    "id": "hourly-grace",
  27 |    "metadata": {},
  28 |    "outputs": [],
  29 |    "source": []
  30 |   },
  31 |   {
  32 |    "cell_type": "markdown",
  33 |    "id": "systematic-midwest",
  34 |    "metadata": {},
  35 |    "source": [
  36 |     "## Dataset"
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "markdown",
  41 |    "id": "caring-invitation",
  42 |    "metadata": {},
  43 |    "source": [
  44 |     "This notebook uses the [Amazon Berkeley Objects (ABO) Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) . \n",
  45 |     "\n",
  46 |     "The dataset was created in partnership with Amazon and UC Berklely .\n",
  47 |     "\n",
  48 |     "For 147,702 it contains product metadata , images and 3D models. "
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": null,
  54 |    "id": "indonesian-tennis",
  55 |    "metadata": {
  56 |     "jupyter": {
  57 |      "outputs_hidden": true
  58 |     }
  59 |    },
  60 |    "outputs": [],
  61 |    "source": [
  62 |     "%%bash \n",
  63 |     "cd ../artifacts/dataset_raw/amazon/\n",
  64 |     "wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar\n",
  65 |     "tar -xvf abo-listings.tar"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": null,
  71 |    "id": "sexual-folder",
  72 |    "metadata": {
  73 |     "scrolled": true
  74 |    },
  75 |    "outputs": [],
  76 |    "source": [
  77 |     "#!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/All_Amazon_Meta.json.gz\n",
  78 |     "#!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": null,
  84 |    "id": "opposite-island",
  85 |    "metadata": {},
  86 |    "outputs": [],
  87 |    "source": []
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": null,
  92 |    "id": "corrected-specification",
  93 |    "metadata": {},
  94 |    "outputs": [],
  95 |    "source": []
  96 |   },
  97 |   {
  98 |    "cell_type": "markdown",
  99 |    "id": "exotic-guess",
 100 |    "metadata": {},
 101 |    "source": [
 102 |     "## Imports"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "code",
 107 |    "execution_count": 1,
 108 |    "id": "eligible-magazine",
 109 |    "metadata": {},
 110 |    "outputs": [],
 111 |    "source": [
 112 |     "import pathlib\n",
 113 |     "import sklearn\n",
 114 |     "import datasets\n",
 115 |     "import pandas as pd\n",
 116 |     "import sklearn.preprocessing\n",
 117 |     "import sklearn.model_selection\n",
 118 |     "import glob\n",
 119 |     "import functools"
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": 2,
 125 |    "id": "raised-myanmar",
 126 |    "metadata": {},
 127 |    "outputs": [
 128 |     {
 129 |      "name": "stdout",
 130 |      "output_type": "stream",
 131 |      "text": [
 132 |       "/home/jupyter/tutorials/personal/pydata_bert/notebooks\r\n"
 133 |      ]
 134 |     }
 135 |    ],
 136 |    "source": [
 137 |     "!pwd"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "id": "foster-engagement",
 143 |    "metadata": {},
 144 |    "source": [
 145 |     "## Process Dataset"
 146 |    ]
 147 |   },
 148 |   {
 149 |    "cell_type": "code",
 150 |    "execution_count": 3,
 151 |    "id": "aboriginal-adobe",
 152 |    "metadata": {},
 153 |    "outputs": [
 154 |     {
 155 |      "name": "stdout",
 156 |      "output_type": "stream",
 157 |      "text": [
 158 |       "listings_0.json.gz  listings_4.json.gz\tlistings_8.json.gz  listings_c.json.gz\r\n",
 159 |       "listings_1.json.gz  listings_5.json.gz\tlistings_9.json.gz  listings_d.json.gz\r\n",
 160 |       "listings_2.json.gz  listings_6.json.gz\tlistings_a.json.gz  listings_e.json.gz\r\n",
 161 |       "listings_3.json.gz  listings_7.json.gz\tlistings_b.json.gz  listings_f.json.gz\r\n"
 162 |      ]
 163 |     }
 164 |    ],
 165 |    "source": [
 166 |     "!ls ../artifacts/dataset_raw/amazon/listings/metadata"
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "code",
 171 |    "execution_count": 4,
 172 |    "id": "hollow-berry",
 173 |    "metadata": {},
 174 |    "outputs": [],
 175 |    "source": [
 176 |     "dataset_path_raw = \"../artifacts/dataset_raw/amazon/listings/metadata\""
 177 |    ]
 178 |   },
 179 |   {
 180 |    "cell_type": "code",
 181 |    "execution_count": 5,
 182 |    "id": "clean-investment",
 183 |    "metadata": {},
 184 |    "outputs": [
 185 |     {
 186 |      "data": {
 187 |       "text/plain": [
 188 |        "['../artifacts/dataset_raw/amazon/listings/metadata/listings_2.json.gz',\n",
 189 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_9.json.gz',\n",
 190 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_0.json.gz',\n",
 191 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_1.json.gz',\n",
 192 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_a.json.gz',\n",
 193 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_7.json.gz',\n",
 194 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_5.json.gz',\n",
 195 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_6.json.gz',\n",
 196 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_f.json.gz',\n",
 197 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_3.json.gz',\n",
 198 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_b.json.gz',\n",
 199 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_c.json.gz',\n",
 200 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_4.json.gz',\n",
 201 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_e.json.gz',\n",
 202 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_8.json.gz',\n",
 203 |        " '../artifacts/dataset_raw/amazon/listings/metadata/listings_d.json.gz']"
 204 |       ]
 205 |      },
 206 |      "execution_count": 5,
 207 |      "metadata": {},
 208 |      "output_type": "execute_result"
 209 |     }
 210 |    ],
 211 |    "source": [
 212 |     "glob.glob(f'{dataset_path_raw}/*.json.gz')"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "markdown",
 217 |    "id": "exciting-homeless",
 218 |    "metadata": {},
 219 |    "source": [
 220 |     "load all 16 files"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "code",
 225 |    "execution_count": 6,
 226 |    "id": "conventional-calculation",
 227 |    "metadata": {},
 228 |    "outputs": [],
 229 |    "source": [
 230 |     "df_raw = pd.concat(map(functools.partial(pd.read_json, lines=True ), \n",
 231 |     "                    glob.glob(f'{dataset_path_raw}/*.json.gz') )) "
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": 7,
 237 |    "id": "sonic-staff",
 238 |    "metadata": {},
 239 |    "outputs": [
 240 |     {
 241 |      "data": {
 242 |       "text/html": [
 243 |        "<div>\n",
 244 |        "<style scoped>\n",
 245 |        "    .dataframe tbody tr th:only-of-type {\n",
 246 |        "        vertical-align: middle;\n",
 247 |        "    }\n",
 248 |        "\n",
 249 |        "    .dataframe tbody tr th {\n",
 250 |        "        vertical-align: top;\n",
 251 |        "    }\n",
 252 |        "\n",
 253 |        "    .dataframe thead th {\n",
 254 |        "        text-align: right;\n",
 255 |        "    }\n",
 256 |        "</style>\n",
 257 |        "<table border=\"1\" class=\"dataframe\">\n",
 258 |        "  <thead>\n",
 259 |        "    <tr style=\"text-align: right;\">\n",
 260 |        "      <th></th>\n",
 261 |        "      <th>brand</th>\n",
 262 |        "      <th>bullet_point</th>\n",
 263 |        "      <th>color</th>\n",
 264 |        "      <th>fabric_type</th>\n",
 265 |        "      <th>item_id</th>\n",
 266 |        "      <th>item_name</th>\n",
 267 |        "      <th>model_name</th>\n",
 268 |        "      <th>model_number</th>\n",
 269 |        "      <th>product_type</th>\n",
 270 |        "      <th>style</th>\n",
 271 |        "      <th>...</th>\n",
 272 |        "      <th>item_keywords</th>\n",
 273 |        "      <th>material</th>\n",
 274 |        "      <th>spin_id</th>\n",
 275 |        "      <th>3dmodel_id</th>\n",
 276 |        "      <th>color_code</th>\n",
 277 |        "      <th>model_year</th>\n",
 278 |        "      <th>pattern</th>\n",
 279 |        "      <th>product_description</th>\n",
 280 |        "      <th>finish_type</th>\n",
 281 |        "      <th>item_shape</th>\n",
 282 |        "    </tr>\n",
 283 |        "  </thead>\n",
 284 |        "  <tbody>\n",
 285 |        "    <tr>\n",
 286 |        "      <th>0</th>\n",
 287 |        "      <td>[{'language_tag': 'de_DE', 'value': 'Amazon Es...</td>\n",
 288 |        "      <td>[{'language_tag': 'de_DE', 'value': 'Fällt gro...</td>\n",
 289 |        "      <td>[{'language_tag': 'de_DE', 'value': 'Mehrfarbi...</td>\n",
 290 |        "      <td>[{'language_tag': 'en_GB', 'value': '100% Cott...</td>\n",
 291 |        "      <td>B07HL25ZQM</td>\n",
 292 |        "      <td>[{'language_tag': 'en_GB', 'value': 'Amazon Es...</td>\n",
 293 |        "      <td>[{'language_tag': 'en_GB', 'value': '6-Pack Bi...</td>\n",
 294 |        "      <td>[{'value': 'P_AE3131_M6'}]</td>\n",
 295 |        "      <td>[{'value': 'BABY_PRODUCT'}]</td>\n",
 296 |        "      <td>[{'language_tag': 'de_DE', 'value': '6-Pack Bi...</td>\n",
 297 |        "      <td>...</td>\n",
 298 |        "      <td>NaN</td>\n",
 299 |        "      <td>NaN</td>\n",
 300 |        "      <td>NaN</td>\n",
 301 |        "      <td>NaN</td>\n",
 302 |        "      <td>NaN</td>\n",
 303 |        "      <td>NaN</td>\n",
 304 |        "      <td>NaN</td>\n",
 305 |        "      <td>NaN</td>\n",
 306 |        "      <td>NaN</td>\n",
 307 |        "      <td>NaN</td>\n",
 308 |        "    </tr>\n",
 309 |        "    <tr>\n",
 310 |        "      <th>1</th>\n",
 311 |        "      <td>[{'language_tag': 'en_GB', 'value': 'AmazonBas...</td>\n",
 312 |        "      <td>[{'language_tag': 'en_GB', 'value': 'Large dry...</td>\n",
 313 |        "      <td>NaN</td>\n",
 314 |        "      <td>NaN</td>\n",
 315 |        "      <td>B0825D4F6R</td>\n",
 316 |        "      <td>[{'language_tag': 'en_GB', 'value': 'AmazonBas...</td>\n",
 317 |        "      <td>NaN</td>\n",
 318 |        "      <td>[{'value': 'AMAZ2001'}]</td>\n",
 319 |        "      <td>[{'value': 'HOME'}]</td>\n",
 320 |        "      <td>[{'language_tag': 'en_GB', 'value': 'Deluxe'}]</td>\n",
 321 |        "      <td>...</td>\n",
 322 |        "      <td>[{'language_tag': 'en_GB', 'value': 'tower lau...</td>\n",
 323 |        "      <td>NaN</td>\n",
 324 |        "      <td>NaN</td>\n",
 325 |        "      <td>NaN</td>\n",
 326 |        "      <td>NaN</td>\n",
 327 |        "      <td>NaN</td>\n",
 328 |        "      <td>NaN</td>\n",
 329 |        "      <td>NaN</td>\n",
 330 |        "      <td>NaN</td>\n",
 331 |        "      <td>NaN</td>\n",
 332 |        "    </tr>\n",
 333 |        "    <tr>\n",
 334 |        "      <th>2</th>\n",
 335 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Amazon Br...</td>\n",
 336 |        "      <td>[{'language_tag': 'en_IN', 'value': '3D Printe...</td>\n",
 337 |        "      <td>[{'language_tag': 'en_IN', 'standardized_value...</td>\n",
 338 |        "      <td>NaN</td>\n",
 339 |        "      <td>B07TF1FCFD</td>\n",
 340 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Amazon Br...</td>\n",
 341 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Samsung G...</td>\n",
 342 |        "      <td>[{'value': 'gz8587-SL40668'}]</td>\n",
 343 |        "      <td>[{'value': 'CELLULAR_PHONE_CASE'}]</td>\n",
 344 |        "      <td>NaN</td>\n",
 345 |        "      <td>...</td>\n",
 346 |        "      <td>[{'language_tag': 'en_IN', 'value': 'mobile co...</td>\n",
 347 |        "      <td>NaN</td>\n",
 348 |        "      <td>NaN</td>\n",
 349 |        "      <td>NaN</td>\n",
 350 |        "      <td>NaN</td>\n",
 351 |        "      <td>NaN</td>\n",
 352 |        "      <td>NaN</td>\n",
 353 |        "      <td>NaN</td>\n",
 354 |        "      <td>NaN</td>\n",
 355 |        "      <td>NaN</td>\n",
 356 |        "    </tr>\n",
 357 |        "    <tr>\n",
 358 |        "      <th>3</th>\n",
 359 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Amazon Br...</td>\n",
 360 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Snug fit ...</td>\n",
 361 |        "      <td>[{'language_tag': 'en_IN', 'standardized_value...</td>\n",
 362 |        "      <td>NaN</td>\n",
 363 |        "      <td>B08569SRJD</td>\n",
 364 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Amazon Br...</td>\n",
 365 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Nokia 7.2'}]</td>\n",
 366 |        "      <td>[{'value': 'UV10845-SL40357'}]</td>\n",
 367 |        "      <td>[{'value': 'CELLULAR_PHONE_CASE'}]</td>\n",
 368 |        "      <td>NaN</td>\n",
 369 |        "      <td>...</td>\n",
 370 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Back Cove...</td>\n",
 371 |        "      <td>[{'language_tag': 'en_IN', 'value': 'Silicon'}]</td>\n",
 372 |        "      <td>NaN</td>\n",
 373 |        "      <td>NaN</td>\n",
 374 |        "      <td>NaN</td>\n",
 375 |        "      <td>NaN</td>\n",
 376 |        "      <td>NaN</td>\n",
 377 |        "      <td>NaN</td>\n",
 378 |        "      <td>NaN</td>\n",
 379 |        "      <td>NaN</td>\n",
 380 |        "    </tr>\n",
 381 |        "    <tr>\n",
 382 |        "      <th>4</th>\n",
 383 |        "      <td>[{'language_tag': 'en_US', 'value': 'Stone &amp; B...</td>\n",
 384 |        "      <td>[{'language_tag': 'en_US', 'value': 'With mode...</td>\n",
 385 |        "      <td>[{'language_tag': 'en_US', 'value': 'Dark Grey'}]</td>\n",
 386 |        "      <td>NaN</td>\n",
 387 |        "      <td>B07B4G5RBN</td>\n",
 388 |        "      <td>[{'language_tag': 'zh_CN', 'value': 'Stone &amp; B...</td>\n",
 389 |        "      <td>NaN</td>\n",
 390 |        "      <td>[{'value': 'UPH10095B'}]</td>\n",
 391 |        "      <td>[{'value': 'CHAIR'}]</td>\n",
 392 |        "      <td>NaN</td>\n",
 393 |        "      <td>...</td>\n",
 394 |        "      <td>[{'language_tag': 'en_US', 'value': 'living-ro...</td>\n",
 395 |        "      <td>[{'language_tag': 'zh_CN', 'value': '灰石色'}, {'...</td>\n",
 396 |        "      <td>485925ed</td>\n",
 397 |        "      <td>B07B4G5RBN</td>\n",
 398 |        "      <td>[#918F8C]</td>\n",
 399 |        "      <td>NaN</td>\n",
 400 |        "      <td>NaN</td>\n",
 401 |        "      <td>NaN</td>\n",
 402 |        "      <td>NaN</td>\n",
 403 |        "      <td>NaN</td>\n",
 404 |        "    </tr>\n",
 405 |        "  </tbody>\n",
 406 |        "</table>\n",
 407 |        "<p>5 rows × 28 columns</p>\n",
 408 |        "</div>"
 409 |       ],
 410 |       "text/plain": [
 411 |        "                                               brand  \\\n",
 412 |        "0  [{'language_tag': 'de_DE', 'value': 'Amazon Es...   \n",
 413 |        "1  [{'language_tag': 'en_GB', 'value': 'AmazonBas...   \n",
 414 |        "2  [{'language_tag': 'en_IN', 'value': 'Amazon Br...   \n",
 415 |        "3  [{'language_tag': 'en_IN', 'value': 'Amazon Br...   \n",
 416 |        "4  [{'language_tag': 'en_US', 'value': 'Stone & B...   \n",
 417 |        "\n",
 418 |        "                                        bullet_point  \\\n",
 419 |        "0  [{'language_tag': 'de_DE', 'value': 'Fällt gro...   \n",
 420 |        "1  [{'language_tag': 'en_GB', 'value': 'Large dry...   \n",
 421 |        "2  [{'language_tag': 'en_IN', 'value': '3D Printe...   \n",
 422 |        "3  [{'language_tag': 'en_IN', 'value': 'Snug fit ...   \n",
 423 |        "4  [{'language_tag': 'en_US', 'value': 'With mode...   \n",
 424 |        "\n",
 425 |        "                                               color  \\\n",
 426 |        "0  [{'language_tag': 'de_DE', 'value': 'Mehrfarbi...   \n",
 427 |        "1                                                NaN   \n",
 428 |        "2  [{'language_tag': 'en_IN', 'standardized_value...   \n",
 429 |        "3  [{'language_tag': 'en_IN', 'standardized_value...   \n",
 430 |        "4  [{'language_tag': 'en_US', 'value': 'Dark Grey'}]   \n",
 431 |        "\n",
 432 |        "                                         fabric_type     item_id  \\\n",
 433 |        "0  [{'language_tag': 'en_GB', 'value': '100% Cott...  B07HL25ZQM   \n",
 434 |        "1                                                NaN  B0825D4F6R   \n",
 435 |        "2                                                NaN  B07TF1FCFD   \n",
 436 |        "3                                                NaN  B08569SRJD   \n",
 437 |        "4                                                NaN  B07B4G5RBN   \n",
 438 |        "\n",
 439 |        "                                           item_name  \\\n",
 440 |        "0  [{'language_tag': 'en_GB', 'value': 'Amazon Es...   \n",
 441 |        "1  [{'language_tag': 'en_GB', 'value': 'AmazonBas...   \n",
 442 |        "2  [{'language_tag': 'en_IN', 'value': 'Amazon Br...   \n",
 443 |        "3  [{'language_tag': 'en_IN', 'value': 'Amazon Br...   \n",
 444 |        "4  [{'language_tag': 'zh_CN', 'value': 'Stone & B...   \n",
 445 |        "\n",
 446 |        "                                          model_name  \\\n",
 447 |        "0  [{'language_tag': 'en_GB', 'value': '6-Pack Bi...   \n",
 448 |        "1                                                NaN   \n",
 449 |        "2  [{'language_tag': 'en_IN', 'value': 'Samsung G...   \n",
 450 |        "3  [{'language_tag': 'en_IN', 'value': 'Nokia 7.2'}]   \n",
 451 |        "4                                                NaN   \n",
 452 |        "\n",
 453 |        "                     model_number                        product_type  \\\n",
 454 |        "0      [{'value': 'P_AE3131_M6'}]         [{'value': 'BABY_PRODUCT'}]   \n",
 455 |        "1         [{'value': 'AMAZ2001'}]                 [{'value': 'HOME'}]   \n",
 456 |        "2   [{'value': 'gz8587-SL40668'}]  [{'value': 'CELLULAR_PHONE_CASE'}]   \n",
 457 |        "3  [{'value': 'UV10845-SL40357'}]  [{'value': 'CELLULAR_PHONE_CASE'}]   \n",
 458 |        "4        [{'value': 'UPH10095B'}]                [{'value': 'CHAIR'}]   \n",
 459 |        "\n",
 460 |        "                                               style  ...  \\\n",
 461 |        "0  [{'language_tag': 'de_DE', 'value': '6-Pack Bi...  ...   \n",
 462 |        "1     [{'language_tag': 'en_GB', 'value': 'Deluxe'}]  ...   \n",
 463 |        "2                                                NaN  ...   \n",
 464 |        "3                                                NaN  ...   \n",
 465 |        "4                                                NaN  ...   \n",
 466 |        "\n",
 467 |        "                                       item_keywords  \\\n",
 468 |        "0                                                NaN   \n",
 469 |        "1  [{'language_tag': 'en_GB', 'value': 'tower lau...   \n",
 470 |        "2  [{'language_tag': 'en_IN', 'value': 'mobile co...   \n",
 471 |        "3  [{'language_tag': 'en_IN', 'value': 'Back Cove...   \n",
 472 |        "4  [{'language_tag': 'en_US', 'value': 'living-ro...   \n",
 473 |        "\n",
 474 |        "                                            material   spin_id  3dmodel_id  \\\n",
 475 |        "0                                                NaN       NaN         NaN   \n",
 476 |        "1                                                NaN       NaN         NaN   \n",
 477 |        "2                                                NaN       NaN         NaN   \n",
 478 |        "3    [{'language_tag': 'en_IN', 'value': 'Silicon'}]       NaN         NaN   \n",
 479 |        "4  [{'language_tag': 'zh_CN', 'value': '灰石色'}, {'...  485925ed  B07B4G5RBN   \n",
 480 |        "\n",
 481 |        "  color_code model_year pattern product_description finish_type item_shape  \n",
 482 |        "0        NaN        NaN     NaN                 NaN         NaN        NaN  \n",
 483 |        "1        NaN        NaN     NaN                 NaN         NaN        NaN  \n",
 484 |        "2        NaN        NaN     NaN                 NaN         NaN        NaN  \n",
 485 |        "3        NaN        NaN     NaN                 NaN         NaN        NaN  \n",
 486 |        "4  [#918F8C]        NaN     NaN                 NaN         NaN        NaN  \n",
 487 |        "\n",
 488 |        "[5 rows x 28 columns]"
 489 |       ]
 490 |      },
 491 |      "execution_count": 7,
 492 |      "metadata": {},
 493 |      "output_type": "execute_result"
 494 |     }
 495 |    ],
 496 |    "source": [
 497 |     "df_raw.head()"
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "code",
 502 |    "execution_count": 8,
 503 |    "id": "sophisticated-obligation",
 504 |    "metadata": {},
 505 |    "outputs": [
 506 |     {
 507 |      "data": {
 508 |       "text/plain": [
 509 |        "147702"
 510 |       ]
 511 |      },
 512 |      "execution_count": 8,
 513 |      "metadata": {},
 514 |      "output_type": "execute_result"
 515 |     }
 516 |    ],
 517 |    "source": [
 518 |     "len(df_raw)"
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "markdown",
 523 |    "id": "universal-document",
 524 |    "metadata": {},
 525 |    "source": [
 526 |     "sample record"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": 9,
 532 |    "id": "corresponding-blast",
 533 |    "metadata": {},
 534 |    "outputs": [
 535 |     {
 536 |      "data": {
 537 |       "text/plain": [
 538 |        "{'brand': [{'language_tag': 'de_DE', 'value': 'Amazon Essentials'}],\n",
 539 |        " 'bullet_point': [{'language_tag': 'de_DE',\n",
 540 |        "   'value': 'Fällt gross aus; eventuell eine Größe kleiner bestellen'}],\n",
 541 |        " 'color': [{'language_tag': 'de_DE', 'value': 'Mehrfarbig(Girl Fruit)'}],\n",
 542 |        " 'fabric_type': [{'language_tag': 'en_GB', 'value': '100% Cotton'},\n",
 543 |        "  {'language_tag': 'de_DE', 'value': '100 % Baumwolle'}],\n",
 544 |        " 'item_id': 'B07HL25ZQM',\n",
 545 |        " 'item_name': [{'language_tag': 'en_GB',\n",
 546 |        "   'value': 'Amazon Essentials Bib Set of 6'},\n",
 547 |        "  {'language_tag': 'de_DE',\n",
 548 |        "   'value': 'Amazon Essentials 6-Pack Bib Set, Mehrfarbig(Girl Fruit), Einheitsgröße'}],\n",
 549 |        " 'model_name': [{'language_tag': 'en_GB', 'value': '6-Pack Bib Set'},\n",
 550 |        "  {'language_tag': 'de_DE', 'value': '6-Pack Bib Set'}],\n",
 551 |        " 'model_number': [{'value': 'P_AE3131_M6'}],\n",
 552 |        " 'product_type': [{'value': 'BABY_PRODUCT'}],\n",
 553 |        " 'style': [{'language_tag': 'de_DE', 'value': '6-Pack Bib Set'}],\n",
 554 |        " 'main_image_id': '718mYsQTQbL',\n",
 555 |        " 'country': 'DE',\n",
 556 |        " 'marketplace': 'Amazon',\n",
 557 |        " 'domain_name': 'amazon.de',\n",
 558 |        " 'node': [{'node_id': 3968940031,\n",
 559 |        "   'node_name': '/Kategorien/Ernährung & Stillen/Lätzchen'}],\n",
 560 |        " 'item_dimensions': nan,\n",
 561 |        " 'item_weight': nan,\n",
 562 |        " 'other_image_id': nan,\n",
 563 |        " 'item_keywords': nan,\n",
 564 |        " 'material': nan,\n",
 565 |        " 'spin_id': nan,\n",
 566 |        " '3dmodel_id': nan,\n",
 567 |        " 'color_code': nan,\n",
 568 |        " 'model_year': nan,\n",
 569 |        " 'pattern': nan,\n",
 570 |        " 'product_description': nan,\n",
 571 |        " 'finish_type': nan,\n",
 572 |        " 'item_shape': nan}"
 573 |       ]
 574 |      },
 575 |      "execution_count": 9,
 576 |      "metadata": {},
 577 |      "output_type": "execute_result"
 578 |     }
 579 |    ],
 580 |    "source": [
 581 |     "df_raw.iloc[0].to_dict()"
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "markdown",
 586 |    "id": "referenced-championship",
 587 |    "metadata": {},
 588 |    "source": [
 589 |     "for this project, we only need `item_name` and `brand`.    \n",
 590 |     "We can assume and take the first value for the fields"
 591 |    ]
 592 |   },
 593 |   {
 594 |    "cell_type": "code",
 595 |    "execution_count": 10,
 596 |    "id": "trying-training",
 597 |    "metadata": {},
 598 |    "outputs": [],
 599 |    "source": [
 600 |     "def parse_property(property_record:dict,property_name:str):\n",
 601 |     "    try:\n",
 602 |     "        r = property_record[property_name][0]\n",
 603 |     "        if property_name ==\"node\":\n",
 604 |     "            return r['node_name']\n",
 605 |     "        else:\n",
 606 |     "            return r['value']\n",
 607 |     "    except Exception as e:\n",
 608 |     "        return None\n",
 609 |     "    \n",
 610 |     "def cleanup_record(raw_record:dict):\n",
 611 |     "    \n",
 612 |     "    \n",
 613 |     "    record= {\n",
 614 |     "        'brand': parse_property(raw_record,'brand')\n",
 615 |     "        ,'item_id': raw_record['item_id']\n",
 616 |     "        ,'item_name': parse_property(raw_record,'item_name')\n",
 617 |     "        ,'product_type': parse_property(raw_record,'product_type')\n",
 618 |     "        ,'node': parse_property(raw_record, 'node')\n",
 619 |     "        , 'main_image_id': raw_record['main_image_id']\n",
 620 |     "        ,'product_description': raw_record['product_description']\n",
 621 |     "\n",
 622 |     "        \n",
 623 |     "    }\n",
 624 |     "    \n",
 625 |     "    return pd.Series(record)"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": 11,
 631 |    "id": "bizarre-taste",
 632 |    "metadata": {},
 633 |    "outputs": [],
 634 |    "source": [
 635 |     "df = df_raw.apply(cleanup_record,axis=1)"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "code",
 640 |    "execution_count": 12,
 641 |    "id": "earlier-education",
 642 |    "metadata": {},
 643 |    "outputs": [
 644 |     {
 645 |      "data": {
 646 |       "text/html": [
 647 |        "<div>\n",
 648 |        "<style scoped>\n",
 649 |        "    .dataframe tbody tr th:only-of-type {\n",
 650 |        "        vertical-align: middle;\n",
 651 |        "    }\n",
 652 |        "\n",
 653 |        "    .dataframe tbody tr th {\n",
 654 |        "        vertical-align: top;\n",
 655 |        "    }\n",
 656 |        "\n",
 657 |        "    .dataframe thead th {\n",
 658 |        "        text-align: right;\n",
 659 |        "    }\n",
 660 |        "</style>\n",
 661 |        "<table border=\"1\" class=\"dataframe\">\n",
 662 |        "  <thead>\n",
 663 |        "    <tr style=\"text-align: right;\">\n",
 664 |        "      <th></th>\n",
 665 |        "      <th>brand</th>\n",
 666 |        "      <th>item_id</th>\n",
 667 |        "      <th>item_name</th>\n",
 668 |        "      <th>product_type</th>\n",
 669 |        "      <th>node</th>\n",
 670 |        "      <th>main_image_id</th>\n",
 671 |        "      <th>product_description</th>\n",
 672 |        "    </tr>\n",
 673 |        "  </thead>\n",
 674 |        "  <tbody>\n",
 675 |        "    <tr>\n",
 676 |        "      <th>0</th>\n",
 677 |        "      <td>Amazon Essentials</td>\n",
 678 |        "      <td>B07HL25ZQM</td>\n",
 679 |        "      <td>Amazon Essentials Bib Set of 6</td>\n",
 680 |        "      <td>BABY_PRODUCT</td>\n",
 681 |        "      <td>/Kategorien/Ernährung &amp; Stillen/Lätzchen</td>\n",
 682 |        "      <td>718mYsQTQbL</td>\n",
 683 |        "      <td>NaN</td>\n",
 684 |        "    </tr>\n",
 685 |        "    <tr>\n",
 686 |        "      <th>1</th>\n",
 687 |        "      <td>AmazonBasics</td>\n",
 688 |        "      <td>B0825D4F6R</td>\n",
 689 |        "      <td>AmazonBasics 3-Tier Deluxe Tower Laundry Dryin...</td>\n",
 690 |        "      <td>HOME</td>\n",
 691 |        "      <td>/Home &amp; Garden/Home &amp; Kitchen/Categories/Stora...</td>\n",
 692 |        "      <td>81lg2wto16L</td>\n",
 693 |        "      <td>NaN</td>\n",
 694 |        "    </tr>\n",
 695 |        "    <tr>\n",
 696 |        "      <th>2</th>\n",
 697 |        "      <td>Amazon Brand - Solimo</td>\n",
 698 |        "      <td>B07TF1FCFD</td>\n",
 699 |        "      <td>Amazon Brand - Solimo Designer Number Eight 3D...</td>\n",
 700 |        "      <td>CELLULAR_PHONE_CASE</td>\n",
 701 |        "      <td>/Categories/Mobiles &amp; Accessories/Mobile Acces...</td>\n",
 702 |        "      <td>71R4R6x-tjL</td>\n",
 703 |        "      <td>NaN</td>\n",
 704 |        "    </tr>\n",
 705 |        "    <tr>\n",
 706 |        "      <th>3</th>\n",
 707 |        "      <td>Amazon Brand - Solimo</td>\n",
 708 |        "      <td>B08569SRJD</td>\n",
 709 |        "      <td>Amazon Brand - Solimo Designer Dark Night View...</td>\n",
 710 |        "      <td>CELLULAR_PHONE_CASE</td>\n",
 711 |        "      <td>/Categories/Mobiles &amp; Accessories/Mobile Acces...</td>\n",
 712 |        "      <td>71QSAxIJagL</td>\n",
 713 |        "      <td>NaN</td>\n",
 714 |        "    </tr>\n",
 715 |        "    <tr>\n",
 716 |        "      <th>4</th>\n",
 717 |        "      <td>Stone &amp; Beam</td>\n",
 718 |        "      <td>B07B4G5RBN</td>\n",
 719 |        "      <td>Stone &amp; Beam Varon 过渡日床, 灰石色</td>\n",
 720 |        "      <td>CHAIR</td>\n",
 721 |        "      <td>/Categories/Furniture/Living Room Furniture/Ch...</td>\n",
 722 |        "      <td>91UiRD6UcHL</td>\n",
 723 |        "      <td>NaN</td>\n",
 724 |        "    </tr>\n",
 725 |        "  </tbody>\n",
 726 |        "</table>\n",
 727 |        "</div>"
 728 |       ],
 729 |       "text/plain": [
 730 |        "                   brand     item_id  \\\n",
 731 |        "0      Amazon Essentials  B07HL25ZQM   \n",
 732 |        "1           AmazonBasics  B0825D4F6R   \n",
 733 |        "2  Amazon Brand - Solimo  B07TF1FCFD   \n",
 734 |        "3  Amazon Brand - Solimo  B08569SRJD   \n",
 735 |        "4           Stone & Beam  B07B4G5RBN   \n",
 736 |        "\n",
 737 |        "                                           item_name         product_type  \\\n",
 738 |        "0                     Amazon Essentials Bib Set of 6         BABY_PRODUCT   \n",
 739 |        "1  AmazonBasics 3-Tier Deluxe Tower Laundry Dryin...                 HOME   \n",
 740 |        "2  Amazon Brand - Solimo Designer Number Eight 3D...  CELLULAR_PHONE_CASE   \n",
 741 |        "3  Amazon Brand - Solimo Designer Dark Night View...  CELLULAR_PHONE_CASE   \n",
 742 |        "4                       Stone & Beam Varon 过渡日床, 灰石色                CHAIR   \n",
 743 |        "\n",
 744 |        "                                                node main_image_id  \\\n",
 745 |        "0           /Kategorien/Ernährung & Stillen/Lätzchen   718mYsQTQbL   \n",
 746 |        "1  /Home & Garden/Home & Kitchen/Categories/Stora...   81lg2wto16L   \n",
 747 |        "2  /Categories/Mobiles & Accessories/Mobile Acces...   71R4R6x-tjL   \n",
 748 |        "3  /Categories/Mobiles & Accessories/Mobile Acces...   71QSAxIJagL   \n",
 749 |        "4  /Categories/Furniture/Living Room Furniture/Ch...   91UiRD6UcHL   \n",
 750 |        "\n",
 751 |        "  product_description  \n",
 752 |        "0                 NaN  \n",
 753 |        "1                 NaN  \n",
 754 |        "2                 NaN  \n",
 755 |        "3                 NaN  \n",
 756 |        "4                 NaN  "
 757 |       ]
 758 |      },
 759 |      "execution_count": 12,
 760 |      "metadata": {},
 761 |      "output_type": "execute_result"
 762 |     }
 763 |    ],
 764 |    "source": [
 765 |     "df.head()"
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "code",
 770 |    "execution_count": 13,
 771 |    "id": "respiratory-horizontal",
 772 |    "metadata": {},
 773 |    "outputs": [
 774 |     {
 775 |      "data": {
 776 |       "text/plain": [
 777 |        "Index(['brand', 'item_id', 'item_name', 'product_type', 'node',\n",
 778 |        "       'main_image_id', 'product_description'],\n",
 779 |        "      dtype='object')"
 780 |       ]
 781 |      },
 782 |      "execution_count": 13,
 783 |      "metadata": {},
 784 |      "output_type": "execute_result"
 785 |     }
 786 |    ],
 787 |    "source": [
 788 |     "df.columns"
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": 14,
 794 |    "id": "honey-certification",
 795 |    "metadata": {},
 796 |    "outputs": [
 797 |     {
 798 |      "data": {
 799 |       "text/plain": [
 800 |        "CELLULAR_PHONE_CASE    64853\n",
 801 |        "SHOES                  12965\n",
 802 |        "GROCERY                 6546\n",
 803 |        "HOME                    5264\n",
 804 |        "HOME_BED_AND_BATH       3082\n",
 805 |        "                       ...  \n",
 806 |        "SOUS_VIDE_MACHINE          1\n",
 807 |        "SKIN_TREATMENT_MASK        1\n",
 808 |        "SCULPTURE                  1\n",
 809 |        "THICKENING_AGENT           1\n",
 810 |        "TERMINAL_BLOCK             1\n",
 811 |        "Name: product_type, Length: 576, dtype: int64"
 812 |       ]
 813 |      },
 814 |      "execution_count": 14,
 815 |      "metadata": {},
 816 |      "output_type": "execute_result"
 817 |     }
 818 |    ],
 819 |    "source": [
 820 |     "df['product_type'].value_counts()"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "markdown",
 825 |    "id": "protective-hudson",
 826 |    "metadata": {},
 827 |    "source": [
 828 |     "There are some product types that don't occur frequently.       \n",
 829 |     "We should limit our training data to include at least 50+ product types"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "code",
 834 |    "execution_count": 19,
 835 |    "id": "gentle-florist",
 836 |    "metadata": {},
 837 |    "outputs": [],
 838 |    "source": [
 839 |     "min_product_count = 500"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "markdown",
 844 |    "id": "junior-timber",
 845 |    "metadata": {},
 846 |    "source": [
 847 |     "compute top product types"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": 20,
 853 |    "id": "taken-presence",
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "top_products =  df['product_type'].value_counts().loc[lambda x: x>min_product_count].index.tolist()"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "code",
 862 |    "execution_count": null,
 863 |    "id": "shared-gilbert",
 864 |    "metadata": {},
 865 |    "outputs": [],
 866 |    "source": []
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": 21,
 871 |    "id": "noble-multimedia",
 872 |    "metadata": {},
 873 |    "outputs": [
 874 |     {
 875 |      "data": {
 876 |       "text/plain": [
 877 |        "(576, 31)"
 878 |       ]
 879 |      },
 880 |      "execution_count": 21,
 881 |      "metadata": {},
 882 |      "output_type": "execute_result"
 883 |     }
 884 |    ],
 885 |    "source": [
 886 |     "len(df['product_type'].value_counts() ) , len (top_products)"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 22,
 892 |    "id": "certified-galaxy",
 893 |    "metadata": {},
 894 |    "outputs": [],
 895 |    "source": [
 896 |     "df_all = df [ df['product_type'].isin(top_products) ].copy()\n"
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "code",
 901 |    "execution_count": 23,
 902 |    "id": "missing-extra",
 903 |    "metadata": {},
 904 |    "outputs": [
 905 |     {
 906 |      "data": {
 907 |       "text/plain": [
 908 |        "121239"
 909 |       ]
 910 |      },
 911 |      "execution_count": 23,
 912 |      "metadata": {},
 913 |      "output_type": "execute_result"
 914 |     }
 915 |    ],
 916 |    "source": [
 917 |     "len(df_all)"
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "markdown",
 922 |    "id": "temporal-fifth",
 923 |    "metadata": {},
 924 |    "source": [
 925 |     "`text` and `label` are the columns that are needed by Hugging Face Transformer package\n",
 926 |     "\n",
 927 |     "Item title is the text. \n",
 928 |     "Product Type is the label we are predicting"
 929 |    ]
 930 |   },
 931 |   {
 932 |    "cell_type": "code",
 933 |    "execution_count": 24,
 934 |    "id": "religious-failure",
 935 |    "metadata": {},
 936 |    "outputs": [],
 937 |    "source": [
 938 |     "df_all['label_name'] = df_all['product_type']\n",
 939 |     "df_all['text'] = df_all['item_name']"
 940 |    ]
 941 |   },
 942 |   {
 943 |    "cell_type": "code",
 944 |    "execution_count": null,
 945 |    "id": "disciplinary-fortune",
 946 |    "metadata": {},
 947 |    "outputs": [],
 948 |    "source": []
 949 |   },
 950 |   {
 951 |    "cell_type": "code",
 952 |    "execution_count": null,
 953 |    "id": "apparent-needle",
 954 |    "metadata": {},
 955 |    "outputs": [],
 956 |    "source": []
 957 |   },
 958 |   {
 959 |    "cell_type": "markdown",
 960 |    "id": "mineral-consensus",
 961 |    "metadata": {},
 962 |    "source": [
 963 |     "encode the product type to a numeric label"
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "code",
 968 |    "execution_count": 25,
 969 |    "id": "regulation-planner",
 970 |    "metadata": {},
 971 |    "outputs": [],
 972 |    "source": [
 973 |     "label_encoder = sklearn.preprocessing.LabelEncoder()"
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": 26,
 979 |    "id": "cultural-jonathan",
 980 |    "metadata": {},
 981 |    "outputs": [
 982 |     {
 983 |      "data": {
 984 |       "text/plain": [
 985 |        "LabelEncoder()"
 986 |       ]
 987 |      },
 988 |      "execution_count": 26,
 989 |      "metadata": {},
 990 |      "output_type": "execute_result"
 991 |     }
 992 |    ],
 993 |    "source": [
 994 |     "label_encoder.fit(df_all['label_name'])"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "code",
 999 |    "execution_count": 27,
1000 |    "id": "substantial-operation",
1001 |    "metadata": {},
1002 |    "outputs": [],
1003 |    "source": [
1004 |     "df_all['label'] = label_encoder.transform(df_all['label_name'])\n"
1005 |    ]
1006 |   },
1007 |   {
1008 |    "cell_type": "markdown",
1009 |    "id": "dressed-defeat",
1010 |    "metadata": {},
1011 |    "source": [
1012 |     "Allocate 60% for training , 20% validation and 20% for training"
1013 |    ]
1014 |   },
1015 |   {
1016 |    "cell_type": "code",
1017 |    "execution_count": 28,
1018 |    "id": "textile-bargain",
1019 |    "metadata": {},
1020 |    "outputs": [
1021 |     {
1022 |      "name": "stdout",
1023 |      "output_type": "stream",
1024 |      "text": [
1025 |       "{'train': 72743, 'test': 24248, 'val': 24248}\n"
1026 |      ]
1027 |     }
1028 |    ],
1029 |    "source": [
1030 |     "df_train, df_test = sklearn.model_selection.train_test_split(df_all, train_size=.6, stratify= df_all['label'] )\n",
1031 |     "\n",
1032 |     "\n",
1033 |     "df_test, df_val = sklearn.model_selection.train_test_split(df_test, test_size=.5, stratify= df_test['label'] )\n",
1034 |     "\n",
1035 |     "\n",
1036 |     "print  ( \n",
1037 |     "{\n",
1038 |     "    'train': len(df_train)\n",
1039 |     "    ,'test': len(df_test)\n",
1040 |     "    ,'val': len(df_val)\n",
1041 |     "}\n",
1042 |     "\n",
1043 |     ")"
1044 |    ]
1045 |   },
1046 |   {
1047 |    "cell_type": "markdown",
1048 |    "id": "retained-speed",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "## Create Hugging Face Dataset"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "markdown",
1056 |    "id": "cathedral-australian",
1057 |    "metadata": {},
1058 |    "source": [
1059 |     "In order to later feed our model to HF transformers package, we need either Pytorch Dataloader or use HF [datasets](https://github.com/huggingface/datasets).\n",
1060 |     "\n",
1061 |     "`Datasets` can easily be used by TF/ Pytorch\n"
1062 |    ]
1063 |   },
1064 |   {
1065 |    "cell_type": "code",
1066 |    "execution_count": 29,
1067 |    "id": "stable-finder",
1068 |    "metadata": {},
1069 |    "outputs": [],
1070 |    "source": [
1071 |     "dataset_features = datasets.Features(\n",
1072 |     "    {'text': datasets.Value('string')\n",
1073 |     "     , 'item_name': datasets.Value('string')\n",
1074 |     "     , 'label': datasets.ClassLabel(names=list ( label_encoder.classes_ ))\n",
1075 |     "     , 'brand': datasets.Value('string')\n",
1076 |     "     , 'item_id': datasets.Value('string')\n",
1077 |     "     , 'main_image_id': datasets.Value('string')\n",
1078 |     "    , 'node': datasets.Value('string')\n",
1079 |     "\n",
1080 |     "    }\n",
1081 |     "\n",
1082 |     ")"
1083 |    ]
1084 |   },
1085 |   {
1086 |    "cell_type": "code",
1087 |    "execution_count": 30,
1088 |    "id": "suffering-stream",
1089 |    "metadata": {},
1090 |    "outputs": [
1091 |     {
1092 |      "data": {
1093 |       "text/plain": [
1094 |        "dict_keys(['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'])"
1095 |       ]
1096 |      },
1097 |      "execution_count": 30,
1098 |      "metadata": {},
1099 |      "output_type": "execute_result"
1100 |     }
1101 |    ],
1102 |    "source": [
1103 |     "dataset_features.keys()"
1104 |    ]
1105 |   },
1106 |   {
1107 |    "cell_type": "markdown",
1108 |    "id": "located-submission",
1109 |    "metadata": {},
1110 |    "source": [
1111 |     "create dataset dictionary with all the subsets"
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "code",
1116 |    "execution_count": 31,
1117 |    "id": "third-stylus",
1118 |    "metadata": {},
1119 |    "outputs": [],
1120 |    "source": [
1121 |     "interested_columns = dataset_features.keys()\n",
1122 |     "\n",
1123 |     "dataset_train = datasets.Dataset.from_pandas(df_train[interested_columns],features=dataset_features)\n",
1124 |     "dataset_test = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features)\n",
1125 |     "dataset_validation = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features)\n",
1126 |     "\n",
1127 |     "dataset_all = datasets.DatasetDict({\n",
1128 |     "    'train': dataset_train,\n",
1129 |     "    'test': dataset_test,\n",
1130 |     "    'valid': dataset_validation }\n",
1131 |     ")"
1132 |    ]
1133 |   },
1134 |   {
1135 |    "cell_type": "code",
1136 |    "execution_count": 32,
1137 |    "id": "operating-drove",
1138 |    "metadata": {},
1139 |    "outputs": [
1140 |     {
1141 |      "data": {
1142 |       "text/plain": [
1143 |        "DatasetDict({\n",
1144 |        "    train: Dataset({\n",
1145 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1146 |        "        num_rows: 72743\n",
1147 |        "    })\n",
1148 |        "    test: Dataset({\n",
1149 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1150 |        "        num_rows: 24248\n",
1151 |        "    })\n",
1152 |        "    valid: Dataset({\n",
1153 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1154 |        "        num_rows: 24248\n",
1155 |        "    })\n",
1156 |        "})"
1157 |       ]
1158 |      },
1159 |      "execution_count": 32,
1160 |      "metadata": {},
1161 |      "output_type": "execute_result"
1162 |     }
1163 |    ],
1164 |    "source": [
1165 |     "dataset_all"
1166 |    ]
1167 |   },
1168 |   {
1169 |    "cell_type": "code",
1170 |    "execution_count": 33,
1171 |    "id": "touched-buddy",
1172 |    "metadata": {},
1173 |    "outputs": [
1174 |     {
1175 |      "data": {
1176 |       "text/plain": [
1177 |        "{'text': 'Amazon Brand - Solimo Designer Light Blue Flower Photography 3D Printed Hard Back Case Mobile Cover for Sony Xperia L1',\n",
1178 |        " 'item_name': 'Amazon Brand - Solimo Designer Light Blue Flower Photography 3D Printed Hard Back Case Mobile Cover for Sony Xperia L1',\n",
1179 |        " 'label': 2,\n",
1180 |        " 'brand': 'Amazon Brand - Solimo',\n",
1181 |        " 'item_id': 'B07THC7RSK',\n",
1182 |        " 'main_image_id': '71PBcKpr8jL',\n",
1183 |        " 'node': '/Categories/Mobiles & Accessories/Mobile Accessories/Cases & Covers/Back & Bumper Cases'}"
1184 |       ]
1185 |      },
1186 |      "execution_count": 33,
1187 |      "metadata": {},
1188 |      "output_type": "execute_result"
1189 |     }
1190 |    ],
1191 |    "source": [
1192 |     "dataset_all['train'][0]"
1193 |    ]
1194 |   },
1195 |   {
1196 |    "cell_type": "code",
1197 |    "execution_count": 34,
1198 |    "id": "wanted-ridge",
1199 |    "metadata": {},
1200 |    "outputs": [],
1201 |    "source": [
1202 |     "all_classes = dataset_all['train'].features['label'].names_file\n",
1203 |     "all_classes"
1204 |    ]
1205 |   },
1206 |   {
1207 |    "cell_type": "code",
1208 |    "execution_count": null,
1209 |    "id": "supported-bottom",
1210 |    "metadata": {},
1211 |    "outputs": [],
1212 |    "source": []
1213 |   },
1214 |   {
1215 |    "cell_type": "markdown",
1216 |    "id": "opening-suggestion",
1217 |    "metadata": {},
1218 |    "source": [
1219 |     "## Persist Changes"
1220 |    ]
1221 |   },
1222 |   {
1223 |    "cell_type": "markdown",
1224 |    "id": "communist-while",
1225 |    "metadata": {},
1226 |    "source": [
1227 |     "save the dataset and load it "
1228 |    ]
1229 |   },
1230 |   {
1231 |    "cell_type": "code",
1232 |    "execution_count": 35,
1233 |    "id": "lonely-shame",
1234 |    "metadata": {},
1235 |    "outputs": [],
1236 |    "source": [
1237 |     "dataset_path = '../artifacts/dataset_processed/'"
1238 |    ]
1239 |   },
1240 |   {
1241 |    "cell_type": "code",
1242 |    "execution_count": 36,
1243 |    "id": "flush-report",
1244 |    "metadata": {},
1245 |    "outputs": [],
1246 |    "source": [
1247 |     "dataset_all.save_to_disk(dataset_path)"
1248 |    ]
1249 |   },
1250 |   {
1251 |    "cell_type": "code",
1252 |    "execution_count": null,
1253 |    "id": "national-italic",
1254 |    "metadata": {},
1255 |    "outputs": [],
1256 |    "source": []
1257 |   },
1258 |   {
1259 |    "cell_type": "code",
1260 |    "execution_count": 37,
1261 |    "id": "promotional-lambda",
1262 |    "metadata": {},
1263 |    "outputs": [
1264 |     {
1265 |      "data": {
1266 |       "text/plain": [
1267 |        "DatasetDict({\n",
1268 |        "    train: Dataset({\n",
1269 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1270 |        "        num_rows: 72743\n",
1271 |        "    })\n",
1272 |        "    test: Dataset({\n",
1273 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1274 |        "        num_rows: 24248\n",
1275 |        "    })\n",
1276 |        "    valid: Dataset({\n",
1277 |        "        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1278 |        "        num_rows: 24248\n",
1279 |        "    })\n",
1280 |        "})"
1281 |       ]
1282 |      },
1283 |      "execution_count": 37,
1284 |      "metadata": {},
1285 |      "output_type": "execute_result"
1286 |     }
1287 |    ],
1288 |    "source": [
1289 |     "datasets.load_from_disk(dataset_path)"
1290 |    ]
1291 |   },
1292 |   {
1293 |    "cell_type": "markdown",
1294 |    "id": "surprising-thomas",
1295 |    "metadata": {},
1296 |    "source": [
1297 |     "# References\n",
1298 |     "\n",
1299 |     "[Amazon Object Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html)       \n",
1300 |     "[Hugging Face Tutorial on Custom Dataset](https://github.com/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)"
1301 |    ]
1302 |   },
1303 |   {
1304 |    "cell_type": "code",
1305 |    "execution_count": null,
1306 |    "id": "animal-whole",
1307 |    "metadata": {},
1308 |    "outputs": [],
1309 |    "source": []
1310 |   }
1311 |  ],
1312 |  "metadata": {
1313 |   "environment": {
1314 |    "name": "rapids-gpu.0-18.m65",
1315 |    "type": "gcloud",
1316 |    "uri": "gcr.io/deeplearning-platform-release/rapids-gpu.0-18:m65"
1317 |   },
1318 |   "kernelspec": {
1319 |    "display_name": "Python [conda env:pytorch]",
1320 |    "language": "python",
1321 |    "name": "conda-env-pytorch-py"
1322 |   },
1323 |   "language_info": {
1324 |    "codemirror_mode": {
1325 |     "name": "ipython",
1326 |     "version": 3
1327 |    },
1328 |    "file_extension": ".py",
1329 |    "mimetype": "text/x-python",
1330 |    "name": "python",
1331 |    "nbconvert_exporter": "python",
1332 |    "pygments_lexer": "ipython3",
1333 |    "version": "3.7.10"
1334 |   }
1335 |  },
1336 |  "nbformat": 4,
1337 |  "nbformat_minor": 5
1338 | }
1339 | 


--------------------------------------------------------------------------------
/notebooks/02_timing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "cultural-excess",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Inference Review : different models"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "marked-reference",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## About\n",
 17 |     "\n",
 18 |     "- Look at timing information of different bert models"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "resident-carroll",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## imports"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 1,
 32 |    "id": "appropriate-nevada",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pathlib\n",
 37 |     "import sklearn\n",
 38 |     "import datasets\n",
 39 |     "import pandas as pd\n",
 40 |     "import torch\n",
 41 |     "\n",
 42 |     "import numpy as np\n",
 43 |     "import transformers\n",
 44 |     "import os\n",
 45 |     "import json\n",
 46 |     "from ts.utils.util  import map_class_to_label\n",
 47 |     "from tqdm import tqdm, trange\n",
 48 |     "import time\n",
 49 |     "import torchviz\n",
 50 |     "import torch.nn as nn\n"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "official-kingdom",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": []
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "id": "considerable-terminal",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "query = \"men shoes\""
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "optional-cross",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": []
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "id": "tribal-depression",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "## Device Specs"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "id": "silent-rouge",
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "ATen/Parallel:\n",
 98 |       "\tat::get_num_threads() : 8\n",
 99 |       "\tat::get_num_interop_threads() : 8\n",
100 |       "OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
101 |       "\tomp_get_max_threads() : 8\n",
102 |       "Intel(R) oneAPI Math Kernel Library Version 2022.1-Product Build 20220311 for Intel(R) 64 architecture applications\n",
103 |       "\tmkl_get_max_threads() : 8\n",
104 |       "Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)\n",
105 |       "std::thread::hardware_concurrency() : 16\n",
106 |       "Environment variables:\n",
107 |       "\tOMP_NUM_THREADS : [not set]\n",
108 |       "\tMKL_NUM_THREADS : [not set]\n",
109 |       "ATen parallel backend: OpenMP\n",
110 |       "\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "print ( torch.__config__.parallel_info())"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "ecological-essex",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "polyphonic-public",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": []
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "id": "related-current",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Model: bert-large-uncased"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 4,
145 |    "id": "solar-restoration",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "base_model = \"bert-large-uncased\""
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 5,
155 |    "id": "traditional-shore",
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "application/vnd.jupyter.widget-view+json": {
161 |        "model_id": "f10152f05b4e4936a450292bb46cc535",
162 |        "version_major": 2,
163 |        "version_minor": 0
164 |       },
165 |       "text/plain": [
166 |        "Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]"
167 |       ]
168 |      },
169 |      "metadata": {},
170 |      "output_type": "display_data"
171 |     },
172 |     {
173 |      "data": {
174 |       "application/vnd.jupyter.widget-view+json": {
175 |        "model_id": "7017cd8945644b709ae81e400155c171",
176 |        "version_major": 2,
177 |        "version_minor": 0
178 |       },
179 |       "text/plain": [
180 |        "Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]"
181 |       ]
182 |      },
183 |      "metadata": {},
184 |      "output_type": "display_data"
185 |     },
186 |     {
187 |      "name": "stderr",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']\n",
191 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
192 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
193 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
194 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
195 |      ]
196 |     },
197 |     {
198 |      "data": {
199 |       "application/vnd.jupyter.widget-view+json": {
200 |        "model_id": "684c59c684684b7c8dcccb4ae2760164",
201 |        "version_major": 2,
202 |        "version_minor": 0
203 |       },
204 |       "text/plain": [
205 |        "Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]"
206 |       ]
207 |      },
208 |      "metadata": {},
209 |      "output_type": "display_data"
210 |     },
211 |     {
212 |      "data": {
213 |       "application/vnd.jupyter.widget-view+json": {
214 |        "model_id": "95eb4afa08b743f5b07f6299394bfdf5",
215 |        "version_major": 2,
216 |        "version_minor": 0
217 |       },
218 |       "text/plain": [
219 |        "Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]"
220 |       ]
221 |      },
222 |      "metadata": {},
223 |      "output_type": "display_data"
224 |     },
225 |     {
226 |      "data": {
227 |       "application/vnd.jupyter.widget-view+json": {
228 |        "model_id": "c6e07a81db2848d49643eb6549d02ad8",
229 |        "version_major": 2,
230 |        "version_minor": 0
231 |       },
232 |       "text/plain": [
233 |        "Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]"
234 |       ]
235 |      },
236 |      "metadata": {},
237 |      "output_type": "display_data"
238 |     }
239 |    ],
240 |    "source": [
241 |     "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
242 |     "\n",
243 |     "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
244 |     "                base_model\n",
245 |     "            )\n",
246 |     "\n",
247 |     "res = tokenizer.encode_plus(query, return_tensors=\"pt\",  padding=\"max_length\", truncation=True)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 6,
253 |    "id": "rocky-arctic",
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "data": {
258 |       "text/plain": [
259 |        "{'input_ids': tensor([[ 101, 2273, 6007,  102,    0,    0,    0,    0,    0,    0,    0,    0,\n",
260 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
261 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
262 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
263 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
264 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
265 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
266 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
267 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
268 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
269 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
270 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
271 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
272 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
273 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
274 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
275 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
276 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
277 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
278 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
279 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
280 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
281 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
282 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
283 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
284 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
285 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
286 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
287 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
288 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
289 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
290 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
291 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
292 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
293 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
294 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
295 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
296 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
297 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
298 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
299 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
300 |        "            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,\n",
301 |        "            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
302 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
303 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
304 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
305 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
306 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
307 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
308 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
309 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
310 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
311 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
312 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
313 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
314 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
315 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
316 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
317 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
318 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
319 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
320 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
321 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
322 |        "         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
323 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
324 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
325 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
326 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
327 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
328 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
329 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
330 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
331 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
332 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
333 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
334 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
335 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
336 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
337 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
338 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
339 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
340 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
341 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
342 |        "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
343 |        "         0, 0, 0, 0, 0, 0, 0, 0]])}"
344 |       ]
345 |      },
346 |      "execution_count": 6,
347 |      "metadata": {},
348 |      "output_type": "execute_result"
349 |     }
350 |    ],
351 |    "source": [
352 |     "res"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 7,
358 |    "id": "corresponding-edgar",
359 |    "metadata": {},
360 |    "outputs": [
361 |     {
362 |      "name": "stdout",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "734 ms ± 25.5 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "%%timeit -r 3 -n 5\n",
371 |     "\n",
372 |     "model_res = model(**res)\n",
373 |     "model_res"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 8,
379 |    "id": "strong-physiology",
380 |    "metadata": {},
381 |    "outputs": [
382 |     {
383 |      "name": "stdout",
384 |      "output_type": "stream",
385 |      "text": [
386 |       "The slowest run took 5.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
387 |       "278 ms ± 228 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
388 |      ]
389 |     }
390 |    ],
391 |    "source": [
392 |     "%%timeit -r 3 -n 5\n",
393 |     "if torch.cuda.is_available():\n",
394 |     "    model_cuda = model.cuda()\n",
395 |     "    model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "id": "american-guide",
401 |    "metadata": {},
402 |    "source": [
403 |     "## Model: bert-base-uncased"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 9,
409 |    "id": "nonprofit-plaza",
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "base_model = \"bert-base-uncased\""
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 10,
419 |    "id": "charged-thirty",
420 |    "metadata": {},
421 |    "outputs": [
422 |     {
423 |      "data": {
424 |       "application/vnd.jupyter.widget-view+json": {
425 |        "model_id": "0b8c07dd566146fba60a06a2ffdb1232",
426 |        "version_major": 2,
427 |        "version_minor": 0
428 |       },
429 |       "text/plain": [
430 |        "Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]"
431 |       ]
432 |      },
433 |      "metadata": {},
434 |      "output_type": "display_data"
435 |     },
436 |     {
437 |      "data": {
438 |       "application/vnd.jupyter.widget-view+json": {
439 |        "model_id": "1409f6c052b240369531dbceb05cd359",
440 |        "version_major": 2,
441 |        "version_minor": 0
442 |       },
443 |       "text/plain": [
444 |        "Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]"
445 |       ]
446 |      },
447 |      "metadata": {},
448 |      "output_type": "display_data"
449 |     },
450 |     {
451 |      "name": "stderr",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']\n",
455 |       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
456 |       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
457 |       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
458 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
459 |      ]
460 |     },
461 |     {
462 |      "data": {
463 |       "application/vnd.jupyter.widget-view+json": {
464 |        "model_id": "8110170de678442f854eb4c17efaf84a",
465 |        "version_major": 2,
466 |        "version_minor": 0
467 |       },
468 |       "text/plain": [
469 |        "Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]"
470 |       ]
471 |      },
472 |      "metadata": {},
473 |      "output_type": "display_data"
474 |     },
475 |     {
476 |      "data": {
477 |       "application/vnd.jupyter.widget-view+json": {
478 |        "model_id": "89133ce67be94708a03a45456c1f2863",
479 |        "version_major": 2,
480 |        "version_minor": 0
481 |       },
482 |       "text/plain": [
483 |        "Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]"
484 |       ]
485 |      },
486 |      "metadata": {},
487 |      "output_type": "display_data"
488 |     },
489 |     {
490 |      "data": {
491 |       "application/vnd.jupyter.widget-view+json": {
492 |        "model_id": "57b9872952c44de8b98b288ab71b4ba2",
493 |        "version_major": 2,
494 |        "version_minor": 0
495 |       },
496 |       "text/plain": [
497 |        "Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]"
498 |       ]
499 |      },
500 |      "metadata": {},
501 |      "output_type": "display_data"
502 |     }
503 |    ],
504 |    "source": [
505 |     "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
506 |     "\n",
507 |     "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
508 |     "                base_model\n",
509 |     "            )\n",
510 |     "\n",
511 |     "res = tokenizer.encode_plus(query, return_tensors=\"pt\",  padding=\"max_length\", truncation=True)"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 11,
517 |    "id": "cheap-california",
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "name": "stdout",
522 |      "output_type": "stream",
523 |      "text": [
524 |       "217 ms ± 6.94 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
525 |      ]
526 |     }
527 |    ],
528 |    "source": [
529 |     "%%timeit -r 3 -n 5\n",
530 |     "\n",
531 |     "model_res = model(**res)\n",
532 |     "model_res"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 12,
538 |    "id": "racial-request",
539 |    "metadata": {},
540 |    "outputs": [
541 |     {
542 |      "name": "stdout",
543 |      "output_type": "stream",
544 |      "text": [
545 |       "43.8 ms ± 13.9 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
546 |      ]
547 |     }
548 |    ],
549 |    "source": [
550 |     "%%timeit -r 3 -n 5\n",
551 |     "if torch.cuda.is_available():\n",
552 |     "    model_cuda = model.cuda()\n",
553 |     "    model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "markdown",
558 |    "id": "eligible-paragraph",
559 |    "metadata": {},
560 |    "source": [
561 |     "## Model: distilbert-base-uncased"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": 13,
567 |    "id": "altered-cotton",
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "base_model = \"distilbert-base-uncased\""
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 14,
577 |    "id": "convenient-possibility",
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "name": "stderr",
582 |      "output_type": "stream",
583 |      "text": [
584 |       "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']\n",
585 |       "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
586 |       "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
587 |       "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']\n",
588 |       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
589 |      ]
590 |     }
591 |    ],
592 |    "source": [
593 |     "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
594 |     "\n",
595 |     "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
596 |     "                base_model\n",
597 |     "            )\n",
598 |     "\n",
599 |     "res = tokenizer.encode_plus(query, return_tensors=\"pt\",  padding=\"max_length\", truncation=True)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 15,
605 |    "id": "literary-particle",
606 |    "metadata": {},
607 |    "outputs": [
608 |     {
609 |      "name": "stdout",
610 |      "output_type": "stream",
611 |      "text": [
612 |       "112 ms ± 4.13 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
613 |      ]
614 |     }
615 |    ],
616 |    "source": [
617 |     "%%timeit -r 3 -n 5\n",
618 |     "\n",
619 |     "model_res = model(**res)\n",
620 |     "model_res"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 16,
626 |    "id": "incorporated-conservative",
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "name": "stdout",
631 |      "output_type": "stream",
632 |      "text": [
633 |       "26.6 ms ± 6.81 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
634 |      ]
635 |     }
636 |    ],
637 |    "source": [
638 |     "%%timeit -r 3 -n 5\n",
639 |     "if torch.cuda.is_available():\n",
640 |     "    model_cuda = model.cuda()\n",
641 |     "    model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "id": "cleared-nashville",
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": []
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": null,
655 |    "id": "expected-adult",
656 |    "metadata": {},
657 |    "outputs": [],
658 |    "source": []
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": null,
663 |    "id": "isolated-template",
664 |    "metadata": {},
665 |    "outputs": [],
666 |    "source": []
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": null,
671 |    "id": "japanese-batch",
672 |    "metadata": {},
673 |    "outputs": [],
674 |    "source": []
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "id": "parliamentary-robin",
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": []
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": null,
687 |    "id": "requested-anthropology",
688 |    "metadata": {},
689 |    "outputs": [],
690 |    "source": []
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": null,
695 |    "id": "failing-instrument",
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": []
699 |   }
700 |  ],
701 |  "metadata": {
702 |   "environment": {
703 |    "kernel": "pyupgrade",
704 |    "name": "pytorch-gpu.1-11.m94",
705 |    "type": "gcloud",
706 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
707 |   },
708 |   "kernelspec": {
709 |    "display_name": "pyupgrade",
710 |    "language": "python",
711 |    "name": "pyupgrade"
712 |   },
713 |   "language_info": {
714 |    "codemirror_mode": {
715 |     "name": "ipython",
716 |     "version": 3
717 |    },
718 |    "file_extension": ".py",
719 |    "mimetype": "text/x-python",
720 |    "name": "python",
721 |    "nbconvert_exporter": "python",
722 |    "pygments_lexer": "ipython3",
723 |    "version": "3.7.12"
724 |   }
725 |  },
726 |  "nbformat": 4,
727 |  "nbformat_minor": 5
728 | }
729 | 


--------------------------------------------------------------------------------
/notebooks/04_packaging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "disturbed-division",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Packaging Model"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "aggressive-pasta",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## About\n",
 17 |     "\n",
 18 |     "- Package the given model using Torch Model Archive\n",
 19 |     "- Write a custom handler to support pre processing and post processing"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "appointed-funeral",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Working directory"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "id": "respective-tourist",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "orignal model and the traced model we saved from before"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "id": "04d57cd2-9d4d-486a-be0c-489b0b4cf3b8",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import os\n",
 46 |     "import sys"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "2528f4c1-7963-4f54-8a21-ee7b06061897",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": []
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "c7c866ce-2428-4ac5-9bf6-292e5073faa4",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": []
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "id": "turkish-finland",
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "distilbert-base-uncased  distilbert-base-uncased__trace\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "!ls ../artifacts/model/"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "specific-usage",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "directory contains tokenizer/ vocab / pytorch model"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "id": "extreme-spirit",
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "config.json\t    setup_config.json\t     tokenizer_config.json\n",
102 |       "index_to_name.json  special_tokens_map.json  training_args.bin\n",
103 |       "pytorch_model.bin   tokenizer.json\t     vocab.txt\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "!ls ../artifacts/model/distilbert-base-uncased "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "id": "sporting-philip",
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "index_to_name.json  special_tokens_map.json  traced_model.pt\n",
122 |       "model_store\t    tokenizer.json\t     vocab.txt\n",
123 |       "setup_config.json   tokenizer_config.json\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "!ls ../artifacts/model/distilbert-base-uncased__trace"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "electronic-liver",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "id": "instructional-electric",
142 |    "metadata": {},
143 |    "source": [
144 |     "## Torch Model Archiver\n",
145 |     "\n",
146 |     "TorchServe required the model and its dependant artifacts to be packaged in a single file. \n",
147 |     "\n",
148 |     "[torch-model-archiver](https://pypi.org/project/torch-model-archiver/) is a python package that can package the artifacts to a mar file"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 5,
154 |    "id": "outdoor-agenda",
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "usage: torch-model-archiver [-h] --model-name MODEL_NAME\n",
162 |       "                            [--serialized-file SERIALIZED_FILE]\n",
163 |       "                            [--model-file MODEL_FILE] --handler HANDLER\n",
164 |       "                            [--extra-files EXTRA_FILES]\n",
165 |       "                            [--runtime {python,python2,python3}]\n",
166 |       "                            [--export-path EXPORT_PATH]\n",
167 |       "                            [--archive-format {tgz,no-archive,default}] [-f]\n",
168 |       "                            -v VERSION [-r REQUIREMENTS_FILE]\n",
169 |       "\n",
170 |       "Torch Model Archiver Tool\n",
171 |       "\n",
172 |       "optional arguments:\n",
173 |       "  -h, --help            show this help message and exit\n",
174 |       "  --model-name MODEL_NAME\n",
175 |       "                        Exported model name. Exported file will be named as\n",
176 |       "                        model-name.mar and saved in current working directory if no --export-path is\n",
177 |       "                        specified, else it will be saved under the export path\n",
178 |       "  --serialized-file SERIALIZED_FILE\n",
179 |       "                        Path to .pt or .pth file containing state_dict in case of eager mode\n",
180 |       "                        or an executable ScriptModule in case of TorchScript.\n",
181 |       "  --model-file MODEL_FILE\n",
182 |       "                        Path to python file containing model architecture.\n",
183 |       "                        This parameter is mandatory for eager mode models.\n",
184 |       "                        The model architecture file must contain only one\n",
185 |       "                        class definition extended from torch.nn.modules.\n",
186 |       "  --handler HANDLER     TorchServe's default handler name\n",
187 |       "                         or Handler path to handle custom inference logic.\n",
188 |       "  --extra-files EXTRA_FILES\n",
189 |       "                        Comma separated path to extra dependency files.\n",
190 |       "  --runtime {python,python2,python3}\n",
191 |       "                        The runtime specifies which language to run your inference code on.\n",
192 |       "                        The default runtime is \"python\".\n",
193 |       "  --export-path EXPORT_PATH\n",
194 |       "                        Path where the exported .mar file will be saved. This is an optional\n",
195 |       "                        parameter. If --export-path is not specified, the file will be saved in the\n",
196 |       "                        current working directory. \n",
197 |       "  --archive-format {tgz,no-archive,default}\n",
198 |       "                        The format in which the model artifacts are archived.\n",
199 |       "                        \"tgz\": This creates the model-archive in <model-name>.tar.gz format.\n",
200 |       "                        If platform hosting TorchServe requires model-artifacts to be in \".tar.gz\"\n",
201 |       "                        use this option.\n",
202 |       "                        \"no-archive\": This option creates an non-archived version of model artifacts\n",
203 |       "                        at \"export-path/{model-name}\" location. As a result of this choice, \n",
204 |       "                        MANIFEST file will be created at \"export-path/{model-name}\" location\n",
205 |       "                        without archiving these model files\n",
206 |       "                        \"default\": This creates the model-archive in <model-name>.mar format.\n",
207 |       "                        This is the default archiving format. Models archived in this format\n",
208 |       "                        will be readily hostable on native TorchServe.\n",
209 |       "  -f, --force           When the -f or --force flag is specified, an existing .mar file with same\n",
210 |       "                        name as that provided in --model-name in the path specified by --export-path\n",
211 |       "                        will overwritten\n",
212 |       "  -v VERSION, --version VERSION\n",
213 |       "                        Model's version\n",
214 |       "  -r REQUIREMENTS_FILE, --requirements-file REQUIREMENTS_FILE\n",
215 |       "                        Path to a requirements.txt containing model specific python dependency\n",
216 |       "                         packages.\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "%%bash \n",
222 |     "\n",
223 |     "torch-model-archiver --help"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "better-nicholas",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": []
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "id": "insured-paradise",
237 |    "metadata": {},
238 |    "source": [
239 |     "package the model artifact and actual handler code"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 6,
245 |    "id": "87a46200-5d60-4f0e-af70-904f1f2089f7",
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "/opt/conda/envs/pyupgrade/bin/torch-model-archiver\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "!which torch-model-archiver "
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 7,
263 |    "id": "searching-testing",
264 |    "metadata": {
265 |     "scrolled": true
266 |    },
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "/home/jupyter/workshop/pytorch-serving-workshop\n"
273 |      ]
274 |     },
275 |     {
276 |      "name": "stderr",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "WARNING - Overwriting artifacts/model/distilbert-base-uncased__trace/model_store/pt_classifier.mar ...\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "%%bash\n",
285 |     "\n",
286 |     "cd ..\n",
287 |     "pwd\n",
288 |     "\n",
289 |     "ARTIFACT_BASE_DIR=\"artifacts/model/distilbert-base-uncased__trace\"\n",
290 |     "\n",
291 |     "MODEL_NAME=\"pt_classifier\"\n",
292 |     "MODEL_VERSION=\"1.0\"\n",
293 |     "MODEL_STORE=\"${ARTIFACT_BASE_DIR}/model_store\"\n",
294 |     "MODEL_SERIALIZED_FILE=\"${ARTIFACT_BASE_DIR}/traced_model.pt\"\n",
295 |     "\n",
296 |     "TOKENIZER_FILES=\"${ARTIFACT_BASE_DIR}/tokenizer_config.json,${ARTIFACT_BASE_DIR}/special_tokens_map.json,${ARTIFACT_BASE_DIR}/vocab.txt,${ARTIFACT_BASE_DIR}/tokenizer.json\"\n",
297 |     "MODEL_EXTRA_FILES=\"${ARTIFACT_BASE_DIR}/index_to_name.json,${ARTIFACT_BASE_DIR}/setup_config.json,${TOKENIZER_FILES}\"\n",
298 |     "\n",
299 |     "\n",
300 |     "\n",
301 |     "\n",
302 |     "mkdir -p $MODEL_STORE\n",
303 |     "\n",
304 |     "torch-model-archiver --model-name ${MODEL_NAME} \\\n",
305 |     "--version ${MODEL_VERSION} \\\n",
306 |     "--serialized-file ${MODEL_SERIALIZED_FILE} \\\n",
307 |     "--export-path ${MODEL_STORE} \\\n",
308 |     "--extra-files ${MODEL_EXTRA_FILES} \\\n",
309 |     "--handler ./serving/handler.py \\\n",
310 |     "--force\n",
311 |     "\n"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 8,
317 |    "id": "familiar-hormone",
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "# %load ../serving/handler.py\n",
322 |     "import json\n",
323 |     "import logging\n",
324 |     "import os\n",
325 |     "import time\n",
326 |     "from abc import ABC\n",
327 |     "from collections.abc import Iterable\n",
328 |     "import transformers\n",
329 |     "import ast\n",
330 |     "import torch\n",
331 |     "\n",
332 |     "import numpy as np\n",
333 |     "from ts.metrics.dimension import Dimension\n",
334 |     "\n",
335 |     "logger = logging.getLogger(__name__)\n",
336 |     "\n",
337 |     "from ts.torch_handler.base_handler import BaseHandler\n",
338 |     "\n",
339 |     "from ts.utils.util import map_class_to_label\n",
340 |     "\n",
341 |     "import time\n",
342 |     "\n",
343 |     "\n",
344 |     "logger = logging.getLogger(__name__)\n",
345 |     "logger.info(\"Transformers version %s\",transformers.__version__)\n",
346 |     "\n",
347 |     "class CustomHandler(BaseHandler, ABC):\n",
348 |     "    \"\"\"\n",
349 |     "    Transformers handler class for sequence classification.\n",
350 |     "    \"\"\"\n",
351 |     "\n",
352 |     "    def __init__(self):\n",
353 |     "        super(CustomHandler, self).__init__()\n",
354 |     "        self.initialized = False\n",
355 |     "\n",
356 |     "    def initialize(self, ctx):\n",
357 |     "\n",
358 |     "        \n",
359 |     "        self.manifest = ctx.manifest\n",
360 |     "        properties = ctx.system_properties\n",
361 |     "        model_dir = properties.get(\"model_dir\")\n",
362 |     "        serialized_file = self.manifest[\"model\"][\"serializedFile\"]\n",
363 |     "        model_pt_path = os.path.join(model_dir, serialized_file)\n",
364 |     "\n",
365 |     "        self.device = torch.device(\n",
366 |     "            \"cuda:\" + str(properties.get(\"gpu_id\"))\n",
367 |     "            if torch.cuda.is_available() and properties.get(\"gpu_id\") is not None\n",
368 |     "            else \"cpu\"\n",
369 |     "        )\n",
370 |     "        \n",
371 |     "        # read configs for the mode, model_name, etc. from setup_config.json\n",
372 |     "        setup_config_path = os.path.join(model_dir, \"setup_config.json\")\n",
373 |     "        if os.path.isfile(setup_config_path):\n",
374 |     "            with open(setup_config_path) as setup_config_file:\n",
375 |     "                self.setup_config = json.load(setup_config_file)\n",
376 |     "        else:\n",
377 |     "            logger.warning(\"Missing the setup_config.json file.\")\n",
378 |     "\n",
379 |     "\n",
380 |     "        # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode\n",
381 |     "        # further setup config can be added.\n",
382 |     "        if self.setup_config[\"save_mode\"] == \"jit\":\n",
383 |     "            self.model = torch.jit.load(model_pt_path, map_location=self.device)\n",
384 |     "        elif self.setup_config[\"save_mode\"] == \"original\":\n",
385 |     "            self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)\n",
386 |     "\n",
387 |     "            self.model.to(self.device)\n",
388 |     "            \n",
389 |     "        else:\n",
390 |     "            logger.warning(\"Missing the checkpoint or state_dict.\")\n",
391 |     "\n",
392 |     "            \n",
393 |     "        \n",
394 |     "        self.top_k = self.setup_config[\"top_k\"]\n",
395 |     "        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir \n",
396 |     "                                                                    , do_lower_case=self.setup_config[\"do_lower_case\"]\n",
397 |     "                                                                    , torchscript=True)\n",
398 |     "\n",
399 |     "      \n",
400 |     "        self.model.eval()\n",
401 |     "\n",
402 |     "        logger.info(\n",
403 |     "            \"Transformer model from path %s loaded successfully\", model_dir\n",
404 |     "        )\n",
405 |     "\n",
406 |     "        # Read the mapping file, index to object name\n",
407 |     "        mapping_file_path = os.path.join(model_dir, \"index_to_name.json\")\n",
408 |     "        \n",
409 |     "        if os.path.isfile(mapping_file_path):\n",
410 |     "            with open(mapping_file_path) as f:\n",
411 |     "                self.mapping = json.load(f)\n",
412 |     "        else:\n",
413 |     "            logger.warning(\"Missing the index_to_name.json file.\")\n",
414 |     "        \n",
415 |     "        self.initialized = True\n",
416 |     "\n",
417 |     "    def preprocess(self, requests):\n",
418 |     "        \"\"\"Basic text preprocessing, based on the user's chocie of application mode.\n",
419 |     "        Args:\n",
420 |     "            requests (str): The Input data in the form of text is passed on to the preprocess\n",
421 |     "            function.\n",
422 |     "        Returns:\n",
423 |     "            list : The preprocess function returns a list of Tensor for the size of the word tokens.\n",
424 |     "        \"\"\"\n",
425 |     "        input_ids_batch = None\n",
426 |     "        attention_mask_batch = None\n",
427 |     "        for idx, data in enumerate(requests):\n",
428 |     "            request = data.get(\"data\")\n",
429 |     "            if request is None:\n",
430 |     "                request = data.get(\"body\")\n",
431 |     "            if isinstance(request, (bytes, bytearray)):\n",
432 |     "                request = request.decode('utf-8')\n",
433 |     "\n",
434 |     "            input_text = request['text']\n",
435 |     "            max_length = self.setup_config[\"max_length\"]\n",
436 |     "            logger.info(\"Received text: '%s'\", input_text)\n",
437 |     "\n",
438 |     "            # preprocessing text for sequence_classification and token_classification.\n",
439 |     "            inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')\n",
440 |     "            \n",
441 |     "            \n",
442 |     "            input_ids = inputs[\"input_ids\"].to(self.device)\n",
443 |     "            attention_mask = inputs[\"attention_mask\"].to(self.device)\n",
444 |     "            # making a batch out of the recieved requests\n",
445 |     "            # attention masks are passed for cases where input tokens are padded.\n",
446 |     "            if input_ids.shape is not None:\n",
447 |     "                if input_ids_batch is None:\n",
448 |     "                    input_ids_batch = input_ids\n",
449 |     "                    attention_mask_batch = attention_mask\n",
450 |     "                else:\n",
451 |     "                    input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)\n",
452 |     "                    attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)\n",
453 |     "        \n",
454 |     "        input_ids_batch = input_ids_batch.to(self.device)\n",
455 |     "        attention_mask_batch = attention_mask_batch.to(self.device)\n",
456 |     "        \n",
457 |     "        return (input_ids_batch, attention_mask_batch)\n",
458 |     "\n",
459 |     "    def inference(self, input_batch):\n",
460 |     "\n",
461 |     "        \n",
462 |     "        input_ids_batch, attention_mask_batch = input_batch\n",
463 |     "        inferences = []\n",
464 |     "        \n",
465 |     "        predictions = self.model(input_ids_batch, attention_mask_batch)\n",
466 |     "        \n",
467 |     "#         ps = torch.nn.functional.softmax(predictions.logits, dim=1)\n",
468 |     "#         probs, classes = torch.topk(ps, self.top_k, dim=1)\n",
469 |     "#         probs = probs.tolist()\n",
470 |     "#         classes = classes.tolist()\n",
471 |     "\n",
472 |     "#         inferences = map_class_to_label(probs, self.mapping, classes)\n",
473 |     "        \n",
474 |     "        num_rows, num_cols = predictions[0].shape\n",
475 |     "        for i in range(num_rows):\n",
476 |     "            ps = torch.nn.functional.softmax(predictions[i], dim=1)\n",
477 |     "            probs, classes = torch.topk(ps, self.top_k, dim=1)\n",
478 |     "            probs = probs.tolist()\n",
479 |     "            classes = classes.tolist()\n",
480 |     "        \n",
481 |     "            friendly_labels = map_class_to_label(probs, self.mapping, classes)\n",
482 |     "            inferences.append(friendly_labels)\n",
483 |     "\n",
484 |     "\n",
485 |     "        return inferences\n",
486 |     "\n",
487 |     "    def postprocess(self, inference_output):\n",
488 |     "\n",
489 |     "        return inference_output\n",
490 |     "   \n",
491 |     "    \n",
492 |     "    def handle(self, data, context):\n",
493 |     "\n",
494 |     "        # It can be used for pre or post processing if needed as additional request\n",
495 |     "        # information is available in context\n",
496 |     "        \n",
497 |     "        start_time = time.time()\n",
498 |     "        \n",
499 |     "        self.context = context\n",
500 |     "        metrics = self.context.metrics\n",
501 |     "        \n",
502 |     "        data_preprocess = self.preprocess(data)\n",
503 |     "        data_inference = self.inference(data_preprocess)\n",
504 |     "        data_postprocess = self.postprocess(data_inference)\n",
505 |     "        \n",
506 |     "        \n",
507 |     "        \n",
508 |     "        stop_time = time.time()\n",
509 |     "        metrics.add_time('HandlerTime', round((stop_time - start_time) * 1000, 2), None, 'ms')\n",
510 |     "        \n",
511 |     "        return data_postprocess\n"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "id": "foster-concept",
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": []
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "id": "confused-discretion",
525 |    "metadata": {},
526 |    "source": [
527 |     "if you would live to serve through Docker, lets copy the `model_store` artifact relative to the DockerFile folder"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 9,
533 |    "id": "intensive-contest",
534 |    "metadata": {},
535 |    "outputs": [],
536 |    "source": [
537 |     "%%bash\n",
538 |     "cd .. \n",
539 |     "\n",
540 |     "rm -rf serving/model_store\n",
541 |     "mkdir -p serving/model_store\n",
542 |     "\n",
543 |     "cp artifacts/model/distilbert-base-uncased__trace/model_store/* serving/model_store\n",
544 |     "cp artifacts/model/distilbert-base-uncased__trace/setup_config.json serving/model_store/"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "id": "interesting-hollow",
551 |    "metadata": {},
552 |    "outputs": [],
553 |    "source": []
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "id": "ideal-specialist",
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": []
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "id": "creative-ballot",
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": []
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "id": "funky-summer",
574 |    "metadata": {},
575 |    "source": [
576 |     "## Torchserve\n",
577 |     "\n",
578 |     "> TorchServe is a performant, flexible and easy to use tool for serving PyTorch eager mode and torschripted models.\n",
579 |     "\n",
580 |     "Ref: [TorchServe Docs](https://pytorch.org/serve/)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "id": "indie-tokyo",
586 |    "metadata": {},
587 |    "source": [
588 |     "below command starts torchserve"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 10,
594 |    "id": "pretty-graphic",
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": [
598 |     "%%bash --bg\n",
599 |     "cd ..\n",
600 |     "torchserve --ts-config ./serving/config.properties \\\n",
601 |     "--start --model-store ./serving/model_store --ncs\n",
602 |     "\n"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "id": "e9dce210",
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": [
612 |     "%%bash\n",
613 |     "\n",
614 |     "echo \"waiting for some time for torchserve to start\"\n",
615 |     "sleep 30"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": 11,
621 |    "id": "missing-champagne",
622 |    "metadata": {},
623 |    "outputs": [
624 |     {
625 |      "name": "stdout",
626 |      "output_type": "stream",
627 |      "text": [
628 |       "access_log.log\tmodel_log.log  model_metrics.log  ts_log.log  ts_metrics.log\n"
629 |      ]
630 |     }
631 |    ],
632 |    "source": [
633 |     "!ls ../logs/"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": 12,
639 |    "id": "endangered-responsibility",
640 |    "metadata": {},
641 |    "outputs": [
642 |     {
643 |      "name": "stdout",
644 |      "output_type": "stream",
645 |      "text": [
646 |       "2022-07-10T00:01:37,265 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Listening on port: /tmp/.ts.sock.9000\n",
647 |       "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - [PID]8127\n",
648 |       "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Torch worker started.\n",
649 |       "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Python runtime: 3.7.12\n",
650 |       "2022-07-10T00:01:37,292 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Connection accepted: /tmp/.ts.sock.9000.\n",
651 |       "2022-07-10T00:01:37,333 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - model_name: pt_classifier, batchSize: 1\n",
652 |       "2022-07-10T00:01:37,530 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformers version 4.20.1\n",
653 |       "2022-07-10T00:01:40,262 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformer model from path /tmp/models/dbe664e9d9464a0b83b3d662a607513a loaded successfully\n"
654 |      ]
655 |     }
656 |    ],
657 |    "source": [
658 |     "!tail ../logs/model_log.log"
659 |    ]
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": 13,
664 |    "id": "forbidden-marriage",
665 |    "metadata": {},
666 |    "outputs": [
667 |     {
668 |      "name": "stdout",
669 |      "output_type": "stream",
670 |      "text": [
671 |       "load_models=all\n",
672 |       "inference_address=http://0.0.0.0:9080\n",
673 |       "management_address=http://0.0.0.0:9081\n",
674 |       "metrics_address=http://0.0.0.0:9082\n",
675 |       "model_store=model_store\n",
676 |       "async_logging=true"
677 |      ]
678 |     }
679 |    ],
680 |    "source": [
681 |     "!cat ../serving/config.properties "
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "markdown",
686 |    "id": "hazardous-slovak",
687 |    "metadata": {},
688 |    "source": [
689 |     "below command stops torchserve"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 14,
695 |    "id": "sixth-gardening",
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "#torchserve --stop"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "id": "floating-currency",
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": []
709 |   },
710 |   {
711 |    "cell_type": "markdown",
712 |    "id": "revolutionary-premises",
713 |    "metadata": {},
714 |    "source": [
715 |     "List all the models loaded"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": 15,
721 |    "id": "false-council",
722 |    "metadata": {},
723 |    "outputs": [
724 |     {
725 |      "name": "stdout",
726 |      "output_type": "stream",
727 |      "text": [
728 |       "curl: (7) Failed to connect to localhost port 9081: Connection refused\n"
729 |      ]
730 |     }
731 |    ],
732 |    "source": [
733 |     "!curl \"http://localhost:9081/models\""
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "code",
738 |    "execution_count": null,
739 |    "id": "proprietary-disposition",
740 |    "metadata": {},
741 |    "outputs": [],
742 |    "source": []
743 |   },
744 |   {
745 |    "cell_type": "markdown",
746 |    "id": "automated-division",
747 |    "metadata": {},
748 |    "source": [
749 |     "get details on the model `pt_classifier`"
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": 16,
755 |    "id": "fancy-rings",
756 |    "metadata": {},
757 |    "outputs": [
758 |     {
759 |      "name": "stdout",
760 |      "output_type": "stream",
761 |      "text": [
762 |       "curl: (7) Failed to connect to localhost port 9081: Connection refused\n"
763 |      ]
764 |     }
765 |    ],
766 |    "source": [
767 |     "!curl http://localhost:9081/models/pt_classifier"
768 |    ]
769 |   },
770 |   {
771 |    "cell_type": "code",
772 |    "execution_count": null,
773 |    "id": "fancy-judges",
774 |    "metadata": {},
775 |    "outputs": [],
776 |    "source": []
777 |   },
778 |   {
779 |    "cell_type": "markdown",
780 |    "id": "premier-scheme",
781 |    "metadata": {},
782 |    "source": [
783 |     "sample prediction"
784 |    ]
785 |   },
786 |   {
787 |    "cell_type": "code",
788 |    "execution_count": 17,
789 |    "id": "norman-trader",
790 |    "metadata": {},
791 |    "outputs": [
792 |     {
793 |      "name": "stdout",
794 |      "output_type": "stream",
795 |      "text": [
796 |       "\n",
797 |       "elasped time (sec):0.000585\n",
798 |       "curl: (7) Failed to connect to localhost port 9080: Connection refused\n"
799 |      ]
800 |     }
801 |    ],
802 |    "source": [
803 |     "! curl -X POST http://localhost:9080/predictions/pt_classifier \\\n",
804 |     "        -H 'Content-Type: application/json' \\\n",
805 |     "        -d '{\"text\":\"herbal tea\",\"request_id\":\"test_id\"}' \\\n",
806 |     "        -w  \"\\nelasped time (sec):%{time_total}\\n\""
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "markdown",
811 |    "id": "direct-sherman",
812 |    "metadata": {},
813 |    "source": [
814 |     "sample prediction from a file"
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "code",
819 |    "execution_count": 19,
820 |    "id": "solid-internship",
821 |    "metadata": {},
822 |    "outputs": [
823 |     {
824 |      "name": "stdout",
825 |      "output_type": "stream",
826 |      "text": [
827 |       "[\n",
828 |       "  {\n",
829 |       "    \"GROCERY\": 0.9995384216308594,\n",
830 |       "    \"HEALTH_PERSONAL_CARE\": 0.0001973821345018223,\n",
831 |       "    \"PET_SUPPLIES\": 8.77468119142577e-05,\n",
832 |       "    \"KITCHEN\": 5.9781144955195487e-05,\n",
833 |       "    \"HOME\": 1.9271317796665244e-05\n",
834 |       "  }\n",
835 |       "]\n",
836 |       "elasped time (sec):0.447966\n"
837 |      ]
838 |     },
839 |     {
840 |      "name": "stderr",
841 |      "output_type": "stream",
842 |      "text": [
843 |       "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
844 |       "                                 Dload  Upload   Total   Spent    Left  Speed\n",
845 |       "100   264  100   213  100    51    476    114 --:--:-- --:--:-- --:--:--   590\n"
846 |      ]
847 |     }
848 |    ],
849 |    "source": [
850 |     "%%bash\n",
851 |     "cd ..\n",
852 |     "curl -X POST http://localhost:9080/predictions/pt_classifier \\\n",
853 |     "        -H 'Content-Type: application/json' \\\n",
854 |     "        -d @serving/sample_input.json \\\n",
855 |     "        -w \"\\nelasped time (sec):%{time_total}\\n\""
856 |    ]
857 |   },
858 |   {
859 |    "cell_type": "code",
860 |    "execution_count": 20,
861 |    "id": "african-donna",
862 |    "metadata": {},
863 |    "outputs": [],
864 |    "source": [
865 |     "import requests"
866 |    ]
867 |   },
868 |   {
869 |    "cell_type": "code",
870 |    "execution_count": 21,
871 |    "id": "aa92e2d6",
872 |    "metadata": {},
873 |    "outputs": [
874 |     {
875 |      "data": {
876 |       "text/plain": [
877 |        "[{'GROCERY': 0.9995384216308594,\n",
878 |        "  'HEALTH_PERSONAL_CARE': 0.0001973821345018223,\n",
879 |        "  'PET_SUPPLIES': 8.77468119142577e-05,\n",
880 |        "  'KITCHEN': 5.9781144955195487e-05,\n",
881 |        "  'HOME': 1.9271317796665244e-05}]"
882 |       ]
883 |      },
884 |      "execution_count": 21,
885 |      "metadata": {},
886 |      "output_type": "execute_result"
887 |     }
888 |    ],
889 |    "source": [
890 |     "payload = {\"text\":\"herbal tea\",\"request_id\":\"test_id\"}\n",
891 |     "\n",
892 |     "endpoint = \"http://localhost:9080/predictions/pt_classifier\"\n",
893 |     "\n",
894 |     "res = requests.post(endpoint, json = payload)\n",
895 |     "\n",
896 |     "res.json()"
897 |    ]
898 |   },
899 |   {
900 |    "cell_type": "code",
901 |    "execution_count": null,
902 |    "id": "c1321402-ca4c-484c-9ec1-5d680c71840b",
903 |    "metadata": {},
904 |    "outputs": [],
905 |    "source": []
906 |   }
907 |  ],
908 |  "metadata": {
909 |   "environment": {
910 |    "kernel": "pyupgrade",
911 |    "name": "pytorch-gpu.1-11.m94",
912 |    "type": "gcloud",
913 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
914 |   },
915 |   "kernelspec": {
916 |    "display_name": "pyupgrade",
917 |    "language": "python",
918 |    "name": "pyupgrade"
919 |   },
920 |   "language_info": {
921 |    "codemirror_mode": {
922 |     "name": "ipython",
923 |     "version": 3
924 |    },
925 |    "file_extension": ".py",
926 |    "mimetype": "text/x-python",
927 |    "name": "python",
928 |    "nbconvert_exporter": "python",
929 |    "pygments_lexer": "ipython3",
930 |    "version": "3.7.12"
931 |   }
932 |  },
933 |  "nbformat": 4,
934 |  "nbformat_minor": 5
935 | }
936 | 


--------------------------------------------------------------------------------
/notebooks/utils.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import tqdm 
 3 | import torch
 4 | import numpy as np
 5 | from ts.utils.util  import map_class_to_label
 6 | 
 7 | 
 8 | def prediction_batch(model, dataset, device:str, batch_size = 32):
 9 |     metric_accuracy = datasets.load_metric('accuracy')
10 |     
11 |     l = len(dataset)
12 |     all_y_preds = []
13 |     # make sure model is in eval mode ; not computing gradients
14 |     model.eval()
15 |     
16 |     # feed model to cpu/gpu device
17 |     model = model.to(device)
18 |     
19 |     # iterate our dataset in batches
20 |     for ndx in tqdm.trange(0, l, batch_size):
21 |         
22 |         # take precomputed inut and attention masks
23 |         input_ids = dataset['input_ids'][ndx:ndx+batch_size].to(device) 
24 |         attention_mask = dataset['attention_mask'][ndx:ndx+batch_size].to(device) 
25 |         
26 |         with torch.no_grad():        
27 |             res = model( input_ids = input_ids, attention_mask = attention_mask )
28 |             
29 |             # output of torchscript model doesn't have logits property 
30 |             #logits = res.logits.detach().cpu().numpy()
31 |             
32 |             logits = res[0].detach().cpu().numpy()
33 |             
34 |             y_preds = np.argmax(logits, axis=1)
35 |             
36 |             all_y_preds.extend(y_preds)
37 |     
38 |     # accuracy on whole dataset
39 |     accuracy = metric_accuracy.compute(predictions = all_y_preds, references = dataset['label'])
40 |     
41 |     return accuracy
42 | 
43 | def prediction(model, tokens_tensor, masks_tensors , id2label_str, topk =5):
44 |     model.eval()
45 |     
46 |     tokens_tensor = tokens_tensor.to('cpu')
47 |     masks_tensors = masks_tensors.to('cpu')
48 |     
49 |     res = model(tokens_tensor, masks_tensors)
50 | 
51 |     ps = torch.nn.functional.softmax(res[0], dim=1)
52 |     probs, classes = torch.topk(ps, topk, dim=1)
53 |     probs = probs.tolist()
54 |     classes = classes.tolist()
55 | 
56 |     labels = map_class_to_label(probs, id2label_str, classes)
57 |     
58 |     return labels
59 |     


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.12.*
 2 | torch-model-archiver==0.6.0
 3 | torchmetrics==0.9.*
 4 | torchserve==0.6.*
 5 | torchvision==0.13.*
 6 | datasets==2.3.*
 7 | transformers==4.20.*
 8 | torchviz==0.0.2
 9 | scikit-learn==1.0.*
10 | plotly==5.9.*
11 | wandb==0.12.*
12 | papermill==2.3.*


--------------------------------------------------------------------------------
/serving/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/torchserve:0.4.0-cpu
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip install -r requirements.txt
 5 | 
 6 | WORKDIR /home/model-server/
 7 | #COPY * /home/model-server/
 8 | COPY model_store model_store
 9 | COPY * /home/model-server/
10 | 
11 | 
12 | USER model-server
13 | 
14 | 
15 | RUN pwd
16 | RUN ls
17 | CMD ["torchserve", \
18 |      "--start", \
19 |      "--ts-config=/home/model-server/config.properties"]


--------------------------------------------------------------------------------
/serving/config.properties:
--------------------------------------------------------------------------------
1 | load_models=all
2 | inference_address=http://0.0.0.0:9080
3 | management_address=http://0.0.0.0:9081
4 | metrics_address=http://0.0.0.0:9082
5 | model_store=model_store
6 | async_logging=true


--------------------------------------------------------------------------------
/serving/handler.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import time
  5 | from abc import ABC
  6 | from collections.abc import Iterable
  7 | import transformers
  8 | import ast
  9 | import torch
 10 | 
 11 | import numpy as np
 12 | from ts.metrics.dimension import Dimension
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | from ts.torch_handler.base_handler import BaseHandler
 17 | 
 18 | from ts.utils.util import map_class_to_label
 19 | 
 20 | import time
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logger.info("Transformers version %s",transformers.__version__)
 25 | 
 26 | class CustomHandler(BaseHandler, ABC):
 27 |     """
 28 |     Transformers handler class for sequence classification.
 29 |     """
 30 | 
 31 |     def __init__(self):
 32 |         super(CustomHandler, self).__init__()
 33 |         self.initialized = False
 34 | 
 35 |     def initialize(self, ctx):
 36 | 
 37 |         
 38 |         self.manifest = ctx.manifest
 39 |         properties = ctx.system_properties
 40 |         model_dir = properties.get("model_dir")
 41 |         serialized_file = self.manifest["model"]["serializedFile"]
 42 |         model_pt_path = os.path.join(model_dir, serialized_file)
 43 | 
 44 |         self.device = torch.device(
 45 |             "cuda:" + str(properties.get("gpu_id"))
 46 |             if torch.cuda.is_available() and properties.get("gpu_id") is not None
 47 |             else "cpu"
 48 |         )
 49 |         
 50 |         # read configs for the mode, model_name, etc. from setup_config.json
 51 |         setup_config_path = os.path.join(model_dir, "setup_config.json")
 52 |         if os.path.isfile(setup_config_path):
 53 |             with open(setup_config_path) as setup_config_file:
 54 |                 self.setup_config = json.load(setup_config_file)
 55 |         else:
 56 |             logger.warning("Missing the setup_config.json file.")
 57 | 
 58 | 
 59 |         # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
 60 |         # further setup config can be added.
 61 |         if self.setup_config["save_mode"] == "jit":
 62 |             self.model = torch.jit.load(model_pt_path, map_location=self.device)
 63 |         elif self.setup_config["save_mode"] == "original":
 64 |             self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)
 65 | 
 66 |             self.model.to(self.device)
 67 |             
 68 |         else:
 69 |             logger.warning("Missing the checkpoint or state_dict.")
 70 | 
 71 |             
 72 |         
 73 |         self.top_k = self.setup_config["top_k"]
 74 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir 
 75 |                                                                     , do_lower_case=self.setup_config["do_lower_case"]
 76 |                                                                     , torchscript=True)
 77 | 
 78 |       
 79 |         self.model.eval()
 80 | 
 81 |         logger.info(
 82 |             "Transformer model from path %s loaded successfully", model_dir
 83 |         )
 84 | 
 85 |         # Read the mapping file, index to object name
 86 |         mapping_file_path = os.path.join(model_dir, "index_to_name.json")
 87 |         
 88 |         if os.path.isfile(mapping_file_path):
 89 |             with open(mapping_file_path) as f:
 90 |                 self.mapping = json.load(f)
 91 |         else:
 92 |             logger.warning("Missing the index_to_name.json file.")
 93 |         
 94 |         self.initialized = True
 95 | 
 96 |     def preprocess(self, requests):
 97 |         """Basic text preprocessing, based on the user's chocie of application mode.
 98 |         Args:
 99 |             requests (str): The Input data in the form of text is passed on to the preprocess
100 |             function.
101 |         Returns:
102 |             list : The preprocess function returns a list of Tensor for the size of the word tokens.
103 |         """
104 |         input_ids_batch = None
105 |         attention_mask_batch = None
106 |         for idx, data in enumerate(requests):
107 |             request = data.get("data")
108 |             if request is None:
109 |                 request = data.get("body")
110 |             if isinstance(request, (bytes, bytearray)):
111 |                 request = request.decode('utf-8')
112 | 
113 |             input_text = request['text']
114 |             max_length = self.setup_config["max_length"]
115 |             logger.info("Received text: '%s'", input_text)
116 | 
117 |             # preprocessing text for sequence_classification and token_classification.
118 |             inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')
119 |             
120 |             
121 |             input_ids = inputs["input_ids"].to(self.device)
122 |             attention_mask = inputs["attention_mask"].to(self.device)
123 |             # making a batch out of the recieved requests
124 |             # attention masks are passed for cases where input tokens are padded.
125 |             if input_ids.shape is not None:
126 |                 if input_ids_batch is None:
127 |                     input_ids_batch = input_ids
128 |                     attention_mask_batch = attention_mask
129 |                 else:
130 |                     input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
131 |                     attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
132 |         
133 |         input_ids_batch = input_ids_batch.to(self.device)
134 |         attention_mask_batch = attention_mask_batch.to(self.device)
135 |         
136 |         return (input_ids_batch, attention_mask_batch)
137 | 
138 |     def inference(self, input_batch):
139 | 
140 |         
141 |         input_ids_batch, attention_mask_batch = input_batch
142 |         inferences = []
143 |         
144 |         predictions = self.model(input_ids_batch, attention_mask_batch)
145 |         
146 | #         ps = torch.nn.functional.softmax(predictions.logits, dim=1)
147 | #         probs, classes = torch.topk(ps, self.top_k, dim=1)
148 | #         probs = probs.tolist()
149 | #         classes = classes.tolist()
150 | 
151 | #         inferences = map_class_to_label(probs, self.mapping, classes)
152 |         
153 |         num_rows, num_cols = predictions[0].shape
154 |         for i in range(num_rows):
155 |             ps = torch.nn.functional.softmax(predictions[i], dim=1)
156 |             probs, classes = torch.topk(ps, self.top_k, dim=1)
157 |             probs = probs.tolist()
158 |             classes = classes.tolist()
159 |         
160 |             friendly_labels = map_class_to_label(probs, self.mapping, classes)
161 |             inferences.append(friendly_labels)
162 | 
163 | 
164 |         return inferences
165 | 
166 |     def postprocess(self, inference_output):
167 | 
168 |         return inference_output
169 |    
170 |     
171 |     def handle(self, data, context):
172 | 
173 |         # It can be used for pre or post processing if needed as additional request
174 |         # information is available in context
175 |         
176 |         start_time = time.time()
177 |         
178 |         self.context = context
179 |         metrics = self.context.metrics
180 |         
181 |         data_preprocess = self.preprocess(data)
182 |         data_inference = self.inference(data_preprocess)
183 |         data_postprocess = self.postprocess(data_inference)
184 |         
185 |         
186 |         
187 |         stop_time = time.time()
188 |         metrics.add_time('HandlerTime', round((stop_time - start_time) * 1000, 2), None, 'ms')
189 |         
190 |         return data_postprocess
191 | 


--------------------------------------------------------------------------------
/serving/requirements.txt:
--------------------------------------------------------------------------------
1 | torch-model-archiver==0.4.2
2 | torchserve==0.4.2
3 | torch==1.9.0
4 | torchmetrics==0.4.1
5 | captum==0.4.0


--------------------------------------------------------------------------------
/serving/sample_input.json:
--------------------------------------------------------------------------------
1 | {"text" :"Herbal Tea" , "request_id":"test_client"}
2 | 


--------------------------------------------------------------------------------
/setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "parental-token",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Setup"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "round-border",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## About\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "Simple notebook to setup\n",
 20 |     "- clone git repo\n",
 21 |     "- install required python dependencies\n",
 22 |     "- download dataset\n",
 23 |     "- download trained model"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "acd07905",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "we are cloning to tmp for jupyterhub setup \n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "homeless-announcement",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "%%bash\n",
 42 |     "\n",
 43 |     "mkdir -p /tmp/workshop\n",
 44 |     "cd /tmp/workshop\n",
 45 |     "git clone https://github.com/npatta01/pytorch-serving-workshop.git -b main --depth 1"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "proof-discipline",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "install needed packages"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "relevant-claim",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "!pip install -r /tmp/workshop/pytorch-serving-workshop/requirements.txt"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "id": "875c286c",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "download processed dataset and model"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "spectacular-scott",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "%%bash\n",
 82 |     "\n",
 83 |     "cd /tmp/workshop\n",
 84 |     "\n",
 85 |     "cd pytorch-serving-workshop\n",
 86 |     "mkdir -p artifacts/dataset_processed/amazon\n",
 87 |     "mkdir -p artifacts/dataset_processed/model\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "cd artifacts\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "# dataset\n",
 94 |     "echo \"downloading dataset\"\n",
 95 |     "wget https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.2/dataset_processed.zip\n",
 96 |     "unzip dataset_processed.zip\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "# model trained on above dataset\n",
100 |     "echo \"downloading model\"\n",
101 |     "wget https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.2/model.zip    \n",
102 |     "unzip model.zip\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "0ce1ca35",
108 |    "metadata": {},
109 |    "source": [
110 |     "Download transformer models"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "9073adaa",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "import transformers"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "c66c5e14",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "for model_name in [\"bert-large-uncased\",\"bert-base-uncased\",\"distilbert-base-uncased\"]:\n",
131 |     "    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)\n",
132 |     "\n",
133 |     "    tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
134 |     "                model_name\n",
135 |     "            )\n",
136 |     "\n",
137 |     "    query = \"men shoes\"\n",
138 |     "    res = tokenizer.encode_plus(query, return_tensors=\"pt\",  padding=\"max_length\", truncation=True)\n",
139 |     "\n",
140 |     "    model_res = model(**res)\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "neither-shipping",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "!cp -r /tmp/workshop/ $HOME/workshop/"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "environment": {
156 |    "kernel": "pyupgrade",
157 |    "name": "pytorch-gpu.1-11.m94",
158 |    "type": "gcloud",
159 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
160 |   },
161 |   "kernelspec": {
162 |    "display_name": "pyupgrade",
163 |    "language": "python",
164 |    "name": "pyupgrade"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.7.12"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 5
181 | }
182 | 


--------------------------------------------------------------------------------
/workshop_infra/cert/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/workshop_infra/cert/.gitkeep


--------------------------------------------------------------------------------
/workshop_infra/config.enc.yaml:
--------------------------------------------------------------------------------
 1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
 2 | scheduling:
 3 |     userScheduler:
 4 |         enabled: true
 5 |     podPriority:
 6 |         enabled: true
 7 |     userPlaceholder:
 8 |         enabled: true
 9 |         # Specify five dummy user pods will be used as placeholders
10 |         replicas: 5
11 |     userPods:
12 |         nodeAffinity:
13 |             matchNodePurpose: require
14 | cull:
15 |     enabled: true
16 |     timeout: 3600
17 |     every: 300
18 | singleuser:
19 |     cpu:
20 |         limit: 4
21 |         guarantee: 2
22 |     memory:
23 |         limit: 16G
24 |         guarantee: 8G
25 |     image:
26 |         # You should replace the "latest" tag with a fixed version from:
27 |         # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
28 |         # Inspect the Dockerfile at:
29 |         # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
30 |         name: gcr.io/np-public-training/pytorch-workshop
31 |         tag: v1.0
32 |     defaultUrl: /lab
33 |     extraEnv:
34 |         JUPYTERHUB_SINGLEUSER_APP: jupyter_server.serverapp.ServerApp
35 |     lifecycleHooks:
36 |         postStart:
37 |             exec:
38 |                 command:
39 |                     - sh
40 |                     - -c
41 |                     - "cp -r /tmp/workshop /home/jovyan; \npwd\n"
42 | # proxy:
43 | #   https:
44 | #     enabled: true
45 | #     hosts:
46 | #       - hub.np.training
47 | #     letsencrypt:
48 | #       contactEmail: npatta01@gmail.com
49 | #   service:
50 | #     loadBalancerIP: "34.145.156.81"
51 | # proxy:
52 | #   service:
53 | #     loadBalancerIP: "34.145.156.81"
54 | proxy:
55 |     https:
56 |         enabled: true
57 |         hosts:
58 |             - hub.np.training
59 |         type: secret
60 |         secret:
61 |             name: workshop-tls
62 |     service:
63 |         loadBalancerIP: 35.188.254.55
64 | hub:
65 |     config:
66 |         Authenticator:
67 |             admin_users:
68 |                 - npatta01
69 |                 - vishalkumar95
70 |                 - reshamas
71 |         GitHubOAuthenticator:
72 |             client_id: ENC[AES256_GCM,data:GVr1cL8lruvBCXqNtaqou1j4urI=,iv:Orx6eg2BTmHaVjsJvVPieIm9d/BCCLzwldYvDZ59ES8=,tag:5PUGb0rpiRprBIdXmALeeg==,type:str]
73 |             client_secret: ENC[AES256_GCM,data:5iBlhLROkt8k8pOVi6OW2BQqivmpSjpWSLeR1NXBQjRFrsbk5tpdwA==,iv:dsfxzR2YRkiL+5EOJBl9a/jEC55LoWuCtMKDA31DYDM=,tag:KH7DT4r+erb1+CGzA2KjcQ==,type:str]
74 |             oauth_callback_url: https://hub.np.training/hub/oauth_callback
75 |         JupyterHub:
76 |             authenticator_class: github
77 | sops:
78 |     kms: []
79 |     gcp_kms:
80 |         - resource_id: projects/np-public-training/locations/global/keyRings/sops/cryptoKeys/sops-key
81 |           created_at: "2022-07-07T16:40:04Z"
82 |           enc: CiQAtA68IVue/mrOfkHLaTjHYkrW6GgqEFBge+pVF/bSJ7gFscASSQDOyIoWJtFT6Rz7JAKCXlZFTTGzrsUQ0c1lHMZlkxSukkT6NfogdDGVwzy+JRA6GQLkmaeWWEPYy+VY/wP0ZYzm3qpQ4/YxSE8=
83 |     azure_kv: []
84 |     hc_vault: []
85 |     age: []
86 |     lastmodified: "2022-07-09T17:45:50Z"
87 |     mac: ENC[AES256_GCM,data:j8mG4yhggQCMn6iS9BNA+0947KJcu7h31MBypEp+XJqgudz3UmSwRlF8UpuUKBVanVosOnSwKak8H11oZcsZBwxFZbEtiNXd8DKZJRNWsOb/Kdkk37ImUlbj1MAuIXh3xLlwFrFsaht5eTAdcri3VcXL0sPZYLDjtOv+YztpTBM=,iv:I+CfXmqeRXpq/84cFMym35/Vi7rDjX8MQ7UKG/6zmaw=,tag:qIAvUsO9ypg/mDsiyk4ing==,type:str]
88 |     pgp: []
89 |     encrypted_regex: ^(client_id|client_secret)$
90 |     version: 3.7.3
91 | 


--------------------------------------------------------------------------------
/workshop_infra/config_public.yaml:
--------------------------------------------------------------------------------
 1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
 2 | scheduling:
 3 |     userScheduler:
 4 |         enabled: true
 5 |     podPriority:
 6 |         enabled: true
 7 |     userPlaceholder:
 8 |         enabled: true
 9 |         # Specify five dummy user pods will be used as placeholders
10 |         replicas: 1
11 |     userPods:
12 |         nodeAffinity:
13 |             matchNodePurpose: require
14 | cull:
15 |     enabled: true
16 |     timeout: 3600
17 |     every: 300
18 | singleuser:
19 |     cpu:
20 |         limit: 4
21 |         guarantee: 2
22 |     memory:
23 |         limit: 16G
24 |         guarantee: 8G
25 |     image:
26 |         name: gcr.io/np-public-training/pytorch-workshop
27 |         tag: v1.0
28 |     defaultUrl: /lab
29 |     extraEnv:
30 |         JUPYTERHUB_SINGLEUSER_APP: jupyter_server.serverapp.ServerApp
31 |     lifecycleHooks:
32 |         postStart:
33 |             exec:
34 |                 command:
35 |                     - sh
36 |                     - -c
37 |                     - "cp -r /tmp/workshop /home/jovyan; \npwd\n"
38 | 
39 | hub:
40 |     config:
41 |         Authenticator:
42 |             admin_users:
43 |                 - npatta01
44 |                 - vishalkumar95
45 |                 - reshamas
46 | 


--------------------------------------------------------------------------------
/workshop_infra/docker-setup.sh:
--------------------------------------------------------------------------------
 1 | cd /home/${NB_USER}
 2 | 
 3 | mkdir -p pytorch-serving-workshop
 4 | 
 5 | cd pytorch-serving-workshop
 6 | mkdir -p artifacts/dataset_processed/amazon
 7 | mkdir -p artifacts/dataset_processed/model
 8 | 
 9 | 
10 | cd artifacts
11 | 
12 | BASE_URL="https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.1/"
13 | 
14 | # dataset
15 | echo "downloading dataset"
16 | wget --quiet "$BASE_URL/dataset_processed.zip"
17 | unzip dataset_processed.zip
18 | 
19 | 
20 | # model trained on above dataset
21 | echo "downloading model"
22 | wget --quiet "$BASE_URL/model.zip"
23 | unzip model.zip


--------------------------------------------------------------------------------
/workshop_infra/setup.md:
--------------------------------------------------------------------------------
  1 | # Workshop Setup
  2 | 
  3 | The following included commands and steps that were used to create a working jupyter hub installation for the workshop.
  4 | 
  5 | The instructions assume that you are plannning to use GCP and have gcloud setup.
  6 | 
  7 | 
  8 | Most of the instructions are taken from [zero-to-jupyterhub](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html) project.
  9 | 
 10 | 
 11 | ## Step 1: common variables
 12 | 
 13 | ```bash
 14 | REGION="us-east4"
 15 | ZONE="$REGION-a"
 16 | NODE_TYPE_USER="n1-highmem-16"
 17 | 
 18 | CLUSTER_NAME=workshop
 19 | NODES_MIN=0
 20 | NODES_MAX=200
 21 | 
 22 | EMAIL="npatta01@gmail.com"
 23 | GCP_PROJECT="np-public-training"
 24 | 
 25 | HELM_NAMESPACE=$CLUSTER_NAME
 26 | 
 27 | HELM_CHART_VERSION="1.2.0"
 28 | ```
 29 | 
 30 | ## Step 2: create static ip address
 31 | 
 32 | ```bash
 33 | gcloud compute addresses create $CLUSTER_NAME \
 34 |     --region $REGION \
 35 |     --project $GCP_PROJECT
 36 | 
 37 | gcloud compute addresses describe $CLUSTER_NAME \
 38 | --region $REGION \
 39 | --project $GCP_PROJECT
 40 | 
 41 | ```
 42 | 
 43 | Create an `A` record with your DNS provider.
 44 | 
 45 | I am using `hub` for my domain `np.training`
 46 | 
 47 | 
 48 | 
 49 | 
 50 | ## Step 3: Create cluster
 51 | 
 52 | 
 53 | ```bash
 54 | 
 55 | gcloud container clusters create \
 56 |   --machine-type n1-standard-2 \
 57 |   --num-nodes 1 \
 58 |   --region $REGION \
 59 |   --cluster-version latest \
 60 |   $CLUSTER_NAME \
 61 |   --project $GCP_PROJECT
 62 | 
 63 | ```
 64 | 
 65 | Get kubectl credentials
 66 | 
 67 | ```bash
 68 | gcloud container clusters get-credentials \
 69 | $CLUSTER_NAME \
 70 | --region $REGION \
 71 | --project $GCP_PROJECT
 72 | ```
 73 | 
 74 | Create admin access for user
 75 | 
 76 | ```bash
 77 | kubectl create clusterrolebinding cluster-admin-binding \
 78 |   --clusterrole=cluster-admin \
 79 |   --user $EMAIL
 80 | ```
 81 | 
 82 | Create separate node pool for jupyter notebook
 83 | 
 84 | ```bash
 85 | gcloud beta container node-pools create user-pool \
 86 |   --machine-type $NODE_TYPE_USER \
 87 |   --num-nodes 0 \
 88 |   --enable-autoscaling \
 89 |   --min-nodes $NODES_MIN \
 90 |   --max-nodes $NODES_MAX \
 91 |   --node-labels hub.jupyter.org/node-purpose=user \
 92 |   --node-taints hub.jupyter.org_dedicated=user:NoSchedule \
 93 |   --region $REGION \
 94 |   --cluster $CLUSTER_NAME  \
 95 |   --project $GCP_PROJECT
 96 | ```
 97 | 
 98 | 
 99 | ## Step 3b: Cert (optional)
100 | 
101 | By default the Helm chart we will use supports LetsEncrypt. However, I had trouble getting it to work.
102 | So, I used followed the steps bellow to get create my own cert
103 | 
104 | create certificate signing request for "*.np.training"
105 | 
106 | ```bash 
107 | openssl req -nodes -newkey rsa:2048 \
108 | -keyout cert/server.key \
109 | -out cert/server.csr \
110 | -subj "/C=US/ST=New York/L=New York/O=NP Training./OU=IT/CN=*.np.training"
111 | ```
112 | 
113 | I bought a wildcard cert from Namecheap
114 | 
115 | 
116 | Download my cert and create a kubectl cert
117 | 
118 | ```bash
119 | 
120 | gsutil cp "gs://np-training-private/certs/_star.np.training/*" workshop_infra/cert
121 | 
122 | 
123 | kubectl create namespace $HELM_NAMESPACE 
124 | cd workshop_infra/cert  
125 | kubectl create secret tls $HELM_NAMESPACE-tls --key="tls.key" --cert="tls.crt" --namespace $HELM_NAMESPACE 
126 | cd ../../
127 | 
128 | ```
129 | 
130 | ## Step 4: Helm setup
131 | 
132 | ```bash
133 | 
134 | curl https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | bash
135 | 
136 | helm version
137 | 
138 | helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/
139 | helm repo update
140 | 
141 | ```
142 | 
143 | 
144 | ## Step 5: Update config file (optional)
145 | 
146 | 
147 | build docker image
148 | 
149 | ```bash
150 | docker build -t gcr.io/$GCP_PROJECT/pytorch-workshop:v1.0 .
151 | docker push gcr.io/$GCP_PROJECT/pytorch-workshop:v1.0
152 | 
153 | ```
154 | 
155 | encrypt setup
156 | 
157 | ```bash
158 | gcloud kms keyrings create sops --location global --project $GCP_PROJECT
159 | gcloud kms keys create sops-key --location global --keyring sops --purpose encryption --project $GCP_PROJECT
160 | gcloud kms keys list --location global --keyring sops --project $GCP_PROJECT
161 | ```
162 | 
163 | 
164 | ```bash
165 | sops --encrypt --gcp-kms projects/$GCP_PROJECT/locations/global/keyRings/sops/cryptoKeys/sops-key \
166 | --encrypted-regex '^(client_id|client_secret)$' \
167 | workshop_infra/config.yaml > workshop_infra/config.enc.yaml
168 | ```
169 | 
170 | ```bash
171 | sops --decrypt workshop_infra/config.enc.yaml > workshop_infra/config.yaml
172 | ```
173 | 
174 | 
175 | replace values in [config.yaml](workshop_infra/config.yaml)
176 | 
177 | - GitHubOAuthenticator
178 | - singleuser.image.name
179 | - scheduling.userPlaceholder.replicas
180 | - proxy.https.host
181 | - proxy.https.service.loadBalancerIP
182 | 
183 | 
184 | 
185 | ## Step 6: Helm Install with authentication
186 | 
187 | setup with authentication and git oauth
188 | ```bash
189 | helm upgrade --cleanup-on-fail \
190 |   --install $HELM_NAMESPACE jupyterhub/jupyterhub \
191 |   --namespace $HELM_NAMESPACE \
192 |   --create-namespace \
193 |   --version $HELM_CHART_VERSION \
194 |   --values workshop_infra/config.yaml
195 | 
196 | ```
197 | 
198 | ```bash
199 | kubectl --namespace=$HELM_NAMESPACE get pod
200 | 
201 | kubectl --namespace=$HELM_NAMESPACE  get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
202 | ```
203 | 
204 | 
205 | ## Step 6b: Helm Install with no authentication ( not auth)
206 | 
207 | ```bash
208 | helm upgrade --cleanup-on-fail \
209 |   --install $HELM_NAMESPACE-public jupyterhub/jupyterhub \
210 |   --namespace $HELM_NAMESPACE-public \
211 |   --create-namespace \
212 |   --version $HELM_CHART_VERSION \
213 |   --values workshop_infra/config_public.yaml
214 | 
215 | 
216 | kubectl --namespace=$HELM_NAMESPACE-public get pod
217 | 
218 | kubectl --namespace=$HELM_NAMESPACE-public  get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
219 | ```
220 | 
221 | add the external ip to dns
222 | 
223 | 
224 | ## Step 7: Cleanup (Helm Delete)
225 | 
226 | ```bash
227 | 
228 | helm delete $HELM_NAMESPACE --namespace $HELM_NAMESPACE 
229 | kubectl delete namespace $HELM_NAMESPACE 
230 | 
231 | helm delete $HELM_NAMESPACE-public --namespace $HELM_NAMESPACE-public
232 | kubectl delete namespace $HELM_NAMESPACE-public
233 | 
234 | 
235 | gcloud container clusters  delete $CLUSTER_NAME  --region $REGION   --project $GCP_PROJECT
236 | 
237 | ```


--------------------------------------------------------------------------------