├── .gitignore
├── Dockerfile
├── README.md
├── assets
├── setup_01.png
├── setup_02.png
├── slides.pdf
└── slides_cover.png
├── notebooks
├── 00_code_snipets.ipynb
├── 00_prepare_dataset.ipynb
├── 01_train.ipynb
├── 02_inference_review.ipynb
├── 02_timing.ipynb
├── 03_optimizing_model.ipynb
├── 04_packaging.ipynb
└── utils.py
├── requirements.txt
├── serving
├── Dockerfile
├── config.properties
├── handler.py
├── requirements.txt
└── sample_input.json
├── setup.ipynb
└── workshop_infra
├── cert
└── .gitkeep
├── config.enc.yaml
├── config_public.yaml
├── docker-setup.sh
└── setup.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
3 | .pth
4 | .mar
5 | logs
6 | *.gz
7 | serving/model_store/
8 | __pycache__
9 | *.zip
10 | notebooks/trainer_*/
11 | notebooks/wandb/
12 | .venv
13 | workshop_infra/cert/*
14 | workshop_infra/config.yaml
15 |
16 | !/**/.gitkeep
17 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | #FROM jupyter/scipy-notebook:python-3.10.5
2 |
3 | FROM jupyter/scipy-notebook:python-3.8.8
4 |
5 |
6 |
7 | USER root
8 |
9 |
10 | RUN apt-get update && apt-get --yes install apt-utils && \
11 | apt-get --yes install htop tmux graphviz openjdk-11-jre-headless curl && \
12 | apt-get clean;
13 |
14 |
15 | # set the user back to original setting
16 | USER $NB_UID
17 |
18 |
19 |
20 | # Install from requirements.txt file
21 | COPY --chown=${NB_UID}:${NB_GID} requirements.txt /tmp/
22 |
23 | RUN pip install --no-cache-dir --requirement /tmp/requirements.txt && \
24 | fix-permissions "${CONDA_DIR}" && \
25 | fix-permissions "/home/${NB_USER}"
26 |
27 |
28 | #COPY --chown=${NB_UID}:${NB_GID} docker-setup.sh /tmp/
29 |
30 | COPY --chown=${NB_UID}:${NB_GID} setup.ipynb /tmp/
31 |
32 | RUN papermill /tmp/setup.ipynb /tmp/setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar && \
33 | fix-permissions "${CONDA_DIR}" && \
34 | fix-permissions "/home/${NB_USER}"
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Readme
2 |
3 |
4 | ## Overview
5 |
6 |
7 | This repo contains notebooks for Pytorch Serving Workshop.
8 |
9 | Note: We **do not** need a GPU runtime
10 |
11 | ## Setup
12 |
13 | If you came to this repo, during a workshop visit this custom [jupyter hub](http://hub2.np.training) with all the dependencies already set up.
14 |
15 |
16 |
17 | Otherwise, consider using [](https://mybinder.org/v2/gh/npatta01/pytorch-serving-workshop/main)
18 |
19 |
20 |
21 | ## Contents
22 |
23 | There are five notebooks.
24 |
25 | a. `00_prepare_dataset.ipynb`
26 |
27 | Notebook that prepares the e-comeerce dataset and saves it.
28 |
29 | b. `01_train.ipynb`
30 |
31 | Trains a DistilBert model
32 |
33 | c. `02_inference_review.ipynb`
34 |
35 | Notebook that shows how to use the HuggingFace ecosystem. Also shows how to use the trained model from previous notebook.
36 |
37 | d. `03_optimizing_model.ipynb`
38 |
39 | Notebook that shows impact of Quantization and TorschScript
40 |
41 |
42 | e. `04_packaging.ipynb`
43 |
44 | Notebook that shows how to use TorchServe to serve models
45 |
46 |
47 | ## Slides
48 |
49 | [](https://www.slideshare.net/nidhinpattaniyil/serving-bert-models-in-production-with-torchserve)
50 |
51 |
52 | ## Video
53 |
54 | [](https://www.youtube.com/watch?v=sDGxzkOvxqY&ab_channel=PyData)
55 |
56 |
57 | ## References
58 |
59 | [Pydata 2021 Slides](https://www.slideshare.net/nidhinpattaniyil/serving-bert-models-in-production-with-torchserve)
60 |
61 | [Pydata 2021 Conference Page](https://pydata.org/global2021/schedule/presentation/136/serving-pytorch-models-in-production/)
62 |
63 |
64 | ## Libraries
65 |
66 | This repro uses HuggingFace transformers and dataset pacakge.
67 |
68 | The dataset used is [Amazon Berkeley Objects (ABO) Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) created by Amazon and UC Berkeley.
69 | For more reference, refer to this [paper](https://arxiv.org/abs/2110.06199)
70 |
71 |
72 | ## Contact
73 |
74 | For help or feedback, please reach out to :
75 |
76 | - [Nidhin Pattaniyil](https://www.linkedin.com/in/nidhinpattaniyil/)
77 | - [Adway Dhillon](https://www.linkedin.com/in/adwaydhillon/)
78 | - [Vishal Rathi](https://www.linkedin.com/in/vishalkumarrathi/)
79 |
--------------------------------------------------------------------------------
/assets/setup_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/setup_01.png
--------------------------------------------------------------------------------
/assets/setup_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/setup_02.png
--------------------------------------------------------------------------------
/assets/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/slides.pdf
--------------------------------------------------------------------------------
/assets/slides_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/assets/slides_cover.png
--------------------------------------------------------------------------------
/notebooks/00_code_snipets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "parliamentary-metropolitan",
6 | "metadata": {},
7 | "source": [
8 | "# About\n",
9 | "\n",
10 | "This is an internal notebok to help create code snippets "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "id": "american-journalist",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": []
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "suspended-attendance",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": []
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 1,
32 | "id": "favorite-subdivision",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "from transformers import BertTokenizer"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "provincial-electron",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# Bert uses WordPiece Tokenizer\n",
47 | "# splitting words either into the full forms\n",
48 | "# (e.g., one word becomes one token) or into word piece\n",
49 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "id": "homeless-employment",
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/plain": [
61 | "['cheap', 'nike', 'men', 'running', 'shoes']"
62 | ]
63 | },
64 | "execution_count": 3,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "tokenizer.tokenize(\"cheap nike men running shoes\")"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "id": "legitimate-employee",
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "['che', '##p', 'nike', 'men', 'shoes', 'run', '##ing', 'under', '100', '$']"
83 | ]
84 | },
85 | "execution_count": 4,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "# chep/runing is mispelled\n",
92 | "tokenizer.tokenize(\"chep nike men shoes runing under 100$ \")"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 5,
98 | "id": "mathematical-acceptance",
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "data": {
103 | "text/plain": [
104 | "30522"
105 | ]
106 | },
107 | "execution_count": 5,
108 | "metadata": {},
109 | "output_type": "execute_result"
110 | }
111 | ],
112 | "source": [
113 | "# size of vocabulary\n",
114 | "tokenizer.vocab_size"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "id": "applied-ribbon",
121 | "metadata": {},
122 | "outputs": [],
123 | "source": []
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "id": "incoming-mentor",
128 | "metadata": {},
129 | "source": [
130 | "# Training Code"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "id": "difficult-smooth",
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "# hugging face library to load existing/custom datasets\n",
141 | "import datasets\n",
142 | "# hugging face library contains tokenizers / models \n",
143 | "import transformers"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "parallel-webster",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "# dataset contains two columns \"text/label\"\n",
154 | "raw_datasets = datasets.load_from_disk(dataset_path)"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "id": "respected-nothing",
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# use existing distilbert tokenizer\n",
165 | "tokenizer = transformers.AutoTokenizer.from_pretrained(\"distilbert-base-uncased\" )\n",
166 | "\n",
167 | "def tokenize_function(examples):\n",
168 | " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
169 | "\n",
170 | "# calculate ['input_ids' , 'attention_mask']\n",
171 | "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) "
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "id": "alleged-sapphire",
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# use pretrained distilbert model\n",
182 | "model = transformers.AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\"\n",
183 | " , num_labels=len(labels) ... )"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "id": "stunning-storage",
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "training_args = transformers.TrainingArguments(\"trainer\",num_train_epochs=5...) \n",
194 | " )\n",
195 | "trainer = transformers.Trainer(\n",
196 | " model=model, \n",
197 | " args=training_args, \n",
198 | " train_dataset=tokenized_datasets['train'], \n",
199 | " eval_dataset=tokenized_datasets['validation'],.... )"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "id": "current-rebate",
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "# train on datasets/argumets passed to trainer args\n",
210 | "trainer.train()\n"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "id": "single-mother",
217 | "metadata": {},
218 | "outputs": [],
219 | "source": []
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "id": "metallic-manor",
224 | "metadata": {},
225 | "source": [
226 | "# Inference Code"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "id": "structured-mirror",
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "query = 'comfortable men sandals'"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "id": "senior-sunset",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "# compute input id / attention mask\n",
247 | "tokenized_res = tokenizer.encode_plus(query, return_tensors=\"pt\")"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "id": "continent-sullivan",
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "# pass input to model\n",
258 | "model_res = model(**tokenized_res)\n",
259 | "# get softmax of logits\n",
260 | "logits = model_res.logits\n",
261 | "softmax_res = torch.softmax(logits, dim=1).toList()[0]"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "id": "color-polls",
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "# get the label and probability sorted\n",
272 | "predictions = list ( zip (labels , softmax_res ) )\n",
273 | "predictions = sorted (predictions , key=lambda x:x[1] , reverse =True)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "id": "minimal-marketplace",
280 | "metadata": {},
281 | "outputs": [],
282 | "source": []
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "id": "false-mississippi",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": []
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "id": "acoustic-ensemble",
295 | "metadata": {},
296 | "source": [
297 | "# Torch Archiving"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "id": "aggressive-alexander",
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "# name and version of the model\n",
308 | "MODEL_NAME=\"pt_classifier\"\n",
309 | "MODEL_VERSION=\"1.0\"\n",
310 | "\n",
311 | "# folder where model is saved\n",
312 | "MODEL_STORE=\"model_store\"\n",
313 | "# path of saved pytorch models\n",
314 | "MODEL_SERIALIZED_FILE=\"traced_model.pt\"\n",
315 | "# path of extra files to include\n",
316 | "MODEL_EXTRA_FILES=\"index_to_name.json,setup_config.json\"\n",
317 | "# model code\n",
318 | "MODEL_CODE=\"handler.py\"\n",
319 | "\n",
320 | "\n",
321 | "torch-model-archiver --model-name ${MODEL_NAME} \\\n",
322 | "--version ${MODEL_VERSION} \\\n",
323 | "--serialized-file ${MODEL_SERIALIZED_FILE} \\\n",
324 | "--export-path ${MODEL_STORE} \\\n",
325 | "--extra-files ${MODEL_EXTRA_FILES} \\\n",
326 | "--handler ${MODEL_CODE} \\"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "id": "brilliant-crisis",
333 | "metadata": {},
334 | "outputs": [],
335 | "source": []
336 | }
337 | ],
338 | "metadata": {
339 | "environment": {
340 | "name": "rapids-gpu.0-18.m65",
341 | "type": "gcloud",
342 | "uri": "gcr.io/deeplearning-platform-release/rapids-gpu.0-18:m65"
343 | },
344 | "kernelspec": {
345 | "display_name": "Python [conda env:pytorch]",
346 | "language": "python",
347 | "name": "conda-env-pytorch-py"
348 | },
349 | "language_info": {
350 | "codemirror_mode": {
351 | "name": "ipython",
352 | "version": 3
353 | },
354 | "file_extension": ".py",
355 | "mimetype": "text/x-python",
356 | "name": "python",
357 | "nbconvert_exporter": "python",
358 | "pygments_lexer": "ipython3",
359 | "version": "3.7.10"
360 | }
361 | },
362 | "nbformat": 4,
363 | "nbformat_minor": 5
364 | }
365 |
--------------------------------------------------------------------------------
/notebooks/00_prepare_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "swedish-certificate",
6 | "metadata": {},
7 | "source": [
8 | "# Prepare Dataset"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "laden-intellectual",
14 | "metadata": {},
15 | "source": [
16 | "## About\n",
17 | "This notebook contains the code to\n",
18 | "1. download the ABO dataset \n",
19 | "2. Clean the dataset to extract title/product type\n",
20 | "3. export dataset as HuggingFace compatible dataset"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "hourly-grace",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": []
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "systematic-midwest",
34 | "metadata": {},
35 | "source": [
36 | "## Dataset"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "caring-invitation",
42 | "metadata": {},
43 | "source": [
44 | "This notebook uses the [Amazon Berkeley Objects (ABO) Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) . \n",
45 | "\n",
46 | "The dataset was created in partnership with Amazon and UC Berklely .\n",
47 | "\n",
48 | "For 147,702 it contains product metadata , images and 3D models. "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "id": "indonesian-tennis",
55 | "metadata": {
56 | "jupyter": {
57 | "outputs_hidden": true
58 | }
59 | },
60 | "outputs": [],
61 | "source": [
62 | "%%bash \n",
63 | "cd ../artifacts/dataset_raw/amazon/\n",
64 | "wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar\n",
65 | "tar -xvf abo-listings.tar"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "sexual-folder",
72 | "metadata": {
73 | "scrolled": true
74 | },
75 | "outputs": [],
76 | "source": [
77 | "#!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles/All_Amazon_Meta.json.gz\n",
78 | "#!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "id": "opposite-island",
85 | "metadata": {},
86 | "outputs": [],
87 | "source": []
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "id": "corrected-specification",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": []
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "exotic-guess",
100 | "metadata": {},
101 | "source": [
102 | "## Imports"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 1,
108 | "id": "eligible-magazine",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "import pathlib\n",
113 | "import sklearn\n",
114 | "import datasets\n",
115 | "import pandas as pd\n",
116 | "import sklearn.preprocessing\n",
117 | "import sklearn.model_selection\n",
118 | "import glob\n",
119 | "import functools"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 2,
125 | "id": "raised-myanmar",
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "/home/jupyter/tutorials/personal/pydata_bert/notebooks\r\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "!pwd"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "id": "foster-engagement",
143 | "metadata": {},
144 | "source": [
145 | "## Process Dataset"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 3,
151 | "id": "aboriginal-adobe",
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "listings_0.json.gz listings_4.json.gz\tlistings_8.json.gz listings_c.json.gz\r\n",
159 | "listings_1.json.gz listings_5.json.gz\tlistings_9.json.gz listings_d.json.gz\r\n",
160 | "listings_2.json.gz listings_6.json.gz\tlistings_a.json.gz listings_e.json.gz\r\n",
161 | "listings_3.json.gz listings_7.json.gz\tlistings_b.json.gz listings_f.json.gz\r\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "!ls ../artifacts/dataset_raw/amazon/listings/metadata"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 4,
172 | "id": "hollow-berry",
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "dataset_path_raw = \"../artifacts/dataset_raw/amazon/listings/metadata\""
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 5,
182 | "id": "clean-investment",
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "['../artifacts/dataset_raw/amazon/listings/metadata/listings_2.json.gz',\n",
189 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_9.json.gz',\n",
190 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_0.json.gz',\n",
191 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_1.json.gz',\n",
192 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_a.json.gz',\n",
193 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_7.json.gz',\n",
194 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_5.json.gz',\n",
195 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_6.json.gz',\n",
196 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_f.json.gz',\n",
197 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_3.json.gz',\n",
198 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_b.json.gz',\n",
199 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_c.json.gz',\n",
200 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_4.json.gz',\n",
201 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_e.json.gz',\n",
202 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_8.json.gz',\n",
203 | " '../artifacts/dataset_raw/amazon/listings/metadata/listings_d.json.gz']"
204 | ]
205 | },
206 | "execution_count": 5,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "glob.glob(f'{dataset_path_raw}/*.json.gz')"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "id": "exciting-homeless",
218 | "metadata": {},
219 | "source": [
220 | "load all 16 files"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 6,
226 | "id": "conventional-calculation",
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "df_raw = pd.concat(map(functools.partial(pd.read_json, lines=True ), \n",
231 | " glob.glob(f'{dataset_path_raw}/*.json.gz') )) "
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 7,
237 | "id": "sonic-staff",
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/html": [
243 | "
\n",
244 | "\n",
257 | "
\n",
258 | " \n",
259 | " \n",
260 | " | \n",
261 | " brand | \n",
262 | " bullet_point | \n",
263 | " color | \n",
264 | " fabric_type | \n",
265 | " item_id | \n",
266 | " item_name | \n",
267 | " model_name | \n",
268 | " model_number | \n",
269 | " product_type | \n",
270 | " style | \n",
271 | " ... | \n",
272 | " item_keywords | \n",
273 | " material | \n",
274 | " spin_id | \n",
275 | " 3dmodel_id | \n",
276 | " color_code | \n",
277 | " model_year | \n",
278 | " pattern | \n",
279 | " product_description | \n",
280 | " finish_type | \n",
281 | " item_shape | \n",
282 | "
\n",
283 | " \n",
284 | " \n",
285 | " \n",
286 | " 0 | \n",
287 | " [{'language_tag': 'de_DE', 'value': 'Amazon Es... | \n",
288 | " [{'language_tag': 'de_DE', 'value': 'Fällt gro... | \n",
289 | " [{'language_tag': 'de_DE', 'value': 'Mehrfarbi... | \n",
290 | " [{'language_tag': 'en_GB', 'value': '100% Cott... | \n",
291 | " B07HL25ZQM | \n",
292 | " [{'language_tag': 'en_GB', 'value': 'Amazon Es... | \n",
293 | " [{'language_tag': 'en_GB', 'value': '6-Pack Bi... | \n",
294 | " [{'value': 'P_AE3131_M6'}] | \n",
295 | " [{'value': 'BABY_PRODUCT'}] | \n",
296 | " [{'language_tag': 'de_DE', 'value': '6-Pack Bi... | \n",
297 | " ... | \n",
298 | " NaN | \n",
299 | " NaN | \n",
300 | " NaN | \n",
301 | " NaN | \n",
302 | " NaN | \n",
303 | " NaN | \n",
304 | " NaN | \n",
305 | " NaN | \n",
306 | " NaN | \n",
307 | " NaN | \n",
308 | "
\n",
309 | " \n",
310 | " 1 | \n",
311 | " [{'language_tag': 'en_GB', 'value': 'AmazonBas... | \n",
312 | " [{'language_tag': 'en_GB', 'value': 'Large dry... | \n",
313 | " NaN | \n",
314 | " NaN | \n",
315 | " B0825D4F6R | \n",
316 | " [{'language_tag': 'en_GB', 'value': 'AmazonBas... | \n",
317 | " NaN | \n",
318 | " [{'value': 'AMAZ2001'}] | \n",
319 | " [{'value': 'HOME'}] | \n",
320 | " [{'language_tag': 'en_GB', 'value': 'Deluxe'}] | \n",
321 | " ... | \n",
322 | " [{'language_tag': 'en_GB', 'value': 'tower lau... | \n",
323 | " NaN | \n",
324 | " NaN | \n",
325 | " NaN | \n",
326 | " NaN | \n",
327 | " NaN | \n",
328 | " NaN | \n",
329 | " NaN | \n",
330 | " NaN | \n",
331 | " NaN | \n",
332 | "
\n",
333 | " \n",
334 | " 2 | \n",
335 | " [{'language_tag': 'en_IN', 'value': 'Amazon Br... | \n",
336 | " [{'language_tag': 'en_IN', 'value': '3D Printe... | \n",
337 | " [{'language_tag': 'en_IN', 'standardized_value... | \n",
338 | " NaN | \n",
339 | " B07TF1FCFD | \n",
340 | " [{'language_tag': 'en_IN', 'value': 'Amazon Br... | \n",
341 | " [{'language_tag': 'en_IN', 'value': 'Samsung G... | \n",
342 | " [{'value': 'gz8587-SL40668'}] | \n",
343 | " [{'value': 'CELLULAR_PHONE_CASE'}] | \n",
344 | " NaN | \n",
345 | " ... | \n",
346 | " [{'language_tag': 'en_IN', 'value': 'mobile co... | \n",
347 | " NaN | \n",
348 | " NaN | \n",
349 | " NaN | \n",
350 | " NaN | \n",
351 | " NaN | \n",
352 | " NaN | \n",
353 | " NaN | \n",
354 | " NaN | \n",
355 | " NaN | \n",
356 | "
\n",
357 | " \n",
358 | " 3 | \n",
359 | " [{'language_tag': 'en_IN', 'value': 'Amazon Br... | \n",
360 | " [{'language_tag': 'en_IN', 'value': 'Snug fit ... | \n",
361 | " [{'language_tag': 'en_IN', 'standardized_value... | \n",
362 | " NaN | \n",
363 | " B08569SRJD | \n",
364 | " [{'language_tag': 'en_IN', 'value': 'Amazon Br... | \n",
365 | " [{'language_tag': 'en_IN', 'value': 'Nokia 7.2'}] | \n",
366 | " [{'value': 'UV10845-SL40357'}] | \n",
367 | " [{'value': 'CELLULAR_PHONE_CASE'}] | \n",
368 | " NaN | \n",
369 | " ... | \n",
370 | " [{'language_tag': 'en_IN', 'value': 'Back Cove... | \n",
371 | " [{'language_tag': 'en_IN', 'value': 'Silicon'}] | \n",
372 | " NaN | \n",
373 | " NaN | \n",
374 | " NaN | \n",
375 | " NaN | \n",
376 | " NaN | \n",
377 | " NaN | \n",
378 | " NaN | \n",
379 | " NaN | \n",
380 | "
\n",
381 | " \n",
382 | " 4 | \n",
383 | " [{'language_tag': 'en_US', 'value': 'Stone & B... | \n",
384 | " [{'language_tag': 'en_US', 'value': 'With mode... | \n",
385 | " [{'language_tag': 'en_US', 'value': 'Dark Grey'}] | \n",
386 | " NaN | \n",
387 | " B07B4G5RBN | \n",
388 | " [{'language_tag': 'zh_CN', 'value': 'Stone & B... | \n",
389 | " NaN | \n",
390 | " [{'value': 'UPH10095B'}] | \n",
391 | " [{'value': 'CHAIR'}] | \n",
392 | " NaN | \n",
393 | " ... | \n",
394 | " [{'language_tag': 'en_US', 'value': 'living-ro... | \n",
395 | " [{'language_tag': 'zh_CN', 'value': '灰石色'}, {'... | \n",
396 | " 485925ed | \n",
397 | " B07B4G5RBN | \n",
398 | " [#918F8C] | \n",
399 | " NaN | \n",
400 | " NaN | \n",
401 | " NaN | \n",
402 | " NaN | \n",
403 | " NaN | \n",
404 | "
\n",
405 | " \n",
406 | "
\n",
407 | "
5 rows × 28 columns
\n",
408 | "
"
409 | ],
410 | "text/plain": [
411 | " brand \\\n",
412 | "0 [{'language_tag': 'de_DE', 'value': 'Amazon Es... \n",
413 | "1 [{'language_tag': 'en_GB', 'value': 'AmazonBas... \n",
414 | "2 [{'language_tag': 'en_IN', 'value': 'Amazon Br... \n",
415 | "3 [{'language_tag': 'en_IN', 'value': 'Amazon Br... \n",
416 | "4 [{'language_tag': 'en_US', 'value': 'Stone & B... \n",
417 | "\n",
418 | " bullet_point \\\n",
419 | "0 [{'language_tag': 'de_DE', 'value': 'Fällt gro... \n",
420 | "1 [{'language_tag': 'en_GB', 'value': 'Large dry... \n",
421 | "2 [{'language_tag': 'en_IN', 'value': '3D Printe... \n",
422 | "3 [{'language_tag': 'en_IN', 'value': 'Snug fit ... \n",
423 | "4 [{'language_tag': 'en_US', 'value': 'With mode... \n",
424 | "\n",
425 | " color \\\n",
426 | "0 [{'language_tag': 'de_DE', 'value': 'Mehrfarbi... \n",
427 | "1 NaN \n",
428 | "2 [{'language_tag': 'en_IN', 'standardized_value... \n",
429 | "3 [{'language_tag': 'en_IN', 'standardized_value... \n",
430 | "4 [{'language_tag': 'en_US', 'value': 'Dark Grey'}] \n",
431 | "\n",
432 | " fabric_type item_id \\\n",
433 | "0 [{'language_tag': 'en_GB', 'value': '100% Cott... B07HL25ZQM \n",
434 | "1 NaN B0825D4F6R \n",
435 | "2 NaN B07TF1FCFD \n",
436 | "3 NaN B08569SRJD \n",
437 | "4 NaN B07B4G5RBN \n",
438 | "\n",
439 | " item_name \\\n",
440 | "0 [{'language_tag': 'en_GB', 'value': 'Amazon Es... \n",
441 | "1 [{'language_tag': 'en_GB', 'value': 'AmazonBas... \n",
442 | "2 [{'language_tag': 'en_IN', 'value': 'Amazon Br... \n",
443 | "3 [{'language_tag': 'en_IN', 'value': 'Amazon Br... \n",
444 | "4 [{'language_tag': 'zh_CN', 'value': 'Stone & B... \n",
445 | "\n",
446 | " model_name \\\n",
447 | "0 [{'language_tag': 'en_GB', 'value': '6-Pack Bi... \n",
448 | "1 NaN \n",
449 | "2 [{'language_tag': 'en_IN', 'value': 'Samsung G... \n",
450 | "3 [{'language_tag': 'en_IN', 'value': 'Nokia 7.2'}] \n",
451 | "4 NaN \n",
452 | "\n",
453 | " model_number product_type \\\n",
454 | "0 [{'value': 'P_AE3131_M6'}] [{'value': 'BABY_PRODUCT'}] \n",
455 | "1 [{'value': 'AMAZ2001'}] [{'value': 'HOME'}] \n",
456 | "2 [{'value': 'gz8587-SL40668'}] [{'value': 'CELLULAR_PHONE_CASE'}] \n",
457 | "3 [{'value': 'UV10845-SL40357'}] [{'value': 'CELLULAR_PHONE_CASE'}] \n",
458 | "4 [{'value': 'UPH10095B'}] [{'value': 'CHAIR'}] \n",
459 | "\n",
460 | " style ... \\\n",
461 | "0 [{'language_tag': 'de_DE', 'value': '6-Pack Bi... ... \n",
462 | "1 [{'language_tag': 'en_GB', 'value': 'Deluxe'}] ... \n",
463 | "2 NaN ... \n",
464 | "3 NaN ... \n",
465 | "4 NaN ... \n",
466 | "\n",
467 | " item_keywords \\\n",
468 | "0 NaN \n",
469 | "1 [{'language_tag': 'en_GB', 'value': 'tower lau... \n",
470 | "2 [{'language_tag': 'en_IN', 'value': 'mobile co... \n",
471 | "3 [{'language_tag': 'en_IN', 'value': 'Back Cove... \n",
472 | "4 [{'language_tag': 'en_US', 'value': 'living-ro... \n",
473 | "\n",
474 | " material spin_id 3dmodel_id \\\n",
475 | "0 NaN NaN NaN \n",
476 | "1 NaN NaN NaN \n",
477 | "2 NaN NaN NaN \n",
478 | "3 [{'language_tag': 'en_IN', 'value': 'Silicon'}] NaN NaN \n",
479 | "4 [{'language_tag': 'zh_CN', 'value': '灰石色'}, {'... 485925ed B07B4G5RBN \n",
480 | "\n",
481 | " color_code model_year pattern product_description finish_type item_shape \n",
482 | "0 NaN NaN NaN NaN NaN NaN \n",
483 | "1 NaN NaN NaN NaN NaN NaN \n",
484 | "2 NaN NaN NaN NaN NaN NaN \n",
485 | "3 NaN NaN NaN NaN NaN NaN \n",
486 | "4 [#918F8C] NaN NaN NaN NaN NaN \n",
487 | "\n",
488 | "[5 rows x 28 columns]"
489 | ]
490 | },
491 | "execution_count": 7,
492 | "metadata": {},
493 | "output_type": "execute_result"
494 | }
495 | ],
496 | "source": [
497 | "df_raw.head()"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 8,
503 | "id": "sophisticated-obligation",
504 | "metadata": {},
505 | "outputs": [
506 | {
507 | "data": {
508 | "text/plain": [
509 | "147702"
510 | ]
511 | },
512 | "execution_count": 8,
513 | "metadata": {},
514 | "output_type": "execute_result"
515 | }
516 | ],
517 | "source": [
518 | "len(df_raw)"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "id": "universal-document",
524 | "metadata": {},
525 | "source": [
526 | "sample record"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 9,
532 | "id": "corresponding-blast",
533 | "metadata": {},
534 | "outputs": [
535 | {
536 | "data": {
537 | "text/plain": [
538 | "{'brand': [{'language_tag': 'de_DE', 'value': 'Amazon Essentials'}],\n",
539 | " 'bullet_point': [{'language_tag': 'de_DE',\n",
540 | " 'value': 'Fällt gross aus; eventuell eine Größe kleiner bestellen'}],\n",
541 | " 'color': [{'language_tag': 'de_DE', 'value': 'Mehrfarbig(Girl Fruit)'}],\n",
542 | " 'fabric_type': [{'language_tag': 'en_GB', 'value': '100% Cotton'},\n",
543 | " {'language_tag': 'de_DE', 'value': '100 % Baumwolle'}],\n",
544 | " 'item_id': 'B07HL25ZQM',\n",
545 | " 'item_name': [{'language_tag': 'en_GB',\n",
546 | " 'value': 'Amazon Essentials Bib Set of 6'},\n",
547 | " {'language_tag': 'de_DE',\n",
548 | " 'value': 'Amazon Essentials 6-Pack Bib Set, Mehrfarbig(Girl Fruit), Einheitsgröße'}],\n",
549 | " 'model_name': [{'language_tag': 'en_GB', 'value': '6-Pack Bib Set'},\n",
550 | " {'language_tag': 'de_DE', 'value': '6-Pack Bib Set'}],\n",
551 | " 'model_number': [{'value': 'P_AE3131_M6'}],\n",
552 | " 'product_type': [{'value': 'BABY_PRODUCT'}],\n",
553 | " 'style': [{'language_tag': 'de_DE', 'value': '6-Pack Bib Set'}],\n",
554 | " 'main_image_id': '718mYsQTQbL',\n",
555 | " 'country': 'DE',\n",
556 | " 'marketplace': 'Amazon',\n",
557 | " 'domain_name': 'amazon.de',\n",
558 | " 'node': [{'node_id': 3968940031,\n",
559 | " 'node_name': '/Kategorien/Ernährung & Stillen/Lätzchen'}],\n",
560 | " 'item_dimensions': nan,\n",
561 | " 'item_weight': nan,\n",
562 | " 'other_image_id': nan,\n",
563 | " 'item_keywords': nan,\n",
564 | " 'material': nan,\n",
565 | " 'spin_id': nan,\n",
566 | " '3dmodel_id': nan,\n",
567 | " 'color_code': nan,\n",
568 | " 'model_year': nan,\n",
569 | " 'pattern': nan,\n",
570 | " 'product_description': nan,\n",
571 | " 'finish_type': nan,\n",
572 | " 'item_shape': nan}"
573 | ]
574 | },
575 | "execution_count": 9,
576 | "metadata": {},
577 | "output_type": "execute_result"
578 | }
579 | ],
580 | "source": [
581 | "df_raw.iloc[0].to_dict()"
582 | ]
583 | },
584 | {
585 | "cell_type": "markdown",
586 | "id": "referenced-championship",
587 | "metadata": {},
588 | "source": [
589 | "for this project, we only need `item_name` and `brand`. \n",
590 | "We can assume and take the first value for the fields"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 10,
596 | "id": "trying-training",
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "def parse_property(property_record:dict,property_name:str):\n",
601 | " try:\n",
602 | " r = property_record[property_name][0]\n",
603 | " if property_name ==\"node\":\n",
604 | " return r['node_name']\n",
605 | " else:\n",
606 | " return r['value']\n",
607 | " except Exception as e:\n",
608 | " return None\n",
609 | " \n",
610 | "def cleanup_record(raw_record:dict):\n",
611 | " \n",
612 | " \n",
613 | " record= {\n",
614 | " 'brand': parse_property(raw_record,'brand')\n",
615 | " ,'item_id': raw_record['item_id']\n",
616 | " ,'item_name': parse_property(raw_record,'item_name')\n",
617 | " ,'product_type': parse_property(raw_record,'product_type')\n",
618 | " ,'node': parse_property(raw_record, 'node')\n",
619 | " , 'main_image_id': raw_record['main_image_id']\n",
620 | " ,'product_description': raw_record['product_description']\n",
621 | "\n",
622 | " \n",
623 | " }\n",
624 | " \n",
625 | " return pd.Series(record)"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 11,
631 | "id": "bizarre-taste",
632 | "metadata": {},
633 | "outputs": [],
634 | "source": [
635 | "df = df_raw.apply(cleanup_record,axis=1)"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 12,
641 | "id": "earlier-education",
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "data": {
646 | "text/html": [
647 | "\n",
648 | "\n",
661 | "
\n",
662 | " \n",
663 | " \n",
664 | " | \n",
665 | " brand | \n",
666 | " item_id | \n",
667 | " item_name | \n",
668 | " product_type | \n",
669 | " node | \n",
670 | " main_image_id | \n",
671 | " product_description | \n",
672 | "
\n",
673 | " \n",
674 | " \n",
675 | " \n",
676 | " 0 | \n",
677 | " Amazon Essentials | \n",
678 | " B07HL25ZQM | \n",
679 | " Amazon Essentials Bib Set of 6 | \n",
680 | " BABY_PRODUCT | \n",
681 | " /Kategorien/Ernährung & Stillen/Lätzchen | \n",
682 | " 718mYsQTQbL | \n",
683 | " NaN | \n",
684 | "
\n",
685 | " \n",
686 | " 1 | \n",
687 | " AmazonBasics | \n",
688 | " B0825D4F6R | \n",
689 | " AmazonBasics 3-Tier Deluxe Tower Laundry Dryin... | \n",
690 | " HOME | \n",
691 | " /Home & Garden/Home & Kitchen/Categories/Stora... | \n",
692 | " 81lg2wto16L | \n",
693 | " NaN | \n",
694 | "
\n",
695 | " \n",
696 | " 2 | \n",
697 | " Amazon Brand - Solimo | \n",
698 | " B07TF1FCFD | \n",
699 | " Amazon Brand - Solimo Designer Number Eight 3D... | \n",
700 | " CELLULAR_PHONE_CASE | \n",
701 | " /Categories/Mobiles & Accessories/Mobile Acces... | \n",
702 | " 71R4R6x-tjL | \n",
703 | " NaN | \n",
704 | "
\n",
705 | " \n",
706 | " 3 | \n",
707 | " Amazon Brand - Solimo | \n",
708 | " B08569SRJD | \n",
709 | " Amazon Brand - Solimo Designer Dark Night View... | \n",
710 | " CELLULAR_PHONE_CASE | \n",
711 | " /Categories/Mobiles & Accessories/Mobile Acces... | \n",
712 | " 71QSAxIJagL | \n",
713 | " NaN | \n",
714 | "
\n",
715 | " \n",
716 | " 4 | \n",
717 | " Stone & Beam | \n",
718 | " B07B4G5RBN | \n",
719 | " Stone & Beam Varon 过渡日床, 灰石色 | \n",
720 | " CHAIR | \n",
721 | " /Categories/Furniture/Living Room Furniture/Ch... | \n",
722 | " 91UiRD6UcHL | \n",
723 | " NaN | \n",
724 | "
\n",
725 | " \n",
726 | "
\n",
727 | "
"
728 | ],
729 | "text/plain": [
730 | " brand item_id \\\n",
731 | "0 Amazon Essentials B07HL25ZQM \n",
732 | "1 AmazonBasics B0825D4F6R \n",
733 | "2 Amazon Brand - Solimo B07TF1FCFD \n",
734 | "3 Amazon Brand - Solimo B08569SRJD \n",
735 | "4 Stone & Beam B07B4G5RBN \n",
736 | "\n",
737 | " item_name product_type \\\n",
738 | "0 Amazon Essentials Bib Set of 6 BABY_PRODUCT \n",
739 | "1 AmazonBasics 3-Tier Deluxe Tower Laundry Dryin... HOME \n",
740 | "2 Amazon Brand - Solimo Designer Number Eight 3D... CELLULAR_PHONE_CASE \n",
741 | "3 Amazon Brand - Solimo Designer Dark Night View... CELLULAR_PHONE_CASE \n",
742 | "4 Stone & Beam Varon 过渡日床, 灰石色 CHAIR \n",
743 | "\n",
744 | " node main_image_id \\\n",
745 | "0 /Kategorien/Ernährung & Stillen/Lätzchen 718mYsQTQbL \n",
746 | "1 /Home & Garden/Home & Kitchen/Categories/Stora... 81lg2wto16L \n",
747 | "2 /Categories/Mobiles & Accessories/Mobile Acces... 71R4R6x-tjL \n",
748 | "3 /Categories/Mobiles & Accessories/Mobile Acces... 71QSAxIJagL \n",
749 | "4 /Categories/Furniture/Living Room Furniture/Ch... 91UiRD6UcHL \n",
750 | "\n",
751 | " product_description \n",
752 | "0 NaN \n",
753 | "1 NaN \n",
754 | "2 NaN \n",
755 | "3 NaN \n",
756 | "4 NaN "
757 | ]
758 | },
759 | "execution_count": 12,
760 | "metadata": {},
761 | "output_type": "execute_result"
762 | }
763 | ],
764 | "source": [
765 | "df.head()"
766 | ]
767 | },
768 | {
769 | "cell_type": "code",
770 | "execution_count": 13,
771 | "id": "respiratory-horizontal",
772 | "metadata": {},
773 | "outputs": [
774 | {
775 | "data": {
776 | "text/plain": [
777 | "Index(['brand', 'item_id', 'item_name', 'product_type', 'node',\n",
778 | " 'main_image_id', 'product_description'],\n",
779 | " dtype='object')"
780 | ]
781 | },
782 | "execution_count": 13,
783 | "metadata": {},
784 | "output_type": "execute_result"
785 | }
786 | ],
787 | "source": [
788 | "df.columns"
789 | ]
790 | },
791 | {
792 | "cell_type": "code",
793 | "execution_count": 14,
794 | "id": "honey-certification",
795 | "metadata": {},
796 | "outputs": [
797 | {
798 | "data": {
799 | "text/plain": [
800 | "CELLULAR_PHONE_CASE 64853\n",
801 | "SHOES 12965\n",
802 | "GROCERY 6546\n",
803 | "HOME 5264\n",
804 | "HOME_BED_AND_BATH 3082\n",
805 | " ... \n",
806 | "SOUS_VIDE_MACHINE 1\n",
807 | "SKIN_TREATMENT_MASK 1\n",
808 | "SCULPTURE 1\n",
809 | "THICKENING_AGENT 1\n",
810 | "TERMINAL_BLOCK 1\n",
811 | "Name: product_type, Length: 576, dtype: int64"
812 | ]
813 | },
814 | "execution_count": 14,
815 | "metadata": {},
816 | "output_type": "execute_result"
817 | }
818 | ],
819 | "source": [
820 | "df['product_type'].value_counts()"
821 | ]
822 | },
823 | {
824 | "cell_type": "markdown",
825 | "id": "protective-hudson",
826 | "metadata": {},
827 | "source": [
828 | "There are some product types that don't occur frequently. \n",
829 | "We should limit our training data to include at least 50+ product types"
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": 19,
835 | "id": "gentle-florist",
836 | "metadata": {},
837 | "outputs": [],
838 | "source": [
839 | "min_product_count = 500"
840 | ]
841 | },
842 | {
843 | "cell_type": "markdown",
844 | "id": "junior-timber",
845 | "metadata": {},
846 | "source": [
847 | "compute top product types"
848 | ]
849 | },
850 | {
851 | "cell_type": "code",
852 | "execution_count": 20,
853 | "id": "taken-presence",
854 | "metadata": {},
855 | "outputs": [],
856 | "source": [
857 | "top_products = df['product_type'].value_counts().loc[lambda x: x>min_product_count].index.tolist()"
858 | ]
859 | },
860 | {
861 | "cell_type": "code",
862 | "execution_count": null,
863 | "id": "shared-gilbert",
864 | "metadata": {},
865 | "outputs": [],
866 | "source": []
867 | },
868 | {
869 | "cell_type": "code",
870 | "execution_count": 21,
871 | "id": "noble-multimedia",
872 | "metadata": {},
873 | "outputs": [
874 | {
875 | "data": {
876 | "text/plain": [
877 | "(576, 31)"
878 | ]
879 | },
880 | "execution_count": 21,
881 | "metadata": {},
882 | "output_type": "execute_result"
883 | }
884 | ],
885 | "source": [
886 | "len(df['product_type'].value_counts() ) , len (top_products)"
887 | ]
888 | },
889 | {
890 | "cell_type": "code",
891 | "execution_count": 22,
892 | "id": "certified-galaxy",
893 | "metadata": {},
894 | "outputs": [],
895 | "source": [
896 | "df_all = df [ df['product_type'].isin(top_products) ].copy()\n"
897 | ]
898 | },
899 | {
900 | "cell_type": "code",
901 | "execution_count": 23,
902 | "id": "missing-extra",
903 | "metadata": {},
904 | "outputs": [
905 | {
906 | "data": {
907 | "text/plain": [
908 | "121239"
909 | ]
910 | },
911 | "execution_count": 23,
912 | "metadata": {},
913 | "output_type": "execute_result"
914 | }
915 | ],
916 | "source": [
917 | "len(df_all)"
918 | ]
919 | },
920 | {
921 | "cell_type": "markdown",
922 | "id": "temporal-fifth",
923 | "metadata": {},
924 | "source": [
925 | "`text` and `label` are the columns that are needed by Hugging Face Transformer package\n",
926 | "\n",
927 | "Item title is the text. \n",
928 | "Product Type is the label we are predicting"
929 | ]
930 | },
931 | {
932 | "cell_type": "code",
933 | "execution_count": 24,
934 | "id": "religious-failure",
935 | "metadata": {},
936 | "outputs": [],
937 | "source": [
938 | "df_all['label_name'] = df_all['product_type']\n",
939 | "df_all['text'] = df_all['item_name']"
940 | ]
941 | },
942 | {
943 | "cell_type": "code",
944 | "execution_count": null,
945 | "id": "disciplinary-fortune",
946 | "metadata": {},
947 | "outputs": [],
948 | "source": []
949 | },
950 | {
951 | "cell_type": "code",
952 | "execution_count": null,
953 | "id": "apparent-needle",
954 | "metadata": {},
955 | "outputs": [],
956 | "source": []
957 | },
958 | {
959 | "cell_type": "markdown",
960 | "id": "mineral-consensus",
961 | "metadata": {},
962 | "source": [
963 | "encode the product type to a numeric label"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 25,
969 | "id": "regulation-planner",
970 | "metadata": {},
971 | "outputs": [],
972 | "source": [
973 | "label_encoder = sklearn.preprocessing.LabelEncoder()"
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 26,
979 | "id": "cultural-jonathan",
980 | "metadata": {},
981 | "outputs": [
982 | {
983 | "data": {
984 | "text/plain": [
985 | "LabelEncoder()"
986 | ]
987 | },
988 | "execution_count": 26,
989 | "metadata": {},
990 | "output_type": "execute_result"
991 | }
992 | ],
993 | "source": [
994 | "label_encoder.fit(df_all['label_name'])"
995 | ]
996 | },
997 | {
998 | "cell_type": "code",
999 | "execution_count": 27,
1000 | "id": "substantial-operation",
1001 | "metadata": {},
1002 | "outputs": [],
1003 | "source": [
1004 | "df_all['label'] = label_encoder.transform(df_all['label_name'])\n"
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "markdown",
1009 | "id": "dressed-defeat",
1010 | "metadata": {},
1011 | "source": [
1012 | "Allocate 60% for training , 20% validation and 20% for training"
1013 | ]
1014 | },
1015 | {
1016 | "cell_type": "code",
1017 | "execution_count": 28,
1018 | "id": "textile-bargain",
1019 | "metadata": {},
1020 | "outputs": [
1021 | {
1022 | "name": "stdout",
1023 | "output_type": "stream",
1024 | "text": [
1025 | "{'train': 72743, 'test': 24248, 'val': 24248}\n"
1026 | ]
1027 | }
1028 | ],
1029 | "source": [
1030 | "df_train, df_test = sklearn.model_selection.train_test_split(df_all, train_size=.6, stratify= df_all['label'] )\n",
1031 | "\n",
1032 | "\n",
1033 | "df_test, df_val = sklearn.model_selection.train_test_split(df_test, test_size=.5, stratify= df_test['label'] )\n",
1034 | "\n",
1035 | "\n",
1036 | "print ( \n",
1037 | "{\n",
1038 | " 'train': len(df_train)\n",
1039 | " ,'test': len(df_test)\n",
1040 | " ,'val': len(df_val)\n",
1041 | "}\n",
1042 | "\n",
1043 | ")"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "markdown",
1048 | "id": "retained-speed",
1049 | "metadata": {},
1050 | "source": [
1051 | "## Create Hugging Face Dataset"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "markdown",
1056 | "id": "cathedral-australian",
1057 | "metadata": {},
1058 | "source": [
1059 | "In order to later feed our model to HF transformers package, we need either Pytorch Dataloader or use HF [datasets](https://github.com/huggingface/datasets).\n",
1060 | "\n",
1061 | "`Datasets` can easily be used by TF/ Pytorch\n"
1062 | ]
1063 | },
1064 | {
1065 | "cell_type": "code",
1066 | "execution_count": 29,
1067 | "id": "stable-finder",
1068 | "metadata": {},
1069 | "outputs": [],
1070 | "source": [
1071 | "dataset_features = datasets.Features(\n",
1072 | " {'text': datasets.Value('string')\n",
1073 | " , 'item_name': datasets.Value('string')\n",
1074 | " , 'label': datasets.ClassLabel(names=list ( label_encoder.classes_ ))\n",
1075 | " , 'brand': datasets.Value('string')\n",
1076 | " , 'item_id': datasets.Value('string')\n",
1077 | " , 'main_image_id': datasets.Value('string')\n",
1078 | " , 'node': datasets.Value('string')\n",
1079 | "\n",
1080 | " }\n",
1081 | "\n",
1082 | ")"
1083 | ]
1084 | },
1085 | {
1086 | "cell_type": "code",
1087 | "execution_count": 30,
1088 | "id": "suffering-stream",
1089 | "metadata": {},
1090 | "outputs": [
1091 | {
1092 | "data": {
1093 | "text/plain": [
1094 | "dict_keys(['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'])"
1095 | ]
1096 | },
1097 | "execution_count": 30,
1098 | "metadata": {},
1099 | "output_type": "execute_result"
1100 | }
1101 | ],
1102 | "source": [
1103 | "dataset_features.keys()"
1104 | ]
1105 | },
1106 | {
1107 | "cell_type": "markdown",
1108 | "id": "located-submission",
1109 | "metadata": {},
1110 | "source": [
1111 | "create dataset dictionary with all the subsets"
1112 | ]
1113 | },
1114 | {
1115 | "cell_type": "code",
1116 | "execution_count": 31,
1117 | "id": "third-stylus",
1118 | "metadata": {},
1119 | "outputs": [],
1120 | "source": [
1121 | "interested_columns = dataset_features.keys()\n",
1122 | "\n",
1123 | "dataset_train = datasets.Dataset.from_pandas(df_train[interested_columns],features=dataset_features)\n",
1124 | "dataset_test = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features)\n",
1125 | "dataset_validation = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features)\n",
1126 | "\n",
1127 | "dataset_all = datasets.DatasetDict({\n",
1128 | " 'train': dataset_train,\n",
1129 | " 'test': dataset_test,\n",
1130 | " 'valid': dataset_validation }\n",
1131 | ")"
1132 | ]
1133 | },
1134 | {
1135 | "cell_type": "code",
1136 | "execution_count": 32,
1137 | "id": "operating-drove",
1138 | "metadata": {},
1139 | "outputs": [
1140 | {
1141 | "data": {
1142 | "text/plain": [
1143 | "DatasetDict({\n",
1144 | " train: Dataset({\n",
1145 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1146 | " num_rows: 72743\n",
1147 | " })\n",
1148 | " test: Dataset({\n",
1149 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1150 | " num_rows: 24248\n",
1151 | " })\n",
1152 | " valid: Dataset({\n",
1153 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1154 | " num_rows: 24248\n",
1155 | " })\n",
1156 | "})"
1157 | ]
1158 | },
1159 | "execution_count": 32,
1160 | "metadata": {},
1161 | "output_type": "execute_result"
1162 | }
1163 | ],
1164 | "source": [
1165 | "dataset_all"
1166 | ]
1167 | },
1168 | {
1169 | "cell_type": "code",
1170 | "execution_count": 33,
1171 | "id": "touched-buddy",
1172 | "metadata": {},
1173 | "outputs": [
1174 | {
1175 | "data": {
1176 | "text/plain": [
1177 | "{'text': 'Amazon Brand - Solimo Designer Light Blue Flower Photography 3D Printed Hard Back Case Mobile Cover for Sony Xperia L1',\n",
1178 | " 'item_name': 'Amazon Brand - Solimo Designer Light Blue Flower Photography 3D Printed Hard Back Case Mobile Cover for Sony Xperia L1',\n",
1179 | " 'label': 2,\n",
1180 | " 'brand': 'Amazon Brand - Solimo',\n",
1181 | " 'item_id': 'B07THC7RSK',\n",
1182 | " 'main_image_id': '71PBcKpr8jL',\n",
1183 | " 'node': '/Categories/Mobiles & Accessories/Mobile Accessories/Cases & Covers/Back & Bumper Cases'}"
1184 | ]
1185 | },
1186 | "execution_count": 33,
1187 | "metadata": {},
1188 | "output_type": "execute_result"
1189 | }
1190 | ],
1191 | "source": [
1192 | "dataset_all['train'][0]"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": 34,
1198 | "id": "wanted-ridge",
1199 | "metadata": {},
1200 | "outputs": [],
1201 | "source": [
1202 | "all_classes = dataset_all['train'].features['label'].names_file\n",
1203 | "all_classes"
1204 | ]
1205 | },
1206 | {
1207 | "cell_type": "code",
1208 | "execution_count": null,
1209 | "id": "supported-bottom",
1210 | "metadata": {},
1211 | "outputs": [],
1212 | "source": []
1213 | },
1214 | {
1215 | "cell_type": "markdown",
1216 | "id": "opening-suggestion",
1217 | "metadata": {},
1218 | "source": [
1219 | "## Persist Changes"
1220 | ]
1221 | },
1222 | {
1223 | "cell_type": "markdown",
1224 | "id": "communist-while",
1225 | "metadata": {},
1226 | "source": [
1227 | "save the dataset and load it "
1228 | ]
1229 | },
1230 | {
1231 | "cell_type": "code",
1232 | "execution_count": 35,
1233 | "id": "lonely-shame",
1234 | "metadata": {},
1235 | "outputs": [],
1236 | "source": [
1237 | "dataset_path = '../artifacts/dataset_processed/'"
1238 | ]
1239 | },
1240 | {
1241 | "cell_type": "code",
1242 | "execution_count": 36,
1243 | "id": "flush-report",
1244 | "metadata": {},
1245 | "outputs": [],
1246 | "source": [
1247 | "dataset_all.save_to_disk(dataset_path)"
1248 | ]
1249 | },
1250 | {
1251 | "cell_type": "code",
1252 | "execution_count": null,
1253 | "id": "national-italic",
1254 | "metadata": {},
1255 | "outputs": [],
1256 | "source": []
1257 | },
1258 | {
1259 | "cell_type": "code",
1260 | "execution_count": 37,
1261 | "id": "promotional-lambda",
1262 | "metadata": {},
1263 | "outputs": [
1264 | {
1265 | "data": {
1266 | "text/plain": [
1267 | "DatasetDict({\n",
1268 | " train: Dataset({\n",
1269 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1270 | " num_rows: 72743\n",
1271 | " })\n",
1272 | " test: Dataset({\n",
1273 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1274 | " num_rows: 24248\n",
1275 | " })\n",
1276 | " valid: Dataset({\n",
1277 | " features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],\n",
1278 | " num_rows: 24248\n",
1279 | " })\n",
1280 | "})"
1281 | ]
1282 | },
1283 | "execution_count": 37,
1284 | "metadata": {},
1285 | "output_type": "execute_result"
1286 | }
1287 | ],
1288 | "source": [
1289 | "datasets.load_from_disk(dataset_path)"
1290 | ]
1291 | },
1292 | {
1293 | "cell_type": "markdown",
1294 | "id": "surprising-thomas",
1295 | "metadata": {},
1296 | "source": [
1297 | "# References\n",
1298 | "\n",
1299 | "[Amazon Object Dataset](https://amazon-berkeley-objects.s3.amazonaws.com/index.html) \n",
1300 | "[Hugging Face Tutorial on Custom Dataset](https://github.com/huggingface/notebooks/blob/master/transformers_doc/custom_datasets.ipynb)"
1301 | ]
1302 | },
1303 | {
1304 | "cell_type": "code",
1305 | "execution_count": null,
1306 | "id": "animal-whole",
1307 | "metadata": {},
1308 | "outputs": [],
1309 | "source": []
1310 | }
1311 | ],
1312 | "metadata": {
1313 | "environment": {
1314 | "name": "rapids-gpu.0-18.m65",
1315 | "type": "gcloud",
1316 | "uri": "gcr.io/deeplearning-platform-release/rapids-gpu.0-18:m65"
1317 | },
1318 | "kernelspec": {
1319 | "display_name": "Python [conda env:pytorch]",
1320 | "language": "python",
1321 | "name": "conda-env-pytorch-py"
1322 | },
1323 | "language_info": {
1324 | "codemirror_mode": {
1325 | "name": "ipython",
1326 | "version": 3
1327 | },
1328 | "file_extension": ".py",
1329 | "mimetype": "text/x-python",
1330 | "name": "python",
1331 | "nbconvert_exporter": "python",
1332 | "pygments_lexer": "ipython3",
1333 | "version": "3.7.10"
1334 | }
1335 | },
1336 | "nbformat": 4,
1337 | "nbformat_minor": 5
1338 | }
1339 |
--------------------------------------------------------------------------------
/notebooks/02_timing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "cultural-excess",
6 | "metadata": {},
7 | "source": [
8 | "# Inference Review : different models"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "marked-reference",
14 | "metadata": {},
15 | "source": [
16 | "## About\n",
17 | "\n",
18 | "- Look at timing information of different bert models"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "resident-carroll",
24 | "metadata": {},
25 | "source": [
26 | "## imports"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 1,
32 | "id": "appropriate-nevada",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import pathlib\n",
37 | "import sklearn\n",
38 | "import datasets\n",
39 | "import pandas as pd\n",
40 | "import torch\n",
41 | "\n",
42 | "import numpy as np\n",
43 | "import transformers\n",
44 | "import os\n",
45 | "import json\n",
46 | "from ts.utils.util import map_class_to_label\n",
47 | "from tqdm import tqdm, trange\n",
48 | "import time\n",
49 | "import torchviz\n",
50 | "import torch.nn as nn\n"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "id": "official-kingdom",
57 | "metadata": {},
58 | "outputs": [],
59 | "source": []
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "id": "considerable-terminal",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "query = \"men shoes\""
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "id": "optional-cross",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": []
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "id": "tribal-depression",
82 | "metadata": {},
83 | "source": [
84 | "## Device Specs"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "id": "silent-rouge",
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "ATen/Parallel:\n",
98 | "\tat::get_num_threads() : 8\n",
99 | "\tat::get_num_interop_threads() : 8\n",
100 | "OpenMP 201511 (a.k.a. OpenMP 4.5)\n",
101 | "\tomp_get_max_threads() : 8\n",
102 | "Intel(R) oneAPI Math Kernel Library Version 2022.1-Product Build 20220311 for Intel(R) 64 architecture applications\n",
103 | "\tmkl_get_max_threads() : 8\n",
104 | "Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)\n",
105 | "std::thread::hardware_concurrency() : 16\n",
106 | "Environment variables:\n",
107 | "\tOMP_NUM_THREADS : [not set]\n",
108 | "\tMKL_NUM_THREADS : [not set]\n",
109 | "ATen parallel backend: OpenMP\n",
110 | "\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "print ( torch.__config__.parallel_info())"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "ecological-essex",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": []
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "id": "polyphonic-public",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": []
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "related-current",
137 | "metadata": {},
138 | "source": [
139 | "## Model: bert-large-uncased"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 4,
145 | "id": "solar-restoration",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "base_model = \"bert-large-uncased\""
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 5,
155 | "id": "traditional-shore",
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "application/vnd.jupyter.widget-view+json": {
161 | "model_id": "f10152f05b4e4936a450292bb46cc535",
162 | "version_major": 2,
163 | "version_minor": 0
164 | },
165 | "text/plain": [
166 | "Downloading: 0%| | 0.00/571 [00:00, ?B/s]"
167 | ]
168 | },
169 | "metadata": {},
170 | "output_type": "display_data"
171 | },
172 | {
173 | "data": {
174 | "application/vnd.jupyter.widget-view+json": {
175 | "model_id": "7017cd8945644b709ae81e400155c171",
176 | "version_major": 2,
177 | "version_minor": 0
178 | },
179 | "text/plain": [
180 | "Downloading: 0%| | 0.00/1.25G [00:00, ?B/s]"
181 | ]
182 | },
183 | "metadata": {},
184 | "output_type": "display_data"
185 | },
186 | {
187 | "name": "stderr",
188 | "output_type": "stream",
189 | "text": [
190 | "Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']\n",
191 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
192 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
193 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
194 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
195 | ]
196 | },
197 | {
198 | "data": {
199 | "application/vnd.jupyter.widget-view+json": {
200 | "model_id": "684c59c684684b7c8dcccb4ae2760164",
201 | "version_major": 2,
202 | "version_minor": 0
203 | },
204 | "text/plain": [
205 | "Downloading: 0%| | 0.00/28.0 [00:00, ?B/s]"
206 | ]
207 | },
208 | "metadata": {},
209 | "output_type": "display_data"
210 | },
211 | {
212 | "data": {
213 | "application/vnd.jupyter.widget-view+json": {
214 | "model_id": "95eb4afa08b743f5b07f6299394bfdf5",
215 | "version_major": 2,
216 | "version_minor": 0
217 | },
218 | "text/plain": [
219 | "Downloading: 0%| | 0.00/226k [00:00, ?B/s]"
220 | ]
221 | },
222 | "metadata": {},
223 | "output_type": "display_data"
224 | },
225 | {
226 | "data": {
227 | "application/vnd.jupyter.widget-view+json": {
228 | "model_id": "c6e07a81db2848d49643eb6549d02ad8",
229 | "version_major": 2,
230 | "version_minor": 0
231 | },
232 | "text/plain": [
233 | "Downloading: 0%| | 0.00/455k [00:00, ?B/s]"
234 | ]
235 | },
236 | "metadata": {},
237 | "output_type": "display_data"
238 | }
239 | ],
240 | "source": [
241 | "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
242 | "\n",
243 | "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
244 | " base_model\n",
245 | " )\n",
246 | "\n",
247 | "res = tokenizer.encode_plus(query, return_tensors=\"pt\", padding=\"max_length\", truncation=True)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 6,
253 | "id": "rocky-arctic",
254 | "metadata": {},
255 | "outputs": [
256 | {
257 | "data": {
258 | "text/plain": [
259 | "{'input_ids': tensor([[ 101, 2273, 6007, 102, 0, 0, 0, 0, 0, 0, 0, 0,\n",
260 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
261 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
262 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
263 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
264 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
265 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
266 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
267 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
268 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
269 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
270 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
271 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
272 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
273 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
274 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
275 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
276 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
277 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
278 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
279 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
280 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
281 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
282 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
283 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
284 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
285 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
286 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
287 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
288 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
289 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
290 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
291 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
292 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
293 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
294 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
295 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
296 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
297 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
298 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
299 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
300 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
301 | " 0, 0, 0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
302 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
303 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
304 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
305 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
306 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
307 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
308 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
309 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
310 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
311 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
312 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
313 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
314 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
315 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
316 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
317 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
318 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
319 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
320 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
321 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
322 | " 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
323 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
324 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
325 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
326 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
327 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
328 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
329 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
330 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
331 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
332 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
333 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
334 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
335 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
336 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
337 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
338 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
339 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
340 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
341 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
342 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
343 | " 0, 0, 0, 0, 0, 0, 0, 0]])}"
344 | ]
345 | },
346 | "execution_count": 6,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "res"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 7,
358 | "id": "corresponding-edgar",
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "734 ms ± 25.5 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
366 | ]
367 | }
368 | ],
369 | "source": [
370 | "%%timeit -r 3 -n 5\n",
371 | "\n",
372 | "model_res = model(**res)\n",
373 | "model_res"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 8,
379 | "id": "strong-physiology",
380 | "metadata": {},
381 | "outputs": [
382 | {
383 | "name": "stdout",
384 | "output_type": "stream",
385 | "text": [
386 | "The slowest run took 5.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
387 | "278 ms ± 228 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
388 | ]
389 | }
390 | ],
391 | "source": [
392 | "%%timeit -r 3 -n 5\n",
393 | "if torch.cuda.is_available():\n",
394 | " model_cuda = model.cuda()\n",
395 | " model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "id": "american-guide",
401 | "metadata": {},
402 | "source": [
403 | "## Model: bert-base-uncased"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 9,
409 | "id": "nonprofit-plaza",
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "base_model = \"bert-base-uncased\""
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 10,
419 | "id": "charged-thirty",
420 | "metadata": {},
421 | "outputs": [
422 | {
423 | "data": {
424 | "application/vnd.jupyter.widget-view+json": {
425 | "model_id": "0b8c07dd566146fba60a06a2ffdb1232",
426 | "version_major": 2,
427 | "version_minor": 0
428 | },
429 | "text/plain": [
430 | "Downloading: 0%| | 0.00/570 [00:00, ?B/s]"
431 | ]
432 | },
433 | "metadata": {},
434 | "output_type": "display_data"
435 | },
436 | {
437 | "data": {
438 | "application/vnd.jupyter.widget-view+json": {
439 | "model_id": "1409f6c052b240369531dbceb05cd359",
440 | "version_major": 2,
441 | "version_minor": 0
442 | },
443 | "text/plain": [
444 | "Downloading: 0%| | 0.00/420M [00:00, ?B/s]"
445 | ]
446 | },
447 | "metadata": {},
448 | "output_type": "display_data"
449 | },
450 | {
451 | "name": "stderr",
452 | "output_type": "stream",
453 | "text": [
454 | "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']\n",
455 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
456 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
457 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
458 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
459 | ]
460 | },
461 | {
462 | "data": {
463 | "application/vnd.jupyter.widget-view+json": {
464 | "model_id": "8110170de678442f854eb4c17efaf84a",
465 | "version_major": 2,
466 | "version_minor": 0
467 | },
468 | "text/plain": [
469 | "Downloading: 0%| | 0.00/28.0 [00:00, ?B/s]"
470 | ]
471 | },
472 | "metadata": {},
473 | "output_type": "display_data"
474 | },
475 | {
476 | "data": {
477 | "application/vnd.jupyter.widget-view+json": {
478 | "model_id": "89133ce67be94708a03a45456c1f2863",
479 | "version_major": 2,
480 | "version_minor": 0
481 | },
482 | "text/plain": [
483 | "Downloading: 0%| | 0.00/226k [00:00, ?B/s]"
484 | ]
485 | },
486 | "metadata": {},
487 | "output_type": "display_data"
488 | },
489 | {
490 | "data": {
491 | "application/vnd.jupyter.widget-view+json": {
492 | "model_id": "57b9872952c44de8b98b288ab71b4ba2",
493 | "version_major": 2,
494 | "version_minor": 0
495 | },
496 | "text/plain": [
497 | "Downloading: 0%| | 0.00/455k [00:00, ?B/s]"
498 | ]
499 | },
500 | "metadata": {},
501 | "output_type": "display_data"
502 | }
503 | ],
504 | "source": [
505 | "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
506 | "\n",
507 | "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
508 | " base_model\n",
509 | " )\n",
510 | "\n",
511 | "res = tokenizer.encode_plus(query, return_tensors=\"pt\", padding=\"max_length\", truncation=True)"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 11,
517 | "id": "cheap-california",
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "name": "stdout",
522 | "output_type": "stream",
523 | "text": [
524 | "217 ms ± 6.94 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
525 | ]
526 | }
527 | ],
528 | "source": [
529 | "%%timeit -r 3 -n 5\n",
530 | "\n",
531 | "model_res = model(**res)\n",
532 | "model_res"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 12,
538 | "id": "racial-request",
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "43.8 ms ± 13.9 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "%%timeit -r 3 -n 5\n",
551 | "if torch.cuda.is_available():\n",
552 | " model_cuda = model.cuda()\n",
553 | " model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
554 | ]
555 | },
556 | {
557 | "cell_type": "markdown",
558 | "id": "eligible-paragraph",
559 | "metadata": {},
560 | "source": [
561 | "## Model: distilbert-base-uncased"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 13,
567 | "id": "altered-cotton",
568 | "metadata": {},
569 | "outputs": [],
570 | "source": [
571 | "base_model = \"distilbert-base-uncased\""
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 14,
577 | "id": "convenient-possibility",
578 | "metadata": {},
579 | "outputs": [
580 | {
581 | "name": "stderr",
582 | "output_type": "stream",
583 | "text": [
584 | "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']\n",
585 | "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
586 | "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
587 | "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']\n",
588 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
589 | ]
590 | }
591 | ],
592 | "source": [
593 | "model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model)\n",
594 | "\n",
595 | "tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
596 | " base_model\n",
597 | " )\n",
598 | "\n",
599 | "res = tokenizer.encode_plus(query, return_tensors=\"pt\", padding=\"max_length\", truncation=True)"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 15,
605 | "id": "literary-particle",
606 | "metadata": {},
607 | "outputs": [
608 | {
609 | "name": "stdout",
610 | "output_type": "stream",
611 | "text": [
612 | "112 ms ± 4.13 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
613 | ]
614 | }
615 | ],
616 | "source": [
617 | "%%timeit -r 3 -n 5\n",
618 | "\n",
619 | "model_res = model(**res)\n",
620 | "model_res"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 16,
626 | "id": "incorporated-conservative",
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "name": "stdout",
631 | "output_type": "stream",
632 | "text": [
633 | "26.6 ms ± 6.81 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
634 | ]
635 | }
636 | ],
637 | "source": [
638 | "%%timeit -r 3 -n 5\n",
639 | "if torch.cuda.is_available():\n",
640 | " model_cuda = model.cuda()\n",
641 | " model_cuda(res['input_ids'].cuda(),res['attention_mask'].cuda())"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": null,
647 | "id": "cleared-nashville",
648 | "metadata": {},
649 | "outputs": [],
650 | "source": []
651 | },
652 | {
653 | "cell_type": "code",
654 | "execution_count": null,
655 | "id": "expected-adult",
656 | "metadata": {},
657 | "outputs": [],
658 | "source": []
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": null,
663 | "id": "isolated-template",
664 | "metadata": {},
665 | "outputs": [],
666 | "source": []
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": null,
671 | "id": "japanese-batch",
672 | "metadata": {},
673 | "outputs": [],
674 | "source": []
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": null,
679 | "id": "parliamentary-robin",
680 | "metadata": {},
681 | "outputs": [],
682 | "source": []
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": null,
687 | "id": "requested-anthropology",
688 | "metadata": {},
689 | "outputs": [],
690 | "source": []
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": null,
695 | "id": "failing-instrument",
696 | "metadata": {},
697 | "outputs": [],
698 | "source": []
699 | }
700 | ],
701 | "metadata": {
702 | "environment": {
703 | "kernel": "pyupgrade",
704 | "name": "pytorch-gpu.1-11.m94",
705 | "type": "gcloud",
706 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
707 | },
708 | "kernelspec": {
709 | "display_name": "pyupgrade",
710 | "language": "python",
711 | "name": "pyupgrade"
712 | },
713 | "language_info": {
714 | "codemirror_mode": {
715 | "name": "ipython",
716 | "version": 3
717 | },
718 | "file_extension": ".py",
719 | "mimetype": "text/x-python",
720 | "name": "python",
721 | "nbconvert_exporter": "python",
722 | "pygments_lexer": "ipython3",
723 | "version": "3.7.12"
724 | }
725 | },
726 | "nbformat": 4,
727 | "nbformat_minor": 5
728 | }
729 |
--------------------------------------------------------------------------------
/notebooks/04_packaging.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "disturbed-division",
6 | "metadata": {},
7 | "source": [
8 | "# Packaging Model"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "aggressive-pasta",
14 | "metadata": {},
15 | "source": [
16 | "## About\n",
17 | "\n",
18 | "- Package the given model using Torch Model Archive\n",
19 | "- Write a custom handler to support pre processing and post processing"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "id": "appointed-funeral",
25 | "metadata": {},
26 | "source": [
27 | "## Working directory"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "id": "respective-tourist",
33 | "metadata": {},
34 | "source": [
35 | "orignal model and the traced model we saved from before"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "id": "04d57cd2-9d4d-486a-be0c-489b0b4cf3b8",
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "import os\n",
46 | "import sys"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "id": "2528f4c1-7963-4f54-8a21-ee7b06061897",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": []
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "c7c866ce-2428-4ac5-9bf6-292e5073faa4",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": []
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 2,
68 | "id": "turkish-finland",
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "distilbert-base-uncased distilbert-base-uncased__trace\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "!ls ../artifacts/model/"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "specific-usage",
86 | "metadata": {},
87 | "source": [
88 | "directory contains tokenizer/ vocab / pytorch model"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "id": "extreme-spirit",
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "config.json\t setup_config.json\t tokenizer_config.json\n",
102 | "index_to_name.json special_tokens_map.json training_args.bin\n",
103 | "pytorch_model.bin tokenizer.json\t vocab.txt\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "!ls ../artifacts/model/distilbert-base-uncased "
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 4,
114 | "id": "sporting-philip",
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "index_to_name.json special_tokens_map.json traced_model.pt\n",
122 | "model_store\t tokenizer.json\t vocab.txt\n",
123 | "setup_config.json tokenizer_config.json\n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "!ls ../artifacts/model/distilbert-base-uncased__trace"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "id": "electronic-liver",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": []
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "id": "instructional-electric",
142 | "metadata": {},
143 | "source": [
144 | "## Torch Model Archiver\n",
145 | "\n",
146 | "TorchServe required the model and its dependant artifacts to be packaged in a single file. \n",
147 | "\n",
148 | "[torch-model-archiver](https://pypi.org/project/torch-model-archiver/) is a python package that can package the artifacts to a mar file"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 5,
154 | "id": "outdoor-agenda",
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "usage: torch-model-archiver [-h] --model-name MODEL_NAME\n",
162 | " [--serialized-file SERIALIZED_FILE]\n",
163 | " [--model-file MODEL_FILE] --handler HANDLER\n",
164 | " [--extra-files EXTRA_FILES]\n",
165 | " [--runtime {python,python2,python3}]\n",
166 | " [--export-path EXPORT_PATH]\n",
167 | " [--archive-format {tgz,no-archive,default}] [-f]\n",
168 | " -v VERSION [-r REQUIREMENTS_FILE]\n",
169 | "\n",
170 | "Torch Model Archiver Tool\n",
171 | "\n",
172 | "optional arguments:\n",
173 | " -h, --help show this help message and exit\n",
174 | " --model-name MODEL_NAME\n",
175 | " Exported model name. Exported file will be named as\n",
176 | " model-name.mar and saved in current working directory if no --export-path is\n",
177 | " specified, else it will be saved under the export path\n",
178 | " --serialized-file SERIALIZED_FILE\n",
179 | " Path to .pt or .pth file containing state_dict in case of eager mode\n",
180 | " or an executable ScriptModule in case of TorchScript.\n",
181 | " --model-file MODEL_FILE\n",
182 | " Path to python file containing model architecture.\n",
183 | " This parameter is mandatory for eager mode models.\n",
184 | " The model architecture file must contain only one\n",
185 | " class definition extended from torch.nn.modules.\n",
186 | " --handler HANDLER TorchServe's default handler name\n",
187 | " or Handler path to handle custom inference logic.\n",
188 | " --extra-files EXTRA_FILES\n",
189 | " Comma separated path to extra dependency files.\n",
190 | " --runtime {python,python2,python3}\n",
191 | " The runtime specifies which language to run your inference code on.\n",
192 | " The default runtime is \"python\".\n",
193 | " --export-path EXPORT_PATH\n",
194 | " Path where the exported .mar file will be saved. This is an optional\n",
195 | " parameter. If --export-path is not specified, the file will be saved in the\n",
196 | " current working directory. \n",
197 | " --archive-format {tgz,no-archive,default}\n",
198 | " The format in which the model artifacts are archived.\n",
199 | " \"tgz\": This creates the model-archive in .tar.gz format.\n",
200 | " If platform hosting TorchServe requires model-artifacts to be in \".tar.gz\"\n",
201 | " use this option.\n",
202 | " \"no-archive\": This option creates an non-archived version of model artifacts\n",
203 | " at \"export-path/{model-name}\" location. As a result of this choice, \n",
204 | " MANIFEST file will be created at \"export-path/{model-name}\" location\n",
205 | " without archiving these model files\n",
206 | " \"default\": This creates the model-archive in .mar format.\n",
207 | " This is the default archiving format. Models archived in this format\n",
208 | " will be readily hostable on native TorchServe.\n",
209 | " -f, --force When the -f or --force flag is specified, an existing .mar file with same\n",
210 | " name as that provided in --model-name in the path specified by --export-path\n",
211 | " will overwritten\n",
212 | " -v VERSION, --version VERSION\n",
213 | " Model's version\n",
214 | " -r REQUIREMENTS_FILE, --requirements-file REQUIREMENTS_FILE\n",
215 | " Path to a requirements.txt containing model specific python dependency\n",
216 | " packages.\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "%%bash \n",
222 | "\n",
223 | "torch-model-archiver --help"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "better-nicholas",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": []
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "insured-paradise",
237 | "metadata": {},
238 | "source": [
239 | "package the model artifact and actual handler code"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 6,
245 | "id": "87a46200-5d60-4f0e-af70-904f1f2089f7",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "/opt/conda/envs/pyupgrade/bin/torch-model-archiver\n"
253 | ]
254 | }
255 | ],
256 | "source": [
257 | "!which torch-model-archiver "
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 7,
263 | "id": "searching-testing",
264 | "metadata": {
265 | "scrolled": true
266 | },
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "/home/jupyter/workshop/pytorch-serving-workshop\n"
273 | ]
274 | },
275 | {
276 | "name": "stderr",
277 | "output_type": "stream",
278 | "text": [
279 | "WARNING - Overwriting artifacts/model/distilbert-base-uncased__trace/model_store/pt_classifier.mar ...\n"
280 | ]
281 | }
282 | ],
283 | "source": [
284 | "%%bash\n",
285 | "\n",
286 | "cd ..\n",
287 | "pwd\n",
288 | "\n",
289 | "ARTIFACT_BASE_DIR=\"artifacts/model/distilbert-base-uncased__trace\"\n",
290 | "\n",
291 | "MODEL_NAME=\"pt_classifier\"\n",
292 | "MODEL_VERSION=\"1.0\"\n",
293 | "MODEL_STORE=\"${ARTIFACT_BASE_DIR}/model_store\"\n",
294 | "MODEL_SERIALIZED_FILE=\"${ARTIFACT_BASE_DIR}/traced_model.pt\"\n",
295 | "\n",
296 | "TOKENIZER_FILES=\"${ARTIFACT_BASE_DIR}/tokenizer_config.json,${ARTIFACT_BASE_DIR}/special_tokens_map.json,${ARTIFACT_BASE_DIR}/vocab.txt,${ARTIFACT_BASE_DIR}/tokenizer.json\"\n",
297 | "MODEL_EXTRA_FILES=\"${ARTIFACT_BASE_DIR}/index_to_name.json,${ARTIFACT_BASE_DIR}/setup_config.json,${TOKENIZER_FILES}\"\n",
298 | "\n",
299 | "\n",
300 | "\n",
301 | "\n",
302 | "mkdir -p $MODEL_STORE\n",
303 | "\n",
304 | "torch-model-archiver --model-name ${MODEL_NAME} \\\n",
305 | "--version ${MODEL_VERSION} \\\n",
306 | "--serialized-file ${MODEL_SERIALIZED_FILE} \\\n",
307 | "--export-path ${MODEL_STORE} \\\n",
308 | "--extra-files ${MODEL_EXTRA_FILES} \\\n",
309 | "--handler ./serving/handler.py \\\n",
310 | "--force\n",
311 | "\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 8,
317 | "id": "familiar-hormone",
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# %load ../serving/handler.py\n",
322 | "import json\n",
323 | "import logging\n",
324 | "import os\n",
325 | "import time\n",
326 | "from abc import ABC\n",
327 | "from collections.abc import Iterable\n",
328 | "import transformers\n",
329 | "import ast\n",
330 | "import torch\n",
331 | "\n",
332 | "import numpy as np\n",
333 | "from ts.metrics.dimension import Dimension\n",
334 | "\n",
335 | "logger = logging.getLogger(__name__)\n",
336 | "\n",
337 | "from ts.torch_handler.base_handler import BaseHandler\n",
338 | "\n",
339 | "from ts.utils.util import map_class_to_label\n",
340 | "\n",
341 | "import time\n",
342 | "\n",
343 | "\n",
344 | "logger = logging.getLogger(__name__)\n",
345 | "logger.info(\"Transformers version %s\",transformers.__version__)\n",
346 | "\n",
347 | "class CustomHandler(BaseHandler, ABC):\n",
348 | " \"\"\"\n",
349 | " Transformers handler class for sequence classification.\n",
350 | " \"\"\"\n",
351 | "\n",
352 | " def __init__(self):\n",
353 | " super(CustomHandler, self).__init__()\n",
354 | " self.initialized = False\n",
355 | "\n",
356 | " def initialize(self, ctx):\n",
357 | "\n",
358 | " \n",
359 | " self.manifest = ctx.manifest\n",
360 | " properties = ctx.system_properties\n",
361 | " model_dir = properties.get(\"model_dir\")\n",
362 | " serialized_file = self.manifest[\"model\"][\"serializedFile\"]\n",
363 | " model_pt_path = os.path.join(model_dir, serialized_file)\n",
364 | "\n",
365 | " self.device = torch.device(\n",
366 | " \"cuda:\" + str(properties.get(\"gpu_id\"))\n",
367 | " if torch.cuda.is_available() and properties.get(\"gpu_id\") is not None\n",
368 | " else \"cpu\"\n",
369 | " )\n",
370 | " \n",
371 | " # read configs for the mode, model_name, etc. from setup_config.json\n",
372 | " setup_config_path = os.path.join(model_dir, \"setup_config.json\")\n",
373 | " if os.path.isfile(setup_config_path):\n",
374 | " with open(setup_config_path) as setup_config_file:\n",
375 | " self.setup_config = json.load(setup_config_file)\n",
376 | " else:\n",
377 | " logger.warning(\"Missing the setup_config.json file.\")\n",
378 | "\n",
379 | "\n",
380 | " # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode\n",
381 | " # further setup config can be added.\n",
382 | " if self.setup_config[\"save_mode\"] == \"jit\":\n",
383 | " self.model = torch.jit.load(model_pt_path, map_location=self.device)\n",
384 | " elif self.setup_config[\"save_mode\"] == \"original\":\n",
385 | " self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)\n",
386 | "\n",
387 | " self.model.to(self.device)\n",
388 | " \n",
389 | " else:\n",
390 | " logger.warning(\"Missing the checkpoint or state_dict.\")\n",
391 | "\n",
392 | " \n",
393 | " \n",
394 | " self.top_k = self.setup_config[\"top_k\"]\n",
395 | " self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir \n",
396 | " , do_lower_case=self.setup_config[\"do_lower_case\"]\n",
397 | " , torchscript=True)\n",
398 | "\n",
399 | " \n",
400 | " self.model.eval()\n",
401 | "\n",
402 | " logger.info(\n",
403 | " \"Transformer model from path %s loaded successfully\", model_dir\n",
404 | " )\n",
405 | "\n",
406 | " # Read the mapping file, index to object name\n",
407 | " mapping_file_path = os.path.join(model_dir, \"index_to_name.json\")\n",
408 | " \n",
409 | " if os.path.isfile(mapping_file_path):\n",
410 | " with open(mapping_file_path) as f:\n",
411 | " self.mapping = json.load(f)\n",
412 | " else:\n",
413 | " logger.warning(\"Missing the index_to_name.json file.\")\n",
414 | " \n",
415 | " self.initialized = True\n",
416 | "\n",
417 | " def preprocess(self, requests):\n",
418 | " \"\"\"Basic text preprocessing, based on the user's chocie of application mode.\n",
419 | " Args:\n",
420 | " requests (str): The Input data in the form of text is passed on to the preprocess\n",
421 | " function.\n",
422 | " Returns:\n",
423 | " list : The preprocess function returns a list of Tensor for the size of the word tokens.\n",
424 | " \"\"\"\n",
425 | " input_ids_batch = None\n",
426 | " attention_mask_batch = None\n",
427 | " for idx, data in enumerate(requests):\n",
428 | " request = data.get(\"data\")\n",
429 | " if request is None:\n",
430 | " request = data.get(\"body\")\n",
431 | " if isinstance(request, (bytes, bytearray)):\n",
432 | " request = request.decode('utf-8')\n",
433 | "\n",
434 | " input_text = request['text']\n",
435 | " max_length = self.setup_config[\"max_length\"]\n",
436 | " logger.info(\"Received text: '%s'\", input_text)\n",
437 | "\n",
438 | " # preprocessing text for sequence_classification and token_classification.\n",
439 | " inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')\n",
440 | " \n",
441 | " \n",
442 | " input_ids = inputs[\"input_ids\"].to(self.device)\n",
443 | " attention_mask = inputs[\"attention_mask\"].to(self.device)\n",
444 | " # making a batch out of the recieved requests\n",
445 | " # attention masks are passed for cases where input tokens are padded.\n",
446 | " if input_ids.shape is not None:\n",
447 | " if input_ids_batch is None:\n",
448 | " input_ids_batch = input_ids\n",
449 | " attention_mask_batch = attention_mask\n",
450 | " else:\n",
451 | " input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)\n",
452 | " attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)\n",
453 | " \n",
454 | " input_ids_batch = input_ids_batch.to(self.device)\n",
455 | " attention_mask_batch = attention_mask_batch.to(self.device)\n",
456 | " \n",
457 | " return (input_ids_batch, attention_mask_batch)\n",
458 | "\n",
459 | " def inference(self, input_batch):\n",
460 | "\n",
461 | " \n",
462 | " input_ids_batch, attention_mask_batch = input_batch\n",
463 | " inferences = []\n",
464 | " \n",
465 | " predictions = self.model(input_ids_batch, attention_mask_batch)\n",
466 | " \n",
467 | "# ps = torch.nn.functional.softmax(predictions.logits, dim=1)\n",
468 | "# probs, classes = torch.topk(ps, self.top_k, dim=1)\n",
469 | "# probs = probs.tolist()\n",
470 | "# classes = classes.tolist()\n",
471 | "\n",
472 | "# inferences = map_class_to_label(probs, self.mapping, classes)\n",
473 | " \n",
474 | " num_rows, num_cols = predictions[0].shape\n",
475 | " for i in range(num_rows):\n",
476 | " ps = torch.nn.functional.softmax(predictions[i], dim=1)\n",
477 | " probs, classes = torch.topk(ps, self.top_k, dim=1)\n",
478 | " probs = probs.tolist()\n",
479 | " classes = classes.tolist()\n",
480 | " \n",
481 | " friendly_labels = map_class_to_label(probs, self.mapping, classes)\n",
482 | " inferences.append(friendly_labels)\n",
483 | "\n",
484 | "\n",
485 | " return inferences\n",
486 | "\n",
487 | " def postprocess(self, inference_output):\n",
488 | "\n",
489 | " return inference_output\n",
490 | " \n",
491 | " \n",
492 | " def handle(self, data, context):\n",
493 | "\n",
494 | " # It can be used for pre or post processing if needed as additional request\n",
495 | " # information is available in context\n",
496 | " \n",
497 | " start_time = time.time()\n",
498 | " \n",
499 | " self.context = context\n",
500 | " metrics = self.context.metrics\n",
501 | " \n",
502 | " data_preprocess = self.preprocess(data)\n",
503 | " data_inference = self.inference(data_preprocess)\n",
504 | " data_postprocess = self.postprocess(data_inference)\n",
505 | " \n",
506 | " \n",
507 | " \n",
508 | " stop_time = time.time()\n",
509 | " metrics.add_time('HandlerTime', round((stop_time - start_time) * 1000, 2), None, 'ms')\n",
510 | " \n",
511 | " return data_postprocess\n"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "id": "foster-concept",
518 | "metadata": {},
519 | "outputs": [],
520 | "source": []
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "id": "confused-discretion",
525 | "metadata": {},
526 | "source": [
527 | "if you would live to serve through Docker, lets copy the `model_store` artifact relative to the DockerFile folder"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 9,
533 | "id": "intensive-contest",
534 | "metadata": {},
535 | "outputs": [],
536 | "source": [
537 | "%%bash\n",
538 | "cd .. \n",
539 | "\n",
540 | "rm -rf serving/model_store\n",
541 | "mkdir -p serving/model_store\n",
542 | "\n",
543 | "cp artifacts/model/distilbert-base-uncased__trace/model_store/* serving/model_store\n",
544 | "cp artifacts/model/distilbert-base-uncased__trace/setup_config.json serving/model_store/"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "id": "interesting-hollow",
551 | "metadata": {},
552 | "outputs": [],
553 | "source": []
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "id": "ideal-specialist",
559 | "metadata": {},
560 | "outputs": [],
561 | "source": []
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": null,
566 | "id": "creative-ballot",
567 | "metadata": {},
568 | "outputs": [],
569 | "source": []
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "id": "funky-summer",
574 | "metadata": {},
575 | "source": [
576 | "## Torchserve\n",
577 | "\n",
578 | "> TorchServe is a performant, flexible and easy to use tool for serving PyTorch eager mode and torschripted models.\n",
579 | "\n",
580 | "Ref: [TorchServe Docs](https://pytorch.org/serve/)"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "id": "indie-tokyo",
586 | "metadata": {},
587 | "source": [
588 | "below command starts torchserve"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": 10,
594 | "id": "pretty-graphic",
595 | "metadata": {},
596 | "outputs": [],
597 | "source": [
598 | "%%bash --bg\n",
599 | "cd ..\n",
600 | "torchserve --ts-config ./serving/config.properties \\\n",
601 | "--start --model-store ./serving/model_store --ncs\n",
602 | "\n"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "id": "e9dce210",
609 | "metadata": {},
610 | "outputs": [],
611 | "source": [
612 | "%%bash\n",
613 | "\n",
614 | "echo \"waiting for some time for torchserve to start\"\n",
615 | "sleep 30"
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": 11,
621 | "id": "missing-champagne",
622 | "metadata": {},
623 | "outputs": [
624 | {
625 | "name": "stdout",
626 | "output_type": "stream",
627 | "text": [
628 | "access_log.log\tmodel_log.log model_metrics.log ts_log.log ts_metrics.log\n"
629 | ]
630 | }
631 | ],
632 | "source": [
633 | "!ls ../logs/"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 12,
639 | "id": "endangered-responsibility",
640 | "metadata": {},
641 | "outputs": [
642 | {
643 | "name": "stdout",
644 | "output_type": "stream",
645 | "text": [
646 | "2022-07-10T00:01:37,265 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Listening on port: /tmp/.ts.sock.9000\n",
647 | "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - [PID]8127\n",
648 | "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Torch worker started.\n",
649 | "2022-07-10T00:01:37,266 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Python runtime: 3.7.12\n",
650 | "2022-07-10T00:01:37,292 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Connection accepted: /tmp/.ts.sock.9000.\n",
651 | "2022-07-10T00:01:37,333 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - model_name: pt_classifier, batchSize: 1\n",
652 | "2022-07-10T00:01:37,530 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformers version 4.20.1\n",
653 | "2022-07-10T00:01:40,262 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformer model from path /tmp/models/dbe664e9d9464a0b83b3d662a607513a loaded successfully\n"
654 | ]
655 | }
656 | ],
657 | "source": [
658 | "!tail ../logs/model_log.log"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 13,
664 | "id": "forbidden-marriage",
665 | "metadata": {},
666 | "outputs": [
667 | {
668 | "name": "stdout",
669 | "output_type": "stream",
670 | "text": [
671 | "load_models=all\n",
672 | "inference_address=http://0.0.0.0:9080\n",
673 | "management_address=http://0.0.0.0:9081\n",
674 | "metrics_address=http://0.0.0.0:9082\n",
675 | "model_store=model_store\n",
676 | "async_logging=true"
677 | ]
678 | }
679 | ],
680 | "source": [
681 | "!cat ../serving/config.properties "
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "id": "hazardous-slovak",
687 | "metadata": {},
688 | "source": [
689 | "below command stops torchserve"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 14,
695 | "id": "sixth-gardening",
696 | "metadata": {},
697 | "outputs": [],
698 | "source": [
699 | "#torchserve --stop"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": null,
705 | "id": "floating-currency",
706 | "metadata": {},
707 | "outputs": [],
708 | "source": []
709 | },
710 | {
711 | "cell_type": "markdown",
712 | "id": "revolutionary-premises",
713 | "metadata": {},
714 | "source": [
715 | "List all the models loaded"
716 | ]
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": 15,
721 | "id": "false-council",
722 | "metadata": {},
723 | "outputs": [
724 | {
725 | "name": "stdout",
726 | "output_type": "stream",
727 | "text": [
728 | "curl: (7) Failed to connect to localhost port 9081: Connection refused\n"
729 | ]
730 | }
731 | ],
732 | "source": [
733 | "!curl \"http://localhost:9081/models\""
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": null,
739 | "id": "proprietary-disposition",
740 | "metadata": {},
741 | "outputs": [],
742 | "source": []
743 | },
744 | {
745 | "cell_type": "markdown",
746 | "id": "automated-division",
747 | "metadata": {},
748 | "source": [
749 | "get details on the model `pt_classifier`"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 16,
755 | "id": "fancy-rings",
756 | "metadata": {},
757 | "outputs": [
758 | {
759 | "name": "stdout",
760 | "output_type": "stream",
761 | "text": [
762 | "curl: (7) Failed to connect to localhost port 9081: Connection refused\n"
763 | ]
764 | }
765 | ],
766 | "source": [
767 | "!curl http://localhost:9081/models/pt_classifier"
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": null,
773 | "id": "fancy-judges",
774 | "metadata": {},
775 | "outputs": [],
776 | "source": []
777 | },
778 | {
779 | "cell_type": "markdown",
780 | "id": "premier-scheme",
781 | "metadata": {},
782 | "source": [
783 | "sample prediction"
784 | ]
785 | },
786 | {
787 | "cell_type": "code",
788 | "execution_count": 17,
789 | "id": "norman-trader",
790 | "metadata": {},
791 | "outputs": [
792 | {
793 | "name": "stdout",
794 | "output_type": "stream",
795 | "text": [
796 | "\n",
797 | "elasped time (sec):0.000585\n",
798 | "curl: (7) Failed to connect to localhost port 9080: Connection refused\n"
799 | ]
800 | }
801 | ],
802 | "source": [
803 | "! curl -X POST http://localhost:9080/predictions/pt_classifier \\\n",
804 | " -H 'Content-Type: application/json' \\\n",
805 | " -d '{\"text\":\"herbal tea\",\"request_id\":\"test_id\"}' \\\n",
806 | " -w \"\\nelasped time (sec):%{time_total}\\n\""
807 | ]
808 | },
809 | {
810 | "cell_type": "markdown",
811 | "id": "direct-sherman",
812 | "metadata": {},
813 | "source": [
814 | "sample prediction from a file"
815 | ]
816 | },
817 | {
818 | "cell_type": "code",
819 | "execution_count": 19,
820 | "id": "solid-internship",
821 | "metadata": {},
822 | "outputs": [
823 | {
824 | "name": "stdout",
825 | "output_type": "stream",
826 | "text": [
827 | "[\n",
828 | " {\n",
829 | " \"GROCERY\": 0.9995384216308594,\n",
830 | " \"HEALTH_PERSONAL_CARE\": 0.0001973821345018223,\n",
831 | " \"PET_SUPPLIES\": 8.77468119142577e-05,\n",
832 | " \"KITCHEN\": 5.9781144955195487e-05,\n",
833 | " \"HOME\": 1.9271317796665244e-05\n",
834 | " }\n",
835 | "]\n",
836 | "elasped time (sec):0.447966\n"
837 | ]
838 | },
839 | {
840 | "name": "stderr",
841 | "output_type": "stream",
842 | "text": [
843 | " % Total % Received % Xferd Average Speed Time Time Time Current\n",
844 | " Dload Upload Total Spent Left Speed\n",
845 | "100 264 100 213 100 51 476 114 --:--:-- --:--:-- --:--:-- 590\n"
846 | ]
847 | }
848 | ],
849 | "source": [
850 | "%%bash\n",
851 | "cd ..\n",
852 | "curl -X POST http://localhost:9080/predictions/pt_classifier \\\n",
853 | " -H 'Content-Type: application/json' \\\n",
854 | " -d @serving/sample_input.json \\\n",
855 | " -w \"\\nelasped time (sec):%{time_total}\\n\""
856 | ]
857 | },
858 | {
859 | "cell_type": "code",
860 | "execution_count": 20,
861 | "id": "african-donna",
862 | "metadata": {},
863 | "outputs": [],
864 | "source": [
865 | "import requests"
866 | ]
867 | },
868 | {
869 | "cell_type": "code",
870 | "execution_count": 21,
871 | "id": "aa92e2d6",
872 | "metadata": {},
873 | "outputs": [
874 | {
875 | "data": {
876 | "text/plain": [
877 | "[{'GROCERY': 0.9995384216308594,\n",
878 | " 'HEALTH_PERSONAL_CARE': 0.0001973821345018223,\n",
879 | " 'PET_SUPPLIES': 8.77468119142577e-05,\n",
880 | " 'KITCHEN': 5.9781144955195487e-05,\n",
881 | " 'HOME': 1.9271317796665244e-05}]"
882 | ]
883 | },
884 | "execution_count": 21,
885 | "metadata": {},
886 | "output_type": "execute_result"
887 | }
888 | ],
889 | "source": [
890 | "payload = {\"text\":\"herbal tea\",\"request_id\":\"test_id\"}\n",
891 | "\n",
892 | "endpoint = \"http://localhost:9080/predictions/pt_classifier\"\n",
893 | "\n",
894 | "res = requests.post(endpoint, json = payload)\n",
895 | "\n",
896 | "res.json()"
897 | ]
898 | },
899 | {
900 | "cell_type": "code",
901 | "execution_count": null,
902 | "id": "c1321402-ca4c-484c-9ec1-5d680c71840b",
903 | "metadata": {},
904 | "outputs": [],
905 | "source": []
906 | }
907 | ],
908 | "metadata": {
909 | "environment": {
910 | "kernel": "pyupgrade",
911 | "name": "pytorch-gpu.1-11.m94",
912 | "type": "gcloud",
913 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
914 | },
915 | "kernelspec": {
916 | "display_name": "pyupgrade",
917 | "language": "python",
918 | "name": "pyupgrade"
919 | },
920 | "language_info": {
921 | "codemirror_mode": {
922 | "name": "ipython",
923 | "version": 3
924 | },
925 | "file_extension": ".py",
926 | "mimetype": "text/x-python",
927 | "name": "python",
928 | "nbconvert_exporter": "python",
929 | "pygments_lexer": "ipython3",
930 | "version": "3.7.12"
931 | }
932 | },
933 | "nbformat": 4,
934 | "nbformat_minor": 5
935 | }
936 |
--------------------------------------------------------------------------------
/notebooks/utils.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | import tqdm
3 | import torch
4 | import numpy as np
5 | from ts.utils.util import map_class_to_label
6 |
7 |
8 | def prediction_batch(model, dataset, device:str, batch_size = 32):
9 | metric_accuracy = datasets.load_metric('accuracy')
10 |
11 | l = len(dataset)
12 | all_y_preds = []
13 | # make sure model is in eval mode ; not computing gradients
14 | model.eval()
15 |
16 | # feed model to cpu/gpu device
17 | model = model.to(device)
18 |
19 | # iterate our dataset in batches
20 | for ndx in tqdm.trange(0, l, batch_size):
21 |
22 | # take precomputed inut and attention masks
23 | input_ids = dataset['input_ids'][ndx:ndx+batch_size].to(device)
24 | attention_mask = dataset['attention_mask'][ndx:ndx+batch_size].to(device)
25 |
26 | with torch.no_grad():
27 | res = model( input_ids = input_ids, attention_mask = attention_mask )
28 |
29 | # output of torchscript model doesn't have logits property
30 | #logits = res.logits.detach().cpu().numpy()
31 |
32 | logits = res[0].detach().cpu().numpy()
33 |
34 | y_preds = np.argmax(logits, axis=1)
35 |
36 | all_y_preds.extend(y_preds)
37 |
38 | # accuracy on whole dataset
39 | accuracy = metric_accuracy.compute(predictions = all_y_preds, references = dataset['label'])
40 |
41 | return accuracy
42 |
43 | def prediction(model, tokens_tensor, masks_tensors , id2label_str, topk =5):
44 | model.eval()
45 |
46 | tokens_tensor = tokens_tensor.to('cpu')
47 | masks_tensors = masks_tensors.to('cpu')
48 |
49 | res = model(tokens_tensor, masks_tensors)
50 |
51 | ps = torch.nn.functional.softmax(res[0], dim=1)
52 | probs, classes = torch.topk(ps, topk, dim=1)
53 | probs = probs.tolist()
54 | classes = classes.tolist()
55 |
56 | labels = map_class_to_label(probs, id2label_str, classes)
57 |
58 | return labels
59 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.12.*
2 | torch-model-archiver==0.6.0
3 | torchmetrics==0.9.*
4 | torchserve==0.6.*
5 | torchvision==0.13.*
6 | datasets==2.3.*
7 | transformers==4.20.*
8 | torchviz==0.0.2
9 | scikit-learn==1.0.*
10 | plotly==5.9.*
11 | wandb==0.12.*
12 | papermill==2.3.*
--------------------------------------------------------------------------------
/serving/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/torchserve:0.4.0-cpu
2 |
3 | COPY requirements.txt .
4 | RUN pip install -r requirements.txt
5 |
6 | WORKDIR /home/model-server/
7 | #COPY * /home/model-server/
8 | COPY model_store model_store
9 | COPY * /home/model-server/
10 |
11 |
12 | USER model-server
13 |
14 |
15 | RUN pwd
16 | RUN ls
17 | CMD ["torchserve", \
18 | "--start", \
19 | "--ts-config=/home/model-server/config.properties"]
--------------------------------------------------------------------------------
/serving/config.properties:
--------------------------------------------------------------------------------
1 | load_models=all
2 | inference_address=http://0.0.0.0:9080
3 | management_address=http://0.0.0.0:9081
4 | metrics_address=http://0.0.0.0:9082
5 | model_store=model_store
6 | async_logging=true
--------------------------------------------------------------------------------
/serving/handler.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import time
5 | from abc import ABC
6 | from collections.abc import Iterable
7 | import transformers
8 | import ast
9 | import torch
10 |
11 | import numpy as np
12 | from ts.metrics.dimension import Dimension
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 | from ts.torch_handler.base_handler import BaseHandler
17 |
18 | from ts.utils.util import map_class_to_label
19 |
20 | import time
21 |
22 |
23 | logger = logging.getLogger(__name__)
24 | logger.info("Transformers version %s",transformers.__version__)
25 |
26 | class CustomHandler(BaseHandler, ABC):
27 | """
28 | Transformers handler class for sequence classification.
29 | """
30 |
31 | def __init__(self):
32 | super(CustomHandler, self).__init__()
33 | self.initialized = False
34 |
35 | def initialize(self, ctx):
36 |
37 |
38 | self.manifest = ctx.manifest
39 | properties = ctx.system_properties
40 | model_dir = properties.get("model_dir")
41 | serialized_file = self.manifest["model"]["serializedFile"]
42 | model_pt_path = os.path.join(model_dir, serialized_file)
43 |
44 | self.device = torch.device(
45 | "cuda:" + str(properties.get("gpu_id"))
46 | if torch.cuda.is_available() and properties.get("gpu_id") is not None
47 | else "cpu"
48 | )
49 |
50 | # read configs for the mode, model_name, etc. from setup_config.json
51 | setup_config_path = os.path.join(model_dir, "setup_config.json")
52 | if os.path.isfile(setup_config_path):
53 | with open(setup_config_path) as setup_config_file:
54 | self.setup_config = json.load(setup_config_file)
55 | else:
56 | logger.warning("Missing the setup_config.json file.")
57 |
58 |
59 | # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
60 | # further setup config can be added.
61 | if self.setup_config["save_mode"] == "jit":
62 | self.model = torch.jit.load(model_pt_path, map_location=self.device)
63 | elif self.setup_config["save_mode"] == "original":
64 | self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)
65 |
66 | self.model.to(self.device)
67 |
68 | else:
69 | logger.warning("Missing the checkpoint or state_dict.")
70 |
71 |
72 |
73 | self.top_k = self.setup_config["top_k"]
74 | self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir
75 | , do_lower_case=self.setup_config["do_lower_case"]
76 | , torchscript=True)
77 |
78 |
79 | self.model.eval()
80 |
81 | logger.info(
82 | "Transformer model from path %s loaded successfully", model_dir
83 | )
84 |
85 | # Read the mapping file, index to object name
86 | mapping_file_path = os.path.join(model_dir, "index_to_name.json")
87 |
88 | if os.path.isfile(mapping_file_path):
89 | with open(mapping_file_path) as f:
90 | self.mapping = json.load(f)
91 | else:
92 | logger.warning("Missing the index_to_name.json file.")
93 |
94 | self.initialized = True
95 |
96 | def preprocess(self, requests):
97 | """Basic text preprocessing, based on the user's chocie of application mode.
98 | Args:
99 | requests (str): The Input data in the form of text is passed on to the preprocess
100 | function.
101 | Returns:
102 | list : The preprocess function returns a list of Tensor for the size of the word tokens.
103 | """
104 | input_ids_batch = None
105 | attention_mask_batch = None
106 | for idx, data in enumerate(requests):
107 | request = data.get("data")
108 | if request is None:
109 | request = data.get("body")
110 | if isinstance(request, (bytes, bytearray)):
111 | request = request.decode('utf-8')
112 |
113 | input_text = request['text']
114 | max_length = self.setup_config["max_length"]
115 | logger.info("Received text: '%s'", input_text)
116 |
117 | # preprocessing text for sequence_classification and token_classification.
118 | inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')
119 |
120 |
121 | input_ids = inputs["input_ids"].to(self.device)
122 | attention_mask = inputs["attention_mask"].to(self.device)
123 | # making a batch out of the recieved requests
124 | # attention masks are passed for cases where input tokens are padded.
125 | if input_ids.shape is not None:
126 | if input_ids_batch is None:
127 | input_ids_batch = input_ids
128 | attention_mask_batch = attention_mask
129 | else:
130 | input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
131 | attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
132 |
133 | input_ids_batch = input_ids_batch.to(self.device)
134 | attention_mask_batch = attention_mask_batch.to(self.device)
135 |
136 | return (input_ids_batch, attention_mask_batch)
137 |
138 | def inference(self, input_batch):
139 |
140 |
141 | input_ids_batch, attention_mask_batch = input_batch
142 | inferences = []
143 |
144 | predictions = self.model(input_ids_batch, attention_mask_batch)
145 |
146 | # ps = torch.nn.functional.softmax(predictions.logits, dim=1)
147 | # probs, classes = torch.topk(ps, self.top_k, dim=1)
148 | # probs = probs.tolist()
149 | # classes = classes.tolist()
150 |
151 | # inferences = map_class_to_label(probs, self.mapping, classes)
152 |
153 | num_rows, num_cols = predictions[0].shape
154 | for i in range(num_rows):
155 | ps = torch.nn.functional.softmax(predictions[i], dim=1)
156 | probs, classes = torch.topk(ps, self.top_k, dim=1)
157 | probs = probs.tolist()
158 | classes = classes.tolist()
159 |
160 | friendly_labels = map_class_to_label(probs, self.mapping, classes)
161 | inferences.append(friendly_labels)
162 |
163 |
164 | return inferences
165 |
166 | def postprocess(self, inference_output):
167 |
168 | return inference_output
169 |
170 |
171 | def handle(self, data, context):
172 |
173 | # It can be used for pre or post processing if needed as additional request
174 | # information is available in context
175 |
176 | start_time = time.time()
177 |
178 | self.context = context
179 | metrics = self.context.metrics
180 |
181 | data_preprocess = self.preprocess(data)
182 | data_inference = self.inference(data_preprocess)
183 | data_postprocess = self.postprocess(data_inference)
184 |
185 |
186 |
187 | stop_time = time.time()
188 | metrics.add_time('HandlerTime', round((stop_time - start_time) * 1000, 2), None, 'ms')
189 |
190 | return data_postprocess
191 |
--------------------------------------------------------------------------------
/serving/requirements.txt:
--------------------------------------------------------------------------------
1 | torch-model-archiver==0.4.2
2 | torchserve==0.4.2
3 | torch==1.9.0
4 | torchmetrics==0.4.1
5 | captum==0.4.0
--------------------------------------------------------------------------------
/serving/sample_input.json:
--------------------------------------------------------------------------------
1 | {"text" :"Herbal Tea" , "request_id":"test_client"}
2 |
--------------------------------------------------------------------------------
/setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "parental-token",
6 | "metadata": {},
7 | "source": [
8 | "# Setup"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "round-border",
14 | "metadata": {},
15 | "source": [
16 | "## About\n",
17 | "\n",
18 | "\n",
19 | "Simple notebook to setup\n",
20 | "- clone git repo\n",
21 | "- install required python dependencies\n",
22 | "- download dataset\n",
23 | "- download trained model"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "acd07905",
29 | "metadata": {},
30 | "source": [
31 | "we are cloning to tmp for jupyterhub setup \n"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "homeless-announcement",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "%%bash\n",
42 | "\n",
43 | "mkdir -p /tmp/workshop\n",
44 | "cd /tmp/workshop\n",
45 | "git clone https://github.com/npatta01/pytorch-serving-workshop.git -b main --depth 1"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "proof-discipline",
51 | "metadata": {},
52 | "source": [
53 | "install needed packages"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "relevant-claim",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "!pip install -r /tmp/workshop/pytorch-serving-workshop/requirements.txt"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "id": "875c286c",
69 | "metadata": {},
70 | "source": [
71 | "download processed dataset and model"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "id": "spectacular-scott",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "%%bash\n",
82 | "\n",
83 | "cd /tmp/workshop\n",
84 | "\n",
85 | "cd pytorch-serving-workshop\n",
86 | "mkdir -p artifacts/dataset_processed/amazon\n",
87 | "mkdir -p artifacts/dataset_processed/model\n",
88 | "\n",
89 | "\n",
90 | "cd artifacts\n",
91 | "\n",
92 | "\n",
93 | "# dataset\n",
94 | "echo \"downloading dataset\"\n",
95 | "wget https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.2/dataset_processed.zip\n",
96 | "unzip dataset_processed.zip\n",
97 | "\n",
98 | "\n",
99 | "# model trained on above dataset\n",
100 | "echo \"downloading model\"\n",
101 | "wget https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.2/model.zip \n",
102 | "unzip model.zip\n"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "0ce1ca35",
108 | "metadata": {},
109 | "source": [
110 | "Download transformer models"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "9073adaa",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "import transformers"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "id": "c66c5e14",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "for model_name in [\"bert-large-uncased\",\"bert-base-uncased\",\"distilbert-base-uncased\"]:\n",
131 | " model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)\n",
132 | "\n",
133 | " tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
134 | " model_name\n",
135 | " )\n",
136 | "\n",
137 | " query = \"men shoes\"\n",
138 | " res = tokenizer.encode_plus(query, return_tensors=\"pt\", padding=\"max_length\", truncation=True)\n",
139 | "\n",
140 | " model_res = model(**res)\n"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "id": "neither-shipping",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "!cp -r /tmp/workshop/ $HOME/workshop/"
151 | ]
152 | }
153 | ],
154 | "metadata": {
155 | "environment": {
156 | "kernel": "pyupgrade",
157 | "name": "pytorch-gpu.1-11.m94",
158 | "type": "gcloud",
159 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
160 | },
161 | "kernelspec": {
162 | "display_name": "pyupgrade",
163 | "language": "python",
164 | "name": "pyupgrade"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 3
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython3",
176 | "version": "3.7.12"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 5
181 | }
182 |
--------------------------------------------------------------------------------
/workshop_infra/cert/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/pytorch-serving-workshop/56496a84f8485188e4ba8c472da192e428ad3a51/workshop_infra/cert/.gitkeep
--------------------------------------------------------------------------------
/workshop_infra/config.enc.yaml:
--------------------------------------------------------------------------------
1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
2 | scheduling:
3 | userScheduler:
4 | enabled: true
5 | podPriority:
6 | enabled: true
7 | userPlaceholder:
8 | enabled: true
9 | # Specify five dummy user pods will be used as placeholders
10 | replicas: 5
11 | userPods:
12 | nodeAffinity:
13 | matchNodePurpose: require
14 | cull:
15 | enabled: true
16 | timeout: 3600
17 | every: 300
18 | singleuser:
19 | cpu:
20 | limit: 4
21 | guarantee: 2
22 | memory:
23 | limit: 16G
24 | guarantee: 8G
25 | image:
26 | # You should replace the "latest" tag with a fixed version from:
27 | # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
28 | # Inspect the Dockerfile at:
29 | # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
30 | name: gcr.io/np-public-training/pytorch-workshop
31 | tag: v1.0
32 | defaultUrl: /lab
33 | extraEnv:
34 | JUPYTERHUB_SINGLEUSER_APP: jupyter_server.serverapp.ServerApp
35 | lifecycleHooks:
36 | postStart:
37 | exec:
38 | command:
39 | - sh
40 | - -c
41 | - "cp -r /tmp/workshop /home/jovyan; \npwd\n"
42 | # proxy:
43 | # https:
44 | # enabled: true
45 | # hosts:
46 | # - hub.np.training
47 | # letsencrypt:
48 | # contactEmail: npatta01@gmail.com
49 | # service:
50 | # loadBalancerIP: "34.145.156.81"
51 | # proxy:
52 | # service:
53 | # loadBalancerIP: "34.145.156.81"
54 | proxy:
55 | https:
56 | enabled: true
57 | hosts:
58 | - hub.np.training
59 | type: secret
60 | secret:
61 | name: workshop-tls
62 | service:
63 | loadBalancerIP: 35.188.254.55
64 | hub:
65 | config:
66 | Authenticator:
67 | admin_users:
68 | - npatta01
69 | - vishalkumar95
70 | - reshamas
71 | GitHubOAuthenticator:
72 | client_id: ENC[AES256_GCM,data:GVr1cL8lruvBCXqNtaqou1j4urI=,iv:Orx6eg2BTmHaVjsJvVPieIm9d/BCCLzwldYvDZ59ES8=,tag:5PUGb0rpiRprBIdXmALeeg==,type:str]
73 | client_secret: ENC[AES256_GCM,data:5iBlhLROkt8k8pOVi6OW2BQqivmpSjpWSLeR1NXBQjRFrsbk5tpdwA==,iv:dsfxzR2YRkiL+5EOJBl9a/jEC55LoWuCtMKDA31DYDM=,tag:KH7DT4r+erb1+CGzA2KjcQ==,type:str]
74 | oauth_callback_url: https://hub.np.training/hub/oauth_callback
75 | JupyterHub:
76 | authenticator_class: github
77 | sops:
78 | kms: []
79 | gcp_kms:
80 | - resource_id: projects/np-public-training/locations/global/keyRings/sops/cryptoKeys/sops-key
81 | created_at: "2022-07-07T16:40:04Z"
82 | enc: CiQAtA68IVue/mrOfkHLaTjHYkrW6GgqEFBge+pVF/bSJ7gFscASSQDOyIoWJtFT6Rz7JAKCXlZFTTGzrsUQ0c1lHMZlkxSukkT6NfogdDGVwzy+JRA6GQLkmaeWWEPYy+VY/wP0ZYzm3qpQ4/YxSE8=
83 | azure_kv: []
84 | hc_vault: []
85 | age: []
86 | lastmodified: "2022-07-09T17:45:50Z"
87 | mac: ENC[AES256_GCM,data:j8mG4yhggQCMn6iS9BNA+0947KJcu7h31MBypEp+XJqgudz3UmSwRlF8UpuUKBVanVosOnSwKak8H11oZcsZBwxFZbEtiNXd8DKZJRNWsOb/Kdkk37ImUlbj1MAuIXh3xLlwFrFsaht5eTAdcri3VcXL0sPZYLDjtOv+YztpTBM=,iv:I+CfXmqeRXpq/84cFMym35/Vi7rDjX8MQ7UKG/6zmaw=,tag:qIAvUsO9ypg/mDsiyk4ing==,type:str]
88 | pgp: []
89 | encrypted_regex: ^(client_id|client_secret)$
90 | version: 3.7.3
91 |
--------------------------------------------------------------------------------
/workshop_infra/config_public.yaml:
--------------------------------------------------------------------------------
1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
2 | scheduling:
3 | userScheduler:
4 | enabled: true
5 | podPriority:
6 | enabled: true
7 | userPlaceholder:
8 | enabled: true
9 | # Specify five dummy user pods will be used as placeholders
10 | replicas: 1
11 | userPods:
12 | nodeAffinity:
13 | matchNodePurpose: require
14 | cull:
15 | enabled: true
16 | timeout: 3600
17 | every: 300
18 | singleuser:
19 | cpu:
20 | limit: 4
21 | guarantee: 2
22 | memory:
23 | limit: 16G
24 | guarantee: 8G
25 | image:
26 | name: gcr.io/np-public-training/pytorch-workshop
27 | tag: v1.0
28 | defaultUrl: /lab
29 | extraEnv:
30 | JUPYTERHUB_SINGLEUSER_APP: jupyter_server.serverapp.ServerApp
31 | lifecycleHooks:
32 | postStart:
33 | exec:
34 | command:
35 | - sh
36 | - -c
37 | - "cp -r /tmp/workshop /home/jovyan; \npwd\n"
38 |
39 | hub:
40 | config:
41 | Authenticator:
42 | admin_users:
43 | - npatta01
44 | - vishalkumar95
45 | - reshamas
46 |
--------------------------------------------------------------------------------
/workshop_infra/docker-setup.sh:
--------------------------------------------------------------------------------
1 | cd /home/${NB_USER}
2 |
3 | mkdir -p pytorch-serving-workshop
4 |
5 | cd pytorch-serving-workshop
6 | mkdir -p artifacts/dataset_processed/amazon
7 | mkdir -p artifacts/dataset_processed/model
8 |
9 |
10 | cd artifacts
11 |
12 | BASE_URL="https://github.com/npatta01/pytorch-serving-workshop/releases/download/v0.0.1/"
13 |
14 | # dataset
15 | echo "downloading dataset"
16 | wget --quiet "$BASE_URL/dataset_processed.zip"
17 | unzip dataset_processed.zip
18 |
19 |
20 | # model trained on above dataset
21 | echo "downloading model"
22 | wget --quiet "$BASE_URL/model.zip"
23 | unzip model.zip
--------------------------------------------------------------------------------
/workshop_infra/setup.md:
--------------------------------------------------------------------------------
1 | # Workshop Setup
2 |
3 | The following included commands and steps that were used to create a working jupyter hub installation for the workshop.
4 |
5 | The instructions assume that you are plannning to use GCP and have gcloud setup.
6 |
7 |
8 | Most of the instructions are taken from [zero-to-jupyterhub](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html) project.
9 |
10 |
11 | ## Step 1: common variables
12 |
13 | ```bash
14 | REGION="us-east4"
15 | ZONE="$REGION-a"
16 | NODE_TYPE_USER="n1-highmem-16"
17 |
18 | CLUSTER_NAME=workshop
19 | NODES_MIN=0
20 | NODES_MAX=200
21 |
22 | EMAIL="npatta01@gmail.com"
23 | GCP_PROJECT="np-public-training"
24 |
25 | HELM_NAMESPACE=$CLUSTER_NAME
26 |
27 | HELM_CHART_VERSION="1.2.0"
28 | ```
29 |
30 | ## Step 2: create static ip address
31 |
32 | ```bash
33 | gcloud compute addresses create $CLUSTER_NAME \
34 | --region $REGION \
35 | --project $GCP_PROJECT
36 |
37 | gcloud compute addresses describe $CLUSTER_NAME \
38 | --region $REGION \
39 | --project $GCP_PROJECT
40 |
41 | ```
42 |
43 | Create an `A` record with your DNS provider.
44 |
45 | I am using `hub` for my domain `np.training`
46 |
47 |
48 |
49 |
50 | ## Step 3: Create cluster
51 |
52 |
53 | ```bash
54 |
55 | gcloud container clusters create \
56 | --machine-type n1-standard-2 \
57 | --num-nodes 1 \
58 | --region $REGION \
59 | --cluster-version latest \
60 | $CLUSTER_NAME \
61 | --project $GCP_PROJECT
62 |
63 | ```
64 |
65 | Get kubectl credentials
66 |
67 | ```bash
68 | gcloud container clusters get-credentials \
69 | $CLUSTER_NAME \
70 | --region $REGION \
71 | --project $GCP_PROJECT
72 | ```
73 |
74 | Create admin access for user
75 |
76 | ```bash
77 | kubectl create clusterrolebinding cluster-admin-binding \
78 | --clusterrole=cluster-admin \
79 | --user $EMAIL
80 | ```
81 |
82 | Create separate node pool for jupyter notebook
83 |
84 | ```bash
85 | gcloud beta container node-pools create user-pool \
86 | --machine-type $NODE_TYPE_USER \
87 | --num-nodes 0 \
88 | --enable-autoscaling \
89 | --min-nodes $NODES_MIN \
90 | --max-nodes $NODES_MAX \
91 | --node-labels hub.jupyter.org/node-purpose=user \
92 | --node-taints hub.jupyter.org_dedicated=user:NoSchedule \
93 | --region $REGION \
94 | --cluster $CLUSTER_NAME \
95 | --project $GCP_PROJECT
96 | ```
97 |
98 |
99 | ## Step 3b: Cert (optional)
100 |
101 | By default the Helm chart we will use supports LetsEncrypt. However, I had trouble getting it to work.
102 | So, I used followed the steps bellow to get create my own cert
103 |
104 | create certificate signing request for "*.np.training"
105 |
106 | ```bash
107 | openssl req -nodes -newkey rsa:2048 \
108 | -keyout cert/server.key \
109 | -out cert/server.csr \
110 | -subj "/C=US/ST=New York/L=New York/O=NP Training./OU=IT/CN=*.np.training"
111 | ```
112 |
113 | I bought a wildcard cert from Namecheap
114 |
115 |
116 | Download my cert and create a kubectl cert
117 |
118 | ```bash
119 |
120 | gsutil cp "gs://np-training-private/certs/_star.np.training/*" workshop_infra/cert
121 |
122 |
123 | kubectl create namespace $HELM_NAMESPACE
124 | cd workshop_infra/cert
125 | kubectl create secret tls $HELM_NAMESPACE-tls --key="tls.key" --cert="tls.crt" --namespace $HELM_NAMESPACE
126 | cd ../../
127 |
128 | ```
129 |
130 | ## Step 4: Helm setup
131 |
132 | ```bash
133 |
134 | curl https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | bash
135 |
136 | helm version
137 |
138 | helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/
139 | helm repo update
140 |
141 | ```
142 |
143 |
144 | ## Step 5: Update config file (optional)
145 |
146 |
147 | build docker image
148 |
149 | ```bash
150 | docker build -t gcr.io/$GCP_PROJECT/pytorch-workshop:v1.0 .
151 | docker push gcr.io/$GCP_PROJECT/pytorch-workshop:v1.0
152 |
153 | ```
154 |
155 | encrypt setup
156 |
157 | ```bash
158 | gcloud kms keyrings create sops --location global --project $GCP_PROJECT
159 | gcloud kms keys create sops-key --location global --keyring sops --purpose encryption --project $GCP_PROJECT
160 | gcloud kms keys list --location global --keyring sops --project $GCP_PROJECT
161 | ```
162 |
163 |
164 | ```bash
165 | sops --encrypt --gcp-kms projects/$GCP_PROJECT/locations/global/keyRings/sops/cryptoKeys/sops-key \
166 | --encrypted-regex '^(client_id|client_secret)$' \
167 | workshop_infra/config.yaml > workshop_infra/config.enc.yaml
168 | ```
169 |
170 | ```bash
171 | sops --decrypt workshop_infra/config.enc.yaml > workshop_infra/config.yaml
172 | ```
173 |
174 |
175 | replace values in [config.yaml](workshop_infra/config.yaml)
176 |
177 | - GitHubOAuthenticator
178 | - singleuser.image.name
179 | - scheduling.userPlaceholder.replicas
180 | - proxy.https.host
181 | - proxy.https.service.loadBalancerIP
182 |
183 |
184 |
185 | ## Step 6: Helm Install with authentication
186 |
187 | setup with authentication and git oauth
188 | ```bash
189 | helm upgrade --cleanup-on-fail \
190 | --install $HELM_NAMESPACE jupyterhub/jupyterhub \
191 | --namespace $HELM_NAMESPACE \
192 | --create-namespace \
193 | --version $HELM_CHART_VERSION \
194 | --values workshop_infra/config.yaml
195 |
196 | ```
197 |
198 | ```bash
199 | kubectl --namespace=$HELM_NAMESPACE get pod
200 |
201 | kubectl --namespace=$HELM_NAMESPACE get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
202 | ```
203 |
204 |
205 | ## Step 6b: Helm Install with no authentication ( not auth)
206 |
207 | ```bash
208 | helm upgrade --cleanup-on-fail \
209 | --install $HELM_NAMESPACE-public jupyterhub/jupyterhub \
210 | --namespace $HELM_NAMESPACE-public \
211 | --create-namespace \
212 | --version $HELM_CHART_VERSION \
213 | --values workshop_infra/config_public.yaml
214 |
215 |
216 | kubectl --namespace=$HELM_NAMESPACE-public get pod
217 |
218 | kubectl --namespace=$HELM_NAMESPACE-public get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
219 | ```
220 |
221 | add the external ip to dns
222 |
223 |
224 | ## Step 7: Cleanup (Helm Delete)
225 |
226 | ```bash
227 |
228 | helm delete $HELM_NAMESPACE --namespace $HELM_NAMESPACE
229 | kubectl delete namespace $HELM_NAMESPACE
230 |
231 | helm delete $HELM_NAMESPACE-public --namespace $HELM_NAMESPACE-public
232 | kubectl delete namespace $HELM_NAMESPACE-public
233 |
234 |
235 | gcloud container clusters delete $CLUSTER_NAME --region $REGION --project $GCP_PROJECT
236 |
237 | ```
--------------------------------------------------------------------------------