├── .gitignore
├── README.md
├── code
    └── inference.py
├── convert_gptj.py
├── experiments.ipynb
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .DS_Store
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | tmp/
133 | model.tar.gz
134 | *.pt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sagemaker-gpt-j
 2 | 
 3 | This repository contains instruction and code on how to run `GPT-J` for inference using Amazon SageMaker.
 4 | 
 5 | 
 6 | ## Getting Started
 7 | 
 8 | Create `model.tar.gz` using the `convert_gpt.py` script.
 9 | 
10 | 
11 | ```python
12 | from sagemaker.huggingface import HuggingFaceModel
13 | import sagemaker
14 | 
15 | # IAM role with permissions to create endpoint
16 | role = sagemaker.get_execution_role()
17 | 
18 | # public S3 URI to gpt-j artifact
19 | model_uri="s3://huggingface-sagemaker-models/transformers/4.12.3/pytorch/1.9.1/gpt-j/model.tar.gz"
20 | 
21 | # create Hugging Face Model Class
22 | huggingface_model = HuggingFaceModel(
23 | 	model_data=model_uri,
24 | 	transformers_version='4.12.3',
25 | 	pytorch_version='1.9.1',
26 | 	py_version='py38',
27 | 	role=role, 
28 | )
29 | 
30 | # deploy model to SageMaker Inference
31 | predictor = huggingface_model.deploy(
32 | 	initial_instance_count=1, # number of instances
33 | 	instance_type='ml.g4dn.xlarge' #'ml.p3.2xlarge' # ec2 instance type
34 | )
35 | ```
36 | 


--------------------------------------------------------------------------------
/code/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from transformers import AutoTokenizer, pipeline
 4 | 
 5 | GPT_WEIGHTS_NAME = "gptj.pt"
 6 | 
 7 | 
 8 | def model_fn(model_dir):
 9 |     model = torch.load(os.path.join(model_dir, GPT_WEIGHTS_NAME))
10 |     tokenizer = AutoTokenizer.from_pretrained(model_dir)
11 | 
12 |     if torch.cuda.is_available():
13 |         device = 0
14 |     else:
15 |         device = -1
16 | 
17 |     generation = pipeline(
18 |         "text-generation", model=model, tokenizer=tokenizer, device=device
19 |     )
20 | 
21 |     return generation
22 | 


--------------------------------------------------------------------------------
/convert_gptj.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import tarfile
 5 | import argparse
 6 | import boto3
 7 | import torch
 8 | from transformers import AutoTokenizer, GPTJForCausalLM
 9 | 
10 | 
11 | def compress(tar_dir=None, output_file="model.tar.gz"):
12 |     with tarfile.open(output_file, "w:gz") as tar:
13 |         tar.add(tar_dir, arcname=os.path.sep)
14 | 
15 | 
16 | def upload_file_to_s3(bucket_name=None, file_name="model.tar.gz", key_prefix=""):
17 |     s3 = boto3.resource("s3")
18 |     key_prefix_with_file_name = os.path.join(key_prefix, file_name)
19 |     s3.Bucket(bucket_name).upload_file(file_name, key_prefix_with_file_name)
20 |     return f"s3://{bucket_name}/{key_prefix_with_file_name}"
21 | 
22 | 
23 | def convert(bucket_name="hf-sagemaker-inference"):
24 |     model_save_dir = "./tmp"
25 |     key_prefix = "gpt-j"
26 |     src_inference_script = os.path.join("code", "inference.py")
27 |     dst_inference_script = os.path.join(model_save_dir, "code")
28 | 
29 |     os.makedirs(model_save_dir, exist_ok=True)
30 |     os.makedirs(dst_inference_script, exist_ok=True)
31 | 
32 |     # load fp 16 model
33 |     print("Loading model from `EleutherAI/gpt-j-6B`")
34 |     model = GPTJForCausalLM.from_pretrained(
35 |         "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16
36 |     )
37 |     print("saving model with `torch.save`")
38 |     torch.save(model, os.path.join(model_save_dir, "gptj.pt"))
39 | 
40 |     print("saving tokenizer")
41 |     tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
42 |     tokenizer.save_pretrained(model_save_dir)
43 | 
44 |     # copy inference script
45 |     print("copying inference.py script")
46 |     shutil.copy(src_inference_script, dst_inference_script)
47 | 
48 |     # create archive
49 |     print("creating `model.tar.gz` archive")
50 |     compress(model_save_dir)
51 | 
52 |     # upload to s3
53 |     print(
54 |         f"uploading `model.tar.gz` archive to s3://{bucket_name}/{key_prefix}/model.tar.gz"
55 |     )
56 |     model_uri = upload_file_to_s3(bucket_name=bucket_name, key_prefix=key_prefix)
57 |     print(f"Successfully uploaded to {model_uri}")
58 |     
59 |     sys.stdout.write(model_uri)
60 |     return model_uri
61 | 
62 | 
63 | def parse_args():
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument("--bucket_name", type=str, default=None)
66 |     return parser.parse_args()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     # parse args
71 |     args = parse_args()
72 | 
73 |     if not args.bucket_name:
74 |         raise ValueError(
75 |             "please provide a valid `bucket_name`, when running `python convert_gptj.py --bucket_name` "
76 |         )
77 | 
78 |     # read config file
79 |     convert(args.bucket_name)
80 | 


--------------------------------------------------------------------------------
/experiments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sample Notebook on how to run inference using `GPT-J`\n",
  8 |     "\n",
  9 |     "The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like causal language model trained on the [Pile](https://pile.eleuther.ai/) dataset.\n",
 10 |     "\n",
 11 |     "This model was contributed by [Stella Biderman](https://huggingface.co/stellaathena).\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "Documentation: [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#gptj)\n",
 15 |     "\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "!pip install transformers==4.12.3 torch==1.9.1 --upgrade"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 5,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import transformers\n",
 34 |     "import torch\n",
 35 |     "\n",
 36 |     "assert transformers.__version__ == \"4.12.3\", f\"wrong transformers version: {transformers.__version__}\"\n",
 37 |     "assert \"1.9.1\" in torch.__version__  , f\"wrong torch version: {torch.__version__}\""
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "We are going to use the [fp16 branch](https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16) which stores the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly 12.1GB of CPU RAM to load the model."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Loading the model and using the `generate` method\n",
 52 |     "\n",
 53 |     "Since we are using the `fp16` branch of the model it should fit on 16GB GPU for inference (P3) or (T4).\n",
 54 |     "loading the model fp16 branch (11.3GB) on `ec2` machine took 3 minutes and 32 seconds. Loading the model into memory took another 3 minutes\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 1,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "application/vnd.jupyter.widget-view+json": {
 65 |        "model_id": "3d49d3cbf1e74e6d8eeb206148a1295a",
 66 |        "version_major": 2,
 67 |        "version_minor": 0
 68 |       },
 69 |       "text/plain": [
 70 |        "Downloading:   0%|          | 0.00/836 [00:00<?, ?B/s]"
 71 |       ]
 72 |      },
 73 |      "metadata": {},
 74 |      "output_type": "display_data"
 75 |     },
 76 |     {
 77 |      "data": {
 78 |       "application/vnd.jupyter.widget-view+json": {
 79 |        "model_id": "9762f53daf7f47c998e0c69d0d6a8831",
 80 |        "version_major": 2,
 81 |        "version_minor": 0
 82 |       },
 83 |       "text/plain": [
 84 |        "Downloading:   0%|          | 0.00/11.3G [00:00<?, ?B/s]"
 85 |       ]
 86 |      },
 87 |      "metadata": {},
 88 |      "output_type": "display_data"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "from transformers import GPTJForCausalLM\n",
 93 |     "import torch\n",
 94 |     "\n",
 95 |     "model = GPTJForCausalLM.from_pretrained(\"EleutherAI/gpt-j-6B\", revision=\"float16\", torch_dtype=torch.float16, low_cpu_mem_usage=True)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 11,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stderr",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
108 |      ]
109 |     },
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English. They even have the same rights as us humans. The unicorns are only two foot tall--an easy target for a hunting rifle.\\n\\n\\nThe unicorns have the same rights as humans because they are, technically, human: they all stem from the same origin. They'"
114 |       ]
115 |      },
116 |      "execution_count": 11,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "from transformers import AutoTokenizer\n",
123 |     "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n",
124 |     "\n",
125 |     "device='cuda:0'\n",
126 |     "model.to(device)\n",
127 |     "\n",
128 |     "prompt = \"In a shocking finding, scientists discovered a herd of unicorns living in a remote, \" \\\n",
129 |     "         \"previously unexplored valley, in the Andes Mountains. Even more surprising to the \" \\\n",
130 |     "         \"researchers was the fact that the unicorns spoke perfect English.\"\n",
131 |     "\n",
132 |     "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
133 |     "\n",
134 |     "%timeit\n",
135 |     "gen_tokens = model.generate(input_ids.to(device), do_sample=True, temperature=0.9, max_length=100,)\n",
136 |     "\n",
137 |     "tokenizer.batch_decode(gen_tokens)[0]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "## Loading the `gpt-j` from cache\n",
145 |     "\n",
146 |     "loading `gpt-j` from local cache took 3 minutes 16 seconds."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 12,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "model2 = GPTJForCausalLM.from_pretrained(\"EleutherAI/gpt-j-6B\", revision=\"float16\", torch_dtype=torch.float16, low_cpu_mem_usage=True)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Load model from `directory`\n",
163 |     "\n",
164 |     "loading the model from `disk` took 1m 23s"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 13,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "model2.save_pretrained(\"./tmp\")"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 14,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "model2 = GPTJForCausalLM.from_pretrained(\"tmp\")"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "# Load `gpt-j` using `torch.load`\n",
190 |     "\n",
191 |     "loading the model with `torch.load` took 7.7s"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 2,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "from transformers import AutoTokenizer,GPTJForCausalLM\n",
201 |     "import torch\n",
202 |     "\n",
203 |     "# load fp 16 model\n",
204 |     "model = GPTJForCausalLM.from_pretrained(\"EleutherAI/gpt-j-6B\", revision=\"float16\", torch_dtype=torch.float16)\n",
205 |     "torch.save(model, \"gptj.pt\")\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 3,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "model = torch.load(\"gptj.pt\")"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 13,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "from transformers import pipeline\n",
224 |     "\n",
225 |     "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n",
226 |     "gen = pipeline(\"text-generation\",model=model,tokenizer=tokenizer,device=0)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 15,
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "name": "stderr",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
239 |      ]
240 |     },
241 |     {
242 |      "data": {
243 |       "text/plain": [
244 |        "[{'generated_text': 'My Name is philipp k. and I live just outside of Detroit. For most of my growing up years I knew that I wanted to be in the art world but had no idea where to start. I started taking art classes during high school and'}]"
245 |       ]
246 |      },
247 |      "execution_count": 15,
248 |      "metadata": {},
249 |      "output_type": "execute_result"
250 |     }
251 |    ],
252 |    "source": [
253 |     "gen(\"My Name is philipp\")"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "# Creating `model.tar.gz` for sagemaker deployment"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 16,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "import tarfile\n",
270 |     "import os\n",
271 |     "\n",
272 |     "def compress(tar_dir=None,output_file=\"model.tar.gz\"):\n",
273 |     "    with tarfile.open(output_file, \"w:gz\") as tar:\n",
274 |     "        tar.add(tar_dir, arcname=os.path.sep)\n",
275 |     "            \n",
276 |     "\n",
277 |     "import boto3\n",
278 |     "\n",
279 |     "def upload_file_to_s3(bucket_name=None,file_name=\"model.tar.gz\",key_prefix=\"\"):\n",
280 |     "    s3 = boto3.resource('s3')\n",
281 |     "    key_prefix_with_file_name = os.path.join(key_prefix,file_name)\n",
282 |     "    s3.Bucket(bucket_name).upload_file(file_name, key_prefix_with_file_name)\n",
283 |     "    return f\"s3://{bucket_name}/{key_prefix_with_file_name}\""
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 18,
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "data": {
293 |       "text/plain": [
294 |        "'s3://hf-sagemaker-inference/gpt-j/model.tar.gz'"
295 |       ]
296 |      },
297 |      "execution_count": 18,
298 |      "metadata": {},
299 |      "output_type": "execute_result"
300 |     }
301 |    ],
302 |    "source": [
303 |     "import os\n",
304 |     "import shutil \n",
305 |     "import tarfile\n",
306 |     "import torch\n",
307 |     "from transformers import AutoTokenizer,GPTJForCausalLM\n",
308 |     "\n",
309 |     "def compress(tar_dir=None,output_file=\"model.tar.gz\"):\n",
310 |     "    with tarfile.open(output_file, \"w:gz\") as tar:\n",
311 |     "        tar.add(tar_dir, arcname=os.path.sep)\n",
312 |     "            \n",
313 |     "\n",
314 |     "import boto3\n",
315 |     "\n",
316 |     "def upload_file_to_s3(bucket_name=None,file_name=\"model.tar.gz\",key_prefix=\"\"):\n",
317 |     "    s3 = boto3.resource('s3')\n",
318 |     "    key_prefix_with_file_name = os.path.join(key_prefix,file_name)\n",
319 |     "    s3.Bucket(bucket_name).upload_file(file_name, key_prefix_with_file_name)\n",
320 |     "    return f\"s3://{bucket_name}/{key_prefix_with_file_name}\"\n",
321 |     "\n",
322 |     "\n",
323 |     "model_save_dir=\"./tmp\"\n",
324 |     "bucket_name=\"hf-sagemaker-inference\"\n",
325 |     "key_prefix=\"gpt-j\"\n",
326 |     "src_inference_script= os.path.join(\"code\",\"inference.py\")\n",
327 |     "dst_inference_script= os.path.join(model_save_dir,\"code\")\n",
328 |     "\n",
329 |     "os.makedirs(model_save_dir,exist_ok=True)\n",
330 |     "os.makedirs(dst_inference_script,exist_ok=True)\n",
331 |     "\n",
332 |     "# load fp 16 model\n",
333 |     "model = GPTJForCausalLM.from_pretrained(\"EleutherAI/gpt-j-6B\", revision=\"float16\", torch_dtype=torch.float16)\n",
334 |     "torch.save(model, os.path.join(model_save_dir,\"gptj.pt\"))\n",
335 |     "\n",
336 |     "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n",
337 |     "tokenizer.save_pretrained(model_save_dir)\n",
338 |     "\n",
339 |     "# copy inference script\n",
340 |     "shutil.copy(src_inference_script, dst_inference_script)\n",
341 |     "\n",
342 |     "# create archive\n",
343 |     "compress(model_save_dir)\n",
344 |     "\n",
345 |     "# upload to s3\n",
346 |     "model_uri = upload_file_to_s3(bucket_name=bucket_name,key_prefix=key_prefix)\n",
347 |     "model_uri\n"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "**bash scripting** -> after loading and saving model + copying `inference.py`"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "%bash\n",
364 |     "tar zcvf model.tar.gz *\n",
365 |     "aws s3 cp model.tar.gz s3://hf-sagemaker-inference/gpt-j/model.tar.gz\n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "## Deploy endpoint"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "!pip install sagemaker"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 29,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "---------------!"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "from sagemaker.huggingface import HuggingFaceModel\n",
399 |     "import boto3\n",
400 |     "import os\n",
401 |     "\n",
402 |     "os.environ[\"AWS_DEFAULT_REGION\"]=\"us-east-1\"\n",
403 |     "\n",
404 |     "\n",
405 |     "iam_role=\"sagemaker_execution_role\"\n",
406 |     "model_uri=\"s3://hf-sagemaker-inference/gpt-j/model.tar.gz\"\n",
407 |     "\n",
408 |     "iam_client = boto3.client('iam')\n",
409 |     "role = iam_client.get_role(RoleName=iam_role)['Role']['Arn']\n",
410 |     "\n",
411 |     "# create Hugging Face Model Class\n",
412 |     "huggingface_model = HuggingFaceModel(\n",
413 |     "  model_data=model_uri,\n",
414 |     "\ttransformers_version='4.12',\n",
415 |     "\tpytorch_version='1.9',\n",
416 |     "\tpy_version='py38',\n",
417 |     "\trole=role, \n",
418 |     ")\n",
419 |     "\n",
420 |     "\n",
421 |     "# deploy model to SageMaker Inference\n",
422 |     "predictor = huggingface_model.deploy(\n",
423 |     "\tinitial_instance_count=1, # number of instances\n",
424 |     "\tinstance_type='ml.g4dn.xlarge' #'ml.p3.2xlarge' # ec2 instance type\n",
425 |     ")\n"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 30,
431 |    "metadata": {},
432 |    "outputs": [
433 |     {
434 |      "data": {
435 |       "text/plain": [
436 |        "[{'generated_text': 'Can you please let us know more details about your \\nexperiences with the bookkeeper.\\n\\nI received a call from Chris Foster requesting that you review the below \\nAgreement and return with any comments.  \\n\\nAs a'}]"
437 |       ]
438 |      },
439 |      "execution_count": 30,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "predictor.predict({\n",
446 |     "\t'inputs': \"Can you please let us know more details about your \"\n",
447 |     "})"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 31,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "[{'generated_text': 'Can you please let us know more details about your xtraday, xtrading and portfolio strategies?\\nSo far, I have read that you have used a 15% drawdown when you exited the equity fund. Is this a safe'}]"
459 |       ]
460 |      },
461 |      "execution_count": 31,
462 |      "metadata": {},
463 |      "output_type": "execute_result"
464 |     }
465 |    ],
466 |    "source": [
467 |     "predictor.predict({\n",
468 |     "\t'inputs': \"Can you please let us know more details about your \"\n",
469 |     "})"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "parameterized request"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 23,
482 |    "metadata": {},
483 |    "outputs": [
484 |     {
485 |      "data": {
486 |       "text/plain": [
487 |        "[{'generated_text': 'Can you please let us know more details about your \\nissue?\\n\\nA:\\n\\nThe problem was caused by my lack of understanding on how web sockets \\n  worked. Once I understood how they work; I was able to fix'}]"
488 |       ]
489 |      },
490 |      "execution_count": 23,
491 |      "metadata": {},
492 |      "output_type": "execute_result"
493 |     }
494 |    ],
495 |    "source": [
496 |     "predictor.predict({\n",
497 |     "\t'inputs': \"Can you please let us know more details about your \",\n",
498 |     "  \"parameters\" : {\n",
499 |     "    \"min_length\": 120,\n",
500 |     "    \"temperature\": 0.9,\n",
501 |     "  }\n",
502 |     "})"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "custom end of sequence token. "
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "from transformers import AutoTokenizer\n",
519 |     "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n",
520 |     "\n",
521 |     "end_sequence=\".\"\n",
522 |     "temparature=40\n",
523 |     "max_generated_token_length=50\n",
524 |     "input=\"Can you please let us know more details about your \"\n",
525 |     "\n",
526 |     "predictor.predict({\n",
527 |     "\t'inputs': input,\n",
528 |     "  \"parameters\" : {\n",
529 |     "    \"min_length\": int(len(input) + max_generated_token_length),\n",
530 |     "    \"temperature\":temparature,\n",
531 |     "    \"eos_token_id\": tokenizer.convert_tokens_to_ids(end_sequence)\n",
532 |     "  }\n",
533 |     "})"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 32,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "predictor.delete_endpoint()"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": []
551 |   }
552 |  ],
553 |  "metadata": {
554 |   "interpreter": {
555 |    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
556 |   },
557 |   "kernelspec": {
558 |    "display_name": "Python 3.8.10 64-bit",
559 |    "language": "python",
560 |    "name": "python3"
561 |   },
562 |   "language_info": {
563 |    "codemirror_mode": {
564 |     "name": "ipython",
565 |     "version": 3
566 |    },
567 |    "file_extension": ".py",
568 |    "mimetype": "text/x-python",
569 |    "name": "python",
570 |    "nbconvert_exporter": "python",
571 |    "pygments_lexer": "ipython3",
572 |    "version": "3.8.10"
573 |   },
574 |   "orig_nbformat": 4
575 |  },
576 |  "nbformat": 4,
577 |  "nbformat_minor": 2
578 | }
579 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.9.1
2 | transformers==4.12.3
3 | boto3


--------------------------------------------------------------------------------