├── .gitignore ├── README.md ├── code └── inference.py ├── convert_gptj.py ├── experiments.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | tmp/ 133 | model.tar.gz 134 | *.pt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sagemaker-gpt-j 2 | 3 | This repository contains instruction and code on how to run `GPT-J` for inference using Amazon SageMaker. 4 | 5 | 6 | ## Getting Started 7 | 8 | Create `model.tar.gz` using the `convert_gpt.py` script. 9 | 10 | 11 | ```python 12 | from sagemaker.huggingface import HuggingFaceModel 13 | import sagemaker 14 | 15 | # IAM role with permissions to create endpoint 16 | role = sagemaker.get_execution_role() 17 | 18 | # public S3 URI to gpt-j artifact 19 | model_uri="s3://huggingface-sagemaker-models/transformers/4.12.3/pytorch/1.9.1/gpt-j/model.tar.gz" 20 | 21 | # create Hugging Face Model Class 22 | huggingface_model = HuggingFaceModel( 23 | model_data=model_uri, 24 | transformers_version='4.12.3', 25 | pytorch_version='1.9.1', 26 | py_version='py38', 27 | role=role, 28 | ) 29 | 30 | # deploy model to SageMaker Inference 31 | predictor = huggingface_model.deploy( 32 | initial_instance_count=1, # number of instances 33 | instance_type='ml.g4dn.xlarge' #'ml.p3.2xlarge' # ec2 instance type 34 | ) 35 | ``` 36 | -------------------------------------------------------------------------------- /code/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import AutoTokenizer, pipeline 4 | 5 | GPT_WEIGHTS_NAME = "gptj.pt" 6 | 7 | 8 | def model_fn(model_dir): 9 | model = torch.load(os.path.join(model_dir, GPT_WEIGHTS_NAME)) 10 | tokenizer = AutoTokenizer.from_pretrained(model_dir) 11 | 12 | if torch.cuda.is_available(): 13 | device = 0 14 | else: 15 | device = -1 16 | 17 | generation = pipeline( 18 | "text-generation", model=model, tokenizer=tokenizer, device=device 19 | ) 20 | 21 | return generation 22 | -------------------------------------------------------------------------------- /convert_gptj.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import tarfile 5 | import argparse 6 | import boto3 7 | import torch 8 | from transformers import AutoTokenizer, GPTJForCausalLM 9 | 10 | 11 | def compress(tar_dir=None, output_file="model.tar.gz"): 12 | with tarfile.open(output_file, "w:gz") as tar: 13 | tar.add(tar_dir, arcname=os.path.sep) 14 | 15 | 16 | def upload_file_to_s3(bucket_name=None, file_name="model.tar.gz", key_prefix=""): 17 | s3 = boto3.resource("s3") 18 | key_prefix_with_file_name = os.path.join(key_prefix, file_name) 19 | s3.Bucket(bucket_name).upload_file(file_name, key_prefix_with_file_name) 20 | return f"s3://{bucket_name}/{key_prefix_with_file_name}" 21 | 22 | 23 | def convert(bucket_name="hf-sagemaker-inference"): 24 | model_save_dir = "./tmp" 25 | key_prefix = "gpt-j" 26 | src_inference_script = os.path.join("code", "inference.py") 27 | dst_inference_script = os.path.join(model_save_dir, "code") 28 | 29 | os.makedirs(model_save_dir, exist_ok=True) 30 | os.makedirs(dst_inference_script, exist_ok=True) 31 | 32 | # load fp 16 model 33 | print("Loading model from `EleutherAI/gpt-j-6B`") 34 | model = GPTJForCausalLM.from_pretrained( 35 | "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16 36 | ) 37 | print("saving model with `torch.save`") 38 | torch.save(model, os.path.join(model_save_dir, "gptj.pt")) 39 | 40 | print("saving tokenizer") 41 | tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") 42 | tokenizer.save_pretrained(model_save_dir) 43 | 44 | # copy inference script 45 | print("copying inference.py script") 46 | shutil.copy(src_inference_script, dst_inference_script) 47 | 48 | # create archive 49 | print("creating `model.tar.gz` archive") 50 | compress(model_save_dir) 51 | 52 | # upload to s3 53 | print( 54 | f"uploading `model.tar.gz` archive to s3://{bucket_name}/{key_prefix}/model.tar.gz" 55 | ) 56 | model_uri = upload_file_to_s3(bucket_name=bucket_name, key_prefix=key_prefix) 57 | print(f"Successfully uploaded to {model_uri}") 58 | 59 | sys.stdout.write(model_uri) 60 | return model_uri 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("--bucket_name", type=str, default=None) 66 | return parser.parse_args() 67 | 68 | 69 | if __name__ == "__main__": 70 | # parse args 71 | args = parse_args() 72 | 73 | if not args.bucket_name: 74 | raise ValueError( 75 | "please provide a valid `bucket_name`, when running `python convert_gptj.py --bucket_name` " 76 | ) 77 | 78 | # read config file 79 | convert(args.bucket_name) 80 | -------------------------------------------------------------------------------- /experiments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sample Notebook on how to run inference using `GPT-J`\n", 8 | "\n", 9 | "The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like causal language model trained on the [Pile](https://pile.eleuther.ai/) dataset.\n", 10 | "\n", 11 | "This model was contributed by [Stella Biderman](https://huggingface.co/stellaathena).\n", 12 | "\n", 13 | "\n", 14 | "Documentation: [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#gptj)\n", 15 | "\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "!pip install transformers==4.12.3 torch==1.9.1 --upgrade" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 5, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import transformers\n", 34 | "import torch\n", 35 | "\n", 36 | "assert transformers.__version__ == \"4.12.3\", f\"wrong transformers version: {transformers.__version__}\"\n", 37 | "assert \"1.9.1\" in torch.__version__ , f\"wrong torch version: {torch.__version__}\"" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "We are going to use the [fp16 branch](https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16) which stores the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly 12.1GB of CPU RAM to load the model." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Loading the model and using the `generate` method\n", 52 | "\n", 53 | "Since we are using the `fp16` branch of the model it should fit on 16GB GPU for inference (P3) or (T4).\n", 54 | "loading the model fp16 branch (11.3GB) on `ec2` machine took 3 minutes and 32 seconds. Loading the model into memory took another 3 minutes\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 1, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "application/vnd.jupyter.widget-view+json": { 65 | "model_id": "3d49d3cbf1e74e6d8eeb206148a1295a", 66 | "version_major": 2, 67 | "version_minor": 0 68 | }, 69 | "text/plain": [ 70 | "Downloading: 0%| | 0.00/836 [00:00 after loading and saving model + copying `inference.py`" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "%bash\n", 364 | "tar zcvf model.tar.gz *\n", 365 | "aws s3 cp model.tar.gz s3://hf-sagemaker-inference/gpt-j/model.tar.gz\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "## Deploy endpoint" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "!pip install sagemaker" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 29, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "---------------!" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "from sagemaker.huggingface import HuggingFaceModel\n", 399 | "import boto3\n", 400 | "import os\n", 401 | "\n", 402 | "os.environ[\"AWS_DEFAULT_REGION\"]=\"us-east-1\"\n", 403 | "\n", 404 | "\n", 405 | "iam_role=\"sagemaker_execution_role\"\n", 406 | "model_uri=\"s3://hf-sagemaker-inference/gpt-j/model.tar.gz\"\n", 407 | "\n", 408 | "iam_client = boto3.client('iam')\n", 409 | "role = iam_client.get_role(RoleName=iam_role)['Role']['Arn']\n", 410 | "\n", 411 | "# create Hugging Face Model Class\n", 412 | "huggingface_model = HuggingFaceModel(\n", 413 | " model_data=model_uri,\n", 414 | "\ttransformers_version='4.12',\n", 415 | "\tpytorch_version='1.9',\n", 416 | "\tpy_version='py38',\n", 417 | "\trole=role, \n", 418 | ")\n", 419 | "\n", 420 | "\n", 421 | "# deploy model to SageMaker Inference\n", 422 | "predictor = huggingface_model.deploy(\n", 423 | "\tinitial_instance_count=1, # number of instances\n", 424 | "\tinstance_type='ml.g4dn.xlarge' #'ml.p3.2xlarge' # ec2 instance type\n", 425 | ")\n" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 30, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/plain": [ 436 | "[{'generated_text': 'Can you please let us know more details about your \\nexperiences with the bookkeeper.\\n\\nI received a call from Chris Foster requesting that you review the below \\nAgreement and return with any comments. \\n\\nAs a'}]" 437 | ] 438 | }, 439 | "execution_count": 30, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "predictor.predict({\n", 446 | "\t'inputs': \"Can you please let us know more details about your \"\n", 447 | "})" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 31, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "[{'generated_text': 'Can you please let us know more details about your xtraday, xtrading and portfolio strategies?\\nSo far, I have read that you have used a 15% drawdown when you exited the equity fund. Is this a safe'}]" 459 | ] 460 | }, 461 | "execution_count": 31, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "predictor.predict({\n", 468 | "\t'inputs': \"Can you please let us know more details about your \"\n", 469 | "})" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "parameterized request" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 23, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "[{'generated_text': 'Can you please let us know more details about your \\nissue?\\n\\nA:\\n\\nThe problem was caused by my lack of understanding on how web sockets \\n worked. Once I understood how they work; I was able to fix'}]" 488 | ] 489 | }, 490 | "execution_count": 23, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "predictor.predict({\n", 497 | "\t'inputs': \"Can you please let us know more details about your \",\n", 498 | " \"parameters\" : {\n", 499 | " \"min_length\": 120,\n", 500 | " \"temperature\": 0.9,\n", 501 | " }\n", 502 | "})" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "custom end of sequence token. " 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "from transformers import AutoTokenizer\n", 519 | "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n", 520 | "\n", 521 | "end_sequence=\".\"\n", 522 | "temparature=40\n", 523 | "max_generated_token_length=50\n", 524 | "input=\"Can you please let us know more details about your \"\n", 525 | "\n", 526 | "predictor.predict({\n", 527 | "\t'inputs': input,\n", 528 | " \"parameters\" : {\n", 529 | " \"min_length\": int(len(input) + max_generated_token_length),\n", 530 | " \"temperature\":temparature,\n", 531 | " \"eos_token_id\": tokenizer.convert_tokens_to_ids(end_sequence)\n", 532 | " }\n", 533 | "})" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 32, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "predictor.delete_endpoint()" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [] 551 | } 552 | ], 553 | "metadata": { 554 | "interpreter": { 555 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 556 | }, 557 | "kernelspec": { 558 | "display_name": "Python 3.8.10 64-bit", 559 | "language": "python", 560 | "name": "python3" 561 | }, 562 | "language_info": { 563 | "codemirror_mode": { 564 | "name": "ipython", 565 | "version": 3 566 | }, 567 | "file_extension": ".py", 568 | "mimetype": "text/x-python", 569 | "name": "python", 570 | "nbconvert_exporter": "python", 571 | "pygments_lexer": "ipython3", 572 | "version": "3.8.10" 573 | }, 574 | "orig_nbformat": 4 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 2 578 | } 579 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.9.1 2 | transformers==4.12.3 3 | boto3 --------------------------------------------------------------------------------