├── .gitignore
├── PEFT
├── 0_setup.ipynb
├── 1_prepare-dataset-alpaca-method.ipynb
├── 1_prepare-dataset-chunk-method.ipynb
├── 2_local-infer-debug-lora.ipynb
├── 2_local-train-debug-lora.ipynb
├── 3_sm-train-lora.ipynb
├── 4_sm-serving-djl.ipynb
├── readme.md
└── src
│ ├── local-run-wandb.sh
│ ├── local-run.sh
│ ├── requirements.txt
│ ├── run-wandb.sh
│ ├── run.sh
│ ├── secrets.env
│ └── train.py
├── RAG-SageMaker
├── dataset
│ └── fsi_smart_faq_ko.csv
├── images
│ ├── TensorShard.png
│ ├── architecture-rag-opensearch.png
│ ├── open1.png
│ ├── open2.png
│ ├── open3.png
│ ├── open4.png
│ ├── open5.png
│ ├── open6.png
│ ├── open7.png
│ ├── open8.png
│ └── rag-lang.png
├── indexer
│ └── fsi_faq_indexer_ko
│ │ ├── index.faiss
│ │ └── index.pkl
├── rag-fsi-data-workshop
│ ├── README.md
│ ├── TASK-0_Setup.ipynb
│ ├── TASK-1_Embedding_Vector_Model_Creation.ipynb
│ ├── TASK-2-optional_Polyglot_12.8B_Korea_LLM_Model_Creation.ipynb
│ ├── TASK-2_Polyglot_5.8B_Korea_LLM_Model_Creation.ipynb
│ ├── TASK-3_FSI_FAQ_Faiss_Vector_Search_Local_Store_Test.ipynb
│ ├── TASK-4_OpenSearch_Creation_and_Vector_Insertion.ipynb
│ ├── TASK-5_OpenSearch_LLM_RAG_Streamlit_Chatbot_Example.py
│ ├── requirements.txt
│ └── src
│ │ └── kullm-polyglot-5-8b-v2
│ │ ├── model.py
│ │ └── serving.properties
└── utils
│ ├── inference_utils.py
│ ├── kullm.json
│ └── streamlit_util.py
├── README.md
├── common_code
├── inference_lib.py
└── kullm.json
├── images
├── Nfloat.png
├── TensorShard.png
├── lora.png
├── lora_eq1.png
├── lora_r.png
├── qlora_eq.png
├── qlora_fig1.png
└── quantization.png
├── templates
├── README.md
├── alpaca.json
├── alpaca_legacy.json
├── alpaca_short.json
├── korwkv.json
├── kullm.json
└── vigogne.json
└── utils
├── __init__.py
├── callbacks.py
├── common_lib.py
└── inference_lib.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | target/
3 | .DS_STORE
4 | .settings/*
5 | javadocs/*
6 | .classpath
7 | .project
8 | logs/
9 | .idea
10 | __pycache__/
11 |
12 | # IntelliJ IDEA
13 | **/.idea
14 | *.iml=
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | share/python-wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | MANIFEST
40 |
41 | ### JetBrains template
42 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
43 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
44 |
45 | # User-specific stuff:
46 | .idea/workspace.xml
47 | .idea/tasks.xml
48 |
49 | # Sensitive or high-churn files:
50 | .idea/dataSources/
51 | .idea/dataSources.ids
52 | .idea/dataSources.xml
53 | .idea/dataSources.local.xml
54 | .idea/sqlDataSources.xml
55 | .idea/dynamic.xml
56 | .idea/uiDesigner.xml
57 |
58 | # PyInstaller
59 | # Usually these files are written by a python script from a template
60 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
61 | *.manifest
62 | *.spec
63 |
64 | # Installer logs
65 | pip-log.txt
66 | pip-delete-this-directory.txt
67 |
68 | # Unit test / coverage reports
69 | htmlcov/
70 | .tox/
71 | .nox/
72 | .coverage
73 | .coverage.*
74 | .cache
75 | nosetests.xml
76 | coverage.xml
77 | *.cover
78 | *.py,cover
79 | .hypothesis/
80 | .pytest_cache/
81 | cover/
82 |
83 | # Translations
84 | *.mo
85 | *.pot
86 |
87 | # Django stuff:
88 | *.log
89 | local_settings.py
90 | db.sqlite3
91 | db.sqlite3-journal
92 |
93 | # Flask stuff:
94 | instance/
95 | .webassets-cache
96 |
97 | # Scrapy stuff:
98 | .scrapy
99 |
100 | # Sphinx documentation
101 | docs/_build/
102 |
103 | # PyBuilder
104 | .pybuilder/
105 |
106 | # Jupyter Notebook
107 | .ipynb_checkpoints
108 |
109 | # IPython
110 | profile_default/
111 | ipython_config.py
112 |
113 | # pyenv
114 | # For a library or package, you might want to ignore these files since the code is
115 | # intended to run in multiple environments; otherwise, check them in:
116 | # .python-version
117 |
118 | # pipenv
119 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
120 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
121 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
122 | # install all needed dependencies.
123 | #Pipfile.lock
124 |
125 | # poetry
126 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
127 | # This is especially recommended for binary packages to ensure reproducibility, and is more
128 | # commonly ignored for libraries.
129 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
130 | #poetry.lock
131 |
132 | # pdm
133 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
134 | #pdm.lock
135 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
136 | # in version control.
137 | # https://pdm.fming.dev/#use-with-ide
138 | .pdm.toml
139 |
140 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
141 | __pypackages__/
142 |
143 | # Celery stuff
144 | celerybeat-schedule
145 | celerybeat.pid
146 |
147 | # SageMath parsed files
148 | *.sage.py
149 |
150 | # Environments
151 | .env
152 | .venv
153 | env/
154 | venv/
155 | ENV/
156 | env.bak/
157 | venv.bak/
158 |
159 | # Spyder project settings
160 | .spyderproject
161 | .spyproject
162 |
163 | # Rope project settings
164 | .ropeproject
165 |
166 | # mkdocs documentation
167 | /site
168 |
169 | # mypy
170 | .mypy_cache/
171 | .dmypy.json
172 | dmypy.json
173 |
174 | # Pyre type checker
175 | .pyre/
176 |
177 | # pytype static type analyzer
178 | .pytype/
179 |
180 | # Cython debug symbols
181 | cython_debug/
182 |
183 | .ipynb_checkpoints
184 | */.ipynb_checkpoints/*
185 | *.pyc
186 |
187 | # PyCharm
188 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
189 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
190 | # and can be added to the global gitignore or merged into this file. For a more nuclear
191 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
192 | #.idea/
193 |
--------------------------------------------------------------------------------
/PEFT/0_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "7c5f0126-694e-411b-9380-86f4c1b70d24",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "!pip install -r src/requirements.txt"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "id": "98ae8673-d1d3-40ea-be62-f1c3aeeead3e",
19 | "metadata": {
20 | "tags": []
21 | },
22 | "outputs": [],
23 | "source": [
24 | "!pip install -qU boto3 botocore huggingface_hub sagemaker langchain deepspeed wandb"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "source": [
30 | "### Change Docker image path to EBS\n",
31 | "#### SageMaker 노트북 인스턴스일 경우 docker image 저장소의 공간이 작아 마운트한 EBS로 변경을 진행합니다.\n",
32 | "SageMaker 노트북 인스턴스에서 로컬 모드 디버깅 시 종종 `No space left` 관련 오류가 발생합니다. 따라서, 도커 이미지/컨테이너가 저장될 폴더를 SageMaker EBS (Amazon Elastic Block Store) 볼륨으로 변경하는 것을 권장합니다. 도커 이미지/컨테이너는 기본적으로 EBS가 아닌 루트 볼륨에 저장하기 때문에(루트 볼륨의 크기는 사용자가 임의로 조정할 수 없습니다!) 고용량의 이미지들을 빌드하면 용량이 꽉 차기 때문입니다."
33 | ],
34 | "metadata": {
35 | "collapsed": false
36 | }
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "outputs": [],
42 | "source": [
43 | "%%bash\n",
44 | "\n",
45 | "#!/usr/bin/env bash\n",
46 | "\n",
47 | "echo '{\n",
48 | " \"runtimes\": {\n",
49 | " \"nvidia\": {\n",
50 | " \"path\": \"nvidia-container-runtime\",\n",
51 | " \"runtimeArgs\": []\n",
52 | " }\n",
53 | " }\n",
54 | "}' > daemon.json\n",
55 | "\n",
56 | "sudo cp daemon.json /etc/docker/daemon.json && rm daemon.json\n",
57 | "\n",
58 | "DAEMON_PATH=\"/etc/docker\"\n",
59 | "MEMORY_SIZE=10G\n",
60 | "\n",
61 | "FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has(\"data-root\")')\n",
62 | "# echo $FLAG\n",
63 | "\n",
64 | "if [ \"$FLAG\" == true ]; then\n",
65 | " echo \"Already revised\"\n",
66 | "else\n",
67 | " echo \"Add data-root and default-shm-size=$MEMORY_SIZE\"\n",
68 | " sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak\n",
69 | " sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {\"data-root\":\"/home/ec2-user/SageMaker/.container/docker\",\"default-shm-size\":\"'$MEMORY_SIZE'\"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null\n",
70 | " sudo service docker restart\n",
71 | " echo \"Docker Restart\"\n",
72 | "fi\n",
73 | "\n",
74 | "sudo docker info | grep Root"
75 | ],
76 | "metadata": {
77 | "collapsed": false
78 | }
79 | }
80 | ],
81 | "metadata": {
82 | "kernelspec": {
83 | "display_name": "conda_pytorch_p310",
84 | "language": "python",
85 | "name": "conda_pytorch_p310"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.10.10"
98 | }
99 | },
100 | "nbformat": 4,
101 | "nbformat_minor": 5
102 | }
103 |
--------------------------------------------------------------------------------
/PEFT/2_local-train-debug-lora.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "55cfa3d1-2187-4176-8c23-19bdf0925364",
6 | "metadata": {},
7 | "source": [
8 | "# Korean LLM (Large Language Model) fine-tuning on Local environment (Debugging)\n",
9 | "---\n",
10 | "\n",
11 | "- 허깅페이스 인증 정보 설정: `huggingface-cli login`\n",
12 | " - https://huggingface.co/join\n",
13 | " - https://huggingface.co/settings/tokens\n",
14 | " \n",
15 | "\n",
16 | "## Overview \n",
17 | "\n",
18 | "본격적으로 SageMaker 훈련 인스턴스로 훈련을 수행하기 전에 SageMaker Notebook / SageMaker Studio / SageMaker Studio Lab 에서 샘플 데이터로 디버깅을 수행합니다.\n",
19 | "물론 온프레미스 환경에서 디버깅을 수행할 수 있다면, 기존 환경과 동일하게 디버깅을 수행하면 됩니다.\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "d8e8020b-9964-4198-aed5-0efca878bf5a",
26 | "metadata": {
27 | "tags": []
28 | },
29 | "outputs": [],
30 | "source": [
31 | "%load_ext autoreload\n",
32 | "%autoreload 2\n",
33 | "import sys\n",
34 | "import os\n",
35 | "import torch\n",
36 | "import transformers\n",
37 | "from datasets import load_dataset, load_from_disk\n",
38 | "from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast\n",
39 | "\n",
40 | "sys.path.append('../utils')\n",
41 | "sys.path.append('../templates')"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "id": "96b7bd04-d647-45cc-a340-84505df99f67",
48 | "metadata": {
49 | "tags": []
50 | },
51 | "outputs": [],
52 | "source": [
53 | "%store -r bucket_prefix dataset_prefix s3_data_path dataset_prefix_all"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "id": "e2745978-138f-4b70-8693-64859f4704ea",
60 | "metadata": {
61 | "tags": []
62 | },
63 | "outputs": [],
64 | "source": [
65 | "try:\n",
66 | " dataset_prefix\n",
67 | "except NameError:\n",
68 | " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
69 | " print(\"[ERROR] 1번 모듈 노트북을 다시 실행해 주세요.\")\n",
70 | " print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "id": "d44b3088-f0b9-4897-a556-933475a9880a",
77 | "metadata": {
78 | "tags": []
79 | },
80 | "outputs": [],
81 | "source": [
82 | "lm_dataset = load_from_disk(dataset_prefix)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 5,
88 | "id": "70a0cd87",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "#lm_dataset = load_from_disk(dataset_prefix_all)"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "id": "dfe7d512",
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "data": {
103 | "text/plain": [
104 | "Dataset({\n",
105 | " features: ['input_ids', 'attention_mask', 'labels'],\n",
106 | " num_rows: 50\n",
107 | "})"
108 | ]
109 | },
110 | "execution_count": 6,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "lm_dataset"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 7,
122 | "id": "5497012c",
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "'chunk-train'"
129 | ]
130 | },
131 | "execution_count": 7,
132 | "metadata": {},
133 | "output_type": "execute_result"
134 | }
135 | ],
136 | "source": [
137 | "dataset_prefix"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "id": "624f77db-7a07-4439-98c0-2e19cc72e0cb",
143 | "metadata": {},
144 | "source": [
145 | "
\n",
146 | "\n",
147 | "## 1. Load Model\n",
148 | "---\n",
149 | "한정된 GPU 메모리 안에 LLM 모델을 로드하는 것은 매우 어렵습니다. 예컨대 20B의 모델을 로드하려면 fp32 기준으로 80GB 이상의 메모리가 필요하고 fp16 기준으로도 40GB 이상의 GPU 메모리가 필요하며, 파인 튜닝을 수행하는 경우는 이보다 더욱 많은 GPU 메모리가 필요합니다. 이런 경우 4비트 양자화와 LoRA를 사용하면 범용적으로 사용하고 있는 16GB 및 24GB GPU 메모리로도 파인 튜닝이 가능합니다. 현 기준으로는 4비트 양자화를 지원하는 QLoRA 기법이 가장 널리 사용되고 있으며 bitsandbytes를 사용하여 QLoRA를 쉽게 적용할 수 있습니다. QLoRA는 양자화된 파라미터의 분포 범위를 정규 분포 내로 억제하여 정밀도의 저하를 방지하는 4비트 NormalFloat 양자화 양자화를 적용하는 정수에 대해서도 양자화를 적용하는 이중 양자화, 그리고 optimizer state 등의 데이터를 CPU 메모리에 저장하는 페이징 기법을 적용하여 GPU 메모리 사용량을 억제합니다. QLoRA에 대한 자세한 내용은 논문 (https://arxiv.org/pdf/2305.14314.pdf) 을 참조하기 바랍니다.\n",
150 | "\n",
151 | "### Create a bitsandbytes configuration"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 8,
157 | "id": "965584cc-3956-4711-8eb4-4fbdaa41f397",
158 | "metadata": {
159 | "tags": []
160 | },
161 | "outputs": [],
162 | "source": [
163 | "from transformers import BitsAndBytesConfig\n",
164 | "quant_4bit = True\n",
165 | "quant_8bit = False\n",
166 | "\n",
167 | "if quant_4bit:\n",
168 | " nf4_config = BitsAndBytesConfig(\n",
169 | " load_in_4bit=True,\n",
170 | " bnb_4bit_quant_type=\"nf4\",\n",
171 | " bnb_4bit_use_double_quant=True,\n",
172 | " bnb_4bit_compute_dtype=torch.bfloat16\n",
173 | ")\n",
174 | "else:\n",
175 | " nf4_config = None"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 9,
181 | "id": "f50e68b0",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "torch.cuda.empty_cache()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 10,
191 | "id": "9da734ff-5488-4ead-9004-9b5a049b0070",
192 | "metadata": {
193 | "tags": []
194 | },
195 | "outputs": [],
196 | "source": [
197 | "import os\n",
198 | "from pathlib import Path\n",
199 | "from huggingface_hub import snapshot_download\n",
200 | "\n",
201 | "HF_MODEL_ID = \"nlpai-lab/kullm-polyglot-12.8b-v2\"\n",
202 | "\n",
203 | "# create model dir\n",
204 | "model_name = HF_MODEL_ID.split(\"/\")[-1].replace('.', '-')\n",
205 | "model_tar_dir = Path(f\"/home/ec2-user/SageMaker/models/{model_name}\")"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 11,
211 | "id": "7b565d1a-5969-4bd3-8233-cce8ad1e245e",
212 | "metadata": {
213 | "tags": []
214 | },
215 | "outputs": [
216 | {
217 | "name": "stderr",
218 | "output_type": "stream",
219 | "text": [
220 | "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
221 | "The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. \n",
222 | "The class this function is called from is 'GPTNeoXTokenizerFast'.\n",
223 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
224 | ]
225 | },
226 | {
227 | "data": {
228 | "application/vnd.jupyter.widget-view+json": {
229 | "model_id": "363ea0fd653b4a6f8cf238d773f684c9",
230 | "version_major": 2,
231 | "version_minor": 0
232 | },
233 | "text/plain": [
234 | "Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
235 | ]
236 | },
237 | "metadata": {},
238 | "output_type": "display_data"
239 | }
240 | ],
241 | "source": [
242 | "device_map = \"auto\"\n",
243 | "\n",
244 | "tokenizer = GPTNeoXTokenizerFast.from_pretrained(HF_MODEL_ID)\n",
245 | "\n",
246 | "model = GPTNeoXForCausalLM.from_pretrained(\n",
247 | " model_tar_dir,\n",
248 | " load_in_8bit=True if quant_8bit else False,\n",
249 | " torch_dtype=torch.float16,\n",
250 | " device_map=device_map,\n",
251 | " #cache_dir=cache_dir,\n",
252 | " quantization_config=nf4_config,\n",
253 | ")"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "id": "a3b37ebe-c475-47ee-993f-464c0d7201d0",
259 | "metadata": {},
260 | "source": [
261 | "### Create LoRA config\n",
262 | "LoRA 설정에 대한 자세한 내용은 아래를 참조해 주세요.\n",
263 | "- https://huggingface.co/docs/peft/conceptual_guides/lora"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 12,
269 | "id": "d6d0fcf8-67a0-44cc-9b89-c6f26a8dcc2d",
270 | "metadata": {
271 | "tags": []
272 | },
273 | "outputs": [],
274 | "source": [
275 | "from peft import (\n",
276 | " LoraConfig,\n",
277 | " get_peft_model,\n",
278 | " get_peft_model_state_dict,\n",
279 | " prepare_model_for_kbit_training,\n",
280 | " set_peft_model_state_dict,\n",
281 | ")\n",
282 | "\n",
283 | "model = prepare_model_for_kbit_training(model)\n",
284 | "\n",
285 | "lora_r = 8\n",
286 | "lora_alpha = 32\n",
287 | "lora_dropout = 0.05\n",
288 | "lora_target_modules = [\"query_key_value\", \"xxx\"]\n",
289 | " \n",
290 | "config = LoraConfig(\n",
291 | " r=lora_r,\n",
292 | " lora_alpha=lora_alpha,\n",
293 | " target_modules=lora_target_modules,\n",
294 | " lora_dropout=lora_dropout,\n",
295 | " bias=\"none\",\n",
296 | " task_type=\"CAUSAL_LM\",\n",
297 | ")\n",
298 | "model = get_peft_model(model, config)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 13,
304 | "id": "35312b9f-98d5-410b-b120-4156c5cce185",
305 | "metadata": {
306 | "tags": []
307 | },
308 | "outputs": [
309 | {
310 | "name": "stdout",
311 | "output_type": "stream",
312 | "text": [
313 | "trainable params: 6,553,600 || all params: 12,900,157,440 || trainable%: 0.05080248074863806\n"
314 | ]
315 | }
316 | ],
317 | "source": [
318 | "model.print_trainable_parameters()"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "id": "f488f765-058c-434b-8ed1-b64a490cf96c",
324 | "metadata": {},
325 | "source": [
326 | "
\n",
327 | "\n",
328 | "## 2. Training\n",
329 | "---\n",
330 | "### Setting Hyperparameters"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 15,
336 | "id": "415f898f-760f-4ffa-ba5c-3e93c0dee47b",
337 | "metadata": {
338 | "tags": []
339 | },
340 | "outputs": [],
341 | "source": [
342 | "train_data = lm_dataset\n",
343 | "val_data = None\n",
344 | "num_epochs = 3\n",
345 | "batch_size = 2\n",
346 | "\n",
347 | "learning_rate = 3e-5\n",
348 | "gradient_accumulation_steps = 2\n",
349 | "val_set_size = 0\n",
350 | "output_dir = 'output'\n",
351 | "world_size = 1\n",
352 | "ddp = world_size != 1\n",
353 | "group_by_length = False"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 16,
359 | "id": "55083ad0-a671-4634-9912-e036b18f29d9",
360 | "metadata": {
361 | "tags": []
362 | },
363 | "outputs": [],
364 | "source": [
365 | "train_args = transformers.TrainingArguments(\n",
366 | " per_device_train_batch_size=batch_size,\n",
367 | " gradient_accumulation_steps=gradient_accumulation_steps,\n",
368 | " warmup_steps=100,\n",
369 | " num_train_epochs=num_epochs,\n",
370 | " learning_rate=learning_rate,\n",
371 | " bf16=True,\n",
372 | " logging_steps=2,\n",
373 | " optim=\"paged_adamw_8bit\",\n",
374 | " evaluation_strategy=\"steps\" if val_set_size > 0 else \"no\",\n",
375 | " save_strategy=\"steps\",\n",
376 | " eval_steps=200 if val_set_size > 0 else None,\n",
377 | " save_steps=10,\n",
378 | " output_dir=output_dir,\n",
379 | " load_best_model_at_end=True if val_set_size > 0 else False,\n",
380 | " ddp_find_unused_parameters=False if ddp else None,\n",
381 | " report_to=\"none\",\n",
382 | " group_by_length=group_by_length,\n",
383 | ")\n",
384 | "\n",
385 | "trainer = transformers.Trainer(\n",
386 | " model=model,\n",
387 | " train_dataset=train_data,\n",
388 | " eval_dataset=val_data,\n",
389 | " args=train_args,\n",
390 | " data_collator=transformers.DataCollatorForSeq2Seq(\n",
391 | " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
392 | " ),\n",
393 | ")"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 17,
399 | "id": "6208dfc0",
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/plain": [
405 | "PeftModelForCausalLM(\n",
406 | " (base_model): LoraModel(\n",
407 | " (model): GPTNeoXForCausalLM(\n",
408 | " (gpt_neox): GPTNeoXModel(\n",
409 | " (embed_in): Embedding(30080, 5120)\n",
410 | " (emb_dropout): Dropout(p=0.0, inplace=False)\n",
411 | " (layers): ModuleList(\n",
412 | " (0-39): 40 x GPTNeoXLayer(\n",
413 | " (input_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)\n",
414 | " (post_attention_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)\n",
415 | " (post_attention_dropout): Dropout(p=0.0, inplace=False)\n",
416 | " (post_mlp_dropout): Dropout(p=0.0, inplace=False)\n",
417 | " (attention): GPTNeoXAttention(\n",
418 | " (rotary_emb): GPTNeoXRotaryEmbedding()\n",
419 | " (query_key_value): lora.Linear4bit(\n",
420 | " (base_layer): Linear4bit(in_features=5120, out_features=15360, bias=True)\n",
421 | " (lora_dropout): ModuleDict(\n",
422 | " (default): Dropout(p=0.05, inplace=False)\n",
423 | " )\n",
424 | " (lora_A): ModuleDict(\n",
425 | " (default): Linear(in_features=5120, out_features=8, bias=False)\n",
426 | " )\n",
427 | " (lora_B): ModuleDict(\n",
428 | " (default): Linear(in_features=8, out_features=15360, bias=False)\n",
429 | " )\n",
430 | " (lora_embedding_A): ParameterDict()\n",
431 | " (lora_embedding_B): ParameterDict()\n",
432 | " )\n",
433 | " (dense): Linear4bit(in_features=5120, out_features=5120, bias=True)\n",
434 | " (attention_dropout): Dropout(p=0.0, inplace=False)\n",
435 | " )\n",
436 | " (mlp): GPTNeoXMLP(\n",
437 | " (dense_h_to_4h): Linear4bit(in_features=5120, out_features=20480, bias=True)\n",
438 | " (dense_4h_to_h): Linear4bit(in_features=20480, out_features=5120, bias=True)\n",
439 | " (act): GELUActivation()\n",
440 | " )\n",
441 | " )\n",
442 | " )\n",
443 | " (final_layer_norm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)\n",
444 | " )\n",
445 | " (embed_out): Linear(in_features=5120, out_features=30080, bias=False)\n",
446 | " )\n",
447 | " )\n",
448 | ")"
449 | ]
450 | },
451 | "execution_count": 17,
452 | "metadata": {},
453 | "output_type": "execute_result"
454 | }
455 | ],
456 | "source": [
457 | "model"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "id": "66470982-1dad-4e3d-9e29-a736fab51cf0",
463 | "metadata": {},
464 | "source": [
465 | "### Start Training"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 18,
471 | "id": "2a70473b-78dd-4ede-9b81-4980e9404c3f",
472 | "metadata": {
473 | "tags": []
474 | },
475 | "outputs": [
476 | {
477 | "name": "stderr",
478 | "output_type": "stream",
479 | "text": [
480 | "You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
481 | "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
482 | " warnings.warn(\n"
483 | ]
484 | },
485 | {
486 | "data": {
487 | "text/html": [
488 | "\n",
489 | "
Step | \n", 498 | "Training Loss | \n", 499 | "
---|---|
2 | \n", 504 | "1.970600 | \n", 505 | "
4 | \n", 508 | "1.837500 | \n", 509 | "
6 | \n", 512 | "1.886500 | \n", 513 | "
8 | \n", 516 | "1.978900 | \n", 517 | "
10 | \n", 520 | "1.923500 | \n", 521 | "
12 | \n", 524 | "1.842900 | \n", 525 | "
14 | \n", 528 | "1.934400 | \n", 529 | "
16 | \n", 532 | "1.957300 | \n", 533 | "
18 | \n", 536 | "1.807000 | \n", 537 | "
20 | \n", 540 | "1.895300 | \n", 541 | "
22 | \n", 544 | "1.961800 | \n", 545 | "
24 | \n", 548 | "1.897600 | \n", 549 | "
26 | \n", 552 | "1.890900 | \n", 553 | "
28 | \n", 556 | "1.920700 | \n", 557 | "
30 | \n", 560 | "1.868100 | \n", 561 | "
32 | \n", 564 | "1.872200 | \n", 565 | "
34 | \n", 568 | "1.945600 | \n", 569 | "
36 | \n", 572 | "1.869000 | \n", 573 | "
"
576 | ],
577 | "text/plain": [
578 | "
\n",
89 | "\n",
90 | "## 1. Merge PEFT model\n",
91 | "---"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "id": "82d3ff2f-c8bf-4bd8-86e3-ea801c2ea983",
98 | "metadata": {
99 | "tags": []
100 | },
101 | "outputs": [],
102 | "source": [
103 | "local_peft_model_dir = 'model_from_sagemaker'\n",
104 | "\n",
105 | "if not os.path.exists(local_peft_model_dir):\n",
106 | " os.makedirs(local_peft_model_dir)\n",
107 | "\n",
108 | "!aws s3 cp {estimator.model_data} {local_peft_model_dir}/model.tar.gz\n",
109 | "!tar -xzf {local_peft_model_dir}/model.tar.gz -C {local_peft_model_dir}\n",
110 | "!rm {local_peft_model_dir}/model.tar.gz"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "8ddab477-5c3d-49d4-b025-3ab50a335b7c",
117 | "metadata": {
118 | "tags": []
119 | },
120 | "outputs": [],
121 | "source": [
122 | "def modify_base_model_path(peft_model_dir, base_model_path):\n",
123 | " with open(f\"{peft_model_dir}/adapter_config.json\", \"r\") as jsonfile:\n",
124 | " data = json.load(jsonfile)\n",
125 | " data[\"base_model_name_or_path\"] = base_model_path\n",
126 | " with open(f\"{peft_model_dir}/adapter_config.json\", \"w\") as jsonfile:\n",
127 | " json.dump(data, jsonfile)\n",
128 | "\n",
129 | "def byte_transform(byte_size, to):\n",
130 | " a = {'k': 1, 'm': 2, 'g': 3, 't': 4, 'p': 5, 'e': 6}\n",
131 | " r = float(byte_size)\n",
132 | " for i in range(a[to]):\n",
133 | " r = r / 1024\n",
134 | " return round(r, 4)\n",
135 | "\n",
136 | "size = 0\n",
137 | "for ele in os.scandir(local_peft_model_dir):\n",
138 | " size += os.path.getsize(ele)\n",
139 | "gb_size = byte_transform(size, 'g')"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "e12ae953-6057-4587-8ead-987097cc6131",
146 | "metadata": {
147 | "tags": []
148 | },
149 | "outputs": [],
150 | "source": [
151 | "if gb_size < 0.5:\n",
152 | " base_model_path = \"/home/ec2-user/SageMaker/models/kullm-polyglot-12-8b-v2\"\n",
153 | " modify_base_model_path(local_peft_model_dir, base_model_path)\n",
154 | " \n",
155 | " local_model_dir = \"model_from_sagemaker_merged\" \n",
156 | " print(f'Save merged model: {local_model_dir}') \n",
157 | " os.makedirs(local_model_dir, exist_ok=True)\n",
158 | " model = AutoPeftModelForCausalLM.from_pretrained(local_peft_model_dir, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)\n",
159 | " merged_model = model.merge_and_unload()\n",
160 | " merged_model.save_pretrained(local_model_dir, safe_serialization=True)\n",
161 | " \n",
162 | " tokenizer = AutoTokenizer.from_pretrained(local_peft_model_dir)\n",
163 | " tokenizer.save_pretrained(local_model_dir)\n",
164 | "else:\n",
165 | " local_model_dir = local_peft_model_dir"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "id": "4406229a",
171 | "metadata": {},
172 | "source": [
173 | "
\n",
174 | "\n",
175 | "## 2. Upload LLM model to S3\n",
176 | "---"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "id": "26c4d12b-05b1-43d6-91c6-9de80720c534",
183 | "metadata": {
184 | "tags": []
185 | },
186 | "outputs": [],
187 | "source": [
188 | "model_prefix = \"kkulm-qlora-12-8B\"\n",
189 | "model_tar_dir = f\"{os.getcwd()}/model_from_sagemaker_merged\"\n",
190 | "\n",
191 | "bucket_prefix = 'ko-llms/serving' \n",
192 | "s3_code_prefix = f\"{bucket_prefix}/{model_prefix}/code\" # folder within bucket where code artifact will go\n",
193 | "s3_model_prefix = f\"{bucket_prefix}/{model_prefix}/model\" # folder where model checkpoint will go\n",
194 | "s3_model_artifact = f\"s3://{bucket}/{s3_model_prefix}\"\n",
195 | "\n",
196 | "print(f\"S3 code prefix \\n {s3_code_prefix}\")\n",
197 | "print(f\"S3 model prefix: \\n {s3_model_prefix}\")\n",
198 | "print(f\"S3 model artifact path: \\n {s3_model_artifact}\")"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "id": "3974a980-1452-4823-98ac-68516bae28cb",
205 | "metadata": {
206 | "tags": []
207 | },
208 | "outputs": [],
209 | "source": [
210 | "%%bash\n",
211 | "aws configure set default.s3.max_concurrent_requests 100\n",
212 | "aws configure set default.s3.max_queue_size 10000\n",
213 | "aws configure set default.s3.multipart_threshold 1GB\n",
214 | "aws configure set default.s3.multipart_chunksize 64MB"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "id": "040f782f-4815-4ee9-8605-d259d61e0e97",
221 | "metadata": {
222 | "tags": []
223 | },
224 | "outputs": [],
225 | "source": [
226 | "!aws s3 sync {model_tar_dir} {s3_model_artifact}\n",
227 | "print(f\"Model uploaded to --- > {s3_model_artifact}\")\n",
228 | "print(f\"We will set option.s3url={s3_model_artifact}\")"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "id": "09506187-4c72-49ee-a450-53855eb2f8d3",
235 | "metadata": {
236 | "tags": []
237 | },
238 | "outputs": [],
239 | "source": [
240 | "src_path = f\"serving_src/{model_prefix}\"\n",
241 | "!rm -rf {src_path}\n",
242 | "os.makedirs(src_path, exist_ok=True)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "id": "51a8cbe6-b1eb-4618-8217-d0ecd8c6428e",
249 | "metadata": {
250 | "tags": []
251 | },
252 | "outputs": [],
253 | "source": [
254 | "%%writefile {src_path}/serving.properties\n",
255 | "option.s3url={{s3url}}\n",
256 | "\n",
257 | "engine=DeepSpeed\n",
258 | "\n",
259 | "# passing extra options to model.py or built-in handler\n",
260 | "job_queue_size=100\n",
261 | "batch_size=1\n",
262 | "max_batch_delay=1\n",
263 | "max_idle_time=60\n",
264 | "\n",
265 | "# Built-in entrypoint\n",
266 | "#option.entryPoint=djl_python.deepspeed\n",
267 | "\n",
268 | "# Hugging Face model id\n",
269 | "#option.model_id=EleutherAI/gpt-j-6B\n",
270 | "\n",
271 | "# defines custom environment variables\n",
272 | "#env=SERVING_NUMBER_OF_NETTY_THREADS=2\n",
273 | "\n",
274 | "# Allows to load DeepSpeed workers in parallel\n",
275 | "option.parallel_loading=true\n",
276 | "\n",
277 | "# specify tensor parallel degree (number of partitions)\n",
278 | "option.tensor_parallel_degree=4\n",
279 | "\n",
280 | "# specify per model timeout\n",
281 | "option.model_loading_timeout=600\n",
282 | "#option.predict_timeout=240\n",
283 | "\n",
284 | "# mark the model as failure after python process crashing 10 times\n",
285 | "retry_threshold=0\n",
286 | "\n",
287 | "option.task=text-generation"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "id": "89f67d0e-6bd4-4919-a400-16f38f7ebcbb",
294 | "metadata": {
295 | "tags": []
296 | },
297 | "outputs": [],
298 | "source": [
299 | "from pathlib import Path\n",
300 | "jinja_env = jinja2.Environment() \n",
301 | "# we plug in the appropriate model location into our `serving.properties` file based on the region in which this notebook is running\n",
302 | "template = jinja_env.from_string(Path(f\"{src_path}/serving.properties\").open().read())\n",
303 | "Path(f\"{src_path}/serving.properties\").open(\"w\").write(template.render(s3url=s3_model_artifact))\n",
304 | "!pygmentize {src_path}/serving.properties | cat -n"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "id": "9ff83d21-555f-47a4-8b73-24fe97db441e",
311 | "metadata": {
312 | "tags": []
313 | },
314 | "outputs": [],
315 | "source": [
316 | "%%writefile {src_path}/model.py\n",
317 | "from djl_python import Input, Output\n",
318 | "import os\n",
319 | "import deepspeed\n",
320 | "import torch\n",
321 | "import logging\n",
322 | "from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer\n",
323 | "from transformers import GPTNeoXLayer\n",
324 | "\n",
325 | "predictor = None\n",
326 | "\n",
327 | "def get_model(properties):\n",
328 | " \n",
329 | " tp_degree = properties[\"tensor_parallel_degree\"]\n",
330 | " model_location = properties[\"model_dir\"]\n",
331 | " if \"model_id\" in properties:\n",
332 | " model_location = properties[\"model_id\"]\n",
333 | " task = properties[\"task\"]\n",
334 | " \n",
335 | " logging.info(f\"Loading model in {model_location}\") \n",
336 | " local_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\n",
337 | "\n",
338 | " tokenizer = AutoTokenizer.from_pretrained(model_location)\n",
339 | "\n",
340 | " model = AutoModelForCausalLM.from_pretrained(\n",
341 | " model_location,\n",
342 | " torch_dtype=torch.float16,\n",
343 | " low_cpu_mem_usage=True,\n",
344 | " )\n",
345 | " \n",
346 | " model.requires_grad_(False)\n",
347 | " model.eval()\n",
348 | " \n",
349 | " ds_config = {\n",
350 | " \"tensor_parallel\": {\"tp_size\": tp_degree},\n",
351 | " \"dtype\": model.dtype,\n",
352 | " \"injection_policy\": {\n",
353 | " GPTNeoXLayer:('attention.dense', 'mlp.dense_4h_to_h')\n",
354 | " }\n",
355 | " }\n",
356 | " logging.info(f\"Starting DeepSpeed init with TP={tp_degree}\") \n",
357 | " model = deepspeed.init_inference(model, ds_config) \n",
358 | " \n",
359 | " generator = pipeline(\n",
360 | " task=task, model=model, tokenizer=tokenizer, device=local_rank\n",
361 | " )\n",
362 | " # https://huggingface.co/docs/hub/models-tasks\n",
363 | " return generator\n",
364 | " \n",
365 | "def handle(inputs: Input) -> None:\n",
366 | " \"\"\"\n",
367 | " inputs: Contains the configurations from serving.properties\n",
368 | " \"\"\" \n",
369 | " global predictor\n",
370 | " if not predictor:\n",
371 | " predictor = get_model(inputs.get_properties())\n",
372 | "\n",
373 | " if inputs.is_empty():\n",
374 | " # Model server makes an empty call to warmup the model on startup\n",
375 | " logging.info(\"is_empty\")\n",
376 | " return None\n",
377 | "\n",
378 | " data = inputs.get_as_json() #inputs.get_as_string()\n",
379 | " logging.info(\"data:\", data)\n",
380 | " \n",
381 | " input_prompt, params = data[\"inputs\"], data[\"parameters\"]\n",
382 | " result = predictor(input_prompt, **params)\n",
383 | " logging.info(\"result:\", result)\n",
384 | "\n",
385 | " return Output().add_as_json(result) #Output().add(result) "
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "id": "cfeca701-a44f-496b-a841-bdbb354bd11c",
392 | "metadata": {
393 | "tags": []
394 | },
395 | "outputs": [],
396 | "source": [
397 | "!rm -rf model.tar.gz\n",
398 | "!tar czvf model.tar.gz -C {src_path} ."
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "id": "b8f234e3-5588-4bbd-a95f-9c74aa50944e",
405 | "metadata": {
406 | "tags": []
407 | },
408 | "outputs": [],
409 | "source": [
410 | "s3_code_artifact = sess.upload_data(\"model.tar.gz\", bucket, s3_code_prefix)\n",
411 | "print(f\"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}\")\n",
412 | "!rm -rf model.tar.gz"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "id": "c4834546-f482-41ae-b94e-901f8be01e5e",
418 | "metadata": {},
419 | "source": [
420 | "
\n",
421 | "\n",
422 | "## 3. Serve LLM Model on SageMaker\n",
423 | "---\n",
424 | "### Create SageMaker Model\n",
425 | "\n",
426 | "SageMaker 엔드포인트 생성 매개변수 VolumeSizeInGB를 지정할 때 마운트되는 Amazon EBS(Amazon Elastic Block Store) 볼륨에 /tmp를 매핑하기 때문에 컨테이너는 인스턴스의 `/tmp` 공간에 모델을 다운로드합니다. 이때 s5cmd (https://github.com/peak/s5cmd) 를 활용하므로 대용량 모델을 빠르게 다운로드할 수 있습니다.\n",
427 | "볼륨 인스턴스와 함께 미리 빌드되어 제공되는 p4dn과 같은 인스턴스의 경우 컨테이너의 `/tmp`를 계속 활용할 수 있습니다. "
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "id": "cf1e0f9c-3eb4-441e-8f22-1e9292abf3bb",
434 | "metadata": {
435 | "tags": []
436 | },
437 | "outputs": [],
438 | "source": [
439 | "from sagemaker.utils import name_from_base\n",
440 | "from sagemaker import image_uris\n",
441 | "\n",
442 | "img_uri = image_uris.retrieve(framework=\"djl-deepspeed\", region=region, version=\"0.23.0\")\n",
443 | "model_name = name_from_base(f\"{model_prefix}\")\n",
444 | "print(model_name)\n",
445 | "\n",
446 | "model_response = sm_client.create_model(\n",
447 | " ModelName=model_name,\n",
448 | " ExecutionRoleArn=role,\n",
449 | " PrimaryContainer={\"Image\": img_uri, \"ModelDataUrl\": s3_code_artifact},\n",
450 | ")\n",
451 | "model_arn = model_response[\"ModelArn\"]\n",
452 | "print(f\"Created Model: {model_arn}\")"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "id": "f7aee43a-87cf-4a0f-ad70-5715c745d7ad",
458 | "metadata": {},
459 | "source": [
460 | "### Create SageMaker Endpoint"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": null,
466 | "id": "c82f3d85-070a-4cd5-bbfa-6c2ec4afd7d5",
467 | "metadata": {
468 | "tags": []
469 | },
470 | "outputs": [],
471 | "source": [
472 | "endpoint_config_name = f\"{model_name}-config\"\n",
473 | "endpoint_name = f\"{model_name}-endpoint\"\n",
474 | "variant_name = \"variant1\"\n",
475 | "instance_type = \"ml.g5.12xlarge\"\n",
476 | "initial_instance_count = 1\n",
477 | "\n",
478 | "prod_variants = [\n",
479 | " {\n",
480 | " \"VariantName\": \"variant1\",\n",
481 | " \"ModelName\": model_name,\n",
482 | " \"InstanceType\": instance_type,\n",
483 | " \"InitialInstanceCount\": initial_instance_count,\n",
484 | " # \"ModelDataDownloadTimeoutInSeconds\": 2400,\n",
485 | " \"ContainerStartupHealthCheckTimeoutInSeconds\": 600,\n",
486 | " }\n",
487 | "]\n",
488 | "\n",
489 | "endpoint_config_response = sm_client.create_endpoint_config(\n",
490 | " EndpointConfigName=endpoint_config_name,\n",
491 | " ProductionVariants=prod_variants\n",
492 | ")\n",
493 | "\n",
494 | "endpoint_response = sm_client.create_endpoint(\n",
495 | " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n",
496 | ")\n",
497 | "print(f\"Created Endpoint: {endpoint_response['EndpointArn']}\")"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "id": "bdf730b4-80ca-4553-be16-60d82be22ae1",
503 | "metadata": {},
504 | "source": [
505 | "엔드포인트가 생성되는 동안 아래의 문서를 같이 확인해 보세요.\n",
506 | "- https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-dlc.html"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": null,
512 | "id": "80136e12-0108-49ce-9689-736fdd4a5602",
513 | "metadata": {
514 | "tags": []
515 | },
516 | "outputs": [],
517 | "source": [
518 | "from IPython.display import display, HTML\n",
519 | "def make_console_link(region, endpoint_name, task='[SageMaker LLM Serving]'):\n",
520 | " endpoint_link = f' {task} Check Endpoint Status' \n",
521 | " return endpoint_link\n",
522 | "\n",
523 | "endpoint_link = make_console_link(region, endpoint_name)\n",
524 | "display(HTML(endpoint_link))"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": null,
530 | "id": "b97b0056-5d95-441f-9c5c-1ec1908008c3",
531 | "metadata": {
532 | "tags": []
533 | },
534 | "outputs": [],
535 | "source": [
536 | "%%time \n",
537 | "from inference_lib import describe_endpoint, Prompter\n",
538 | "describe_endpoint(endpoint_name) "
539 | ]
540 | },
541 | {
542 | "cell_type": "markdown",
543 | "id": "bf69e880-441f-487c-b458-8bb12df5e0bd",
544 | "metadata": {},
545 | "source": [
546 | "
\n",
547 | "\n",
548 | "## 4. Inference\n",
549 | "---\n",
550 | "\n",
551 | "엔드포인트를 호출할 때 이 텍스트를 JSON 페이로드 내에 제공해야 합니다. 이 JSON 페이로드에는 length, sampling strategy, output token sequence restrictions을 제어하는 데 도움이 되는 원하는 추론 매개변수가 포함될 수 있습니다. 허깅페이스 트랜스포머 transformers 라이브러리에는 [사용 가능한 페이로드 매개변수](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig)의 전체 목록이 정의되어 있지만, 중요한 페이로드 매개변수는 다음과 같이 정의되어 있습니다:\n",
552 | "\n",
553 | "* **do_sample (`bool`)** – logits sampling 활성화\n",
554 | "* **max_new_tokens (`int`)** – 생성된 토큰의 최대 수\n",
555 | "* **best_of (`int`)** – best_of 개의 시퀀스를 생성하고 가장 높은 토큰 로그 확률이 있는 경우 반환\n",
556 | "* **repetition_penalty (`float`)** – 반복 패널티에 대한 파라미터, 1.0은 패널티가 없음을 의미하여 Greedy 서치와 동일, 커질수록 다양한 결과를 얻을 수 있으며, 자세한 사항은 [this paper](https://arxiv.org/pdf/1909.05858.pdf)을 참고\n",
557 | "* **return_full_text (`bool`)** – 생성된 텍스트에 프롬프트를 추가할지 여부\n",
558 | "* **seed (`int`)** – Random sampling seed\n",
559 | "* **stop_sequences (`List[str]`)** – `stop_sequences` 가 생성되면 토큰 생성을 중지\n",
560 | "* **temperature (`float`)** – logits 분포 모듈화에 사용되는 값\n",
561 | "* **top_k (`int`)** – 상위 K개 만큼 가장 높은 확률 어휘 토큰의 수\n",
562 | "* **top_p (`float`)** – 1 보다 작게 설정하게 되며, 상위부터 정렬했을 때 가능한 토큰들의 확률을 합산하여 `top_p` 이상의 가장 작은 집합을 유지\n",
563 | "* **truncate (`int`)** – 입력 토큰을 지정된 크기로 잘라냄\n",
564 | "* **typical_p (`float`)** – typical Decoding 양으로, 자세한 사항은 [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)을 참고\n",
565 | "* **watermark (`bool`)** – [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)가 Watermarking\n",
566 | "* **decoder_input_details (`bool`)** – decoder input token logprobs와 ids를 반환"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "id": "ebbb0f5e-3bbd-43d1-b482-9310cc2095f9",
573 | "metadata": {
574 | "tags": []
575 | },
576 | "outputs": [],
577 | "source": [
578 | "params = {\n",
579 | " \"do_sample\": True,\n",
580 | " \"max_new_tokens\": 128,\n",
581 | " \"temperature\": 0.7,\n",
582 | " \"top_p\": 0.9,\n",
583 | " \"return_full_text\": False,\n",
584 | " \"repetition_penalty\": 1.1,\n",
585 | " \"presence_penalty\": None,\n",
586 | " \"eos_token_id\": 2,\n",
587 | "}"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": null,
593 | "id": "40280725-bfbc-4057-9d31-33aec765d606",
594 | "metadata": {
595 | "tags": []
596 | },
597 | "outputs": [],
598 | "source": [
599 | "import json\n",
600 | "from inference_lib import KoLLMSageMakerEndpoint, Prompter\n",
601 | "ep = KoLLMSageMakerEndpoint(endpoint_name)"
602 | ]
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": null,
607 | "id": "3450e7d7-9c04-44f6-9ca6-d756fc9d3757",
608 | "metadata": {
609 | "tags": []
610 | },
611 | "outputs": [],
612 | "source": [
613 | "\n",
614 | "instruction = \"다음 글을 알기 쉽게 요약해 주세요.\"\n",
615 | "context = \"\"\"\n",
616 | "대규모 언어 모델(LLM; Large Language Models) 분야의 발전과 LLM이 가치 있는 인사이트를 제공하는 문제 세트 수가 계속 증가하고 있다는 소식을 들어보셨을 겁니다. \n",
617 | "대규모 데이터 세트와 여러 작업을 통해 훈련된 대규모 모델은 훈련되지 않은 특정 작업에도 잘 일반화됩니다. 이러한 모델을 파운데이션 모델(foundation model)이라고 하며, 스탠포드 HAI 연구소(Stanford Institute for Human-Centered Artificial Intelligence)에서 처음 대중화한 용어입니다. \n",
618 | "이러한 파운데이션 모델은 프롬프트 엔지니어링(prompt engineering) 기법의 도움으로 잘 활용할 수 있지만, 유스케이스가 도메인에 매우 특화되어 있거나 작업의 종류가 매우 다양하여 모델을 추가로 커스터마이징해야 하는 경우가 많습니다. \n",
619 | "특정 도메인이나 작업에 대한 대규모 모델의 성능을 개선하기 위한 한 가지 접근 방식은 더 작은 작업별 데이터 세트로 모델을 추가로 훈련하는 것입니다. 파인 튜닝(fine-tuning)으로 알려진 이 접근 방식은 LLM의 정확도를 성공적으로 개선하지만, 모든 모델 가중치를 수정해야 합니다. \n",
620 | "파인 튜닝 데이터 세트 크기가 훨씬 작기 때문에 사전 훈련(pre-training)하는 것 보다 훨씬 빠르지만 여전히 상당한 컴퓨팅 성능과 메모리가 필요합니다. \n",
621 | "파인 튜닝은 원본 모델의 모든 파라미터 가중치를 수정하므로 비용이 많이 들고 원본 모델과 동일한 크기의 모델을 생성하므로 스토리지 용량에도 부담이 됩니다.\n",
622 | "\"\"\"\n",
623 | "payload = ep.get_payload(instruction, context, params)"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "id": "1d0706d2-2918-4e4b-8536-ed90134c5d22",
630 | "metadata": {
631 | "tags": []
632 | },
633 | "outputs": [],
634 | "source": [
635 | "%%time\n",
636 | "generated_text = ep.infer(payload, verbose=True)"
637 | ]
638 | },
639 | {
640 | "cell_type": "markdown",
641 | "id": "1a9a65c8-ec81-4e35-afe0-69e0862c2ac0",
642 | "metadata": {},
643 | "source": [
644 | "
\n",
645 | "\n",
646 | "## 5. Clean Up\n",
647 | "---"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": null,
653 | "id": "c3dc8bbf-fd0f-4a76-8c39-068fd77e56d0",
654 | "metadata": {
655 | "tags": []
656 | },
657 | "outputs": [],
658 | "source": [
659 | "!rm -rf {local_peft_model_dir} {local_model_dir} serving_src"
660 | ]
661 | },
662 | {
663 | "cell_type": "code",
664 | "execution_count": null,
665 | "id": "74de01d5-8b2c-43da-bb66-3063a7289cf4",
666 | "metadata": {
667 | "tags": []
668 | },
669 | "outputs": [],
670 | "source": [
671 | "# - Delete the end point\n",
672 | "sm_client.delete_endpoint(EndpointName=endpoint_name)\n",
673 | "# - In case the end point failed we still want to delete the model\n",
674 | "sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)\n",
675 | "sm_client.delete_model(ModelName=model_name)"
676 | ]
677 | }
678 | ],
679 | "metadata": {
680 | "kernelspec": {
681 | "display_name": "conda_pytorch_p310",
682 | "language": "python",
683 | "name": "conda_pytorch_p310"
684 | },
685 | "language_info": {
686 | "codemirror_mode": {
687 | "name": "ipython",
688 | "version": 3
689 | },
690 | "file_extension": ".py",
691 | "mimetype": "text/x-python",
692 | "name": "python",
693 | "nbconvert_exporter": "python",
694 | "pygments_lexer": "ipython3",
695 | "version": "3.10.13"
696 | }
697 | },
698 | "nbformat": 4,
699 | "nbformat_minor": 5
700 | }
701 |
--------------------------------------------------------------------------------
/PEFT/readme.md:
--------------------------------------------------------------------------------
1 | # QLORA: Efficient Finetuning of Quantized LLMs
2 | "QLORA"는 대규모 언어 모델(Large Language Models, LLMs)을 효율적으로 파인튜닝(finetuning)하는 새로운 방법을 제시합니다. 이 방법은 단일 48GB GPU에서 65B 파라미터 모델을 파인튜닝할 수 있을 만큼 메모리 사용을 크게 줄입니다. QL O RA는 4비트로 양자화된 사전 훈련된 언어 모델을 통해 그래디언트를 역전파하여 Low Rank Adapters (LoRA)를 파인튜닝합니다. Amazon SageMaker에서는 Huggingface의 모델을 이용하여 QLORA 파인튜닝을 쉽게 할수 있습니다
3 |
4 | 논문링크 : https://arxiv.org/pdf/2305.14314.pdf
5 |
6 | > **참고:** 본 코드는 SageMaker Notebook Instance에서 구현, 동작 검증 되었습니다. konlpy등 Java가 필요한 패키지들과 로컬 GPU 인스턴스에서 PEFT 파인튜닝하는 과정이 있기 때문입니다.
7 | >
8 | > **테스트 인스턴스 사양:**
9 | > - 타입: ml.g5.2xlarge(A10G 1개) ~ ml.g5.12xlarge(A10G 4개)
10 | > - EBS: 100기가 이상
11 | >
12 | > **코드 실행 순서:**
13 | > 1. 데이터 학습 전처리
14 | > - `1_prepare-dataset-alpaca-method.ipynb`
15 | > - `1_prepare-dataset-chunk-method.ipynb`
16 | >
17 | > 2. 로컬 QLoRA PEFT 학습
18 | > - `2_local-train-debug-lora.ipynb`
19 | > - ml.g5.xlarge 에서 동작 테스트 했습니다.
20 | >
21 | > 3. 로컬 QLoRA PEFT LLM 로드 및 테스트
22 | > - `2_local-infer-debug-lora.ipynb`
23 | > - ml.g5.xlarge 에서 동작 테스트 했습니다.
24 | >
25 | > 4. SageMaker training job - QLoRA PEFT
26 | > - `3_sm-train-lora.ipynb`
27 | > - 코드에 ml.g5.12xlarge에서 학습 인스턴스 설정되어있습니다.
28 | >
29 | > 5. SageMaker Endpoint API 추론배포 - QLoRA PEFT
30 | > - `4_sm-serving-djl.ipynb`
31 | > - 코드는 QLoRA 학습파라미터를 Merge한 원본 파라미터를 사용해서 추론합니다.
32 | > - ml.g5.12xlarge 이상부터 가능합니다. (A10G 4대가 필요합니다.)
33 |
34 | ## 1. Introduction
35 | 대규모 언어 모델(Large Language Models, LLMs)을 파인튜닝(finetuning)하는 것은 그 성능을 향상시키는 매우 효과적인 방법이지만, 매우 큰 모델을 파인튜닝하는 것은 고비용이며 많은 GPU 메모리가 필요합니다. 예를 들어, 65B 파라미터를 가진 LLaMA 모델을 일반적인 16비트 파인튜닝으로 훈련시키려면 780GB 이상의 GPU 메모리가 필요합니다. 이러한 문제를 해결하기 위해 QL O RA 방법이 제안되었습니다. 이 방법은 4비트로 양자화된 모델을 파인튜닝하면서 성능 저하 없이 메모리 사용량을 크게 줄입니다. 이를 통해 단일 GPU에서도 대규모 모델을 파인튜닝할 수 있게 되어, 이러한 모델의 접근성이 크게 향상됩니다.
36 |
37 |
38 | ## 2. QLORA 이해를 위한 백그라운드
39 |
40 | ### 2-a.Block-wise k-bit Quantization
41 |
42 | #### 개념
43 |
44 | - **양자화(Quantization)**: 입력 데이터를 더 적은 정보를 가진 표현으로 변환하는 과정입니다. 일반적으로 더 많은 비트를 가진 데이터 타입을 더 적은 비트로 변환합니다. 예를 들어, 32-bit 부동소수점을 4-bit 정수로 변환할 수 있습니다.
45 |
46 |
51 |
52 |
53 | #### 기본 방법
54 |
55 | - **정규화(Normalization)**: 낮은 비트 데이터 타입의 전체 범위가 사용되도록 입력 데이터 타입은 대상 데이터 타입 범위로 정규화됩니다. 이는 입력 요소의 절대 최대값으로 정규화하여 이루어집니다.
56 | - **양자화 상수(Quantization Constant)**: 이 상수는 양자화 과정에서 사용되며, 일반적으로 $c$로 표시됩니다.
57 | - For example, quantizing a 32-bit Floating Point (FP32) tensor into a Int8 tensor with range $[−127, 127]$
58 |
59 |
61 |
81 |
115 |
170 |
199 |
215 |
23 |
54 |
\n",
68 | "\n",
69 | "## 1. Download LLM model and upload it to S3\n",
70 | "---"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "id": "8a104756-a389-403c-bb3a-b512ed6b948d",
77 | "metadata": {
78 | "tags": []
79 | },
80 | "outputs": [],
81 | "source": [
82 | "from huggingface_hub import snapshot_download\n",
83 | "from pathlib import Path\n",
84 | "\n",
85 | "model_id = \"nlpai-lab/kullm-polyglot-12.8b-v2\"\n",
86 | "model_prefix = model_id.split('/')[-1].replace('.', '-')\n",
87 | "\n",
88 | "s3_code_prefix = f\"ko-llm/{model_prefix}/code\" # folder within bucket where code artifact will go\n",
89 | "s3_model_prefix = f\"ko-llm/{model_prefix}/model\" # folder where model checkpoint will go"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "id": "99fca204-ff77-49c5-bc76-b5383c8ded4e",
95 | "metadata": {},
96 | "source": [
97 | "
\n",
98 | "\n",
99 | "## 2. Model Serving Scripts\n",
100 | "---\n",
101 | "### Create `serving.properties`\n",
102 | "\n",
103 | "이 설정 파일은 어떤 추론 최적화 라이브러리를 사용할지, 어떤 설정을 사용할지 DJL Serving에 알려주는 설정 파일입니다. 필요에 따라 적절한 구성을 설정할 수 있습니다.\n",
104 | "\n",
105 | "모델이 레이어에 따라 분할되는 파이프라인 병렬화(Pipeline Parallelism)를 사용하는 허깅페이스 Accelerate와 달리, DeepSpeed는 각 레이어(텐서)가 여러 디바이스에 걸쳐 샤딩되는 텐서 병렬화(Tensor Parallelism)를 사용합니다. 파이프라인 병렬 처리 접근 방식에서는 데이터가 각 GPU 장치를 통해 순차적으로 흐르지만, 텐서 병렬 처리는 데이터가 모든 GPU 장치로 전송되어 각 GPU에서 부분적인 결과가 계산됩니다. 그런 다음 All-Gather 연산을 통해 부분 결과를 수집하여 최종 결과를 계산합니다. 따라서, 텐서 병렬화가 일반적으로 더 높은 GPU 활용률과 더 나은 성능을 제공합니다.\n",
106 | "\n",
107 | "- `option.s3url` - 모델 파일의 위치를 지정합니다. 또는`option.model_id` 옵션을 대신 사용하여 허깅페이스 허브에서 모델을 지정할 수 있습니다(예: EleutherAI/gpt-j-6B). 그러면 허브에서 모델이 자동으로 다운로드됩니다. s3url 접근 방식은 자체 환경 내에서 모델 아티팩트를 호스팅할 수 있고 DJL 추론 컨테이너 내에서 최적화된 접근 방식을 활용하여 S3에서 호스팅 인스턴스로 모델을 전송함으로써 더 빠른 모델 배포가 가능합니다.\n",
108 | "\n",
109 | "`serving.properties`의 일반적인 설정법과 자세한 내용은 https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-configuration.html 를 참조하세요.\n",
110 | "\n",
111 | ""
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 5,
117 | "id": "7e960108-f6bf-4b8d-9ea6-78b6ca3c6915",
118 | "metadata": {
119 | "tags": []
120 | },
121 | "outputs": [],
122 | "source": [
123 | "src_path = f\"src/{model_prefix}\"\n",
124 | "!rm -rf {src_path}\n",
125 | "os.makedirs(src_path, exist_ok=True)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 6,
131 | "id": "5beab7e1-205c-47b1-b27a-0120614caec2",
132 | "metadata": {
133 | "tags": []
134 | },
135 | "outputs": [
136 | {
137 | "name": "stdout",
138 | "output_type": "stream",
139 | "text": [
140 | "Writing src/kullm-polyglot-12-8b-v2/serving.properties\n"
141 | ]
142 | }
143 | ],
144 | "source": [
145 | "%%writefile {src_path}/serving.properties\n",
146 | "\n",
147 | "engine=DeepSpeed\n",
148 | "\n",
149 | "# passing extra options to model.py or built-in handler\n",
150 | "job_queue_size=100\n",
151 | "batch_size=1\n",
152 | "max_batch_delay=1\n",
153 | "max_idle_time=60\n",
154 | "\n",
155 | "# Built-in entrypoint\n",
156 | "#option.entryPoint=djl_python.deepspeed\n",
157 | "\n",
158 | "# Hugging Face model id\n",
159 | "#option.model_id={{model_id}}\n",
160 | "\n",
161 | "# defines custom environment variables\n",
162 | "#env=SERVING_NUMBER_OF_NETTY_THREADS=2\n",
163 | "\n",
164 | "# Allows to load DeepSpeed workers in parallel\n",
165 | "option.parallel_loading=true\n",
166 | "\n",
167 | "# specify tensor parallel degree (number of partitions)\n",
168 | "option.tensor_parallel_degree=4\n",
169 | "\n",
170 | "# specify per model timeout\n",
171 | "option.model_loading_timeout=600\n",
172 | "#option.predict_timeout=240\n",
173 | "\n",
174 | "# mark the model as failure after python process crashing 10 times\n",
175 | "retry_threshold=0\n",
176 | "\n",
177 | "option.task=text-generation"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "76217c91-0a17-4895-a400-e8bd9889f00d",
183 | "metadata": {},
184 | "source": [
185 | "### Create model.py with custom inference code\n",
186 | "빌트인 추론 코드로 no-code로 배포할 수도 있지만, 커스텀 추론 코드를 작성하는 것도 가능합니다."
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 7,
192 | "id": "ff9c01fd-d6cc-4dfc-9dfe-8a63ebd8cf60",
193 | "metadata": {
194 | "tags": []
195 | },
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "Writing src/kullm-polyglot-12-8b-v2/model.py\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "%%writefile {src_path}/model.py\n",
207 | "from djl_python import Input, Output\n",
208 | "import os\n",
209 | "import deepspeed\n",
210 | "import torch\n",
211 | "import logging\n",
212 | "from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer\n",
213 | "from transformers import GPTNeoXLayer\n",
214 | "\n",
215 | "predictor = None\n",
216 | "\n",
217 | "def get_model(properties):\n",
218 | " \n",
219 | " tp_degree = properties[\"tensor_parallel_degree\"]\n",
220 | " # model_location = properties[\"model_dir\"]\n",
221 | " # if \"model_id\" in properties:\n",
222 | " # model_location = properties[\"model_id\"]\n",
223 | " model_location = \"nlpai-lab/kullm-polyglot-12.8b-v2\" \n",
224 | " task = properties[\"task\"]\n",
225 | " \n",
226 | " logging.info(f\"Loading model in {model_location}\") \n",
227 | " local_rank = int(os.getenv(\"LOCAL_RANK\", \"0\"))\n",
228 | "\n",
229 | " tokenizer = AutoTokenizer.from_pretrained(model_location)\n",
230 | "\n",
231 | " model = AutoModelForCausalLM.from_pretrained(\n",
232 | " model_location,\n",
233 | " torch_dtype=torch.float16,\n",
234 | " low_cpu_mem_usage=True,\n",
235 | " )\n",
236 | " \n",
237 | " model.requires_grad_(False)\n",
238 | " model.eval()\n",
239 | " \n",
240 | " ds_config = {\n",
241 | " \"tensor_parallel\": {\"tp_size\": tp_degree},\n",
242 | " \"dtype\": model.dtype,\n",
243 | " \"injection_policy\": {\n",
244 | " GPTNeoXLayer:('attention.dense', 'mlp.dense_4h_to_h')\n",
245 | " }\n",
246 | " }\n",
247 | " logging.info(f\"Starting DeepSpeed init with TP={tp_degree}\") \n",
248 | " model = deepspeed.init_inference(model, ds_config) \n",
249 | " \n",
250 | " generator = pipeline(\n",
251 | " task=task, model=model, tokenizer=tokenizer, device=local_rank\n",
252 | " )\n",
253 | " # https://huggingface.co/docs/hub/models-tasks\n",
254 | " return generator\n",
255 | " \n",
256 | "def handle(inputs: Input) -> None:\n",
257 | " \"\"\"\n",
258 | " inputs: Contains the configurations from serving.properties\n",
259 | " \"\"\" \n",
260 | " global predictor\n",
261 | " if not predictor:\n",
262 | " predictor = get_model(inputs.get_properties())\n",
263 | "\n",
264 | " if inputs.is_empty():\n",
265 | " # Model server makes an empty call to warmup the model on startup\n",
266 | " logging.info(\"is_empty\")\n",
267 | " return None\n",
268 | "\n",
269 | " data = inputs.get_as_json() #inputs.get_as_string()\n",
270 | " logging.info(\"data:\", data)\n",
271 | " \n",
272 | " input_prompt, params = data[\"inputs\"], data[\"parameters\"]\n",
273 | " result = predictor(input_prompt, **params)\n",
274 | " logging.info(\"result:\", result)\n",
275 | "\n",
276 | " return Output().add_as_json(result) #Output().add(result)"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "id": "a063b634-3a4f-49ec-8779-fdda2818326e",
282 | "metadata": {},
283 | "source": [
284 | "### Create the Tarball and then upload to S3"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 8,
290 | "id": "90fa1063-839a-46be-9ca4-f6b9c14efbc7",
291 | "metadata": {
292 | "tags": []
293 | },
294 | "outputs": [
295 | {
296 | "name": "stdout",
297 | "output_type": "stream",
298 | "text": [
299 | "./\n",
300 | "./serving.properties\n",
301 | "./model.py\n"
302 | ]
303 | }
304 | ],
305 | "source": [
306 | "!rm -rf model.tar.gz\n",
307 | "!tar czvf model.tar.gz -C {src_path} ."
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 9,
313 | "id": "1d60fd8a-8a0e-497c-8628-98eb26772cd9",
314 | "metadata": {
315 | "tags": []
316 | },
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-143656149352/ko-llm/kullm-polyglot-12-8b-v2/code/model.tar.gz\n"
323 | ]
324 | }
325 | ],
326 | "source": [
327 | "s3_code_artifact = sess.upload_data(\"model.tar.gz\", bucket, s3_code_prefix)\n",
328 | "print(f\"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}\")\n",
329 | "!rm -rf model.tar.gz"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "id": "57ea2358-3ba1-4ff3-9ca4-df772b59770d",
335 | "metadata": {},
336 | "source": [
337 | "
\n",
338 | "\n",
339 | "## 3. Serve LLM Model on SageMaker\n",
340 | "\n",
341 | "---"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "id": "7820b1c3-7854-433d-bbb6-03193abefa22",
347 | "metadata": {},
348 | "source": [
349 | "### Create SageMaker Model\n",
350 | "\n",
351 | "SageMaker 엔드포인트 생성 매개변수 VolumeSizeInGB를 지정할 때 마운트되는 Amazon EBS(Amazon Elastic Block Store) 볼륨에 /tmp를 매핑하기 때문에 컨테이너는 인스턴스의 `/tmp` 공간에 모델을 다운로드합니다. 이때 s5cmd (https://github.com/peak/s5cmd) 를 활용하므로 대용량 모델을 빠르게 다운로드할 수 있습니다.\n",
352 | "볼륨 인스턴스와 함께 미리 빌드되어 제공되는 p4dn과 같은 인스턴스의 경우 컨테이너의 `/tmp`를 계속 활용할 수 있습니다. "
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 10,
358 | "id": "598b6ded-ba9c-4f25-b862-090546607b98",
359 | "metadata": {
360 | "tags": []
361 | },
362 | "outputs": [
363 | {
364 | "name": "stdout",
365 | "output_type": "stream",
366 | "text": [
367 | "kullm-polyglot-12-8b-v2-2023-07-23-14-04-08-969\n",
368 | "Created Model: arn:aws:sagemaker:us-east-1:143656149352:model/kullm-polyglot-12-8b-v2-2023-07-23-14-04-08-969\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "from sagemaker.utils import name_from_base\n",
374 | "from sagemaker import image_uris\n",
375 | "\n",
376 | "img_uri = image_uris.retrieve(framework=\"djl-deepspeed\", region=region, version=\"0.23.0\")\n",
377 | "model_name = name_from_base(f\"{model_prefix}\")\n",
378 | "print(model_name)\n",
379 | "\n",
380 | "model_response = sm_client.create_model(\n",
381 | " ModelName=model_name,\n",
382 | " ExecutionRoleArn=role,\n",
383 | " PrimaryContainer={\"Image\": img_uri, \"ModelDataUrl\": s3_code_artifact},\n",
384 | ")\n",
385 | "model_arn = model_response[\"ModelArn\"]\n",
386 | "print(f\"Created Model: {model_arn}\")"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "id": "a96783b7-9e6a-4bed-8ff9-c779d9e628e4",
392 | "metadata": {},
393 | "source": [
394 | "### Create SageMaker Endpoint"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 11,
400 | "id": "0d2f670c-57f4-4092-af29-b1416829e9dd",
401 | "metadata": {
402 | "tags": []
403 | },
404 | "outputs": [
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "Created Endpoint: arn:aws:sagemaker:us-east-1:143656149352:endpoint/kullm-polyglot-12-8b-v2-2023-07-23-14-04-08-969-endpoint\n"
410 | ]
411 | }
412 | ],
413 | "source": [
414 | "endpoint_config_name = f\"{model_name}-config\"\n",
415 | "endpoint_name = f\"{model_name}-endpoint\"\n",
416 | "variant_name = \"variant1\"\n",
417 | "instance_type = \"ml.g5.12xlarge\"\n",
418 | "initial_instance_count = 1\n",
419 | "\n",
420 | "prod_variants = [\n",
421 | " {\n",
422 | " \"VariantName\": \"variant1\",\n",
423 | " \"ModelName\": model_name,\n",
424 | " \"InstanceType\": instance_type,\n",
425 | " \"InitialInstanceCount\": initial_instance_count,\n",
426 | " # \"ModelDataDownloadTimeoutInSeconds\": 2400,\n",
427 | " \"ContainerStartupHealthCheckTimeoutInSeconds\": 1600,\n",
428 | " }\n",
429 | "]\n",
430 | "\n",
431 | "endpoint_config_response = sm_client.create_endpoint_config(\n",
432 | " EndpointConfigName=endpoint_config_name,\n",
433 | " ProductionVariants=prod_variants\n",
434 | ")\n",
435 | "\n",
436 | "endpoint_response = sm_client.create_endpoint(\n",
437 | " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n",
438 | ")\n",
439 | "print(f\"Created Endpoint: {endpoint_response['EndpointArn']}\")"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "id": "0bf78f45-b06e-431c-9048-3ade776cac07",
445 | "metadata": {},
446 | "source": [
447 | "엔드포인트가 생성되는 동안 아래의 문서를 같이 확인해 보세요.\n",
448 | "- https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-dlc.html"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 12,
454 | "id": "6122a3f8-78b6-42b9-b390-af8942d8e30c",
455 | "metadata": {
456 | "tags": []
457 | },
458 | "outputs": [
459 | {
460 | "data": {
461 | "text/html": [
462 | " [SageMaker LLM Serving] Check Endpoint Status"
463 | ],
464 | "text/plain": [
465 | "
\n",
527 | "\n",
528 | "## 4. Inference\n",
529 | "---"
530 | ]
531 | },
532 | {
533 | "cell_type": "markdown",
534 | "id": "cd435259-7952-4a76-b4ee-3360da5dd7c5",
535 | "metadata": {},
536 | "source": [
537 | "엔드포인트를 호출할 때 이 텍스트를 JSON 페이로드 내에 제공해야 합니다. 이 JSON 페이로드에는 length, sampling strategy, output token sequence restrictions을 제어하는 데 도움이 되는 원하는 추론 매개변수가 포함될 수 있습니다. 허깅페이스 트랜스포머 transformers 라이브러리에는 [사용 가능한 페이로드 매개변수](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig)의 전체 목록이 정의되어 있지만, 중요한 페이로드 매개변수는 다음과 같이 정의되어 있습니다:\n",
538 | "\n",
539 | "* **do_sample (`bool`)** – logits sampling 활성화\n",
540 | "* **max_new_tokens (`int`)** – 생성된 토큰의 최대 수\n",
541 | "* **best_of (`int`)** – best_of 개의 시퀀스를 생성하고 가장 높은 토큰 로그 확률이 있는 경우 반환\n",
542 | "* **repetition_penalty (`float`)** – 반복 패널티에 대한 파라미터, 1.0은 패널티가 없음을 의미하여 Greedy 서치와 동일, 커질수록 다양한 결과를 얻을 수 있으며, 자세한 사항은 [this paper](https://arxiv.org/pdf/1909.05858.pdf)을 참고\n",
543 | "* **return_full_text (`bool`)** – 생성된 텍스트에 프롬프트를 추가할지 여부\n",
544 | "* **seed (`int`)** – Random sampling seed\n",
545 | "* **stop_sequences (`List[str]`)** – `stop_sequences` 가 생성되면 토큰 생성을 중지\n",
546 | "* **temperature (`float`)** – logits 분포 모듈화에 사용되는 값\n",
547 | "* **top_k (`int`)** – 상위 K개 만큼 가장 높은 확률 어휘 토큰의 수\n",
548 | "* **top_p (`float`)** – 1 보다 작게 설정하게 되며, 상위부터 정렬했을 때 가능한 토큰들의 확률을 합산하여 `top_p` 이상의 가장 작은 집합을 유지\n",
549 | "* **truncate (`int`)** – 입력 토큰을 지정된 크기로 잘라냄\n",
550 | "* **typical_p (`float`)** – typical Decoding 양으로, 자세한 사항은 [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)을 참고\n",
551 | "* **watermark (`bool`)** – [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)가 Watermarking\n",
552 | "* **decoder_input_details (`bool`)** – decoder input token logprobs와 ids를 반환"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "id": "12185205-fc49-4716-aacd-c1017a8541a0",
559 | "metadata": {
560 | "tags": []
561 | },
562 | "outputs": [],
563 | "source": [
564 | "params = {\n",
565 | " \"do_sample\": False,\n",
566 | " \"max_new_tokens\": 128,\n",
567 | " \"temperature\": 0.4,\n",
568 | " \"top_p\": 0.9,\n",
569 | " \"return_full_text\": False,\n",
570 | " \"repetition_penalty\": 1.1,\n",
571 | " \"presence_penalty\": None,\n",
572 | " \"eos_token_id\": 2,\n",
573 | "}"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 14,
579 | "id": "922d07d1-7bc6-457d-b18e-9bb4831e4f02",
580 | "metadata": {
581 | "tags": []
582 | },
583 | "outputs": [],
584 | "source": [
585 | "import json\n",
586 | "from inference_utils import KoLLMSageMakerEndpoint\n",
587 | "pred = KoLLMSageMakerEndpoint(endpoint_name)"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": null,
593 | "id": "e4c4579f-7ef7-40d5-a943-ae8dc137b8bd",
594 | "metadata": {
595 | "tags": []
596 | },
597 | "outputs": [],
598 | "source": [
599 | "instruction = \"다음 글을 요약해 주세요.\"\n",
600 | "context = \"\"\"\n",
601 | "엔터프라이즈 환경에서 생성 AI와 대규모 언어 모델(LLM; Large Language Models)의 가장 일반적인 유스케이스 중 하나는 기업의 지식 코퍼스를 기반으로 질문에 답변하는 것입니다. Amazon Lex는 AI 기반 챗봇을 구축하기 위한 프레임워크를 제공합니다. 사전 훈련된 파운데이션 모델(FM; Foundation Models)은 다양한 주제에 대한 요약, 텍스트 생성, 질문 답변과 같은 자연어 이해(NLU; Natural Language Understanding) 작업은 잘 수행하지만, 훈련 데이터의 일부로 보지 못한 콘텐츠에 대한 질문에는 정확한(오답 없이) 답변을 제공하는 데 어려움을 겪거나 완전히 실패합니다. 또한 FM은 특정 시점의 데이터 스냅샷으로 훈련하기에 추론 시점에 새로운 데이터에 액세스할 수 있는 고유한 기능이 없기에 잠재적으로 부정확하거나 부적절한 답변을 제공할 수 있습니다.\n",
602 | "\n",
603 | "이 문제를 해결하기 위해 흔히 사용되는 접근 방식은 검색 증강 생성(RAG; Retrieval Augmented Generation)이라는 기법을 사용하는 것입니다. RAG 기반 접근 방식에서는 LLM을 사용하여 사용자 질문을 벡터 임베딩으로 변환한 다음, 엔터프라이즈 지식 코퍼스에 대한 임베딩이 미리 채워진 벡터 데이터베이스에서 이러한 임베딩에 대한 유사성 검색을 수행합니다. 소수의 유사한 문서(일반적으로 3개)가 사용자 질문과 함께 다른 LLM에 제공된 ‘프롬프트’에 컨텍스트로 추가되고, 해당 LLM은 프롬프트에 컨텍스트로 제공된 정보를 사용하여 사용자 질문에 대한 답변을 생성합니다. RAG 모델은 매개변수 메모리(parametric memory)는 사전 훈련된 seq2seq 모델이고 비매개변수 메모리(non-parametric memory)는 사전 훈련된 신경망 검색기로 액세스되는 위키백과의 고밀도 벡터 색인 모델로 2020년에 Lewis 등이 도입했습니다. RAG 기반 접근 방식의 전반적 구조를 이해하려면 Question answering using Retrieval Augmented Generation with foundation models in Amazon SageMaker JumpStart 블로그를 참조하기 바랍니다.\n",
604 | "\"\"\"\n",
605 | "payload = pred.get_payload(instruction, context, params)"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": null,
611 | "id": "62f94f9b-9b69-408d-bec5-68a195879b00",
612 | "metadata": {},
613 | "outputs": [],
614 | "source": [
615 | "%%time\n",
616 | "generated_text = pred.infer(payload, verbose=True)"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "id": "6d6bcdb9-b73d-468f-8606-6bf4b2f90a56",
623 | "metadata": {},
624 | "outputs": [],
625 | "source": [
626 | "endpoint_name_text = endpoint_name\n",
627 | "%store endpoint_name_text"
628 | ]
629 | },
630 | {
631 | "cell_type": "markdown",
632 | "id": "625410bd-e2e9-4d57-bb35-bddf5cf20301",
633 | "metadata": {},
634 | "source": [
635 | "
\n",
636 | "\n",
637 | "## 5. Clean Up\n",
638 | "---"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "id": "0b5d242e-3b9f-42e4-9ea0-c9596cae540d",
645 | "metadata": {},
646 | "outputs": [],
647 | "source": [
648 | "!rm -rf src"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": null,
654 | "id": "44680c92-0623-46a0-9d3a-2efa262f9af6",
655 | "metadata": {
656 | "tags": []
657 | },
658 | "outputs": [],
659 | "source": [
660 | "# - Delete the end point\n",
661 | "sm_client.delete_endpoint(EndpointName=endpoint_name)\n",
662 | "# - In case the end point failed we still want to delete the model\n",
663 | "sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)\n",
664 | "sm_client.delete_model(ModelName=model_name)"
665 | ]
666 | },
667 | {
668 | "cell_type": "markdown",
669 | "id": "7feeb821-db0a-4a48-8550-b0146705b8d5",
670 | "metadata": {
671 | "tags": []
672 | },
673 | "source": [
674 | "
\n",
675 | "\n",
676 | "# References\n",
677 | "---\n",
678 | "\n",
679 | "- Model 정보\n",
680 | " - kullm-polyglot-5.8b-v2\n",
681 | " - This model is a parameter-efficient fine-tuned version of EleutherAI/polyglot-ko-5.8b on a KULLM v2\n",
682 | " - https://huggingface.co/nlpai-lab/kullm-polyglot-5.8b-v2 \n",
683 | " - kullm-polyglot-12.8b-v2\n",
684 | " - This model is a fine-tuned version of EleutherAI/polyglot-ko-12.8b on a KULLM v2\n",
685 | " - https://huggingface.co/nlpai-lab/kullm-polyglot-12.8b-v2\n",
686 | " - beomi/KoAlpaca-Polyglot-12.8B\n",
687 | " - This model is a fine-tuned version of EleutherAI/polyglot-ko-12.8b on a KoAlpaca Dataset v1.1b\n",
688 | " - https://huggingface.co/beomi/KoAlpaca-Polyglot-12.8B\n",
689 | " - EleutherAI/polyglot-ko-12.8b\n",
690 | " - Polyglot-Ko-12.8B was trained for 167 billion tokens over 301,000 steps on 256 A100 GPUs with the GPT-NeoX framework. It was trained as an autoregressive language model, using cross-entropy loss to maximize the likelihood of predicting the next token.\n",
691 | " - License: Apache 2.0\n",
692 | " - https://huggingface.co/EleutherAI/polyglot-ko-12.8b \n",
693 | "- 코드\n",
694 | " - [Boto3](https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/pytorch_deploy_large_GPT_model/GPT-J-6B-model-parallel-inference-DJL.ipynb)\n",
695 | " - [Python SDK](https://github.com/aws/amazon-sagemaker-examples/blob/main/inference/generativeai/deepspeed/GPT-J-6B_DJLServing_with_PySDK.ipynb)\n",
696 | " - [Kor LLM on SageMaker](https://github.com/gonsoomoon-ml/Kor-LLM-On-SageMaker)\n",
697 | " - [AWS Generative AI Workshop for Korean language](https://github.com/aws-samples/aws-ai-ml-workshop-kr/tree/master/genai)"
698 | ]
699 | }
700 | ],
701 | "metadata": {
702 | "kernelspec": {
703 | "display_name": "conda_pytorch_p310",
704 | "language": "python",
705 | "name": "conda_pytorch_p310"
706 | },
707 | "language_info": {
708 | "codemirror_mode": {
709 | "name": "ipython",
710 | "version": 3
711 | },
712 | "file_extension": ".py",
713 | "mimetype": "text/x-python",
714 | "name": "python",
715 | "nbconvert_exporter": "python",
716 | "pygments_lexer": "ipython3",
717 | "version": "3.10.10"
718 | }
719 | },
720 | "nbformat": 4,
721 | "nbformat_minor": 5
722 | }
723 |
--------------------------------------------------------------------------------
/RAG-SageMaker/rag-fsi-data-workshop/TASK-5_OpenSearch_LLM_RAG_Streamlit_Chatbot_Example.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import sys
3 | import json
4 | import boto3
5 | import numpy as np
6 | from typing import Any, Dict, List, Optional
7 | from langchain.embeddings import SagemakerEndpointEmbeddings
8 | from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
9 | from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
10 | from langchain import PromptTemplate
11 | from langchain.chains.question_answering import load_qa_chain
12 | from streamlit_chat import message
13 | from langchain.indexes import VectorstoreIndexCreator
14 | from langchain.vectorstores import Chroma, AtlasDB, FAISS
15 | from langchain.document_loaders.csv_loader import CSVLoader
16 | from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
17 | import csv
18 | from langchain.vectorstores import OpenSearchVectorSearch
19 | import os
20 | import copy
21 |
22 | import sys
23 | sys.path.append('../utils') # src 폴더 경로 설정
24 | from streamlit_util import KoSimCSERobertaContentHandler, KullmContentHandler, SagemakerEndpointEmbeddingsJumpStart, KoSimCSERobertaContentHandler
25 |
26 | ##########################################################################################################################################################################
27 | # pip install -r ./requirements.txt in the system terminal
28 | #
29 | #
30 | # Case 1: SageMaker notebook Instance
31 | # notebook Instance의 Stramlit URL은 lab이하의 정보가 proxy/8501/로 대체됩니다.
32 | ## uri : https://ori-cuda-version.notebook.us-east-1.sagemaker.aws/lab/tree/main.ipynb
33 | ## streamlit : https://ori-cuda-version.notebook.us-east-1.sagemaker.aws/proxy/8501/
34 | #
35 | # Case 2: SageMaker Studio lab
36 | # Studio의 Stramlit URL은 domain의 lab이하의 정보가 proxy/8501/webapp로 대체됩니다.
37 | ## uri > https://d-l2kk7xvxmnbl.studio.us-east-1.sagemaker.aws/jupyter/default/lab?
38 | ## streamlit : https://d-l2kk7xvxmnbl.studio.us-east-1.sagemaker.aws/jupyter/default/proxy/8501/webapp로
39 | # 참고 : https://aws.amazon.com/ko/blogs/tech/build-a-powerful-question-answering-bot-with-amazon-sagemaker-amazon-opensearch-service-streamlit-and-langchain/
40 | #########################################################################################################################################################################
41 |
42 |
43 | ######## AWS Setting
44 | aws_region = 'us-east-1'
45 | region ='us-east-1'
46 | service ='es'
47 |
48 | ######## For SageMaker
49 | # LLM Endpoint Name :
50 | llm_endpoint_name = 'kullm-polyglot-5-8b-v2-2023-08-23-15-47-39-450-endpoint'
51 | # Embedding Vector Model Endpoint Name :
52 | embvec_endpoint_name= 'KoSimCSE-roberta-2023-08-23-14-07-12'
53 |
54 | ######## For OpenSearch
55 | # Opensearch index name :
56 | index_name = 'fsi-sample'
57 | # Opensearch domain_endpoin name :
58 | opensearch_domain_endpoint = "https://search-ragopensearch-2pz3fgitugmvrz7vbngitqljzu.us-east-1.es.amazonaws.com"
59 | # Opensearch master user auth
60 | username = 'raguser'
61 | password = 'MarsEarth1!'
62 |
63 | #aws_access_key = os.environ['AWS_ACCESS_KEY']
64 | #aws_secret_key =os.environ['AWS_SECRET_KEY']
65 | ##########################################################################################################################################################################
66 | # 검색 rank 개수
67 | faiss_k =3
68 |
69 | # Kullum LLM 파라미터 설정
70 | params = {
71 | 'do_sample': False,
72 | 'max_new_tokens': 512, #128
73 | 'temperature': 1.0, # 0.5 ~ 1.0 default = 1.0 높으면 랜덤하게 자유도. 다음 생성 문장 토큰의 자유도
74 | 'top_k': 0,
75 | 'top_p': 0.9,
76 | 'return_full_text': False,
77 | 'repetition_penalty': 1.1,
78 | 'presence_penalty': None,
79 | 'eos_token_id': 2
80 | }
81 | ##########################################################################################################################################################################
82 |
83 |
84 | def load_chain(llm_endpoint_name):
85 | # KULLUM LLM 로드
86 | LLMTextContentHandler = KullmContentHandler()
87 | endpoint_name_text = llm_endpoint_name
88 | seperator = "||SPEPERATOR||"
89 |
90 | llm_text = SagemakerEndpoint(
91 | endpoint_name=endpoint_name_text,
92 | region_name=aws_region,
93 | model_kwargs=params,
94 | content_handler=LLMTextContentHandler,
95 | )
96 | prompt_template = ''.join(["{context}", seperator, "{question}"])
97 |
98 | PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
99 | chain = load_qa_chain(llm=llm_text, chain_type="stuff", prompt=PROMPT, verbose=True)
100 | return chain
101 |
102 | ##################################################################################################
103 | # FAISS VectorStore - OpenSearch
104 | ##################################################################################################
105 | def load_emb_vec(embvec_endpoint_name):
106 | LLMEmbHandler = KoSimCSERobertaContentHandler()
107 | emb_vec = SagemakerEndpointEmbeddingsJumpStart(
108 | endpoint_name=embvec_endpoint_name,
109 | region_name=aws_region,
110 | content_handler=LLMEmbHandler,
111 | )
112 | return emb_vec
113 |
114 | # opensearch score seems like ranking
115 | def filter_and_remove_score_opensearch_vector_score(res, cutoff_score = 0.006, variance=0.95):
116 | # Get the lowest score
117 | highest_score = max(score for doc, score in res)
118 | print('highest_score : ', highest_score)
119 | # If the lowest score is over 200, return an empty list
120 | if highest_score < cutoff_score:
121 | return []
122 | # Calculate the upper bound for scores
123 | lower_bound = highest_score * variance
124 | print('lower_bound : ', lower_bound)
125 | # Filter the list and remove the score
126 | res = [doc for doc, score in res if score >= lower_bound]
127 |
128 | return res
129 |
130 |
131 | def get_similiar_docs(query, k=5, fetch_k=300, score=True, bank="신한은행"):
132 | print("bank : ", bank)
133 | #query = f'{bank}, {query}'
134 | print("query : ",query)
135 |
136 | if score:
137 | pre_similar_doc = vectro_db.similarity_search_with_score(
138 | query,
139 | k=k,
140 | fetch_k=fetch_k,
141 | search_type="approximate_search", # approximate_search, script_scoring, painless_scripting
142 | space_type="l2", # "l2", "l1", "linf", "cosinesimil", "innerproduct", "hammingbit";
143 | pre_filter={"bool": {"filter": {"term": {"text": bank}}}},
144 | boolean_filter={"bool": {"filter": {"term": {"text": bank}}}}
145 | # filter=dict(source=bank)
146 | )
147 | print('jhs : ', pre_similar_doc)
148 | pretty_print_documents(pre_similar_doc)
149 | similar_docs = filter_and_remove_score_opensearch_vector_score(pre_similar_doc)
150 | else:
151 | similar_docs = vectro_db.similarity_search(
152 | query,
153 | k=k,
154 | search_type="approximate_search", # approximate_search, script_scoring, painless_scripting
155 | space_type="12", # "l2", "l1", "linf", "cosinesimil", "innerproduct", "hammingbit";
156 | pre_filter={"bool": {"filter": {"term": {"text": bank}}}},
157 | boolean_filter={"bool": {"filter": {"term": {"text": bank}}}}
158 |
159 | )
160 | similar_docs_copy = copy.deepcopy(similar_docs)
161 |
162 | # print('similar_docs_copy : \n', similar_docs_copy)
163 |
164 | return similar_docs_copy
165 |
166 | # 임베딩 벡터 로드
167 | emb_vec = load_emb_vec(embvec_endpoint_name)
168 |
169 | # LLM 로드
170 | chain = load_chain(llm_endpoint_name)
171 |
172 | http_auth = (username, password) # opensearch user
173 |
174 | #OpenSearch Vector Indexer
175 |
176 | vectro_db = OpenSearchVectorSearch(
177 | index_name=index_name,
178 | opensearch_url=opensearch_domain_endpoint,
179 | embedding_function=emb_vec,
180 | http_auth=http_auth,
181 | is_aoss = False,
182 | engine="faiss",
183 | space_type="12"
184 | )
185 |
186 | ##################################################################################################
187 | def pretty_print_documents(response):
188 | for doc, score in response:
189 | print(f'\nScore: {score}')
190 | print(f'Document Number: {doc.metadata["row"]}')
191 | print(f'Source: {doc.metadata["source"]}')
192 |
193 | # Split the page content into lines
194 | lines = doc.page_content.split("\n")
195 |
196 | # Extract and print each piece of information if it exists
197 | for line in lines:
198 | split_line = line.split(": ")
199 | if len(split_line) > 1:
200 | print(f'{split_line[0]}: {split_line[1]}')
201 |
202 | print('-' * 50)
203 |
204 |
205 | def get_answer(query):
206 | k = 3
207 | search_query = query
208 |
209 | similar_docs = get_similiar_docs(search_query, k=k, bank='신한은행')
210 |
211 | llm_query = ''+query+' Category에 대한 Information을 찾아서 설명해주세요.'
212 |
213 | if not similar_docs:
214 | llm_query = query
215 |
216 | answer = chain.run(input_documents=similar_docs, question=llm_query)
217 |
218 | return answer
219 |
220 |
221 |
222 | ##################################################################################################
223 | # Streamlit UI
224 | # From here down is all the StreamLit UI.
225 | ##################################################################################################
226 | st.set_page_config(page_title="FSI RAG FAQ Demo vectorstore mode", page_icon="🦜", layout="wide")
227 | st.header("🦜 FSI RAG Demo - Opensearch vectorstore with LLM mode")
228 |
229 | def get_text():
230 | input_text = st.text_input("You: ", "", key="input")
231 | return input_text
232 |
233 | # 사용자로부터 입력을 받습니다.
234 | # user_input = get_text()
235 |
236 | # if "generated" not in st.session_state:
237 | # st.session_state["generated"] = []
238 | #
239 | # if "past" not in st.session_state:
240 | # st.session_state["past"] = []
241 | #
242 | # # 사용자가 입력을 제공했는지 확인합니다.
243 | # if user_input:
244 | # output = get_answer(user_input)
245 | # print("OUTPUT : ", output)
246 | # st.session_state.past.append(user_input)
247 | # st.session_state.generated.append(output)
248 | #
249 | #
250 | #
251 | #
252 | # if st.session_state["generated"]:
253 | #
254 | # for i in range(len(st.session_state["generated"]) - 1, -1, -1):
255 | # message(st.session_state["generated"][i], key=str(i))
256 | # message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
257 |
258 |
259 |
260 | from langchain.callbacks import StreamlitCallbackHandler
261 | if "messages" not in st.session_state:
262 | st.session_state["messages"] = []
263 |
264 | for msg in st.session_state.messages:
265 | st.chat_message(msg["role"]).write(msg["content"])
266 |
267 | if prompt := st.chat_input(placeholder="여기에 금융 FAQ 질문해주세요"):
268 | st.session_state.messages.append({"role": "user", "content": prompt})
269 | st.chat_message("user").write(prompt)
270 |
271 |
272 | with st.chat_message("assistant"):
273 | st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
274 | response = get_answer(prompt)
275 | st.session_state.messages.append({"role": "assistant", "content": response})
276 | st.write(response)
277 |
--------------------------------------------------------------------------------
/RAG-SageMaker/rag-fsi-data-workshop/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | streamlit
3 | langchain
4 | streamlit_chat
5 | opensearch-py
--------------------------------------------------------------------------------
/RAG-SageMaker/rag-fsi-data-workshop/src/kullm-polyglot-5-8b-v2/model.py:
--------------------------------------------------------------------------------
1 | from djl_python import Input, Output
2 | import os
3 | import deepspeed
4 | import torch
5 | import logging
6 | from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
7 | from transformers import GPTNeoXLayer
8 |
9 | predictor = None
10 |
11 | def get_model(properties):
12 |
13 | tp_degree = properties["tensor_parallel_degree"]
14 | model_location = properties["model_dir"]
15 | if "model_id" in properties:
16 | model_location = properties["model_id"]
17 | task = properties["task"]
18 |
19 | logging.info(f"Loading model in {model_location}")
20 | local_rank = int(os.getenv("LOCAL_RANK", "0"))
21 |
22 | tokenizer = AutoTokenizer.from_pretrained(model_location)
23 |
24 | model = AutoModelForCausalLM.from_pretrained(
25 | model_location,
26 | torch_dtype=torch.float16,
27 | low_cpu_mem_usage=True,
28 | )
29 |
30 | model.requires_grad_(False)
31 | model.eval()
32 |
33 | ds_config = {
34 | "tensor_parallel": {"tp_size": tp_degree},
35 | "dtype": model.dtype,
36 | "injection_policy": {
37 | GPTNeoXLayer:('attention.dense', 'mlp.dense_4h_to_h')
38 | }
39 | }
40 | logging.info(f"Starting DeepSpeed init with TP={tp_degree}")
41 | model = deepspeed.init_inference(model, ds_config)
42 |
43 | generator = pipeline(
44 | task=task, model=model, tokenizer=tokenizer, device=local_rank
45 | )
46 | # https://huggingface.co/docs/hub/models-tasks
47 | return generator
48 |
49 | def handle(inputs: Input) -> None:
50 | """
51 | inputs: Contains the configurations from serving.properties
52 | """
53 | global predictor
54 | if not predictor:
55 | predictor = get_model(inputs.get_properties())
56 |
57 | if inputs.is_empty():
58 | # Model server makes an empty call to warmup the model on startup
59 | logging.info("is_empty")
60 | return None
61 |
62 | data = inputs.get_as_json() #inputs.get_as_string()
63 | logging.info("data:", data)
64 |
65 | input_prompt, params = data["inputs"], data["parameters"]
66 | result = predictor(input_prompt, **params)
67 | logging.info("result:", result)
68 |
69 | return Output().add_as_json(result) #Output().add(result)
70 |
--------------------------------------------------------------------------------
/RAG-SageMaker/rag-fsi-data-workshop/src/kullm-polyglot-5-8b-v2/serving.properties:
--------------------------------------------------------------------------------
1 |
2 | engine=DeepSpeed
3 |
4 | # passing extra options to model.py or built-in handler
5 | job_queue_size=100
6 | batch_size=1
7 | max_batch_delay=1
8 | max_idle_time=60
9 |
10 | # Built-in entrypoint
11 | #option.entryPoint=djl_python.deepspeed
12 |
13 | # Hugging Face model id
14 | option.model_id=nlpai-lab/kullm-polyglot-5.8b-v2
15 |
16 | # defines custom environment variables
17 | #env=SERVING_NUMBER_OF_NETTY_THREADS=2
18 |
19 | # Allows to load DeepSpeed workers in parallel
20 | option.parallel_loading=true
21 |
22 | # specify tensor parallel degree (number of partitions)
23 | option.tensor_parallel_degree=1
24 |
25 | # specify per model timeout
26 | option.model_loading_timeout=600
27 | #option.predict_timeout=240
28 |
29 | # mark the model as failure after python process crashing 10 times
30 | retry_threshold=0
31 |
32 | option.task=text-generation
--------------------------------------------------------------------------------
/RAG-SageMaker/utils/inference_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | import boto3
4 | import os.path as osp
5 | from typing import Union
6 | import pprint
7 | """
8 | A dedicated helper to manage templates and prompt building.
9 | """
10 |
11 | class Prompter(object):
12 |
13 | __slots__ = ("template", "_verbose")
14 |
15 | def __init__(self, template_name: str = "", verbose: bool = False):
16 | self._verbose = verbose
17 | if not template_name:
18 | # Enforce the default here, so the constructor can be called with '' and will not break.
19 | template_name = "alpaca"
20 | #file_name = osp.join("templates", f"{template_name}.json")
21 | file_name = osp.join("../utils", f"{template_name}.json")
22 | if not osp.exists(file_name):
23 | raise ValueError(f"Can't read {file_name}")
24 | with open(file_name) as fp:
25 | self.template = json.load(fp)
26 | if self._verbose:
27 | print(
28 | f"Using prompt template {template_name}: {self.template['description']}"
29 | )
30 |
31 | def generate_prompt(
32 | self,
33 | instruction: str,
34 | input: Union[None, str] = None,
35 | label: Union[None, str] = None,
36 | ) -> str:
37 | # returns the full prompt from instruction and optional input
38 | # if a label (=response, =output) is provided, it's also appended.
39 | if input:
40 | res = self.template["prompt_input"].format(
41 | instruction=instruction, input=input
42 | )
43 | else:
44 | res = self.template["prompt_no_input"].format(
45 | instruction=instruction
46 | )
47 | if label:
48 | res = f"{res}{label}"
49 | if self._verbose:
50 | print(res)
51 | return res
52 |
53 | def get_response(self, output: str) -> str:
54 | return output.split(self.template["response_split"])[1].strip()
55 |
56 | def invoke_inference(endpoint_name, prompt):
57 |
58 | client = boto3.client("sagemaker-runtime")
59 | content_type = "application/json"
60 |
61 | response = client.invoke_endpoint(
62 | EndpointName=endpoint_name,
63 | ContentType=content_type,
64 | Body=json.dumps(prompt)
65 | )
66 | res = response["Body"].read().decode()
67 |
68 | return res
69 |
70 | def parse_response(query_response):
71 |
72 | def traverse(o, tree_types=(list, tuple)):
73 | if isinstance(o, tree_types):
74 | for value in o:
75 | for subvalue in traverse(value, tree_types):
76 | yield subvalue
77 | else:
78 | yield o
79 |
80 | data = eval(query_response)
81 |
82 | listRes = []
83 | for value in traverse(data):
84 | listRes.append(value["generated_text"])
85 |
86 | if len(listRes) >= 2: return listRes
87 | else: return listRes[0].strip()
88 |
89 |
90 |
91 | class KoLLMSageMakerEndpoint(object):
92 | def __init__(self, endpoint_name):
93 | self.endpoint_name = endpoint_name
94 | self.prompter = Prompter("kullm")
95 | self.smr_client = boto3.client('sagemaker-runtime')
96 |
97 | def get_payload(self, instruction, input_text, params):
98 | prompter = Prompter("kullm")
99 | prompt = prompter.generate_prompt(instruction, input_text)
100 | payload = {
101 | 'inputs': prompt,
102 | 'parameters': params
103 | }
104 | return payload
105 |
106 | def infer(self, payload, verbose=True):
107 |
108 | content_type = "application/json"
109 | response = self.smr_client.invoke_endpoint(
110 | EndpointName=self.endpoint_name,
111 | ContentType=content_type,
112 | Body=json.dumps(payload)
113 | )
114 |
115 | #model_predictions = json.loads(response['Body'].read().decode())
116 | #s = model_predictions[0]['generated_text']
117 | #generated_text = self.prompter.get_response(s)
118 | res = response["Body"].read().decode()
119 | generated_text = parse_response(res)
120 | generated_text = generated_text.split('###')[0]
121 | if verbose:
122 | pprint.pprint(f'Response: {generated_text}')
123 | return generated_text
124 |
--------------------------------------------------------------------------------
/RAG-SageMaker/utils/kullm.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Alpaca-LoRA에서 사용하는 템플릿입니다.",
3 | "prompt_input": "아래는 작업을 설명하는 명령어와 추가 컨텍스트를 제공하는 입력이 짝을 이루는 예제입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 입력:\n{input}\n\n### 응답:\n",
4 | "prompt_no_input": "아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 응답:\n",
5 | "response_split": "### 응답:"
6 | }
--------------------------------------------------------------------------------
/RAG-SageMaker/utils/streamlit_util.py:
--------------------------------------------------------------------------------
1 | import json
2 | import boto3
3 | import numpy as np
4 | from inference_utils import Prompter
5 | from typing import Any, Dict, List, Optional
6 | from langchain.embeddings import SagemakerEndpointEmbeddings
7 | from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
8 | from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
9 |
10 | prompter = Prompter("kullm")
11 |
12 | class KullmContentHandler(LLMContentHandler):
13 | content_type = "application/json"
14 | accepts = "application/json"
15 |
16 | def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
17 | '''
18 | 입력 데이터 전처리 후에 리턴
19 | '''
20 | context, question = prompt.split("||SPEPERATOR||")
21 | prompt = prompter.generate_prompt(question, context)
22 |
23 | # print ("prompt", prompt)
24 | payload = {
25 | 'inputs': [prompt],
26 | 'parameters': model_kwargs
27 | }
28 |
29 | input_str = json.dumps(payload)
30 |
31 | return input_str.encode('utf-8')
32 |
33 | def transform_output(self, output: bytes) -> str:
34 | response_json = json.loads(output.read().decode("utf-8"))
35 | generated_text = response_json[0][0]["generated_text"]
36 |
37 | return generated_text
38 |
39 |
40 | class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
41 | def embed_documents(self, texts: List[str], chunk_size: int = 1) -> List[List[float]]:
42 | """Compute doc embeddings using a SageMaker Inference Endpoint.
43 |
44 | Args:
45 | texts: The list of texts to embed.
46 | chunk_size: The chunk size defines how many input texts will
47 | be grouped together as request. If None, will use the
48 | chunk size specified by the class.
49 |
50 | Returns:
51 | List of embeddings, one for each text.
52 | """
53 | results = []
54 | _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
55 |
56 | print("text size: ", len(texts))
57 | print("_chunk_size: ", _chunk_size)
58 |
59 | for i in range(0, len(texts), _chunk_size):
60 | # print (i, texts[i : i + _chunk_size])
61 | response = self._embedding_func(texts[i: i + _chunk_size])
62 | # print (i, response, len(response[0].shape))
63 |
64 | results.extend(response)
65 | return results
66 |
67 |
68 | class KoSimCSERobertaContentHandler(EmbeddingsContentHandler):
69 | content_type = "application/json"
70 | accepts = "application/json"
71 |
72 | def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
73 |
74 | input_str = json.dumps({"inputs": prompt, **model_kwargs})
75 |
76 | return input_str.encode("utf-8")
77 |
78 | def transform_output(self, output: bytes) -> str:
79 |
80 | response_json = json.loads(output.read().decode("utf-8"))
81 | ndim = np.array(response_json).ndim
82 |
83 | if ndim == 4:
84 | # Original shape (1, 1, n, 768)
85 | emb = response_json[0][0][0]
86 | emb = np.expand_dims(emb, axis=0).tolist()
87 | elif ndim == 2:
88 | # Original shape (n, 1)
89 | emb = []
90 | for ele in response_json:
91 | e = ele[0][0]
92 | emb.append(e)
93 | else:
94 | print(f"Other # of dimension: {ndim}")
95 | emb = None
96 | return emb
97 |
98 |
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AWS-LLM-SageMaker
2 |
3 | 개발자와 솔루션 빌더를 대상으로 하는 이 실습 워크샵에서는 [Amazon SageMaker](https://aws.amazon.com/sagemaker/)을 통해 파운데이션 모델(FM)을 활용하는 방법을 소개합니다.
4 |
5 | 이 실습에서는 Generative AI에 대해 고객들이 가장 많이 사용하는 몇 가지 사용 패턴 GenAI를 이용하여 생산성을 향상시킴으로써 조직의 가치를 창출하는 기술의 예제를 보여줍니다.
6 | 이는 이메일 작성, 텍스트 요약, 질문에 대한 답변, 챗봇 구축, 이미지 생성에 도움이 되는 기초 모델을 활용하여 달성할 수 있습니다.
7 |
8 |
9 | ### AWS Samples Github 배포 실습 자료 안내
10 |
11 | 본 실습 자료는 AWS Samples Github에 배포됩니다. 현재 실습 자료는 AWS Samples 공식 자료보다 항상 최신 선반영됩니다.
12 |
13 | #### LLM - RAG: Opensearch with SageMaker Endpoint LLM Polyglot
14 | - 토픽: LLM - RAG: Opensearch with SageMaker Endpoint LLM Polyglot
15 | - 반영 링크: https://github.com/aws-samples/aws-ai-ml-workshop-kr/tree/master/genai/aws-gen-ai-kr/20_applications/04_rag_finance_opensearch_sllm_workshop
16 | - 최근 반영일: 2024.04.25
17 |
18 | #### [Tuner] QLoRA fine-tuning
19 | - 토픽: [Tuner] QLoRA fine-tuning
20 | - 반영 링크: https://github.com/aws-samples/aws-ai-ml-workshop-kr/tree/master/genai/aws-gen-ai-kr/30_fine_tune/01-instruction-tuning-peft-qlora
21 | - 최근 반영일: 2024.04.18
22 |
23 | ## LLM - RAG : Opensearch with SageMaker Endpoint LLM Ployglot
24 | 1. [Amazon SageMaker와 Amazon Opensearch로 RAG (Retrieval-Augmented Generation) 구현실습 ](https://github.com/hyeonsangjeon/AWS-LLM-SageMaker/tree/main/RAG-SageMaker/rag-fsi-data-workshop) -
25 | RAG (Retrieval-Augmented Generation)는 정보 검색과 텍스트 생성을 결합한 혁신적인 NLP 아키텍처입니다. 이번 실습에서는 RAG가 어떻게 Amazon Opensearch와 통합되어 외부의 신뢰할 수 있는 데이터베이스나 문서를 검색하는 과정을 강화하는지 간단한 실습을 통해 알아봅니다.
26 | 이 실습에서는 SageMaker Endpoint와 Amazon Openssearch에서 Embedding 데이터 입력, SDK, 그리고 [LangChain](https://python.langchain.com/docs/get_started/introduction) 및 [FAISS](https://faiss.ai/index.html)와 같은 오픈소스 소프트웨어를 통해 이러한 패턴을 구현하는 실무 경험을 쌓을 수 있습니다.
27 |
28 |
29 | #### SageMaker Training deep-learning-containers
30 | - deep-learning Base Container image link : https://github.com/aws/deep-learning-containers/blob/master/available_images.md
31 |
32 | ## [Tuner] QLoRA fine-tuning
33 | - [KULLM-Polyglot-12.8B](PEFT)
34 |
35 | ### Filenames
36 | - `1_prepare-dataset-alpaca-method.ipynb`: instruction 데이터 세트로부터 훈련 데이터 세트를 준비합니다. 각 샘플을 토크나이즈하는 방식입니다.
37 | - `1_prepare-dataset-chunk-method.ipynb`: instruction 데이터 세트로부터 훈련 데이터 세트를 준비합니다. 샘플을 모두 모아서(concatenate) 청크 크기(chunk size)만큼 분할하는 방식입니다.
38 | - `2_local-train-debug-lora.ipynb`: 본격적으로 훈련 인스턴스에서 수행하기 전에 개발 환경에서 일부 샘플 데이터로 디버그를 수행합니다. 이미 파인 튜닝에 익숙한 분들은 이 핸즈온을 건너뛰고 3_sm-train-lora.ipynb을 진행해 주세요.
39 | - `3_sm-train-lora.ipynb`: SageMaker 훈련 인스턴스에서 파인튜닝을 수행합니다.
40 |
41 |
--------------------------------------------------------------------------------
/common_code/inference_lib.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | import time
3 | import json
4 |
5 |
6 | """
7 | A dedicated helper to manage templates and prompt building.
8 | """
9 |
10 | import json
11 | import os.path as osp
12 | from typing import Union
13 | import os
14 | import pathlib
15 |
16 | class Prompter(object):
17 | __slots__ = ("template", "_verbose")
18 |
19 | def __init__(self, template_name: str = "", verbose: bool = False):
20 | self._verbose = verbose
21 | if not template_name:
22 | # Enforce the default here, so the constructor can be called with '' and will not break.
23 | template_name = "alpaca"
24 | #file_name = osp.join("templates", f"{template_name}.json")
25 | # file_name = str(pathlib.Path().home()) + '/Kor-LLM-On-SageMaker/common_code' # + f"{template_name}.json"
26 | file_name = osp.join("/root/Kor-LLM-On-SageMaker/common_code", f"{template_name}.json")
27 |
28 |
29 |
30 | # path = pathlib.Path.cwd()
31 | # print("Pathlib: ", path)
32 |
33 | # print(pathlib.Path().home())
34 |
35 | # path = pathlib.Path().home() + '/Kor-LLM-On-SageMaker/common_code'
36 |
37 |
38 | # print("pwd: ", os.getcwd())
39 | # file_name = f"{template_name}.json"
40 | if not osp.exists(file_name):
41 | raise ValueError(f"Can't read {file_name}")
42 | with open(file_name) as fp:
43 | self.template = json.load(fp)
44 | if self._verbose:
45 | print(
46 | f"Using prompt template {template_name}: {self.template['description']}"
47 | )
48 |
49 | def generate_prompt(
50 | self,
51 | instruction: str,
52 | input: Union[None, str] = None,
53 | label: Union[None, str] = None,
54 | ) -> str:
55 | # returns the full prompt from instruction and optional input
56 | # if a label (=response, =output) is provided, it's also appended.
57 | if input:
58 | res = self.template["prompt_input"].format(
59 | instruction=instruction, input=input
60 | )
61 | else:
62 | res = self.template["prompt_no_input"].format(
63 | instruction=instruction
64 | )
65 | if label:
66 | res = f"{res}{label}"
67 | if self._verbose:
68 | print(res)
69 | return res
70 |
71 | def get_response(self, output: str) -> str:
72 | return output.split(self.template["response_split"])[1].strip()
73 |
74 | def describe_endpoint(endpoint_name):
75 | '''
76 | 엔드폰인트 생성 유무를 확인. 생성 중이면 기다림.
77 | '''
78 | sm_client = boto3.client("sagemaker")
79 |
80 | while(True):
81 | response = sm_client.describe_endpoint(
82 | EndpointName= endpoint_name
83 | )
84 | status = response['EndpointStatus']
85 | if status == 'Creating':
86 | print("Endpoint is ", status)
87 | time.sleep(60)
88 | else:
89 | print("Endpoint is ", status)
90 | break
91 |
92 |
93 | def invoke_inference(endpoint_name, prompt):
94 | '''
95 | KoAlpaca 프롬프트를 제공하여 엔드포인트 호출
96 | '''
97 | client = boto3.client("sagemaker-runtime")
98 |
99 | content_type = "text/plain"
100 | response = client.invoke_endpoint(
101 | EndpointName=endpoint_name, ContentType=content_type, Body=prompt
102 | )
103 | #print(response["Body"].read())
104 | res = response["Body"].read().decode()
105 | print (eval(res)[0]['generated_text'])
106 |
107 | def invoke_inference_DJ(endpoint_name, prompt):
108 |
109 | '''
110 | invoke_inference 변형,
111 | 곤수님께서 기존에 invoke_inference를 사용하는 부분이 있어 우선 이름을 달리 함
112 | 추후 invoke_inference과 하나로 합칠 예정
113 | '''
114 |
115 | '''
116 | KoAlpaca 프롬프트를 제공하여 엔드포인트 호출
117 | '''
118 |
119 | client = boto3.client("sagemaker-runtime")
120 |
121 | content_type = "application/json"
122 | response = client.invoke_endpoint(
123 | EndpointName=endpoint_name,
124 | ContentType=content_type,
125 | Body=json.dumps(prompt)
126 | )
127 |
128 | res = response["Body"].read().decode()
129 | # print (res)
130 |
131 | return res
132 |
133 | def query_endpoint_with_text_payload(plain_text, endpoint_name, content_type="text/plain"):
134 | '''
135 | content_type 이 text/plain 인 경우 사용
136 | '''
137 | client = boto3.client("runtime.sagemaker")
138 | response = client.invoke_endpoint(
139 | EndpointName=endpoint_name, ContentType=content_type, Body=plain_text
140 | )
141 | return response
142 |
143 |
144 | def parse_response_text_model(query_response):
145 | '''
146 | content_type 이 text/plain 인 경우 사용
147 | '''
148 |
149 | model_predictions = json.loads(query_response["Body"].read())
150 | # print("model_predictions: \n", model_predictions)
151 | generated_text = model_predictions[0]["generated_text"]
152 | return generated_text
153 |
154 | def parse_response(query_response):
155 |
156 | def traverse(o, tree_types=(list, tuple)):
157 | if isinstance(o, tree_types):
158 | for value in o:
159 | for subvalue in traverse(value, tree_types):
160 | yield subvalue
161 | else:
162 | yield o
163 |
164 | data = eval(query_response)
165 |
166 | listRes = []
167 | for value in traverse(data):
168 | listRes.append(value["generated_text"])
169 |
170 | if len(listRes) >= 2: return listRes
171 | else: return listRes[0].strip()
172 |
173 | ################################################
174 | # Embedding Handler
175 | ################################################
176 |
177 | # from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
178 | # from langchain.embeddings import SagemakerEndpointEmbeddings
179 | # from langchain.llms.sagemaker_endpoint import ContentHandlerBase
180 | # from typing import Any, Dict, List, Optional
181 |
182 | # class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
183 | # def embed_documents(self, texts: List[str], chunk_size: int = 5) -> List[List[float]]:
184 | # """Compute doc embeddings using a SageMaker Inference Endpoint.
185 |
186 | # Args:
187 | # texts: The list of texts to embed.
188 | # chunk_size: The chunk size defines how many input texts will
189 | # be grouped together as request. If None, will use the
190 | # chunk size specified by the class.
191 |
192 | # Returns:
193 | # List of embeddings, one for each text.
194 | # """
195 | # results = []
196 | # _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
197 |
198 | # # print("text size: ", len(texts))
199 | # # print("_chunk_size: ", _chunk_size)
200 |
201 | # for i in range(0, len(texts), _chunk_size):
202 | # response = self._embedding_func(texts[i : i + _chunk_size])
203 | # print
204 | # results.extend(response)
205 | # return results
206 |
207 | # import numpy as np
208 |
209 | # class KoSimCSERobertaContentHandler(EmbeddingsContentHandler):
210 | # content_type = "application/json"
211 | # accepts = "application/json"
212 |
213 | # def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
214 | # input_str = json.dumps({"inputs": prompt, **model_kwargs})
215 | # return input_str.encode("utf-8")
216 |
217 | # def transform_output(self, output: bytes) -> str:
218 | # response_json = json.loads(output.read().decode("utf-8"))
219 | # ndim = np.array(response_json).ndim
220 | # # print("response_json ndim: \n", ndim)
221 | # # print("response_json shape: \n", np.array(response_json).shape)
222 | # if ndim == 4:
223 | # # Original shape (1, 1, n, 768)
224 | # emb = response_json[0][0][0]
225 | # emb = np.expand_dims(emb, axis=0).tolist()
226 | # # print("emb shape: ", np.array(emb).shape)
227 | # # print("emb TYPE: ", type(emb))
228 | # elif ndim == 2:
229 | # # Original shape (n, 1)
230 | # # print(response_json[0])
231 | # emb = []
232 | # for ele in response_json:
233 | # # print(np.array(response_json[0]).shape)
234 | # e = ele[0][0]
235 | # #emb = np.expand_dims(emb, axis=0).tolist()
236 | # # print("emb shape: ", np.array(emb).shape)
237 | # # print("emb TYPE: ", type(emb))
238 | # emb.append(e)
239 | # # print("emb_list shape: ", np.array(emb).shape)
240 | # # print("emb_list TYPE: ", type(emb))
241 | # else:
242 | # print(f"Other # of dimension: {ndim}")
243 | # emb = None
244 | # return emb
245 |
246 |
247 | # ################################################
248 | # # LLM Handler
249 | # ################################################
250 | # from langchain.llms.sagemaker_endpoint import LLMContentHandler
251 | # import json
252 |
253 | # class KoAlpacaContentHandler(LLMContentHandler):
254 | # content_type = "application/json"
255 | # accepts = "application/json"
256 |
257 | # def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
258 | # input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
259 | # return input_str.encode("utf-8")
260 |
261 | # def transform_output(self, output: bytes) -> str:
262 | # print("In KoAlpacaContentHandler")
263 | # # print("output: ", output)
264 | # response_json = json.loads(output.read().decode("utf-8"))
265 | # print("response_json: ", response_json)
266 | # # return response_json["generated_texts"][0]
267 | # doc = response_json[0]['generated_text']
268 | # doc = json.loads(doc)
269 | # doc = doc['text_inputs']
270 | # return doc
271 |
--------------------------------------------------------------------------------
/common_code/kullm.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Alpaca-LoRA에서 사용하는 템플릿입니다.",
3 | "prompt_input": "아래는 작업을 설명하는 명령어와 추가 컨텍스트를 제공하는 입력이 짝을 이루는 예제입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 입력:\n{input}\n\n### 응답:\n",
4 | "prompt_no_input": "아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 응답:\n",
5 | "response_split": "### 응답:"
6 | }
--------------------------------------------------------------------------------
/images/Nfloat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/Nfloat.png
--------------------------------------------------------------------------------
/images/TensorShard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/TensorShard.png
--------------------------------------------------------------------------------
/images/lora.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/lora.png
--------------------------------------------------------------------------------
/images/lora_eq1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/lora_eq1.png
--------------------------------------------------------------------------------
/images/lora_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/lora_r.png
--------------------------------------------------------------------------------
/images/qlora_eq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/qlora_eq.png
--------------------------------------------------------------------------------
/images/qlora_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/qlora_fig1.png
--------------------------------------------------------------------------------
/images/quantization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/images/quantization.png
--------------------------------------------------------------------------------
/templates/README.md:
--------------------------------------------------------------------------------
1 | # Prompt templates
2 |
3 | This directory contains template styles for the prompts used to finetune LoRA models.
4 |
5 | ## Format
6 |
7 | A template is described via a JSON file with the following keys:
8 |
9 | - `prompt_input`: The template to use when input is not None. Uses `{instruction}` and `{input}` placeholders.
10 | - `prompt_no_input`: The template to use when input is None. Uses `{instruction}` placeholders.
11 | - `description`: A short description of the template, with possible use cases.
12 | - `response_split`: The text to use as separator when cutting real response from the model output.
13 |
14 | No `{response}` placeholder was used, since the response is always the last element of the template and is just to be concatenated to the rest.
15 |
16 | ## Example template
17 |
18 | The default template, used unless otherwise specified, is `alpaca.json`
19 |
20 | ```json
21 | {
22 | "description": "Template used by Alpaca-LoRA.",
23 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
24 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
25 | "response_split": "### Response:"
26 | }
27 |
28 | ```
29 |
30 | ## Current templates
31 |
32 | ### alpaca
33 |
34 | Default template used for generic LoRA fine tunes so far.
35 |
36 | ### alpaca_legacy
37 |
38 | Legacy template used by the original alpaca repo, with no `\n` after the response field. Kept for reference and experiments.
39 |
40 | ### alpaca_short
41 |
42 | A trimmed down alpaca template which seems to perform just as well and spare some tokens. Models created with the default template seem to be queryable by the short tempalte as well. More experiments are welcome.
43 |
44 | ### vigogne
45 |
46 | The default alpaca template, translated to french. This template was used to train the "Vigogne" LoRA and is to be used to query it, or for extra fine tuning.
47 |
--------------------------------------------------------------------------------
/templates/alpaca.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Template used by Alpaca-LoRA.",
3 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
4 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
5 | "response_split": "### Response:"
6 | }
7 |
--------------------------------------------------------------------------------
/templates/alpaca_legacy.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Legacy template, used by Original Alpaca repository.",
3 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:",
4 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:",
5 | "response_split": "### Response:"
6 | }
7 |
--------------------------------------------------------------------------------
/templates/alpaca_short.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "A shorter template to experiment with.",
3 | "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
4 | "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n",
5 | "response_split": "### Response:"
6 | }
7 |
--------------------------------------------------------------------------------
/templates/korwkv.json:
--------------------------------------------------------------------------------
1 | {
2 | "prompt_input": "### 명령어:\n{instruction}\n\n### 질문:\n{input}\n\n### 답변:\n",
3 | "prompt_no_input": "### 명령어:\n{instruction}\n\n### 답변:\n",
4 | "response_split": "### 답변:"
5 | }
--------------------------------------------------------------------------------
/templates/kullm.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Alpaca-LoRA에서 사용하는 템플릿입니다.",
3 | "prompt_input": "아래는 작업을 설명하는 명령어와 추가 컨텍스트를 제공하는 입력이 짝을 이루는 예제입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 입력:\n{input}\n\n### 응답:\n",
4 | "prompt_no_input": "아래는 작업을 설명하는 명령어입니다. 요청을 적절히 완료하는 응답을 작성하세요.\n\n### 명령어:\n{instruction}\n\n### 응답:\n",
5 | "response_split": "### 응답:"
6 | }
7 |
--------------------------------------------------------------------------------
/templates/vigogne.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "French template, used by Vigogne for finetuning.",
3 | "prompt_input": "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Entrée:\n{input}\n\n### Réponse:\n",
4 | "prompt_no_input": "Ci-dessous se trouve une instruction qui décrit une tâche. Écrivez une réponse qui complète correctement la demande.\n\n### Instruction:\n{instruction}\n\n### Réponse:\n",
5 | "response_split": "### Réponse:"
6 | }
7 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyeonsangjeon/AWS-LLM-SageMaker/6949934df9fa40cc8ff2afda07a3250b7a126f11/utils/__init__.py
--------------------------------------------------------------------------------
/utils/callbacks.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers to support streaming generate output.
3 | Borrowed from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/callbacks.py
4 | """
5 |
6 | import gc
7 | import traceback
8 | from queue import Queue
9 | from threading import Thread
10 |
11 | import torch
12 | import transformers
13 |
14 |
15 | class Stream(transformers.StoppingCriteria):
16 | def __init__(self, callback_func=None):
17 | self.callback_func = callback_func
18 |
19 | def __call__(self, input_ids, scores) -> bool:
20 | if self.callback_func is not None:
21 | self.callback_func(input_ids[0])
22 | return False
23 |
24 |
25 | class Iteratorize:
26 |
27 | """
28 | Transforms a function that takes a callback
29 | into a lazy iterator (generator).
30 | """
31 |
32 | def __init__(self, func, kwargs={}, callback=None):
33 | self.mfunc = func
34 | self.c_callback = callback
35 | self.q = Queue()
36 | self.sentinel = object()
37 | self.kwargs = kwargs
38 | self.stop_now = False
39 |
40 | def _callback(val):
41 | if self.stop_now:
42 | raise ValueError
43 | self.q.put(val)
44 |
45 | def gentask():
46 | try:
47 | ret = self.mfunc(callback=_callback, **self.kwargs)
48 | except ValueError:
49 | pass
50 | except:
51 | traceback.print_exc()
52 | pass
53 |
54 | self.q.put(self.sentinel)
55 | if self.c_callback:
56 | self.c_callback(ret)
57 |
58 | self.thread = Thread(target=gentask)
59 | self.thread.start()
60 |
61 | def __iter__(self):
62 | return self
63 |
64 | def __next__(self):
65 | obj = self.q.get(True, None)
66 | if obj is self.sentinel:
67 | raise StopIteration
68 | else:
69 | return obj
70 |
71 | def __enter__(self):
72 | return self
73 |
74 | def __exit__(self, exc_type, exc_val, exc_tb):
75 | self.stop_now = True
76 |
--------------------------------------------------------------------------------
/utils/common_lib.py:
--------------------------------------------------------------------------------
1 | def check_packages():
2 | try:
3 | import langchain
4 | _has_packages = True
5 | except (ImportError, AttributeError):
6 | _has_packages = False
7 |
8 | if _has_packages:
9 | print("Proceed.")
10 | else:
11 | print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
12 | print("[ERROR] 0번 모듈 노트북(0_setup.ipynb)을 먼저 실행해 주세요.")
13 | print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
--------------------------------------------------------------------------------
/utils/inference_lib.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | import time
3 | import json
4 | import os.path as osp
5 | from typing import Union
6 | import pprint
7 |
8 | def parse_response(query_response):
9 |
10 | def traverse(o, tree_types=(list, tuple)):
11 | if isinstance(o, tree_types):
12 | for value in o:
13 | for subvalue in traverse(value, tree_types):
14 | yield subvalue
15 | else:
16 | yield o
17 |
18 | data = eval(query_response)
19 |
20 | listRes = []
21 | for value in traverse(data):
22 | listRes.append(value["generated_text"])
23 |
24 | if len(listRes) >= 2: return listRes
25 | else: return listRes[0].strip()
26 |
27 | # def invoke_inference(endpoint_name, prompt):
28 | # '''
29 | # KoAlpaca 프롬프트를 제공하여 엔드포인트 호출
30 | # '''
31 | # client = boto3.client("sagemaker-runtime")
32 |
33 | # content_type = "text/plain"
34 | # response = client.invoke_endpoint(
35 | # EndpointName=endpoint_name, ContentType=content_type, Body=prompt
36 | # )
37 | # #print(response["Body"].read())
38 | # res = response["Body"].read().decode()
39 | # print (eval(res)[0]['generated_text'])
40 |
41 | # def invoke_inference_DJ(endpoint_name, prompt):
42 |
43 | # client = boto3.client("sagemaker-runtime")
44 |
45 | # content_type = "application/json"
46 | # response = client.invoke_endpoint(
47 | # EndpointName=endpoint_name,
48 | # ContentType=content_type,
49 | # Body=json.dumps(prompt)
50 | # )
51 |
52 | # res = response["Body"].read().decode()
53 | # return res
54 |
55 | # def query_endpoint_with_text_payload(plain_text, endpoint_name, content_type="text/plain"):
56 | # '''
57 | # content_type 이 text/plain 인 경우 사용
58 | # '''
59 | # client = boto3.client("runtime.sagemaker")
60 | # response = client.invoke_endpoint(
61 | # EndpointName=endpoint_name, ContentType=content_type, Body=plain_text
62 | # )
63 | # return response
64 |
65 |
66 | # def parse_response_text_model(query_response):
67 | # '''
68 | # content_type 이 text/plain 인 경우 사용
69 | # '''
70 |
71 | # model_predictions = json.loads(query_response["Body"].read())
72 | # # print("model_predictions: \n", model_predictions)
73 | # generated_text = model_predictions[0]["generated_text"]
74 | # return generated_text
75 |
76 |
77 | """
78 | A dedicated helper to manage templates and prompt building.
79 | """
80 |
81 | class Prompter(object):
82 | __slots__ = ("template", "_verbose")
83 |
84 | def __init__(self, template_name: str = "", verbose: bool = False):
85 | self._verbose = verbose
86 | if not template_name:
87 | # Enforce the default here, so the constructor can be called with '' and will not break.
88 | template_name = "alpaca"
89 | file_name = osp.join("../templates", f"{template_name}.json")
90 | if not osp.exists(file_name):
91 | raise ValueError(f"Can't read {file_name}")
92 | with open(file_name) as fp:
93 | self.template = json.load(fp)
94 | if self._verbose:
95 | print(
96 | f"Using prompt template {template_name}: {self.template['description']}"
97 | )
98 |
99 | def generate_prompt(
100 | self,
101 | instruction: str,
102 | input: Union[None, str] = None,
103 | label: Union[None, str] = None,
104 | ) -> str:
105 | # returns the full prompt from instruction and optional input
106 | # if a label (=response, =output) is provided, it's also appended.
107 | if input:
108 | res = self.template["prompt_input"].format(
109 | instruction=instruction, input=input
110 | )
111 | else:
112 | res = self.template["prompt_no_input"].format(
113 | instruction=instruction
114 | )
115 | if label:
116 | res = f"{res}{label}"
117 | if self._verbose:
118 | print(res)
119 | return res
120 |
121 | def get_response(self, output: str) -> str:
122 | return output.split(self.template["response_split"])[1].strip()
123 |
124 |
125 | def describe_endpoint(endpoint_name):
126 | '''
127 | 엔드폰인트 생성 유무를 확인. 생성 중이면 기다림.
128 | '''
129 | sm_client = boto3.client("sagemaker")
130 |
131 | while(True):
132 | response = sm_client.describe_endpoint(
133 | EndpointName= endpoint_name
134 | )
135 | status = response['EndpointStatus']
136 | if status == 'Creating':
137 | print("Endpoint is ", status)
138 | time.sleep(60)
139 | else:
140 | print("Endpoint is ", status)
141 | break
142 |
143 |
144 | class KoLLMSageMakerEndpoint(object):
145 | def __init__(self, endpoint_name):
146 | self.endpoint_name = endpoint_name
147 | self.prompter = Prompter("kullm")
148 | self.smr_client = boto3.client('sagemaker-runtime')
149 |
150 | def get_payload(self, instruction, input_text, params):
151 | prompt = self.prompter.generate_prompt(instruction, input_text)
152 | payload = {
153 | 'inputs': prompt,
154 | 'parameters': params
155 | }
156 | payload_str = json.dumps(payload)
157 | return payload_str.encode("utf-8")
158 |
159 | def infer(self, payload, content_type="application/json", verbose=True):
160 | response = self.smr_client.invoke_endpoint(
161 | EndpointName=self.endpoint_name,
162 | ContentType=content_type,
163 | Body=payload
164 | )
165 |
166 | res = json.loads(response['Body'].read().decode("utf-8"))
167 | generated_text = res[0]["generated_text"]
168 | #generated_text = self.prompter.get_response(generated_text)
169 |
170 | generated_text = generated_text.split('###')[0]
171 | if verbose:
172 | pprint.pprint(f'Response: {generated_text}')
173 | return generated_text
174 |
175 |
176 | ################################################
177 | # Embedding Handler
178 | ################################################
179 |
180 | # from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
181 | # from langchain.embeddings import SagemakerEndpointEmbeddings
182 | # from langchain.llms.sagemaker_endpoint import ContentHandlerBase
183 | # from typing import Any, Dict, List, Optional
184 |
185 | # class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
186 | # def embed_documents(self, texts: List[str], chunk_size: int = 5) -> List[List[float]]:
187 | # """Compute doc embeddings using a SageMaker Inference Endpoint.
188 |
189 | # Args:
190 | # texts: The list of texts to embed.
191 | # chunk_size: The chunk size defines how many input texts will
192 | # be grouped together as request. If None, will use the
193 | # chunk size specified by the class.
194 |
195 | # Returns:
196 | # List of embeddings, one for each text.
197 | # """
198 | # results = []
199 | # _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
200 |
201 | # # print("text size: ", len(texts))
202 | # # print("_chunk_size: ", _chunk_size)
203 |
204 | # for i in range(0, len(texts), _chunk_size):
205 | # response = self._embedding_func(texts[i : i + _chunk_size])
206 | # print
207 | # results.extend(response)
208 | # return results
209 |
210 | # import numpy as np
211 |
212 | # class KoSimCSERobertaContentHandler(EmbeddingsContentHandler):
213 | # content_type = "application/json"
214 | # accepts = "application/json"
215 |
216 | # def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
217 | # input_str = json.dumps({"inputs": prompt, **model_kwargs})
218 | # return input_str.encode("utf-8")
219 |
220 | # def transform_output(self, output: bytes) -> str:
221 | # response_json = json.loads(output.read().decode("utf-8"))
222 | # ndim = np.array(response_json).ndim
223 | # # print("response_json ndim: \n", ndim)
224 | # # print("response_json shape: \n", np.array(response_json).shape)
225 | # if ndim == 4:
226 | # # Original shape (1, 1, n, 768)
227 | # emb = response_json[0][0][0]
228 | # emb = np.expand_dims(emb, axis=0).tolist()
229 | # # print("emb shape: ", np.array(emb).shape)
230 | # # print("emb TYPE: ", type(emb))
231 | # elif ndim == 2:
232 | # # Original shape (n, 1)
233 | # # print(response_json[0])
234 | # emb = []
235 | # for ele in response_json:
236 | # # print(np.array(response_json[0]).shape)
237 | # e = ele[0][0]
238 | # #emb = np.expand_dims(emb, axis=0).tolist()
239 | # # print("emb shape: ", np.array(emb).shape)
240 | # # print("emb TYPE: ", type(emb))
241 | # emb.append(e)
242 | # # print("emb_list shape: ", np.array(emb).shape)
243 | # # print("emb_list TYPE: ", type(emb))
244 | # else:
245 | # print(f"Other # of dimension: {ndim}")
246 | # emb = None
247 | # return emb
248 |
249 |
250 | # ################################################
251 | # # LLM Handler
252 | # ################################################
253 | # from langchain.llms.sagemaker_endpoint import LLMContentHandler
254 | # import json
255 |
256 | # class KoAlpacaContentHandler(LLMContentHandler):
257 | # content_type = "application/json"
258 | # accepts = "application/json"
259 |
260 | # def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
261 | # input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
262 | # return input_str.encode("utf-8")
263 |
264 | # def transform_output(self, output: bytes) -> str:
265 | # print("In KoAlpacaContentHandler")
266 | # # print("output: ", output)
267 | # response_json = json.loads(output.read().decode("utf-8"))
268 | # print("response_json: ", response_json)
269 | # # return response_json["generated_texts"][0]
270 | # doc = response_json[0]['generated_text']
271 | # doc = json.loads(doc)
272 | # doc = doc['text_inputs']
273 | # return doc
--------------------------------------------------------------------------------