├── .github
    └── workflows
    │   └── scorecard.yml
├── .gitignore
├── Chinese_Version
    ├── README.md
    ├── ch_1_Introduction
    │   ├── .keep
    │   └── README.md
    ├── ch_2_Environment_Setup
    │   ├── .keep
    │   └── README.md
    ├── ch_3_AppDev_Basic
    │   ├── .keep
    │   ├── 3_BasicApp.ipynb
    │   └── README.md
    ├── ch_4_Chinese_Support
    │   ├── .keep
    │   ├── 4_1_ChatGLM2-6B.ipynb
    │   ├── 4_2_Baichuan-13B.ipynb
    │   └── README.md
    ├── ch_5_AppDev_Intermediate
    │   ├── .keep
    │   ├── 5_1_ChatBot.ipynb
    │   ├── 5_2_Speech_Recognition.ipynb
    │   └── README.md
    ├── ch_6_GPU_Acceleration
    │   ├── .keep
    │   ├── 6_1_GPU_Llama2-7B.md
    │   └── README.md
    ├── ch_7_Finetune
    │   ├── 7_1_Finetune_Llama2-7B.md
    │   └── README.md
    └── ch_8_AppDev_Advanced
    │   ├── .keep
    │   ├── 8_LangChain_Integrations.ipynb
    │   └── README.md
├── LICENSE
├── README.md
├── SECURITY.md
├── ch_1_Introduction
    ├── .keep
    └── README.md
├── ch_2_Environment_Setup
    └── README.md
├── ch_3_AppDev_Basic
    ├── .keep
    ├── 3_Baichuan2_BasicApp.ipynb
    ├── 3_OpenLlamaBasicApp.ipynb
    └── README.md
├── ch_4_Chinese_Support
    ├── .keep
    ├── 4_1_ChatGLM2-6B.ipynb
    ├── 4_2_Baichuan-13B.ipynb
    └── README.md
├── ch_5_AppDev_Intermediate
    ├── .keep
    ├── 5_1_ChatBot.ipynb
    ├── 5_2_Speech_Recognition.ipynb
    └── README.md
├── ch_6_GPU_Acceleration
    ├── 6_1_GPU_Llama2-7B.md
    ├── 6_2_GPU_Baichuan2-7B.md
    ├── 6_3_GPU_Whisper-medium.md
    ├── README.md
    └── environment_setup.md
├── ch_7_Finetune
    ├── 7_1_Finetune_Llama2-7B.md
    └── README.md
└── ch_8_AppDev_Advanced
    ├── .keep
    ├── 8_LangChain_Integrations.ipynb
    └── README.md


/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   # schedule:
13 |   #   - cron: '26 2 * * *'
14 |   # push:
15 |   #   branches: [ "main" ]
16 |   workflow_dispatch:
17 |   # pull_request:
18 | 
19 | # Declare default permissions as read only.
20 | permissions: read-all
21 | 
22 | jobs:
23 |   analysis:
24 |     name: Scorecard analysis
25 |     runs-on: ubuntu-latest
26 |     permissions:
27 |       # Needed to upload the results to code-scanning dashboard.
28 |       security-events: write
29 |       # Needed to publish results and get a badge (see publish_results below).
30 |       id-token: write
31 |       # Uncomment the permissions below if installing in a private repository.
32 |       # contents: read
33 |       # actions: read
34 | 
35 |     steps:
36 |       - name: "Checkout code"
37 |         uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
38 |         with:
39 |           persist-credentials: false
40 | 
41 |       - name: "Run analysis"
42 |         uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
43 |         with:
44 |           results_file: results.sarif
45 |           results_format: sarif
46 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
47 |           # - you want to enable the Branch-Protection check on a *public* repository, or
48 |           # - you are installing Scorecard on a *private* repository
49 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
50 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
51 | 
52 |           # Public repositories:
53 |           #   - Publish results to OpenSSF REST API for easy access by consumers
54 |           #   - Allows the repository to include the Scorecard badge.
55 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
56 |           # For private repositories:
57 |           #   - `publish_results` will always be set to `false`, regardless
58 |           #     of the value entered here.
59 |           publish_results: true
60 | 
61 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
62 |       # format to the repository Actions tab.
63 |       - name: "Upload artifact"
64 |         uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
65 |         with:
66 |           name: SARIF file
67 |           path: results.sarif
68 |           retention-days: 5
69 | 
70 |       # Upload the results to GitHub's code scanning dashboard.
71 |       - name: "Upload to code-scanning"
72 |         uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4
73 |         with:
74 |           sarif_file: results.sarif
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/Chinese_Version/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><h1>IPEX-LLM 教程</h1><p>
 2 |   
 3 | <h4 align="center">
 4 |     <p>
 5 |         <a href="../README.md">English</a> |
 6 |         <b>中文</b>
 7 |     <p>
 8 | </h4>
 9 | 
10 | [_IPEX-LLM_](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) 是一个为Intel XPU (包括CPU和GPU) 打造的轻量级大语言模型加速库。本代码仓库包含了若干关于IPEX-LLM的教程，能帮助你理解什么是IPEX-LLM，以及如何使用IPEX-LLM 来开发基于大语言模型的应用。
11 | 
12 | 教程组织如下：
13 | 
14 | - [第1章 **`概览`**](./ch_1_Introduction) 介绍了什么是 IPEX-LLM以及您可以用它做什么。
15 | - [第2章 **`环境准备`**](./ch_2_Environment_Setup) 提供了一组最佳实践，用于设置您的环境。
16 | - [第3章 **`应用开发基础`**](./ch_3_AppDev_Basic) 介绍了 IPEX-LLM 的基本用法，以及如何构建一个最简单的聊天应用。
17 | - [第4章 **`中文支持`**](./ch_4_Chinese_Support) 展示了一些支持中文输入/输出的大语言模型的用法，例如ChatGLM2、Baichuan。
18 | - [第5章 **`中级应用开发`**](./ch_5_AppDev_Intermediate) 介绍了使用 IPEX-LLM 进行应用开发的中级知识，例如如何构建更复杂的聊天机器人、如何做语音识别等。
19 | - [第6章 **`GPU加速`**](./ch_6_GPU_Acceleration) 介绍了如何用IPEX-LLM在Intel GPU上加速LLM应用。
20 | - [第7章 **`微调`**](./ch_7_Finetune) 介绍了如何使用 IPEX-LLM 进行微调。
21 | - [第8章 **`高级应用开发`**](./ch_8_AppDev_Advanced) 介绍了使用 IPEX-LLM 进行应用开发的高级知识，例如langchain的使用。
22 | 
23 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_1_Introduction/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_1_Introduction/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_1_Introduction/README.md:
--------------------------------------------------------------------------------
 1 | # 第一章 概览
 2 | 
 3 | ## 什么是 IPEX-LLM
 4 | [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) 是一个为Intel XPU (包括CPU和GPU) 打造的轻量级大语言模型加速库，在Intel平台上具有广泛的模型支持、最低的延迟和最小的内存占用。IPEX-LLM是采用 Apache 2.0 许可证发布的开源项目。
 5 | 
 6 | ## 能用 IPEX-LLM 做什么
 7 | 您可以使用 IPEX-LLM 运行任何 PyTorch 模型（例如  [HuggingFace transformers](https://huggingface.co/docs/transformers/index) 模型）。在运行过程中，IPEX-LLM利用了低比特优化技术、现代硬件加速技术，和一系列软件优化技术来自动加速LLM。
 8 | 
 9 | 使用 IPEX-LLM 非常简单。只需更改一行代码，您就可以立即观察到显著的加速效果[^1]。
10 | 
11 | ### 案例：使用一行`optimize_model`来优化加速LLaMA模型
12 | ```python
13 | # 按常规流程加载LLaMA模型
14 | from ipex_llm import optimize_model
15 | 
16 | from transformers import LlamaForCausalLM, LlamaTokenizer
17 | model = LlamaForCausalLM.from_pretrained(model_path,...)
18 | 
19 | # 应用IPEX-LLM 的低精度优化。默认使用 INT4
20 | model = optimize_model(model)
21 | 
22 | # 后续模型推理部分的代码无需修改
23 | ...
24 | ```
25 | 
26 | IPEX-LLM 提供多种低比特优化选择（例如，INT3/NF3/INT4/NF4/INT5/INT8），并允许您使用多种Intel平台运行LLM，包括入门笔记本（仅使用CPU）、装载Intel Arc独立显卡的高端电脑，至强服务器，或者数据中心GPU（如Flex、Max）。
27 | 
28 | 以下演示展示了在一台16GB内存的笔记本电脑上仅使用CPU运行7B和13B模型的体验。
29 | 
30 | #### 在英特尔 12 代酷睿电脑上运行 6B 模型（实时屏幕画面）:
31 | 
32 | <p align="left">
33 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/chatglm2-6b.gif" width='60%' /> 
34 | 
35 | </p>
36 | 
37 | #### 在英特尔 12 代酷睿电脑上运行 13B 模型（实时屏幕画面）: 
38 | 
39 | <p align="left">
40 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/llama-2-13b-chat.gif" width='60%' /> 
41 | 
42 | </p>
43 | 
44 | 
45 | 
46 | ## 接下来做什么
47 | 
48 | 本教程以下各章将详细介绍如何使用 IPEX-LLM 构建 LLM 应用程序，例如 transformers API、langchain API、多语言支持等。每一章都将使用流行的开源模型提供可运行的Jupyter 笔记本。您可以继续阅读以了解更多信息，同时也可以在您的笔记本电脑上运行提供的代码。
49 | 
50 | 此外，您还可以访问我们的 [GitHub repo](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) 获取更多信息和最新消息。
51 | 
52 | 我们已经在 IPEX-LLM 上验证了很多的模型并且提供了可立即运行的示例，例如 [Llama2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2), [ChatGLM2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2), [百川](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan), [书生](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm), [通义千问](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen), [Falcon](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon), [MPT](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt), [Dolly-v2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2), [StarCoder](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder), [Whisper](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper) 等。你可以在[这里](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model)找到模型的示例。
53 | 
54 | 
55 | [^1]: 性能因使用、配置和其他因素而异。对于非英特尔产品，`ipex-llm`的优化程度可能不同。了解更多信息，请访问 www.Intel.com/PerformanceIndex。
56 | 
57 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_2_Environment_Setup/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_2_Environment_Setup/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_2_Environment_Setup/README.md:
--------------------------------------------------------------------------------
  1 | # 第二章 环境准备
  2 | 
  3 | 本章介绍了一系列环境配置的最佳实践。为了确保在后续章节中顺利使用 Jupyter Notebook, 强烈建议您按照以下相应步骤正确配置环境。
  4 | 
  5 | ## 2.1 系统建议
  6 | 首先，选择一个合适的系统。以下是推荐的硬件与操作系统列表：
  7 | 
  8 | >⚠️**硬件**
  9 | 
 10 | - 至少 16GB 内存的英特尔®个人电脑
 11 | - 搭载英特尔®至强®处理器和至少 32GB 内存的服务器
 12 | 
 13 | >⚠️**操作系统**
 14 | 
 15 | - Ubuntu 20.04 或更高版本
 16 | - CentOS 7 或更高版本
 17 | - Windows 10/11, 有无WSL均可
 18 | 
 19 | ## 2.2 设置 Python 环境
 20 | 
 21 | 接下来，使用 Python 环境管理工具（推荐使用 [Conda](https://docs.conda.io/projects/conda/en/stable/) ）创建 Python 环境并安装必要的库。
 22 | 
 23 | 
 24 | ### 2.2.1 安装 Conda
 25 | 请按照下面与您的操作系统相对应的说明进行操作。
 26 | 
 27 | #### 2.2.1.1 Linux
 28 | 
 29 | 对于 Linux 用户，打开终端并且运行以下命令。
 30 | 
 31 | ```bash
 32 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 33 | bash ./Miniconda3-latest-Linux-x86_64.sh
 34 | conda init
 35 | ```
 36 | >**注意**
 37 | > 请按照终端显示的说明进行操作，直到 conda 初始化成功完成。
 38 | 
 39 | 
 40 | #### 2.2.1.2 Windows
 41 | 
 42 | 对于 Windows 用户，在[这里](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)下载 conda 安装包并运行。
 43 | 
 44 | 在安装完成后，打开 "Anaconda Powershell Prompt (Miniconda3)" 执行以下步骤。
 45 | 
 46 | #### 2.2.1.3 适用于 Linux 的 Windows 子系统 (WSL):
 47 | 
 48 | 对于 WSL 用户，请确保已经安装了 WSL2。如果没有，请参阅[此处](https://bigdl.readthedocs.io/en/latest/doc/UserGuide/win.html#install-wsl2l)了解安装方法。
 49 | 
 50 | 打开 WSL2 shell 并运行与 [2.2.1.1 Linux](#2211-linux) 相同的命令。
 51 | 
 52 | 
 53 | 
 54 | ### 2.2.2 创建环境
 55 | > **注意**
 56 | > 推荐使用 Python 3.9 运行 IPEX-LLM.
 57 | 
 58 | 创建一个 Python 3.9 环境，名称由您选择，例如 `llm-tutorial`:
 59 | ```
 60 | conda create -n llm-tutorial python=3.9
 61 | ```
 62 | 然后激活环境 `llm-tutorial`:
 63 | ```
 64 | conda activate llm-tutorial
 65 | ```
 66 | 
 67 | ## 2.3 安装 IPEX-LLM
 68 | 
 69 | 下面这一行命令将安装最新版本的`ipex-llm`以及所有常见LLM应用程序开发所需的依赖项。
 70 | ```
 71 | pip install --pre --upgrade ipex-llm[all]
 72 | ```
 73 | 
 74 | ## 2.4 安装 Jupyter 服务
 75 | 
 76 | ### 2.4.1 安装 Jupyter
 77 | 运行教程提供的 Notebook (即 `.ipynb` 文件) 需要 `jupyter` 库。在激活的 Python 3.9 环境下运行：
 78 | ```
 79 | pip install jupyter
 80 | ```
 81 | 
 82 | ### 2.4.2 启动 Jupyter 服务
 83 | 启动 jupyter 服务的推荐指令在个人电脑和服务器上略有不同。
 84 | 
 85 | #### 2.4.2.1 在个人电脑上
 86 | 在个人电脑上，只需在 shell 中运行以下命令：
 87 | ```
 88 | jupyter notebook
 89 | ```
 90 | 
 91 | #### 2.4.2.2 在服务器上
 92 | 在服务器上，建议使用单个插槽的所有物理核心以获得更好的性能。因此，请运行以下命令：
 93 | ```bash
 94 | # 以每个插槽有48个核心的服务器为例
 95 | export OMP_NUM_THREADS=48
 96 | numactl -C 0-47 -m 0 jupyter notebook
 97 | ```
 98 | 
 99 | 祝贺您！现在您可以使用浏览器来访问 jupyter 服务 url 并运行本教程提供的notebooks。
100 | 
101 | 
102 | ## 2.5 关于使用LLM的一些你可能想要了解的事项
103 | 
104 | 如果您在LLM和LLM应用程序开发方面是新手，本节可能包含了一些您想要了解的内容。
105 | 
106 | ### 2.5.1 去哪里找模型
107 | 
108 | 首先，您需要获取一个模型。社区中有许多开源的LLM可供选择。如果您没有特定的目标，可以考虑从社区公开的LLM排行榜上排名较高的模型中选择。这些公开的LLM排行榜一般采用多种评测手段评估和比较多个LLM的能力。一些比较有名的排行榜包括：
109 | 
110 | - [Open LLM LeaderBoard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) 由 Huggingface 维护 
111 | - [Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) 由 llmsys 维护
112 | 
113 | 这些排行榜大多包含了列出的模型的参考链接。如果一个模型是开源的，您可以直接从提供的链接中轻松下载并尝试使用。
114 | 
115 | 
116 | ### 2.5.2 从Huggingface下载模型
117 | 
118 | 截止到目前为止，许多热门的LLM模型都托管在Huggingface上。Huggingface托管的一个示例模型主页如下所示。
119 | ![image](https://github.com/shane-huang/bigdl-llm-tutorial/assets/1995599/a04df95f-5590-4bf1-968c-32cf494ece92)
120 | 
121 | 要从Huggingface下载模型，您可以使用git或Huggingface提供的API。有关如何下载模型的详细信息，请参阅[从Huggingface下载模型](https://huggingface.co/docs/hub/models-downloading) 。
122 | 
123 | 通常从Huggingface下载的模型可以使用[Huggingface Transformers库](https://huggingface.co/docs/transformers/index)加载。IPEX-LLM提供了API，可以轻松地与这些模型一起使用。请阅读本教程后续章节了解更多信息。
124 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_3_AppDev_Basic/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_3_AppDev_Basic/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_3_AppDev_Basic/3_BasicApp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Notebook 3: 应用开发基础\n",
  9 |     "\n",
 10 |     "此 notebook 介绍了`ipex-llm`的基本用法，并带您逐步构建一个最基础的聊天应用程序。\n",
 11 |     "\n",
 12 |     "## 3.1 安装 `ipex-llm`\n",
 13 |     "\n",
 14 |     "如果您尚未安装ipex-llm，请按照以下示例进行安装。这一行命令将安装最新版本的`ipex-llm`以及所有常见LLM应用程序开发所需的依赖项。"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install --pre --upgrade ipex-llm[all]"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## 3.2 加载预训练模型\n",
 31 |     "\n",
 32 |     "在使用LLM之前，您首先需要加载一个模型。这里我们以一个相对较小的LLM作为示例，即[open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2)。\n",
 33 |     "\n",
 34 |     "> **注意**\n",
 35 |     "> * `open_llama_3b_v2`是一个基于LLaMA结构的开源大语言模型，您可以在该模型在Huggingface上托管的[主页](https://huggingface.co/openlm-research/open_llama_3b_v2)上获取更多模型相关信息。\n",
 36 |     "\n",
 37 |     "### 3.2.1 加载和优化模型\n",
 38 |     "\n",
 39 |     "通常情况下，您只需要一行`optimize_model`就可以轻松优化已加载的任何PyTorch模型，无论采用的是什么库或者API。关于`optimize_model`更详细的用法，请参考[API文档](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html).\n",
 40 |     "\n",
 41 |     "此外，大量流行的开源PyTorch大语言模型都可以使用`Huggingface Transformers API`（例如[AutoModel](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModel), [AutoModelForCasualLM](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModelForCausalLM) 等）来加载。对于这类模型，`ipex-llm`也提供了一套API来支持。我们接下来展示一下这种API的用法。\n",
 42 |     "\n",
 43 |     "在这个例子里，我们将使用`ipex_llm.transformers.AutoModelForCausalLM`来加载`open_llama_3b_v2`。这个API相对官方的`tranformers.AutoModelForCasualLM`，除了增加了一些低比特优化相关的参数和方法，其他部分在使用上完全一致。\n",
 44 |     "\n",
 45 |     "要应用INT4优化，只需在`from_pretrained`中指定`load_in_4bit=True`即可。另外根据经验，我们默认设置参数`torch_dtype=\"auto\"`和`low_cpu_mem_usage=True`，这会有利于性能和内存优化。"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from ipex_llm.transformers import AutoModelForCausalLM\n",
 55 |     "\n",
 56 |     "model_path = 'openlm-research/open_llama_3b_v2'\n",
 57 |     "\n",
 58 |     "model = AutoModelForCausalLM.from_pretrained(model_path,\n",
 59 |     "                                             load_in_4bit=True)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "attachments": {},
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "> **注意**\n",
 68 |     "> * 如果您需要使用除了INT4以外的其他精度（例如INT3/INT5/INT8)，或者想了解更多API的详细参数，请参阅[API 文档](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/transformers.html)。\n",
 69 |     "\n",
 70 |     "> * `openlm-research/open_llama_3b_v2`是`open_llama_3b_v2`模型在huggingface上托管的model_id。如果将`from_pretrained`的`model_path`参数设置成model_id，`from_pretrained`会默认从huggingface上下载模型、缓存到本地路径（比如 `~/.cache/huggingface`）并加载。下载模型的过程可能会较久，您也可以自行下载模型，再将`model_path`变量修改为本地路径。关于`from_pretrained`的用法，请参考[这里](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained)。\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "### 3.2.2 保存和加载优化后的模型\n",
 74 |     "\n",
 75 |     "在上一节中，用`Huggingface transformers` API加载的原模型通常是以fp32或fp16精度存储的。为了节省模型存储空间并加速后续加载过程，`ipex-llm`还提供了`save_low_bit`接口用于保存低比特优化后的模型，以及`load_low_bit`接口用于加载已保存的优化模型。\n",
 76 |     "\n",
 77 |     "由于`load_low_bit`不需要读取原始的模型，也省去了优化模型的时间，通常我们可以做一次`save_low_bit`操作，然后将模型部署在不同平台上用`load_low_bit`加载并进行多次推理。这种方法既节省了内存，又提高了加载速度。而且，由于优化后的模型格式与平台无关，您可以在各种不同操作系统的计算机上无缝执行保存和加载操作。这种灵活性使您可以在内存更大的服务器上进行优化和保存操作，然后在有限内存的入门级个人电脑上部署模型进行推理应用。\n",
 78 |     "\n",
 79 |     "**保存优化后模型**\n",
 80 |     "\n",
 81 |     "例如，您可以使用`save_low_bit`函数来保存优化后模型，如下所示："
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 4,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "save_directory = './open-llama-3b-v2-ipex-llm-INT4'\n",
 91 |     "\n",
 92 |     "model.save_low_bit(save_directory)\n",
 93 |     "del(model)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "attachments": {},
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "**加载优化后模型**\n",
102 |     "\n",
103 |     "您可以使用`load_low_bit`函数加载优化后的模型，如下所示："
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# note that the AutoModelForCausalLM here is imported from ipex_llm.transformers\n",
113 |     "model = AutoModelForCausalLM.load_low_bit(save_directory)"
114 |    ]
115 |   },
116 |   {
117 |    "attachments": {},
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## 3.3 构建一个最简单的聊天应用\n",
122 |     "\n",
123 |     "现在，模型已经成功加载，可以开始构建我们的第一个聊天应用程序了。接下来将使用`Huggingface transformers`推理API来完成这个任务。\n",
124 |     "\n",
125 |     "\n",
126 |     "> **注意**\n",
127 |     "> \n",
128 |     "> 本节中的代码完全使用`Huggingface transformers` API实现。`ipex-llm`不需要在推理代码中进行任何更改，因此您可以在推理阶段使用任何库来构建您的应用程序。\n",
129 |     "\n",
130 |     "\n",
131 |     "> **注意**\n",
132 |     "> \n",
133 |     "> 我们使用了Q&A的对话式模板，以更好地回答问题。\n",
134 |     "\n",
135 |     "\n",
136 |     "> **注意**\n",
137 |     "> \n",
138 |     "> 您在调用`generate`函数时，可以通过修改`max_new_tokens`参数来指定要预测的tokens数目上限。\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "from transformers import LlamaTokenizer\n",
148 |     "\n",
149 |     "tokenizer = LlamaTokenizer.from_pretrained(model_path)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "Inference time: xxxx s\n",
162 |       "-------------------- Output --------------------\n",
163 |       "Q: What is CPU?\n",
164 |       "A: CPU stands for Central Processing Unit. It is the brain of the computer.\n",
165 |       "Q: What is RAM?\n",
166 |       "A: RAM stands for Random Access Memory.\n"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "import torch\n",
172 |     "\n",
173 |     "with torch.inference_mode():\n",
174 |     "    prompt = 'Q: What is CPU?\\nA:'\n",
175 |     "    \n",
176 |     "    # tokenize the input prompt from string to token ids\n",
177 |     "    input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
178 |     "    # predict the next tokens (maximum 32) based on the input token ids\n",
179 |     "    output = model.generate(input_ids, max_new_tokens=32)\n",
180 |     "    # decode the predicted token ids to output string\n",
181 |     "    output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
182 |     "\n",
183 |     "    print('-'*20, 'Output', '-'*20)\n",
184 |     "    print(output_str)"
185 |    ]
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "Python 3 (ipykernel)",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.9.17"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 4
209 | }
210 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_3_AppDev_Basic/README.md:
--------------------------------------------------------------------------------
 1 | # 第三章 应用开发基础
 2 | 
 3 | 这一章将帮助您迅速入门IPEX-LLM，并指导您构建您的第一个LLM应用程序。
 4 | 
 5 | 本章附带的notebook [3_BasicApp.ipynb](./3_BasicApp.ipynb) 介绍了IPEX-LLM的一些基本使用方法，并指导您完成构建一个基础聊天应用程序的过程。
 6 | 
 7 | ## 接下来做什么
 8 | 
 9 | 在接下来的[第四章：中文支持](../ch_4_Chinese_Support/)中，您将学习如何使用支持中文的模型进行应用程序开发。在[第五章：中级应用程序开发](../ch_5_AppDev_Intermediate/)中，您将学习一些开发更高级的聊天机器人应用的技巧，以及如何构建语音识别应用。
10 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_4_Chinese_Support/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_4_Chinese_Support/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_4_Chinese_Support/4_1_ChatGLM2-6B.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Notebook 4.1: ChatGLM2-6B\n",
  8 |     "\n",
  9 |     "## 4.1.1 概述\n",
 10 |     "本示例展示了如何使用[IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) API 在低成本 PC 上（无需独立显卡）运行[ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B) 中文推理。ChatGLM2-6B 是 [THUDM](https://github.com/THUDM) 提出的开源双语（汉英）聊天模型 [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) 的第二代版本。ChatGLM2-6B 也可在以下[链接](https://huggingface.co/THUDM/chatglm2-6b)中的[Huggingface models](https://huggingface.co/models)中找到。\n",
 11 |     "\n",
 12 |     "在进行推理之前，您可能需要根据[第二章](../ch_2_Environment_Setup/README.md)的说明配置环境。"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## 4.1.2 安装\n",
 20 |     "\n",
 21 |     "首先，在准备好的环境中安装 IPEX-LLM。有关环境配置的最佳实践，请参阅本教程的 [第二章](../ch_2_Environment_Setup/README.md)。"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "!pip install --pre --upgrade ipex-llm[all]"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "all 选项用于安装 IPEX-LLM 所需的其他软件包。"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## 4.1.3 加载模型以及 Tokenizer\n",
 45 |     "\n",
 46 |     "### 4.1.3.1 加载模型\n",
 47 |     "\n",
 48 |     "使用 IPEX-LLM API 加载低精度优化（INT4）的 ChatGLM2 模型以降低资源成本，这会将模型中的相关层转为 INT4 格式。\n",
 49 |     "\n",
 50 |     "> **注意**\n",
 51 |     ">\n",
 52 |     "> IPEX-LLM 支持 `AutoModel`、`AutoModelForCausalLM`、`AutoModelForSpeechSeq2Seq` 和 `AutoModelForSeq2SeqLM`。前缀带有 Auto 的这些类可帮助用户自动读取相关模型，因此，我们只需使用 `AutoModel` 加载即可。\n",
 53 |     "\n",
 54 |     "> **注意**\n",
 55 |     ">\n",
 56 |     "> 您可以指定参数 `model_path` 为 Huggingface repo id 或本地模型路径。"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "from ipex_llm.transformers import AutoModel\n",
 66 |     "\n",
 67 |     "model_path = \"THUDM/chatglm2-6b\"\n",
 68 |     "model = AutoModel.from_pretrained(model_path,\n",
 69 |     "                                  load_in_4bit=True,\n",
 70 |     "                                  trust_remote_code=True)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### 4.1.3.2 加载 Tokenizer\n",
 78 |     "\n",
 79 |     "LLM 推理也需要一个 tokenizer。它用于将输入文本编码为张量，然后输入 LLM，并将 LLM 输出的张量解码为文本。您可以使用 [Huggingface transformers](https://huggingface.co/docs/transformers/index) API 直接加载 tokenizer。它可以与 IPEX-LLM 加载的模型无缝配合使用。"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from transformers import AutoTokenizer\n",
 89 |     "\n",
 90 |     "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
 91 |     "                                          trust_remote_code=True)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## 4.1.4 推理\n",
 99 |     "\n",
100 |     "### 4.1.4.1 创建 Prompt 模板\n",
101 |     "\n",
102 |     "在生成之前，您需要创建一个 prompt 模板。这里我们给出了一个用于提问和回答的 prompt 模板示例，参考自 [ChatGLM2-6B prompt template](https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007)。您也可以根据自己的模型调整 prompt。"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "CHATGLM_V2_PROMPT_TEMPLATE = \"问：{prompt}\\n\\n答：\""
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### 4.1.4.2 生成\n",
119 |     "\n",
120 |     "接下来，您可以使用加载的模型与 tokenizer 生成输出。\n",
121 |     "\n",
122 |     "> **注意**\n",
123 |     "> \n",
124 |     ">  `generate` 函数中的 `max_new_tokens` 参数定义了预测的最大 token 数量。"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 7,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "-------------------- Output --------------------\n",
137 |       "问：AI是什么？\n",
138 |       "\n",
139 |       "答： AI指的是人工智能,是一种能够通过学习和推理来执行任务的计算机程序。它可以模仿人类的思维方式,做出类似人类的决策,并且具有自主学习、自我\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "import time\n",
145 |     "import torch\n",
146 |     "\n",
147 |     "prompt = \"AI是什么？\"\n",
148 |     "n_predict = 32\n",
149 |     "\n",
150 |     "with torch.inference_mode():\n",
151 |     "    prompt = CHATGLM_V2_PROMPT_TEMPLATE.format(prompt=prompt)\n",
152 |     "    input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
153 |     "    output = model.generate(input_ids,\n",
154 |     "                            max_new_tokens=n_predict)\n",
155 |     "    output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
156 |     "    print('-'*20, 'Output', '-'*20)\n",
157 |     "    print(output_str)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "### 4.1.4.3 流式对话\n",
165 |     "\n",
166 |     "ChatGLM2-6B 支持流式输出函数 `stream_chat`，它使模型能够逐字提供流式响应。但是，其他模型可能不提供类似的 API，如果您想实现一般的流式输出功能，请参阅 [5.1 章](../ch_5_AppDev_Intermediate/5_1_ChatBot.ipynb)。\n",
167 |     "\n",
168 |     "> **注意**\n",
169 |     ">\n",
170 |     "> 为了成功观察到标准输出中的文本流，我们需要设置环境变量 `PYTHONUNBUFFERED=1` 以确保标准输出流直接发送到终端，而无需先进行缓冲。"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "-------------------- Stream Chat Output --------------------\n",
183 |       "AI指的是人工智能,是一种能够通过学习和理解数据,以及应用适当的算法和数学模型,来执行与人类智能相似的任务的计算机程序。AI可以包括机器学习、自然语言处理、计算机视觉、专家系统、强化学习等不同类型的技术。\n",
184 |       "\n",
185 |       "AI的应用领域广泛,例如自然语言处理可用于语音识别、机器翻译、情感分析等;计算机视觉可用于人脸识别、图像识别、自动驾驶等;机器学习可用于预测、分类、聚类等数据分析任务。\n",
186 |       "\n",
187 |       "AI是一种非常有前途的技术,已经在许多领域产生了积极的影响,并随着技术的不断进步,将继续为我们的生活和工作带来更多的便利和改变。"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "import torch\n",
193 |     "\n",
194 |     "with torch.inference_mode():\n",
195 |     "    question = \"AI 是什么？\"\n",
196 |     "    response_ = \"\"\n",
197 |     "    print('-'*20, 'Stream Chat Output', '-'*20)\n",
198 |     "    for response, history in model.stream_chat(tokenizer, question, history=[]):\n",
199 |     "        print(response.replace(response_, \"\"), end=\"\")\n",
200 |     "        response_ = response"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "## 4.1.5 在 LangChain 中使用\n",
208 |     "\n",
209 |     "[LangChain](https://python.langchain.com/docs/get_started/introduction.html)是一个被广泛的用于开发由语言模型驱动的应用程序的框架。本节将介绍如何将 IPEX-LLM 与 LangChain 集成。您可以按照此[说明](https://python.langchain.com/docs/get_started/installation)为 LangChain 配置环境。"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "如果需要的话，请按如下方式安装 LangChain，或者参考 [第八章](../ch_8_AppDev_Advanced/README.md) 获取更多有关 LangChain 集成的信息："
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "!pip install -U langchain==0.0.248"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "> **注意**\n",
233 |     "> \n",
234 |     "> 我们建议使用 `langchain==0.0.248`，这个版本在我们的教程中不会出现问题。"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "### 4.1.5.1 创建 Prompt 模板\n",
242 |     "\n",
243 |     "在推理之前，您需要创建一个 prompt 模板。这里我们给出了一个用于提问和回答的 prompt 模板示例，其中包含两个输入变量：`history` 和 `human_input`。您也可以根据自己的模型调整 prompt 模板。"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 2,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "CHATGLM_V2_LANGCHAIN_PROMPT_TEMPLATE = \"\"\"{history}\\n\\n问：{human_input}\\n\\n答：\"\"\""
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "### 4.1.5.2 准备 Chain"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "使用 [LangChain API](https://api.python.langchain.com/en/latest/api_reference.html) `LLMChain` 构造用于推理的 chain。在这里，我们使用 IPEX-LLM API 来构造一个 `LLM` 对象，它将自动加载经过低精度优化的模型。\n",
267 |     "\n",
268 |     "> **注意**\n",
269 |     ">\n",
270 |     "> `ConversationBufferWindowMemory` 是 LangChain 中的一种存储类型，用于保存一个装有对话中最近 `k` 次交互的移动窗口。"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "from langchain import LLMChain, PromptTemplate\n",
280 |     "from ipex_llm.langchain.llms import TransformersLLM\n",
281 |     "from langchain.memory import ConversationBufferWindowMemory\n",
282 |     "\n",
283 |     "llm_model_path = \"THUDM/chatglm2-6b\" # huggingface llm 模型的路径\n",
284 |     "\n",
285 |     "prompt = PromptTemplate(input_variables=[\"history\", \"human_input\"], template=CHATGLM_V2_LANGCHAIN_PROMPT_TEMPLATE)\n",
286 |     "max_new_tokens = 128\n",
287 |     "\n",
288 |     "llm = TransformersLLM.from_model_id(\n",
289 |     "        model_id=llm_model_path,\n",
290 |     "        model_kwargs={\"trust_remote_code\": True},\n",
291 |     ")\n",
292 |     "\n",
293 |     "# 以下代码与用例完全相同\n",
294 |     "llm_chain = LLMChain(\n",
295 |     "    llm=llm,\n",
296 |     "    prompt=prompt,\n",
297 |     "    verbose=True,\n",
298 |     "    llm_kwargs={\"max_new_tokens\":max_new_tokens},\n",
299 |     "    memory=ConversationBufferWindowMemory(k=2),\n",
300 |     ")\n"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "### 4.1.5.3 生成"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 15,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "\n",
320 |       "\n",
321 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
322 |       "Prompt after formatting:\n",
323 |       "\u001b[32;1m\u001b[1;3m\n",
324 |       "\n",
325 |       "问：AI 是什么？\n",
326 |       "\n",
327 |       "答：\u001b[0m\n",
328 |       "AI指的是人工智能,是一种能够通过学习和理解数据,以及应用数学、逻辑、推理等知识,来实现与人类智能相似或超越人类智能的计算机系统。AI可以分为弱人工智能和强人工智能。弱人工智能是指一种只能完成特定任务的AI系统,比如语音识别或图像识别等;而强人工智能则是一种具有与人类智能相同或超越人类智能的AI系统,可以像人类一样思考、学习和理解世界。目前,AI的应用领域已经涵盖了诸如自然语言处理、计算机视觉、机器学习、深度学习、自动驾驶、医疗健康等多个领域。\n",
329 |       "\n",
330 |       "\u001b[1m> Finished chain.\u001b[0m\n",
331 |       "\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "text = \"AI 是什么？\"\n",
337 |     "response_text = llm_chain.run(human_input=text,stop=\"\\n\\n\")\n"
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "cn-eval",
344 |    "language": "python",
345 |    "name": "python3"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 3
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython3",
357 |    "version": "3.9.18"
358 |   },
359 |   "orig_nbformat": 4
360 |  },
361 |  "nbformat": 4,
362 |  "nbformat_minor": 2
363 | }
364 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_4_Chinese_Support/4_2_Baichuan-13B.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Notebook 4.2: Baichuan-13B"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 4.2.1 概述\n",
 15 |     "\n",
 16 |     "本 Notebook 展示了如何使用[IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) API 在低成本 PC 上（无需独立显卡）运行[Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B) 中文推理。Baichuan-13B 是百川智能科技继 [Baichuan-7B](https://github.com/baichuan-inc/baichuan-7B) 之后开发的一款开源、商业化的大规模语言模型。Baichuan-13B 还可在以下[链接](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)中的[Huggingface models](https://huggingface.co/models)中找到。"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## 4.2.2 安装\n",
 24 |     "\n",
 25 |     "首先，在准备好的环境中安装 IPEX-LLM。有关环境配置的最佳实践，请参阅本教程的 [第二章](../ch_2_Environment_Setup/README.md)。"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "!pip install --pre --upgrade ipex-llm[all]\n",
 35 |     "\n",
 36 |     "# Baichuan-13B-Chat 进行生成所需的额外软件包\n",
 37 |     "!pip install -U transformers_stream_generator"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "all 选项用于安装 IPEX-LLM 所需的其他软件包。"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 4.2.3 加载模型与 Tokenizer\n",
 52 |     "\n",
 53 |     "### 4.2.3.1 加载模型\n",
 54 |     "\n",
 55 |     "使用 IPEX-LLM API加载低精度优化（INT4）的 Baichuan 模型以降低资源成本，这会将模型中的相关层转换为 INT4 格式，\n",
 56 |     "\n",
 57 |     "\n",
 58 |     "> **注意**\n",
 59 |     ">\n",
 60 |     "> 您可以指定参数 `model_path` 为 Huggingface repo id 或本地模型路径。"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from ipex_llm.transformers import AutoModelForCausalLM\n",
 70 |     "\n",
 71 |     "model_path = \"baichuan-inc/Baichuan-13B-Chat\"\n",
 72 |     "model = AutoModelForCausalLM.from_pretrained(model_path,\n",
 73 |     "                                             load_in_4bit=True,\n",
 74 |     "                                             trust_remote_code=True)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### 4.2.3.2 加载 Tokenizer\n",
 82 |     "\n",
 83 |     "LLM 推理也需要一个 tokenizer。它用于将输入文本编码为张量，然后输入 LLM，并将 LLM 输出的张量解码为文本。您可以使用 [Huggingface transformers](https://huggingface.co/docs/transformers/index) API 直接加载 tokenizer。它可以与 IPEX-LLM 加载的模型无缝配合使用。"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from transformers import AutoTokenizer\n",
 93 |     "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
 94 |     "                                          trust_remote_code=True)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## 4.2.4 推理"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### 4.2.4.1 创建 Prompt 模板\n",
109 |     "\n",
110 |     "在生成之前，您需要创建一个 prompt 模板。这里我们给出了一个用于提问和回答的 prompt 模板示例。您也可以根据自己的模型调整 prompt。"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 4,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "BAICHUAN_PROMPT_FORMAT = \"<human>{prompt} <bot>\""
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### 4.2.4.2 生成\n",
127 |     "\n",
128 |     "接下来，您可以使用加载的模型与 tokenizer 生成输出。\n",
129 |     "\n",
130 |     "> **注意**\n",
131 |     ">\n",
132 |     "> `generate` 函数中的 `max_new_tokens` 参数定义了预测的最大 token 数量。"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 5,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "-------------------- Output --------------------\n",
145 |       "<human>AI是什么？ <bot>\n",
146 |       "AI是人工智能（Artificial Intelligence）的缩写，它是指让计算机或其他设备模拟人类智能的技术。AI可以执行各种任务，如语音识别\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "import time\n",
152 |     "import torch\n",
153 |     "\n",
154 |     "prompt = \"AI是什么？\"\n",
155 |     "n_predict = 32\n",
156 |     "with torch.inference_mode():\n",
157 |     "        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=prompt)\n",
158 |     "        input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
159 |     "        # 如果您选择的模型能够利用之前的 key/value attentions 来提高解码速度，\n",
160 |     "        # 但其模型配置中的 `\"use_cache\": false`，\n",
161 |     "        # 则必须在 `generate` 函数中明确设置 `use_cache=True`，\n",
162 |     "        # 以便利用 IPEX-LLM INT4 优化获得最佳性能。\n",
163 |     "        output = model.generate(input_ids,\n",
164 |     "                                max_new_tokens=n_predict)\n",
165 |     "        output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
166 |     "        print('-'*20, 'Output', '-'*20)\n",
167 |     "        print(output_str)"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "llm-zcg",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.9.18"
188 |   },
189 |   "orig_nbformat": 4
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 2
193 | }
194 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_4_Chinese_Support/README.md:
--------------------------------------------------------------------------------
 1 | # 第四章 中文支持
 2 | 
 3 | 本章探讨大型语言模型处理多种语言的能力。由于这些模型的用例和实际应用非常广泛，因此能够支持多种语言对于这些模型来说至关重要。
 4 | 
 5 | 许多流行的模型都支持多种语言，例如 [ChatGPT](https://openai.com/blog/chatgpt), [ChatGLM](https://chatglm.cn/blog), [Baichuan](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat) 等。
 6 | 
 7 | 
 8 | 我们提供了两个 Notebook 用于展示两种流行的多语言模型的用法，并以其中文对话能力来进行说明。
 9 | 
10 | + [ChatGLM2-6B](4_1_ChatGLM2-6B.ipynb)
11 | + [Baichuan-13B](4_2_Baichuan-13B.ipynb)
12 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_5_AppDev_Intermediate/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_5_AppDev_Intermediate/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_5_AppDev_Intermediate/5_2_Speech_Recognition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Notebook 5.2 语音识别\n",
  9 |     "\n",
 10 |     "语音识别，又称自动语音识别（ASR），是一种将口语转换为书面格式或根据口头命令执行特定操作的技术。它涉及分析语音模式、语音学和语言结构的机器学习模型，以准确地转录和理解人类语音。\n",
 11 |     "\n",
 12 |     "[Whisper](https://openai.com/research/whisper) 是一个由 OpenAI 发布的流行的开源模型，用于 ASR 和语音翻译。Whisper 能够转录多种语言的语音，并将这些语言翻译成英语。\n",
 13 |     "\n",
 14 |     "由于 Whisper 的底层是基于 Transformer 的编码器-解码器架构，因此可以使用 IPEX-LLM INT4 优化功能对其进行有效优化。在本教程中，我们将指导您在 IPEX-LLM 优化的 Whisper 模型上构建一个语音识别应用程序，该应用程序可以将音频文件转录/翻译为文本。\n",
 15 |     "\n",
 16 |     "## 5.2.1 安装程序包\n",
 17 |     "\n",
 18 |     "如果您还没有设置环境，请先按照 [第二章](../ch_2_Environment_Setup/README.md) 中的说明进行设置。然后安装 ipex-llm："
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "!pip install ipex-llm[all]"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "attachments": {},
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "由于需要处理音频文件，您还需要安装用于音频分析的 `librosa` 软件包。"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "!pip install -U librosa"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "attachments": {},
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 5.2.2 下载音频文件\n",
 53 |     "\n",
 54 |     "首先，让我们准备一些音频文件。例如，您可以从支持多语言的 [common_voice](https://huggingface.co/datasets/common_voice/viewer/en/train) 数据集中下载音频文件。这里我们随机选择了一个英文音频文件和一个中文音频文件。您可以根据自己的喜好选择不同的音频文件。"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "vscode": {
 62 |      "languageId": "plaintext"
 63 |     }
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "!wget -O audio_en.mp3 https://datasets-server.huggingface.co/assets/common_voice/--/en/train/5/audio/audio.mp3\n",
 68 |     "!wget -O audio_zh.mp3 https://datasets-server.huggingface.co/assets/common_voice/--/zh-CN/train/2/audio/audio.mp3"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "attachments": {},
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "您可以播放下载完成的音频："
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "import IPython\n",
 86 |     "\n",
 87 |     "IPython.display.display(IPython.display.Audio(\"audio_en.mp3\"))\n",
 88 |     "IPython.display.display(IPython.display.Audio(\"audio_zh.mp3\"))"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "attachments": {},
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## 5.2.3 加载预训练好的 Whisper 模型\n",
 97 |     "\n",
 98 |     "现在，让我们加载一个经过预训练的 Whisper 模型，例如 [whisper-medium](https://huggingface.co/openai/whisper-medium) 。OpenAI 发布了各种尺寸的预训练 Whisper 模型（包括 [whisper-small](https://huggingface.co/openai/whisper-small)、[whisper-tiny](https://huggingface.co/openai/whisper-tiny) 等），您可以选择最符合您要求的模型。\n",
 99 |     "\n",
100 |     "您只需在 `ipex-llm` 中使用单行 `transformers`-style API，即可加载具有 INT4 优化功能的 `whisper-medium`（通过指定 `load_in_4bit=True`），如下所示。请注意，对于 Whisper，我们使用了 `AutoModelForSpeechSeq2Seq` 类。"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "from ipex_llm.transformers import AutoModelForSpeechSeq2Seq\n",
110 |     "\n",
111 |     "model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=\"openai/whisper-medium\",\n",
112 |     "                                                  load_in_4bit=True)"
113 |    ]
114 |   },
115 |   {
116 |    "attachments": {},
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## 5.2.4 加载 Whisper Processor\n",
121 |     "\n",
122 |     "无论是音频预处理还是将模型输出从标记转换为文本的后处理，我们都需要 Whisper Processor。您只需使用官方的 `transformers` API 加载 `WhisperProcessor` 即可："
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from transformers import WhisperProcessor\n",
132 |     "\n",
133 |     "processor = WhisperProcessor.from_pretrained(pretrained_model_name_or_path=\"openai/whisper-medium\")"
134 |    ]
135 |   },
136 |   {
137 |    "attachments": {},
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## 5.2.5 转录英文音频\n",
142 |     "\n",
143 |     "使用带有 INT4 优化功能的 IPEX-LLM 优化 Whisper 模型并加载 Whisper Processor 后，就可以开始通过模型推理转录音频了。\n",
144 |     "\n",
145 |     "让我们从英语音频文件 `audio_en.mp3` 开始。在将其输入 Whisper Processor 之前，我们需要从原始语音波形中提取序列数据："
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "import librosa\n",
155 |     "\n",
156 |     "data_en, sample_rate_en = librosa.load(\"audio_en.mp3\", sr=16000)"
157 |    ]
158 |   },
159 |   {
160 |    "attachments": {},
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "> **注意**\n",
165 |     ">\n",
166 |     "> 对于 `whisper-medium`，其 `WhisperFeatureExtractor`（`WhisperProcessor` 的一部分）默认使用 16,000Hz 采样率从音频中提取特征。关键的是要用模型的 `WhisperFeatureExtractor` 以采样率加载音频文件，以便精确识别。\n",
167 |     "\n",
168 |     "然后，我们就可以根据序列数据转录音频文件，使用的方法与使用官方的 `transformers` API 完全相同："
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 6,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Inference time: xxxx s\n",
181 |       "-------------------- English Transcription --------------------\n",
182 |       "[' Book me a reservation for mid-day at French Camp Academy.']\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "import torch\n",
188 |     "import time\n",
189 |     "\n",
190 |     "# 定义任务类型\n",
191 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"english\", task=\"transcribe\")\n",
192 |     "\n",
193 |     "with torch.inference_mode():\n",
194 |     "    # 为 Whisper 模型提取输入特征\n",
195 |     "    input_features = processor(data_en, sampling_rate=sample_rate_en, return_tensors=\"pt\").input_features\n",
196 |     "\n",
197 |     "    # 为转录预测 token id\n",
198 |     "    st = time.time()\n",
199 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)\n",
200 |     "    end = time.time()\n",
201 |     "\n",
202 |     "    # 将 token id 解码为文本\n",
203 |     "    transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
204 |     "\n",
205 |     "    print(f'Inference time: {end-st} s')\n",
206 |     "    print('-'*20, 'English Transcription', '-'*20)\n",
207 |     "    print(transcribe_str)"
208 |    ]
209 |   },
210 |   {
211 |    "attachments": {},
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "> **注意**\n",
216 |     ">\n",
217 |     "> `forced_decoder_ids` 为不同语言和任务（转录或翻译）定义上下文 token 。如果设置为 `None`，Whisper 将自动预测它们。\n",
218 |     "\n",
219 |     "\n",
220 |     "## 5.2.6 转录中文音频并翻译成英文\n",
221 |     "\n",
222 |     "现在把目光转向中文音频 `audio_zh.mp3`。Whisper 可以转录多语言音频，并将其翻译成英文。这里唯一的区别是通过 `forced_decoder_ids` 来定义特定的上下文 token："
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 7,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "Inference time: xxxx s\n",
235 |       "-------------------- Chinese Transcription --------------------\n",
236 |       "['制作时将各原料研磨']\n",
237 |       "Inference time: xxxx s\n",
238 |       "-------------------- Chinese to English Translation --------------------\n",
239 |       "[' When making the dough, grind the ingredients.']\n"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "# 提取序列数据\n",
245 |     "data_zh, sample_rate_zh = librosa.load(\"audio_zh.mp3\", sr=16000)\n",
246 |     "\n",
247 |     "# 定义中文转录任务\n",
248 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"chinese\", task=\"transcribe\")\n",
249 |     "\n",
250 |     "with torch.inference_mode():\n",
251 |     "    input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors=\"pt\").input_features\n",
252 |     "    st = time.time()\n",
253 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)\n",
254 |     "    end = time.time()\n",
255 |     "    transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
256 |     "\n",
257 |     "    print(f'Inference time: {end-st} s')\n",
258 |     "    print('-'*20, 'Chinese Transcription', '-'*20)\n",
259 |     "    print(transcribe_str)\n",
260 |     "\n",
261 |     "# 定义中文转录以及翻译任务\n",
262 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"chinese\", task=\"translate\")\n",
263 |     "\n",
264 |     "with torch.inference_mode():\n",
265 |     "    input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors=\"pt\").input_features\n",
266 |     "    st = time.time()\n",
267 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)\n",
268 |     "    end = time.time()\n",
269 |     "    translate_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
270 |     "\n",
271 |     "    print(f'Inference time: {end-st} s')\n",
272 |     "    print('-'*20, 'Chinese to English Translation', '-'*20)\n",
273 |     "    print(translate_str)"
274 |    ]
275 |   },
276 |   {
277 |    "attachments": {},
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "## 5.2.7 后续学习\n",
282 |     "\n",
283 |     "在接下来的章节中，我们将探讨如何将 IPEX-LLM 与 langchain 结合使用，langchain 是一个专为使用语言模型开发应用程序而设计的框架。有了 langchain 集成，应用程序开发过程就可以变得简单。"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "Python 3 (ipykernel)",
290 |    "language": "python",
291 |    "name": "python3"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 3
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython3",
303 |    "version": "3.9.17"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 4
308 | }
309 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_5_AppDev_Intermediate/README.md:
--------------------------------------------------------------------------------
 1 | # 第五章 中级应用开发
 2 | 
 3 | 您可以使用 IPEX-LLM 加载任何 Hugging Face *transformers* 模型，并在笔记本电脑上对其进行加速。有了 IPEX-LLM，托管在 Hugging Face 上的 PyTorch 模型（FP16/BF16/FP32 格式）可以通过低位量化（支持的精度包括 INT4/INT5/INT8）自动加载和优化。
 4 | 
 5 | 本章将深入探讨 IPEX-LLM 的 `transformers`-style API，该 API 用于加载和优化 Huggingface *transformers* 模型。您将了解 API 的用法和常见做法，并学习如何使用这些 API 创建真实世界中的应用程序。
 6 | 
 7 | 本章包含两个 Notebook。
 8 | 
 9 | 在 [5.1 聊天机器人](./5_1_ChatBot.ipynb) 中，您将首先学习如何在不同场景中使用 `transformers`-style API（例如保存/加载、精度选择等），然后继续构建一个具有流式显示和多轮聊天功能的聊天机器人应用程序。
10 | 
11 | 在 [5.2 语音识别](./5_2_Speech_Recognition.ipynb) 中，您将学习如何使用 IPEX-LLM 加载基于 Transformer 的语音识别模型 [Whisper](https://openai.com/research/whisper)，然后使用它转录和翻译音频文件。
12 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_6_GPU_Acceleration/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_6_GPU_Acceleration/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_6_GPU_Acceleration/6_1_GPU_Llama2-7B.md:
--------------------------------------------------------------------------------
  1 | # 6.1 在英特尔 GPU 上运行 Llama 2 (7B)
  2 | 
  3 | 您可以使用 IPEX-LLM 加载任何 Hugging Face *transformers* 模型，以便在英特尔 GPU 上加速。有了 IPEX-LLM，Hugging Face 上托管的 PyTorch 模型（FP16/BF16/FP32）可以在英特尔 GPU 上以低位量化（支持的精度包括 INT4/NF4/INT5/INT8）的方式自动加载和优化。
  4 | 
  5 | 在本教程中，您将学习如何在英特尔 GPU 上运行经过 IPEX-LLM 优化的 LLM，并在此基础上构建一个流式对话机器人。本教程以一个流行的开源 LLM [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)为例。
  6 | 
  7 | ## 6.1.1 在英特尔 GPU 上安装 IPEX-LLM
  8 | 
  9 | 首先，在准备好的环境中安装 IPEX-LLM。有关英特尔 GPU 环境设置的最佳做法，请参阅本章的 [README](./README.md#70-environment-setup)。
 10 | 
 11 | 在终端中运行：
 12 | 
 13 | ```bash
 14 | pip install --pre --upgrade ipex-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 15 | ```
 16 | 
 17 | > **注意**
 18 | > 如果您使用了旧版本的`ipex-llm`(早于`2.5.0b20240104`版本)，您需要在代码开头手动导入`import intel_extension_for_pytorch as ipex`。
 19 | 
 20 | 完成安装后，您需要为英特尔 GPU 配置 oneAPI 环境变量。
 21 | 
 22 | ```bash
 23 | # 配置 oneAPI 环境变量
 24 | source /opt/intel/oneapi/setvars.sh
 25 | ```
 26 | 
 27 | 安装以及环境配置完成后，让我们进入本教程的 **Python 脚本**。
 28 | 
 29 | ## 6.1.2 (可选) 下载 Llama 2 (7B)
 30 | 
 31 | 要从 Hugging Face 下载 [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) 模型，您需要获得 Meta 授予的访问权限。请按照 [此处](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) 提供的说明申请模型的访问权限。
 32 | 
 33 | 获得访问权限后，用您的 Hugging Face token 下载模型：
 34 | 
 35 | ```python
 36 | from huggingface_hub import snapshot_download
 37 | 
 38 | model_path = snapshot_download(repo_id='meta-llama/Llama-2-7b-chat-hf',
 39 |                                token='hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX') # 将此处改为您自己的 Hugging Face access token
 40 | ```
 41 | 
 42 | > **注意**
 43 | > 模型将会默认被下载到 `HF_HOME='~/.cache/huggingface'`.
 44 | 
 45 | ## 6.1.3 以低精度加载模型
 46 | 
 47 | 一个常见的用例是以低精度加载 Hugging Face *transformers* 模型，即在加载时进行**隐式**量化。
 48 | 
 49 | 对于 Llama 2 (7B)，您可以简单地导入 `ipex_llm.transformers.AutoModelForCausalLM` 而不是 `transformers.AutoModelForCausalLM`，并在 `from_pretrained` 函数中相应地指定 `load_in_4bit=True` 或 `load_in_low_bit` 参数。
 50 | 
 51 | 对于英特尔 GPU，您应在 `from_pretrained` 函数中**特别设置 `optimize_model=False`** 。**一旦获得低精度模型，请将其设置为 `to('xpu')`**。
 52 | 
 53 | **用于 INT4 优化（通过使用 `load_in_4bit=True`）：**
 54 | 
 55 | ```python
 56 | from ipex_llm.transformers import AutoModelForCausalLM
 57 | 
 58 | model_in_4bit = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
 59 |                                                      load_in_4bit=True,
 60 |                                                      optimize_model=False)
 61 | model_in_4bit_gpu = model_in_4bit.to('xpu')
 62 | ```
 63 | 
 64 | > **注意**
 65 | > IPEX-LLM 支持 `AutoModel`, `AutoModelForCausalLM`, `AutoModelForSpeechSeq2Seq` 以及 `AutoModelForSeq2SeqLM`.
 66 | >
 67 | > 如果您已经下载了 Llama 2 (7B) 模型并跳过了步骤 [7.1.2.2](#712-optional-download-llama-2-7b)，您可以将`pretrained_model_name_or_path`设置为模型路径。
 68 | 
 69 | **(可选) 用于 INT8 优化（通过使用 `load_in_low_bit="sym_int8"`）:**
 70 | 
 71 | ```python
 72 | # 请注意，这里的 AutoModelForCausalLM 是从 ipex_llm.transformers 导入的
 73 | model_in_8bit = AutoModelForCausalLM.from_pretrained(
 74 |     pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
 75 |     load_in_low_bit="sym_int8",
 76 |     optimize_model=False
 77 | )
 78 | model_in_8bit_gpu = model_in_8bit.to('xpu')
 79 | ```
 80 | 
 81 | > **注意**
 82 | > * 目前英特尔 GPU 上的 IPEX-LLM 支持 `'sym_int4'`, `'asym_int4'`, `'sym_int5'`, `'asym_int5'` 或 `'sym_int8'`选项，其中 'sym' 和 'asym' 用于区分对称量化与非对称量化。选项 `'nf4'` ，也就是 4-bit NormalFloat，同样也是支持的。
 83 | >
 84 | > * `load_in_4bit=True` 等价于 `load_in_low_bit='sym_int4'`.
 85 | 
 86 | ## 6.1.4 加载 Tokenizer 
 87 | 
 88 | LLM 推理也需要一个 tokenizer. 您可以使用 [Huggingface transformers](https://huggingface.co/docs/transformers/index) API 来直接加载 tokenizer. 它可以与 IPEX-LLM 加载的模型无缝配合使用。对于 Llama 2，对应的 tokenizer 类为 `LlamaTokenizer`.
 89 | 
 90 | ```python
 91 | from transformers import LlamaTokenizer
 92 | 
 93 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf")
 94 | ```
 95 | 
 96 | > **注意**
 97 | > 如果您已经下载了 Llama 2 (7B) 模型并跳过了步骤 [7.1.2.2](#712-optional-download-llama-2-7b)，您可以将`pretrained_model_name_or_path`设置为模型路径。
 98 | 
 99 | ## 6.1.5 运行模型
100 | 
101 | 您可以用与官方 `transformers` API 几乎相同的方式在英特尔 GPU 上使用 IPEX-LLM 优化进行模型推理。**唯一的区别是为 token id 设置 `to('xpu')`**。这里我们为模型创建了一个问答对话模板让其补全。
102 | 
103 | ```python
104 | import torch
105 | 
106 | with torch.inference_mode():
107 |     prompt = 'Q: What is CPU?\nA:'
108 |     
109 |     # 将输入的 prompt 从字符串转为 token id
110 |     # 使用 .to('xpu') 以在英特尔 GPU 上进行推理
111 |     input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
112 | 
113 |     # 基于输入的 token id 预测接下来的 token (最多 32 个)
114 |     output = model_in_4bit_gpu.generate(input_ids,
115 |                             max_new_tokens=32)
116 | 
117 |     # 将预测的 token id 解码为输出字符串
118 |     output = output.cpu()
119 |     output_str = tokenizer.decode(output[0], skip_special_tokens=True)
120 |     
121 |     print('-'*20, 'Output', '-'*20)
122 |     print(output_str)
123 | ```
124 | 
125 | > **注意**
126 | > 在英特尔 GPU 上运行优化的 LLM 刚开始的生成可能会比较慢。因此，建议在实际生成前进行一些**预热**的运行。
127 | >
128 | > 对于关于流式对话的下一节，我们可以将第 7.1.6 节中的这次生成视为一个预热。
129 | 
130 | ## 6.1.6 流式对话
131 | 
132 | 现在，让我们构建一个在英特尔 GPU 上运行的流式对话机器人，让 LLM 参与互动对话。聊天机器人的互动并没有什么魔法——它依然依赖于 LLM 预测以及生成下一个 token. 为了让 LLM 对话，我们需要将 prompt 适当的格式化为对话格式，例如：
133 | 
134 | 
135 | ```
136 | <s>[INST] <<SYS>>
137 | You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe.
138 | <</SYS>>
139 | 
140 | What is AI? [/INST]
141 | ```
142 | 
143 | 此外，为了实现多轮对话，您需要将新的对话输入附加到之前的对话从而为模型制作一个新的 prompt，例如：
144 | 
145 | ```
146 | <s>[INST] <<SYS>>
147 | You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe.
148 | <</SYS>>
149 | 
150 | What is AI? [/INST] AI is a term used to describe the development of computer systems that can perform tasks that typically require human intelligence, such as understanding natural language, recognizing images. </s><s> [INST] Is it dangerous? [INST]
151 | ```
152 | 
153 | 这里我们展示了一个运行在 IPEX-LLM 优化过的 Llama 2 (7B) 模型上的支持流式显示的多轮对话实例。
154 | 
155 | 首先，定义对话上下文格式[^1]，以便模型完成对话：
156 | 
157 | ```python
158 | SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe."
159 | 
160 | def format_prompt(input_str, chat_history):
161 |     prompt = [f'<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n\n']
162 |     do_strip = False
163 |     for history_input, history_response in chat_history:
164 |         history_input = history_input.strip() if do_strip else history_input
165 |         do_strip = True
166 |         prompt.append(f'{history_input} [/INST] {history_response.strip()} </s><s>[INST] ')
167 |     input_str = input_str.strip() if do_strip else input_str
168 |     prompt.append(f'{input_str} [/INST]')
169 |     return ''.join(prompt)
170 | ```
171 | 
172 | [^1]: 对话上下文格式参考自[这里](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/323df5680706d388eff048fba2f9c9493dfc0152/model.py#L20)以及[这里](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/323df5680706d388eff048fba2f9c9493dfc0152/app.py#L9).
173 | 
174 | 接下来，定义 `stream_chat` 函数，将模型输出持续添加到聊天记录中。这样可以确保对话上下文正确的被格式化从而便于下一次回复的生成。这里的响应是逐字生成的：
175 | 
176 | ```python
177 | from transformers import TextIteratorStreamer
178 | 
179 | def stream_chat(model, tokenizer, input_str, chat_history):
180 |     # 通过聊天记录将对话上下文格式化为 prompt
181 |     prompt = format_prompt(input_str, chat_history)
182 |     input_ids = tokenizer([prompt], return_tensors='pt').to('xpu') # 为英特尔 GPU 指定 to('xpu')
183 | 
184 |     streamer = TextIteratorStreamer(tokenizer,
185 |                                     skip_prompt=True, # 在生成的 token 中跳过 prompt
186 |                                     skip_special_tokens=True)
187 | 
188 |     generate_kwargs = dict(
189 |         input_ids,
190 |         streamer=streamer,
191 |         max_new_tokens=128
192 |     )
193 |     
194 |     # 为了确保对生成文本的非阻塞访问，生成过程应在单独的线程中运行
195 |     from threading import Thread
196 |     
197 |     thread = Thread(target=model.generate, kwargs=generate_kwargs)
198 |     thread.start()
199 | 
200 |     output_str = []
201 |     print("Response: ", end="")
202 |     for stream_output in streamer:
203 |         output_str.append(stream_output)
204 |         print(stream_output, end="")
205 | 
206 |     # 将模型的输出添加至聊天记录中
207 |     chat_history.append((input_str, ''.join(output_str)))
208 | ```
209 | 
210 | > **注意**
211 | > 为了成功观察标准输出中的文本流行为，我们需要设置环境变量 `PYTHONUNBUFFERED=1` 以确保标准输出流直接发送到终端而不是先进行缓冲。
212 | >
213 | > [Hugging Face *transformers* streamer classes](https://huggingface.co/docs/transformers/main/generation_strategies#streaming) 目前还在开发中，未来可能会发生变化。
214 | 
215 | 然后，我们可以通过允许连续的用户输入来实现人类和机器人之间的互动式、多轮流式对话：
216 | 
217 | ```python
218 | chat_history = []
219 | 
220 | print('-'*20, 'Stream Chat', '-'*20, end="")
221 | while True:
222 |     with torch.inference_mode():
223 |         print("\n", end="")
224 |         user_input = input("Input: ")
225 |         if user_input == "stop": # 当用户输入 "stop" 时停止对话
226 |             print("Stream Chat with Llama 2 (7B) stopped.")
227 |             break
228 |         stream_chat(model=model_in_4bit_gpu,
229 |                     tokenizer=tokenizer,
230 |                     input_str=user_input,
231 |                     chat_history=chat_history)
232 | ```


--------------------------------------------------------------------------------
/Chinese_Version/ch_6_GPU_Acceleration/README.md:
--------------------------------------------------------------------------------
  1 | # 第六章 GPU 加速
  2 | 
  3 | IPEX-LLM 除了在英特尔 CPU 上具有显著的加速能力外，还支持在英特尔 GPU 上运行 LLM（大型语言模型）的优化和加速。
  4 | 
  5 | IPEX-LLM 借助低精度技术、现代硬件加速和最新的软件优化，支持在英特尔 GPU 上优化任何 [*HuggingFace transformers*](https://huggingface.co/docs/transformers/index) 模型。
  6 | 
  7 | #### 在英特尔锐炫 GPU 上运行 6B 模型（实时屏幕画面）:
  8 | 
  9 | <p align="left">
 10 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/chatglm2-arc.gif" width='60%' /> 
 11 | 
 12 | </p>
 13 | 
 14 | #### 在英特尔锐炫 GPU 上运行 13B 模型（实时屏幕画面）: 
 15 | 
 16 | <p align="left">
 17 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/llama2-13b-arc.gif" width='60%' /> 
 18 | 
 19 | </p>
 20 | 
 21 | 在第六章中，您将学习如何在英特尔 GPU 上使用 IPEX-LLM 优化来运行 LLM 以及实现流式对话功能。本章将使用流行的开源模型作为示例：
 22 | 
 23 | + [Llama2-7B](./6_1_GPU_Llama2-7B.md)
 24 | 
 25 | ## 6.0 环境配置
 26 | 
 27 | 以下是一些设置环境的最佳做法。强烈建议您按照以下相应步骤正确配置环境。
 28 | 
 29 | ### 6.0.1 系统需求
 30 | 
 31 | 为了顺利体验第六章中的 Notebook，请确保您的硬件和操作系统符合以下要求：
 32 | 
 33 | > ⚠️硬件
 34 |   - 英特尔锐炫™ A系列显卡
 35 |   - 英特尔 Data Center GPU Flex Series
 36 |   - 英特尔 Data Center GPU Max Series
 37 | 
 38 | > ⚠️操作系统
 39 |   - Linux 系统, 推荐使用 Ubuntu 22.04
 40 | 
 41 |     > **注意**
 42 |     > 请注意，英特尔 GPU 上的 IPEX-LLM 优化仅支持 Linux 操作系统。
 43 | 
 44 | ### 6.0.2 安装驱动程序和工具包
 45 | 
 46 | 在英特尔 GPU 上使用 IPEX-LLM 之前，有几个安装工具的步骤：
 47 | 
 48 | - 首先，您需要安装英特尔 GPU 驱动程序。请参阅我们的[驱动程序安装](https://dgpu-docs.intel.com/driver/installation.html)以了解更多关于通用 GPU 功能的事项。
 49 |   > **注意**
 50 |   > 对于使用默认 IPEX 版本（IPEX 2.0.110+xpu）的 IPEX-LLM，需要英特尔 GPU 驱动程序版本 [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html)。
 51 | 
 52 | - 您还需要下载并安装[英特尔® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html)。OneMKL 和 DPC++ 编译器是必选项，其他为可选项。
 53 |   > **注意**
 54 |   > 使用默认 IPEX 版本（IPEX 2.0.110+xpu）的 IPEX-LLM 需要英特尔® oneAPI Base Toolkit 的版本 == 2023.2.0。
 55 | 
 56 | <details><summary>对于在 Ubuntu 22.04 上使用英特尔锐炫™ A 系列显卡的客户端用户，也可参考以下命令安装驱动程序和 oneAPI Base Toolkit。详细命令：</summary>
 57 | <br/>
 58 | 
 59 | ```bash
 60 | # 安装锐炫驱动程序
 61 | sudo apt-get install -y gpg-agent wget
 62 | 
 63 | wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | \
 64 |   sudo gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
 65 | 
 66 | echo 'deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc' | \
 67 |   sudo tee  /etc/apt/sources.list.d/intel.gpu.jammy.list
 68 | 
 69 | 
 70 | # 降级内核版本
 71 | sudo apt-get update && sudo apt-get install  -y --install-suggests  linux-image-5.19.0-41-generic
 72 | 
 73 | sudo sed -i "s/GRUB_DEFAULT=.*/GRUB_DEFAULT=\"1> $(echo $(($(awk -F\' '/menuentry / {print $2}' /boot/grub/grub.cfg \
 74 | | grep -no '5.19.0-41' | sed 's/:/\n/g' | head -n 1)-2)))\"/" /etc/default/grub
 75 | 
 76 | sudo  update-grub
 77 | 
 78 | sudo reboot
 79 | 
 80 | # 移除最新版本内核
 81 | sudo apt purge linux-image-6.2.0-26-generic
 82 | 
 83 | sudo apt autoremove
 84 | 
 85 | sudo reboot
 86 | 
 87 | # 安装驱动程序
 88 | sudo apt-get update
 89 | 
 90 | sudo apt-get -y install \
 91 |     gawk \
 92 |     dkms \
 93 |     linux-headers-$(uname -r) \
 94 |     libc6-dev
 95 | 	
 96 | sudo apt-get install -y intel-platform-vsec-dkms intel-platform-cse-dkms intel-i915-dkms intel-fw-gpu
 97 | 
 98 | sudo apt-get install -y gawk libc6-dev udev\
 99 |   intel-opencl-icd intel-level-zero-gpu level-zero \
100 |   intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
101 |   libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
102 |   libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
103 |   mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo
104 |   
105 | sudo reboot
106 | 
107 | # 配置权限
108 | sudo gpasswd -a ${USER} render
109 | 
110 | newgrp render
111 | 
112 | # 验证设备是否可以使用 i915 驱动程序正常运行
113 | sudo apt-get install -y hwinfo
114 | hwinfo --display
115 | 
116 | 
117 | # 安装 one api
118 | wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
119 | 
120 | echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
121 | 
122 | sudo apt update
123 | 
124 | sudo apt install intel-basekit
125 | ```
126 | </details>
127 | 
128 | ### 6.0.3 Python 环境配置
129 | 
130 | 接下来，使用 python 环境管理工具（推荐使用 [Conda](https://docs.conda.io/projects/conda/en/stable/)）创建 python 环境并安装必要的库。
131 | 
132 | #### 6.0.3.1 安装 Conda
133 | 
134 | 对于 Linux 用户，打开终端并运行以下命令：
135 | 
136 | ```bash
137 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
138 | bash ./Miniconda3-latest-Linux-x86_64.sh
139 | conda init
140 | ```
141 | 
142 | > **注意**
143 | > 按照控制台弹出的提示操作，直到 conda 初始化成功完成。
144 | 
145 | #### 6.0.3.2 创建环境
146 | 
147 | > **注意**
148 | > 推荐使用 Python 3.9 运行 IPEX-LLM.
149 | 
150 | 使用您选择的名称创建一个 Python 3.9 环境，例如 `llm-tutorial-gpu`：
151 | 
152 | ```bash
153 | conda create -n llm-tutorial-gpu python=3.9
154 | ```
155 | 
156 | 接下来激活环境 `llm-tutorial-gpu`:
157 | 
158 | ```bash
159 | conda activate llm-tutorial-gpu
160 | ```
161 | 
162 | ### 6.0.4 Linux 上的推荐配置
163 | 
164 | 为优化英特尔 GPU 的性能，建议设置以下几个环境变量：
165 | 
166 | ```bash
167 | # 配置 OneAPI 环境变量
168 | source /opt/intel/oneapi/setvars.sh
169 | 
170 | export USE_XETLA=OFF
171 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
172 | ```
173 | 


--------------------------------------------------------------------------------
/Chinese_Version/ch_7_Finetune/7_1_Finetune_Llama2-7B.md:
--------------------------------------------------------------------------------
  1 | # 7.1 使用 QLoRA 微调 Llama 2 (7B)
  2 | 
  3 | 为了帮助您更好地理解 QLoRA 微调过程，在本教程中，我们提供了一个实用指南，利用 IPEX-LLM 将大语言模型对特定的下游任务进行微调。 这里使用 [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) 作为示例来完成文本生成任务。
  4 | 
  5 | ## 7.1.1 在 Intel GPU 上启用 IPEX-LLM
  6 | 
  7 | ### 7.1.1.1 安装 IPEX-LLM
  8 | 
  9 | 按照[Readme](./README.md#70-environment-setup)中的步骤设置环境后，您可以使用以下命令在终端中安装 IPEX-LLM 以及相应的依赖环境：
 10 | 
 11 | ```bash
 12 | pip install --pre --upgrade ipex-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 13 | pip install transformers==4.34.0
 14 | pip install peft==0.5.0
 15 | pip install accelerate==0.23.0
 16 | ```
 17 | 
 18 | > **注意**
 19 | > 如果您使用了旧版本的`ipex-llm`(早于`2.5.0b20240104`版本)，您需要在代码开头手动导入`import intel_extension_for_pytorch as ipex`。
 20 | 
 21 | ### 7.1.1.2 配置 oneAPI 环境变量
 22 | 
 23 | 完成安装后，您需要为英特尔 GPU 配置 oneAPI 环境变量。
 24 | 
 25 | ```bash
 26 | # 配置 oneAPI 环境变量
 27 | source /opt/intel/oneapi/setvars.sh
 28 | ```
 29 | 
 30 | 安装以及环境配置完成后，让我们进入本教程的 **Python 脚本**。
 31 | 
 32 | ## 7.1.2 QLoRA 微调
 33 | 
 34 | ### 7.1.2.1 以低精度加载模型
 35 | 
 36 | 本教程选择流行的开源 LLM 模型 [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)来说明 QLoRA 模型微调的过程。
 37 | 
 38 | > **注意**
 39 | >
 40 | > 您可以使用 Huggingface 库 id 或本地模型路径来指定参数 `pretrained_model_name_or_path`。
 41 | > 如果您已经下载了 Llama 2 (7B) 模型，您可以将 `pretrained_model_name_or_path` 指定为本地模型路径。
 42 | 
 43 | 通过 IPEX-LLM 优化，您可以使用`ipex_llm.transformers.AutoModelForCausalLM`替代`transformers.AutoModelForCausalLM`来加载模型来进行隐式量化。
 44 | 
 45 | 对于英特尔 GPU，您应在`from_pretrained`函数中特别设置 `optimize_model=False`。一旦获得低精度模型，请将其设置为`to('xpu')`。
 46 | 
 47 | ```python
 48 | model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = "meta-llama/Llama-2-7b-hf",
 49 |                                              load_in_low_bit="nf4",
 50 |                                              optimize_model=False,
 51 |                                              torch_dtype=torch.float16,
 52 |                                              modules_to_not_convert=["lm_head"])
 53 | model = model.to('xpu')
 54 | ```
 55 | > **注意**
 56 | >
 57 | > 我们指定 load_in_low_bit="nf4" 以应用 4 位 NormalFloat 优化。 根据 [QLoRA 论文](https://arxiv.org/pdf/2305.14314.pdf)，使用 “nf4” 量化比 “int4” 能达到更优的模型效果。
 58 | 
 59 | ### 7.1.2.2 准备训练模型
 60 | 
 61 | 我们可以应用`ipex_llm.transformers.qlora`中的`prepare_model_for_kbit_training`对模型进行预处理，以进行训练的准备。
 62 | 
 63 | ```python
 64 | from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
 65 | model.gradient_checkpointing_enable() #可以进一步减少内存使用但速度较慢
 66 | model = prepare_model_for_kbit_training(model)
 67 | ```
 68 | 
 69 | 接下来，我们可以从预处理后的模型中创建一个 PEFT 模型并配置它的参数，如下所示：
 70 | 
 71 | ```python
 72 | from ipex_llm.transformers.qlora import get_peft_model
 73 | from peft import LoraConfig
 74 | 
 75 | config = LoraConfig(r=8, 
 76 |                     lora_alpha=32, 
 77 |                     target_modules=["q_proj", "k_proj", "v_proj"], 
 78 |                     lora_dropout=0.05, 
 79 |                     bias="none", 
 80 |                     task_type="CAUSAL_LM")
 81 | model = get_peft_model(model, config)
 82 | 
 83 | ```
 84 | > **注意**
 85 | >
 86 | > 我们从`ipex_llm.transformers.qlora`导入与IPEX-LLM兼容的 PEFT 模型，替代原先使用 bitandbytes 库和 cuda 的`from peft importprepare_model_for_kbit_training, get_peft_model`用法。其使用方法和用`peft`库进行 QLoRA 微调方法相同。
 87 | > 
 88 | > **注意**
 89 | >
 90 | > 有关 LoraConfig 参数的更多说明可以在 [Transformer LoRA 指南](https://huggingface.co/docs/peft/conceptual_guides/lora#common-lora-parameters-in-peft)中查看。
 91 | 
 92 | ### 7.1.2.3 加载数据集
 93 | 
 94 | 我们加载通用数据集 [english quotes](https://huggingface.co/datasets/Abirate/english_quotes) 来根据英语名言来微调我们的模型。
 95 | 
 96 | ```python
 97 | from datasets import load_dataset
 98 | data = load_dataset("Abirate/english_quotes")
 99 | data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
100 | ```
101 | 
102 | > **注意**
103 | >
104 | > 如果您已经从 [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes/blob/main/quotes.jsonl) 下载了 `.jsonl` 文件，您可以使用 `data = load_dataset( "json", data_files= "path/to/your/.jsonl/file")` 指定本地路径，以替代从 huggingface repo id 的加载方法 `data = load_dataset("Abirate/english_quotes")`。
105 | 
106 | ### 7.1.2.4 加载Tokenizer
107 | 
108 | 分词器可以在 LLM 训练和推理中实现分词和去分词过程。您可以使用 [Huggingface Transformers](https://huggingface.co/docs/transformers/index) API来加载 LLM 推理需要的分词器，它可以与 IPEX-LLM 加载的模型无缝配合使用。对于Llama 2，对应的tokenizer类为`LlamaTokenizer`。
109 | 
110 | ```python
111 | from transformers import LlamaTokenizer
112 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)
113 | tokenizer.pad_token_id = 0
114 | tokenizer.padding_side = "left"
115 | ```
116 | 
117 | > **注意**
118 | >
119 | > 如果您已经下载了 Llama 2 (7B) 模型，您可以将 `pretrained_model_name_or_path` 指定为本地模型路径。
120 | 
121 | ### 7.1.2.5 进行训练
122 | 
123 | 现在您可以通过使用 HuggingFace 系统设置`trainer`来开始训练过程。 这里我们将`warmup_steps`设置为 20 以加速训练过程。
124 | ```python
125 | import transformers
126 | trainer = transformers.Trainer(
127 |     model=model,
128 |     train_dataset=data["train"],
129 |     args=transformers.TrainingArguments(
130 |         per_device_train_batch_size=4,
131 |         gradient_accumulation_steps= 1,
132 |         warmup_steps=20,
133 |         max_steps=200,
134 |         learning_rate=2e-4,
135 |         save_steps=100,
136 |         fp16=True,
137 |         logging_steps=20,
138 |         output_dir="outputs", # 这里指定你自己的输出路径
139 |         optim="adamw_hf", # 尚不支持 paged_adamw_8bit优化
140 |         # gradient_checkpointing=True, # #可以进一步减少内存使用但速度较慢
141 |     ),
142 |     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
143 | )
144 | model.config.use_cache = False  # 消除警告，进行推理时应重新启用
145 | result = trainer.train()
146 | ```
147 | 我们可以获得以下输出来展示我们的训练损失：
148 | ```
149 | /home/arda/anaconda3/envs/yining-llm-qlora/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
150 |   warnings.warn(
151 | {'loss': 1.7193, 'learning_rate': 0.0002, 'epoch': 0.03}                                                             
152 | {'loss': 1.3242, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}                                             
153 | {'loss': 1.2266, 'learning_rate': 0.00015555555555555556, 'epoch': 0.1}                                              
154 | {'loss': 1.1534, 'learning_rate': 0.00013333333333333334, 'epoch': 0.13}                                             
155 | {'loss': 0.9368, 'learning_rate': 0.00011111111111111112, 'epoch': 0.16}                                             
156 | {'loss': 0.9321, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.19}                                              
157 | {'loss': 0.9902, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.22}                                              
158 | {'loss': 0.8593, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.26}                                             
159 | {'loss': 1.0055, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.29}                                             
160 | {'loss': 1.0081, 'learning_rate': 0.0, 'epoch': 0.32}                                                                
161 | {'train_runtime': xxx, 'train_samples_per_second': xxx, 'train_steps_per_second': xxx, 'train_loss': 1.1155566596984863, 'epoch': 0.32}
162 | 100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [xx:xx<xx:xx,  xxxs/it]
163 | ```
164 | 
165 | 最终的 LoRA 模型权重和配置文件会保存到`${output_dir}/checkpoint-{max_steps}/adapter_model.bin`和`${output_dir}/checkpoint-{max_steps}/adapter_config.json`。
166 | 
167 | ## 7.1.3 合并模型
168 | 
169 | 微调模型后，您可以将QLoRA模型权重和基本模型合并并导出为Hugging Face格式。
170 | 
171 | > **注意**
172 | >
173 | > 请确保您的`accelerate`库版本是 0.23.0，以便在CPU上启用模型合并。
174 | 
175 | ### 7.1.3.1 加载预训练模型
176 | 
177 | ```python
178 | base_model = AutoModelForCausalLM.from_pretrained(
179 |         base_model,
180 |         torch_dtype=torch.float16,
181 |         device_map={"": "cpu"},
182 |     )
183 | ```
184 | 
185 | > **注意**
186 | >
187 | > 在合并状态下，应删除 load_in_low_bit="nf4"，因为我们需要加载原始模型作为基本模型。
188 | 
189 | ### 7.1.3.2 合并权重
190 | 
191 | 然后我们可以加载训练出的LoRA权重来准备合并。
192 | 
193 | ```python
194 | from ipex_llm.transformers.qlora import PeftModel
195 | adapter_path = "./outputs/checkpoint-200"
196 | lora_model = PeftModel.from_pretrained(
197 |         base_model,
198 |         adapter_path,
199 |         device_map={"": "cpu"},
200 |         torch_dtype=torch.float16,
201 |     )
202 | ```
203 | 
204 | > **注意**
205 | >
206 | > 我们用`import PeftModel from ipex_llm.transformers.qlora`替代`from peft import PeftModel`来导入 IPEX-LLM 兼容的 PEFT 模型。
207 | 
208 | > **注意**
209 | > 
210 | > 适配器路径是您保存微调模型的本地路径，在本例中是`./outputs/checkpoint-200`。
211 | 
212 | 为了验证LoRA权重和预训练模型权重有效合并，我们提取第一层权重（在 llama2模型中为attention中的queries）来对比其差异。
213 | 
214 | ```python
215 | first_weight = base_model.model.layers[0].self_attn.q_proj.weight
216 | first_weight_old = first_weight.clone()
217 | lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight
218 | assert torch.allclose(first_weight_old, first_weight)
219 | ```
220 | 通过`merge_and_unlaod`方法可以将微调后的模型与预训练的模型进行合并，并通过`assert`声明来验证权重是否发生变化。
221 | 
222 | ```python
223 | lora_model = lora_model.merge_and_unload()
224 | lora_model.train(False)
225 | assert not torch.allclose(first_weight_old, first_weight)
226 | ```
227 | 如果返回一下输出并没有报错，则说明模型合并成功。
228 | 
229 | ```
230 | Using pad_token, but it is not set yet.
231 | Using pad_token, but it is not set yet.
232 | ```
233 | 
234 | 最后，我们可以将合并的模型保存在指定的本地路径中（在我们的例子中是`./outputs/checkpoint-200-merged`）。
235 | 
236 | ```python
237 | output_path = ./outputs/checkpoint-200-merged
238 | lora_model_sd = lora_model.state_dict()
239 | deloreanized_sd = {
240 |         k.replace("base_model.model.", ""): v
241 |         for k, v in lora_model_sd.items()
242 |         if "lora" not in k
243 |     }
244 | base_model.save_pretrained(output_path, state_dict=deloreanized_sd)
245 | tokenizer.save_pretrained(output_path)
246 | 
247 | ```
248 | 
249 | ## 7.1.4 使用微调模型进行推理
250 | 
251 | 合并和部署模型后，我们可以测试微调模型的性能。
252 | 使用IPEX-LLM优化的推理过程详细说明可以在[第6章](../ch_6_GPU_Acceleration/6_1_GPU_Llama2-7B.md)中找到，这里我们快速完成模型推理的准备工作。
253 | 
254 | ### 7.1.4.1 使用微调模型进行推理
255 | 
256 | ```python
257 | model_path = "./outputs/checkpoint-200-merged"
258 | model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_path,load_in_4bit=True)
259 | model = model.to('xpu')
260 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path = model_path)
261 | ```
262 | 
263 | > **注意**
264 | > `model_path` 参数应该与合并模型的输出路径一致。
265 | 
266 | 我们可以验证微调模型在增添新的数据集训练后是否能做出富有哲理的回答。
267 | 
268 | ```python
269 | with torch.inference_mode():
270 |     input_ids = tokenizer.encode('The paradox of time and eternity is', 
271 |                                   return_tensors="pt").to('xpu')
272 |     output = model.generate(input_ids, max_new_tokens=32)
273 |     output = output.cpu()
274 |     output_str = tokenizer.decode(output[0], skip_special_tokens=True)
275 |     print(output_str)
276 | ```
277 | 
278 | 我们可以仅替换`model_path`参数来使用预训练模型重复该过程，将预训练模型的答案与微调模型的答案进行比较：
279 | 
280 | > **预训练模型**
281 | ```
282 | The paradox of time and eternity is that time is not eternal, but eternity is. nobody knows how long time is.
283 | The paradox of time and eternity is
284 | ```
285 | > **微调模型**
286 | ```
287 | The paradox of time and eternity is that, on the one hand, we experience time as linear and progressive, and on the other hand, we experience time as cyclical. And the
288 | ```
289 | 
290 | 我们可以看到微调模型的推理结果与新增数据集中有相同的词汇以及近似的文本风格。基于 IPEX-LLM 的优化，我们可以在仅仅几分钟的训练之内达到这个效果。
291 | 
292 | 以下是更多预训练模型和微调模型的对比结果：
293 | 
294 | |   ♣ 预训练模型   | ♣ 微调模型  |
295 | |         -----          |       -----         |
296 | |   **There are two things that matter:** Einzelnes and the individual. Everyone has heard of the "individual," but few have heard of the "individuum," or "   |  **There are two things that matter:** the quality of our relationships and the legacy we leave.And I think that all of us as human beings are searching for it, no matter where |
297 | |   **In the quiet embrace of the night,** I felt the earth move. Unterscheidung von Wörtern und Ausdrücken.  |  **In the quiet embrace of the night,** the world is still and the stars are bright. My eyes are closed, my heart is at peace, my mind is at rest. I am ready for  |


--------------------------------------------------------------------------------
/Chinese_Version/ch_7_Finetune/README.md:
--------------------------------------------------------------------------------
 1 | # 第七章 微调
 2 | 
 3 | 作为最新的参数微调方法，QLoRA 能够在仅微调少量参数情况下高效地将专业知识注入到预训练后的大语言模型中。IPEX-LLM 同样支持使用QLora 在英特尔 GPU 上进行 4 bit 优化来微调 LLM（大语言模型）。
 4 | 
 5 | > **注意**
 6 | >
 7 | > 目前仅支持Hugging Face Transformers模型运行QLoRA微调，且IPEX-LLM支持在英特尔GPU上优化任何 [*HuggingFace Transformers*](https://huggingface.co/docs/transformers/index)模型。
 8 | > 目前，IPEX-LLM 仅支持对[Hugging Face `transformers` 模型](https://huggingface.co/docs/transformers/index)进行 QLoRA 微调。
 9 | 
10 | 在第7章中，您将了解如何使用 IPEX-LLM 优化将大型语言模型微调适配文本生成任务。IPEX-LLM 可帮助您微调模型、进行 LoRA 权重与基本权重的合并以及应用合并后的模型进行推理。
11 | 
12 | 我们将以当下流行的开源模型 [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) 为例进行训练。
13 | 
14 | ## 7.0 环境设置
15 | 
16 | 您可以按照[第6章](./ch_6_GPU_Acceleration/README.md) 中的详细说明在英特尔 GPU 上配置环境。以下仅列举正确配置环境的**必要**步骤。
17 | 
18 | ### 7.0.1 系统需求
19 | > ⚠️硬件
20 |    - 英特尔 Arc™ A 系列显卡
21 |    - 英特尔数据中心 GPU Flex 系列
22 |    - 英特尔数据中心 GPU Max 系列
23 | 
24 | > ⚠️操作系统
25 |    - Linux系统，Ubuntu 22.04优先
26 | 
27 | ### 7.0.2 安装驱动程序和工具包
28 | 
29 | 在英特尔 GPU 上使用 IPEX-LLM 之前，有几个安装工具的步骤：
30 | 
31 | - 首先，您需要安装英特尔 GPU 驱动程序。请参阅我们的驱动程序安装以了解更多关于通用 GPU 功能的事项。
32 | 
33 | - 您还需要下载并安装英特尔® oneAPI Base Toolkit。OneMKL 和 DPC++ 编译器是必选项，其他为可选项。
34 | 
35 | ### 7.0.3 Python环境配置
36 | 
37 | 假设您已经安装了[Conda](https://docs.conda.io/projects/conda/en/stable/)作为您的python环境管理工具，下面的命令可以帮助您创建并激活您的 Python 环境：
38 | 
39 | ````bash
40 | # 建议使用 Python 3.9 来运行 IPEX-LLM
41 | conda create -n llm-finetune python=3.9
42 | conda activate llm-finetune 
43 | ````
44 | 
45 | ### 7.0.4 设置OneAPI环境变量
46 | 
47 | 您需要在 Intel GPU 上为 IPEX-LLM 设置 OneAPI 环境变量。
48 | 
49 | ```bash
50 | # 配置 OneAPI 环境变量
51 | source /opt/intel/oneapi/setvars.sh
52 | ```
53 | 
54 | ### 7.0.5（可选项） 在英特尔 GPU 上运行模型推理的配置
55 | 
56 | 如果您想使用英特尔 GPU 对微调后的模型进行推理，建议设置以下变量以达到最佳性能：
57 | 
58 | ```bash
59 | export USE_XETLA=OFF
60 | export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
61 | ```


--------------------------------------------------------------------------------
/Chinese_Version/ch_8_AppDev_Advanced/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/Chinese_Version/ch_8_AppDev_Advanced/.keep


--------------------------------------------------------------------------------
/Chinese_Version/ch_8_AppDev_Advanced/README.md:
--------------------------------------------------------------------------------
1 | # 第八章 高级应用开发
2 | 
3 | 本章介绍如何将 LangChain 与 IPEX-LLM 结合使用。
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><h1>IPEX-LLM Tutorial</h1><p>
 2 |   
 3 | <h4 align="center">
 4 |     <p>
 5 |         <b>English</b> |
 6 |         <a href="./Chinese_Version/README.md">中文</a>
 7 |     <p>
 8 | </h4>
 9 | 
10 | [_IPEX-LLM_](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) is a low-bit LLM library on Intel XPU (Xeon/Core/Flex/Arc/PVC). This repository contains tutorials to help you understand what is _IPEX-LLM_ and how to use _IPEX-LLM_ to build LLM applications.
11 | 
12 | The tutorials are organized as follows:
13 | - [Chapter 1 **`Introduction`**](./ch_1_Introduction/) introduces what is _IPEX-LLM_ and what you can do with it. 
14 | - [Chapter 2 **`Environment Setup`**](./ch_2_Environment_Setup/) provides a set of best practices for setting-up your environment.
15 | - [Chapter 3 **`Application Development: Basics`**](./ch_3_AppDev_Basic/) introduces the basic usage of _IPEX-LLM_ and how to build a very simple Chat application.
16 | - [Chapter 4 **`Chinese Support`**](./ch_4_Chinese_Support/) shows the usage of some LLMs which suppports Chinese input/output, e.g. ChatGLM2, Baichuan  
17 | - [Chapter 5 **`Application Development: Intermediate`**](./ch_5_AppDev_Intermediate/) introduces intermediate-level knowledge for application development using _IPEX-LLM_, e.g. How to build a more sophisticated Chatbot, Speech recoginition, etc. 
18 | - [Chapter 6 **`GPU Acceleration`**](./ch_6_GPU_Acceleration/) introduces how to use Intel GPU to accelerate LLMs using _IPEX-LLM_.
19 | - [Chapter 7 **`Finetune`**](./ch_7_Finetune/) introduces how to do finetune using _IPEX-LLM_.
20 | - [Chapter 8 **`Application Development: Advanced`**](./ch_8_AppDev_Advanced/) introduces advanced-level knowledge for application development using _IPEX-LLM_, e.g. langchain usage. 
21 | 
22 | [^1]: Performance varies by use, configuration and other factors. `ipex-llm` may not optimize to the same degree for non-Intel products. Learn more at www.Intel.com/PerformanceIndex.
23 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Report a Vulnerability
 4 | 
 5 | Please report security issues or vulnerabilities to the [Intel® Security Center].
 6 | 
 7 | For more information on how Intel® works to resolve security issues, see
 8 | [Vulnerability Handling Guidelines].
 9 | 
10 | [Intel® Security Center]:https://www.intel.com/security
11 | 
12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
13 | 


--------------------------------------------------------------------------------
/ch_1_Introduction/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/ch_1_Introduction/.keep


--------------------------------------------------------------------------------
/ch_1_Introduction/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 1 Introduction
 2 | 
 3 | ## What is IPEX-LLM
 4 | [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) is a low-bit LLM library on Intel XPU (Xeon/Core/Flex/Arc/PVC), featuring broadest model support, lowest latency and smallest memory footprint. It is released under Apache 2.0 License. 
 5 | 
 6 | 
 7 | ## What can you do with _IPEX-LLM_
 8 | You can use IPEX-LLM to run _any pytorch model_ (e.g. [HuggingFace transformers](https://huggingface.co/docs/transformers/index) models). It automatically optimizes and accelerates LLMs using low-bit optimizations, modern hardware accelerations and latest software optimizations. 
 9 | 
10 | Using IPEX-LLM is easy. With just 1-line of code change, you can immediately observe significant speedup [^1] . 
11 | 
12 | #### Example: Optimize LLaMA model with `optimize_model`
13 | ```python
14 | from ipex_llm import optimize_model
15 | 
16 | from transformers import LlamaForCausalLM, LlamaTokenizer
17 | model = LlamaForCausalLM.from_pretrained(model_path,...)
18 | 
19 | # apply ipex-llm low-bit optimization, by default uses INT4
20 | model = optimize_model(model)
21 | 
22 | ...
23 | ```
24 | 
25 | IPEX-LLM provides a variety of low-bit optimizations (e.g., INT3/NF3/INT4/NF4/INT5/INT8), and allows you to run LLMs on low-cost PCs (CPU-only), on PCs with GPU, or on cloud. 
26 | 
27 | The demos below shows the experiences of running 7B and 13B model on a 16G memory laptop.  
28 | 
29 | #### 6B model running on an Intel 12-Gen Core PC (real-time screen capture):
30 | 
31 | <p align="left">
32 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/chatglm2-6b.gif" width='60%' /> 
33 | 
34 | </p>
35 | 
36 | #### 13B model running on an Intel 12-Gen Core PC (real-time screen capture): 
37 | 
38 | <p align="left">
39 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/llama-2-13b-chat.gif" width='60%' /> 
40 | 
41 | </p>
42 | 
43 | 
44 | 
45 | ## What's Next
46 | 
47 | The following chapters in this tutorial will explain in more details about how to use IPEX-LLM to build LLM applications, e.g. best practices for setting up your environment, APIs, Chinese support, GPU, application development guides with case studies, etc. Most chapters provide runnable notebooks using popular open source models. Read along to learn more and run the code on your laptop.
48 | 
49 | 
50 | Also, you can check out our [GitHub repo](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) for more information and latest news.
51 | 
52 | We have already verified many models on IPEX-LLM and provided ready-to-run examples, such as [Llama2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2), [Vicuna](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna), [ChatGLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm), [ChatGLM2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2), [Baichuan](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan), [MOSS](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss), [Falcon](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon), [Dolly-v1](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1), [Dolly-v2](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2), [StarCoder](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder), [Mistral](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral), [RedPajama](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama), [Whisper](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper), etc. You can find more model examples [here](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/CPU/HF-Transformers-AutoModels/Model).
53 | 
54 | 
55 | [^1]: Performance varies by use, configuration and other factors. `ipex-llm` may not optimize to the same degree for non-Intel products. Learn more at www.Intel.com/PerformanceIndex.
56 | 
57 | 


--------------------------------------------------------------------------------
/ch_2_Environment_Setup/README.md:
--------------------------------------------------------------------------------
  1 | # Chapter 2 Environment Setup
  2 | 
  3 | This chapter presents a set of best practices for setting up your environment. To ensure a smooth experience with the notebooks in the subsequent chapters, it is strongly recommended that you follow the corresponding steps below to configure your environment properly.  
  4 | 
  5 | ## 2.1 System Recommendation
  6 | First of all, choose a proper system. Here's a list of recommended hardware and OS.
  7 | >⚠️**Hardware**
  8 | 
  9 | - Intel PCs, at least 16GB RAM.
 10 | - Servers equipped with Intel® Xeon® processors, at least 32G RAM.
 11 | 
 12 | >⚠️**Operating System**
 13 | 
 14 | - Ubuntu 20.04 or later
 15 | - CentOS 7 or later
 16 | - Windows 10/11, with or without WSL
 17 | 
 18 | ## 2.2 Setup Python Environment
 19 | 
 20 | Next, use a python environment management tool (we recommend using [Conda](https://docs.conda.io/projects/conda/en/stable/)) to create a python enviroment and install necessary libs.  
 21 | 
 22 | 
 23 | ### 2.2.1 Install Conda
 24 | Follow the instructions corresponding to your OS below.
 25 | 
 26 | #### 2.2.1.1 Linux
 27 | 
 28 | For Linux users, open a terminal and run below commands.
 29 | 
 30 | ```bash
 31 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 32 | bash ./Miniconda3-latest-Linux-x86_64.sh
 33 | conda init
 34 | ```
 35 | >**Note**
 36 | > Follow the instructions popped up on the console until conda initialization finished successfully.
 37 | 
 38 | 
 39 | #### 2.2.1.2 Windows
 40 | 
 41 | For Windows users, download conda installer [here](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links) and execute it.
 42 | 
 43 | After the installation finished, open "Anaconda Powershell Prompt (Miniconda3)" for following steps.
 44 | 
 45 | #### 2.2.1.3 Windows Subsystem for Linux (WSL):
 46 | 
 47 | For WSL users, ensure you have already installed WSL2. If not, refer to [here](https://bigdl.readthedocs.io/en/latest/doc/UserGuide/win.html#install-wsl2l) for how to install.
 48 | 
 49 | Open a WSL2 shell and run the same commands as in [2.2.1.1 Linux](#2211-linux) section.
 50 | 
 51 | 
 52 | 
 53 | ### 2.2.2 Create Environment
 54 | > **Note**
 55 | > Python 3.9 is recommended for running IPEX-LLM.
 56 | 
 57 | Create a Python 3.9 environment with the name you choose, for example `llm-tutorial`:
 58 | ```
 59 | conda create -n llm-tutorial python=3.9
 60 | ```
 61 | Then activate the environment `llm-tutorial`:
 62 | ```
 63 | conda activate llm-tutorial
 64 | ```
 65 | 
 66 | ## 2.3 Install IPEX-LLM
 67 | 
 68 | The one-line command below will install the latest `ipex-llm` with all the dependencies for common LLM application development.
 69 | ```
 70 | pip install --pre --upgrade ipex-llm[all]
 71 | ```
 72 | 
 73 | ## 2.4 Setup Jupyter Service
 74 | 
 75 | ### 2.4.1 Install Jupyter
 76 | The `jupyter` library is required for running the tutorial notebooks (i.e. the `.ipynb` files). Under your activated Python 3.9 environment, run:
 77 | ```
 78 | pip install jupyter
 79 | ```
 80 | 
 81 | ### 2.4.2 Start Jupyter Service
 82 | The recommended command to start jupyter service is slightly different on PC and server. 
 83 | 
 84 | #### 2.4.2.1 On PC
 85 | On PC, just run the command in shell:
 86 | ```
 87 | jupyter notebook
 88 | ```
 89 | 
 90 | #### 2.4.2.2 On Server
 91 | On server, it is recommended to use all physical cores of a single socket for better performance. So run below command instead:
 92 | ```bash
 93 | # e.g. for a server with 48 cores per socket
 94 | export OMP_NUM_THREADS=48
 95 | numactl -C 0-47 -m 0 jupyter notebook
 96 | ```
 97 | 
 98 | Congratulations! Now you can use a web browser to access the jupyter service url and execute the notebooks provided in this tutorial. 
 99 | 
100 | 
101 | ## 2.5 Things you may want to know about working with LLMs
102 | If you're new to LLMs and LLM applicaiton development, there's something you might want to know. 
103 | 
104 | ### 2.5.1 Where to find the models
105 | To start, you'll need to obtain a model. There are numerous open-source LLMs available in the community. If you don't have a specific target in mind, consider selecting one that ranks higher on LLM leaderboards. These leaderboards evaluate and compare the capabilities of various LLMs. For instance,
106 | 
107 | - [Open LLM LeaderBoard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) hosted by Huggingface. 
108 | - [Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) hosted by llmsys.
109 | 
110 | Most of these leaderboards include reference links to the models listed. If a model is open source, you can easily download it directly from the provided link and give it a try.
111 | 
112 | ### 2.5.2 Download Models from Huggingface
113 | As of writing, many popular LLMs are hosted on [Huggingface](https://huggingface.co/). 
114 | An example model homepage hosted on huggingface looks like this.
115 | 
116 | ![image](https://github.com/shane-huang/bigdl-llm-tutorial/assets/1995599/a04df95f-5590-4bf1-968c-32cf494ece92)
117 | 
118 | 
119 | To download models from huggingface, you can either use git or huggingface provided APIs. Refer to [Download Model from Huggingface](https://huggingface.co/docs/hub/models-downloading) for details about how to download models. 
120 | 
121 | Usually, the models downloaded from Huggingface can be loaded using [Huggingface Transformers library API](https://huggingface.co/docs/transformers/index). IPEX-LLM provides APIs to easily work with such models. Read the following chapters to to find out more. 
122 |  
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/ch_3_AppDev_Basic/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/ch_3_AppDev_Basic/.keep


--------------------------------------------------------------------------------
/ch_3_AppDev_Basic/3_Baichuan2_BasicApp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Notebook 3: Basic Application Development On Baichuan2\n",
  9 |     "\n",
 10 |     "This notebook introduces the essential usage of `ipex-llm`, and walks you through building a very basic chat application built upon `Baichuan2`.\n",
 11 |     "\n",
 12 |     "## 3.1 Install `ipex-llm`"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "If you haven't installed `ipex-llm`, install it as shown below. The one-line command will install the latest `ipex-llm` with all the dependencies for common LLM application development."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!pip install --pre --upgrade ipex-llm[all]"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "> **Note**\n",
 36 |     ">\n",
 37 |     "> * On Linux OS, we recommend to use `pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu` to install. Please refer to https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_cpu.html#quick-installation for more details."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "attachments": {},
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 3.2 Load a pretrained Model\n",
 46 |     "\n",
 47 |     "Before using a LLM, you need to first load one. Here we take [Baichuan2-7b-chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) as an example.\n",
 48 |     "\n",
 49 |     "> **Note**\n",
 50 |     ">\n",
 51 |     "> * `Baichuan2-7b-chat` is an open-source large language model based on the Transformer architecture. You can find more information about this model on its [homepage](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) hosted on Hugging Face.\n",
 52 |     "\n",
 53 |     "### 3.2.1 Load and Optimize Model\n",
 54 |     " \n",
 55 |     "In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. For more detailed usage of optimize_model, please refer to the [API documentation](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html).\n",
 56 |     "\n",
 57 |     "Besides, many popular open-source PyTorch large language models can be loaded using the `Huggingface Transformers API` (such as [AutoModel](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModel), [AutoModelForCasualLM](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModelForCausalLM), etc.). For such models, ipex-llm also provides a set of APIs to support them. We will now demonstrate how to use them.\n",
 58 |     "\n",
 59 |     "In this example, we use `ipex_llm.transformers.AutoModelForCausalLM` to load the `Baichuan2-7b-chat`. This API mirrors the official `transformers.AutoModelForCasualLM` with only a few additional parameters and methods related to low-bit optimization in the loading process.\n",
 60 |     "\n",
 61 |     "To enable INT4 optimization, simply set `load_in_4bit=True` in `from_pretrained`. Additionally, we configure the parameters `torch_dtype=\"auto\"` and `low_cpu_mem_usage=True` by default, as they may improve both performance and memory efficiency. \n",
 62 |     "\n",
 63 |     "Remember to set `trust_remote_code=True` when loading the model weights and tokenizer. This will allow the necessary configuration for the model."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from ipex_llm.transformers import AutoModelForCausalLM\n",
 73 |     "\n",
 74 |     "model_path = 'baichuan-inc/Baichuan2-7B-Chat'\n",
 75 |     "\n",
 76 |     "model = AutoModelForCausalLM.from_pretrained(model_path,\n",
 77 |     "                                             load_in_4bit=True,\n",
 78 |     "                                             trust_remote_code=True)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "attachments": {},
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "> **Note**\n",
 87 |     ">\n",
 88 |     "> * If you want to use precisions other than INT4(e.g. FP8/INT8,etc.), or know more details about the arguments, please refer to [API document](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/transformers.html) for more information. \n",
 89 |     ">\n",
 90 |     "> * `baichuan-inc/Baichuan2-7B-Chat` is the **_model_id_** of the model `Baichuan2-7B-Chat` on huggingface. When you set the `model_path` parameter of `from_pretrained` to this **_model_id_**, `from_pretrained` will automatically download the model from huggingface,  cache it locally (e.g. `~/.cache/huggingface`), and load it. It may take a long time to download the model using this API. Alternatively, you can download the model yourself, and set `model_path` to the local path of the downloaded model. For more information, refer to the [`from_pretrained` document](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "### 3.2.2 Save & Load Optimized Model\n",
 94 |     "\n",
 95 |     "In the previous section, models loaded using the `Huggingface Transformers API` are typically stored with either fp32 or fp16 precision. To save model space and speedup loading processes, ipex-llm also provides the `save_low_bit` API for saving the model after low-bit optimization, and the `load_low_bit` API for loading the saved low-bit model.\n",
 96 |     "\n",
 97 |     "You can use `save_low_bit` once and use `load_low_bit` many times for inference. This approach bypasses the processes of loading the original FP32/FP16 model and optimization during inference stage, saving both memory and time. Moreover, because the optimized model format is platform-agnostic, you can seamlessly perform saving and loading operations across various machines, regardless of their operating systems. This flexibility enables you to perform optimization/saving on a high-RAM server and deploy the model for inference on a PC with limited RAM.\n",
 98 |     "\n",
 99 |     "\n",
100 |     "**Save Optimized Model**\n",
101 |     "\n",
102 |     "For example, you can use the `save_low_bit` function to save the optimized model as below:"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 2,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "save_directory = './baichuan2-7b-chat-ipex-llm-INT4'\n",
112 |     "\n",
113 |     "model.save_low_bit(save_directory)\n",
114 |     "del(model)"
115 |    ]
116 |   },
117 |   {
118 |    "attachments": {},
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "**Load Optimized Model**\n",
123 |     "\n",
124 |     "Then use `load_low_bit` to load the optimized low-bit model as follows:"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# note that the AutoModelForCausalLM here is imported from ipex_llm.transformers\n",
134 |     "model = AutoModelForCausalLM.load_low_bit(save_directory, trust_remote_code=True)"
135 |    ]
136 |   },
137 |   {
138 |    "attachments": {},
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "## 3.3 Building a Simple Chat Application\n",
143 |     "\n",
144 |     "Now that the model is successfully loaded, we can start building our very first chat application. We shall use the `Huggingface transformers` inference API to do this job.\n",
145 |     "\n",
146 |     "> **Note**\n",
147 |     "> \n",
148 |     "> The code in this section is solely implemented using `Huggingface transformers` API. `ipex-llm` does not require any change in the inference code so you can use any libraries to build your appliction at inference stage.  \n",
149 |     "\n",
150 |     "> **Note**\n",
151 |     "> \n",
152 |     "> Here we use Q&A dialog prompt template so that it can answer our questions.\n",
153 |     "\n",
154 |     "\n",
155 |     "> **Note**\n",
156 |     "> \n",
157 |     "> `max_new_tokens` parameter in the `generate` function defines the maximum number of tokens to predict. \n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 4,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "from transformers import AutoTokenizer\n",
167 |     "\n",
168 |     "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 5,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "-------------------- Output --------------------\n",
181 |       "Q: What is CPU?\n",
182 |       "A: The CPU (Central Processing Unit) is the heart of a computer system, responsible for processing and executing instructions provided by the software. It manages the overall operation of\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "import torch\n",
188 |     "\n",
189 |     "with torch.inference_mode():\n",
190 |     "    prompt = 'Q: What is CPU?\\nA:'\n",
191 |     "    \n",
192 |     "    # tokenize the input prompt from string to token ids\n",
193 |     "    input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
194 |     "    # predict the next tokens (maximum 32) based on the input token ids\n",
195 |     "    output = model.generate(input_ids, max_new_tokens=32)\n",
196 |     "    # decode the predicted token ids to output string\n",
197 |     "    output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
198 |     "\n",
199 |     "    print('-'*20, 'Output', '-'*20)\n",
200 |     "    print(output_str)"
201 |    ]
202 |   }
203 |  ],
204 |  "metadata": {
205 |   "kernelspec": {
206 |    "display_name": "Python 3 (ipykernel)",
207 |    "language": "python",
208 |    "name": "python3"
209 |   }
210 |  },
211 |  "nbformat": 4,
212 |  "nbformat_minor": 4
213 | }
214 | 


--------------------------------------------------------------------------------
/ch_3_AppDev_Basic/3_OpenLlamaBasicApp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Notebook 3: Basic Application Development On Open-Llama\n",
  9 |     "\n",
 10 |     "This notebook introduces the essential usage of `ipex-llm`, and walks you through building a very basic chat application built upon `Open-Llama`.\n",
 11 |     "\n",
 12 |     "## 3.1 Install `ipex-llm`"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "If you haven't installed `ipex-llm`, install it as shown below. The one-line command will install the latest `ipex-llm` with all the dependencies for common LLM application development."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!pip install --pre --upgrade ipex-llm[all]"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "> **Note**\n",
 36 |     ">\n",
 37 |     "> * On Linux OS, we recommend to use `pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu` to install. Please refer to https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_cpu.html#quick-installation for more details."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "attachments": {},
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 3.2 Load a pretrained Model\n",
 46 |     "\n",
 47 |     "Before using a LLM, you need to first load one. Here we take a relatively small LLM, i.e. [open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) as an example.\n",
 48 |     "\n",
 49 |     "> **Note**\n",
 50 |     ">\n",
 51 |     "> * `open_llama_3b_v2` is an open-source large language model based on the LLaMA architecture. You can find more information about this model on its [homepage](https://huggingface.co/openlm-research/open_llama_3b_v2) hosted on Hugging Face.\n",
 52 |     "\n",
 53 |     "### 3.2.1 Load and Optimize Model\n",
 54 |     " \n",
 55 |     "In general, you just need one-line `optimize_model` to easily optimize any loaded PyTorch model, regardless of the library or API you are using. For more detailed usage of optimize_model, please refer to the [API documentation](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/optimize.html).\n",
 56 |     "\n",
 57 |     "Besides, many popular open-source PyTorch large language models can be loaded using the `Huggingface Transformers API` (such as [AutoModel](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModel), [AutoModelForCasualLM](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/auto#transformers.AutoModelForCausalLM), etc.). For such models, ipex-llm also provides a set of APIs to support them. We will now demonstrate how to use them.\n",
 58 |     "\n",
 59 |     "In this example, we use `ipex_llm.transformers.AutoModelForCausalLM` to load the `open_llama_3b_v2 model`. This API mirrors the official `transformers.AutoModelForCasualLM` with only a few additional parameters and methods related to low-bit optimization in the loading process.\n",
 60 |     "\n",
 61 |     "To enable INT4 optimization, simply set `load_in_4bit=True` in `from_pretrained`. Additionally, we configure the parameters `torch_dtype=\"auto\"` and `low_cpu_mem_usage=True` by default, as they may improve both performance and memory efficiency. "
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 2,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from ipex_llm.transformers import AutoModelForCausalLM\n",
 71 |     "\n",
 72 |     "model_path = 'openlm-research/open_llama_3b_v2'\n",
 73 |     "\n",
 74 |     "model = AutoModelForCausalLM.from_pretrained(model_path,\n",
 75 |     "                                             load_in_4bit=True)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "attachments": {},
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "> **Note**\n",
 84 |     ">\n",
 85 |     "> * If you want to use precisions other than INT4(e.g. NF4/INT5/INT8,etc.), or know more details about the arguments, please refer to [API document](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/transformers.html) for more information. \n",
 86 |     ">\n",
 87 |     "> * `openlm-research/open_llama_3b_v2` is the **_model_id_** of the model `open_llama_3b_v2` on huggingface. When you set the `model_path` parameter of `from_pretrained` to this **_model_id_**, `from_pretrained` will automatically download the model from huggingface,  cache it locally (e.g. `~/.cache/huggingface`), and load it. It may take a long time to download the model using this API. Alternatively, you can download the model yourself, and set `model_path` to the local path of the downloaded model. For more information, refer to the [`from_pretrained` document](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "### 3.2.2 Save & Load Optimized Model\n",
 91 |     "\n",
 92 |     "In the previous section, models loaded using the `Huggingface Transformers API` are typically stored with either fp32 or fp16 precision. To save model space and speedup loading processes, ipex-llm also provides the `save_low_bit` API for saving the model after low-bit optimization, and the `load_low_bit` API for loading the saved low-bit model.\n",
 93 |     "\n",
 94 |     "You can use `save_low_bit` once and use `load_low_bit` many times for inference. This approach bypasses the processes of loading the original FP32/FP16 model and optimization during inference stage, saving both memory and time. Moreover, because the optimized model format is platform-agnostic, you can seamlessly perform saving and loading operations across various machines, regardless of their operating systems. This flexibility enables you to perform optimization/saving on a high-RAM server and deploy the model for inference on a PC with limited RAM.\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "**Save Optimized Model**\n",
 98 |     "\n",
 99 |     "For example, you can use the `save_low_bit` function to save the optimized model as below:"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "save_directory = './open-llama-3b-v2-ipex-llm-INT4'\n",
109 |     "\n",
110 |     "model.save_low_bit(save_directory)\n",
111 |     "del(model)"
112 |    ]
113 |   },
114 |   {
115 |    "attachments": {},
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "**Load Optimized Model**\n",
120 |     "\n",
121 |     "Then use `load_low_bit` to load the optimized low-bit model as follows:"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 4,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# note that the AutoModelForCausalLM here is imported from ipex_llm.transformers\n",
131 |     "model = AutoModelForCausalLM.load_low_bit(save_directory)"
132 |    ]
133 |   },
134 |   {
135 |    "attachments": {},
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## 3.3 Building a Simple Chat Application\n",
140 |     "\n",
141 |     "Now that the model is successfully loaded, we can start building our very first chat application. We shall use the `Huggingface transformers` inference API to do this job.\n",
142 |     "\n",
143 |     "> **Note**\n",
144 |     "> \n",
145 |     "> The code in this section is solely implemented using `Huggingface transformers` API. `ipex-llm` does not require any change in the inference code so you can use any libraries to build your appliction at inference stage.  \n",
146 |     "\n",
147 |     "> **Note**\n",
148 |     "> \n",
149 |     "> Here we use Q&A dialog prompt template so that it can answer our questions.\n",
150 |     "\n",
151 |     "\n",
152 |     "> **Note**\n",
153 |     "> \n",
154 |     "> `max_new_tokens` parameter in the `generate` function defines the maximum number of tokens to predict. \n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "from transformers import LlamaTokenizer\n",
164 |     "\n",
165 |     "tokenizer = LlamaTokenizer.from_pretrained(model_path)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 6,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "-------------------- Output --------------------\n",
178 |       "Q: What is CPU?\n",
179 |       "A: CPU stands for Central Processing Unit. It is the brain of the computer.\n",
180 |       "Q: What is RAM?\n",
181 |       "A: RAM stands for Random Access Memory.\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "import torch\n",
187 |     "\n",
188 |     "with torch.inference_mode():\n",
189 |     "    prompt = 'Q: What is CPU?\\nA:'\n",
190 |     "    \n",
191 |     "    # tokenize the input prompt from string to token ids\n",
192 |     "    input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
193 |     "    # predict the next tokens (maximum 32) based on the input token ids\n",
194 |     "    output = model.generate(input_ids, max_new_tokens=32)\n",
195 |     "    # decode the predicted token ids to output string\n",
196 |     "    output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
197 |     "\n",
198 |     "    print('-'*20, 'Output', '-'*20)\n",
199 |     "    print(output_str)"
200 |    ]
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "Python 3 (ipykernel)",
206 |    "language": "python",
207 |    "name": "python3"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.9.18"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 4
224 | }
225 | 


--------------------------------------------------------------------------------
/ch_3_AppDev_Basic/README.md:
--------------------------------------------------------------------------------
1 | # Chatper 3 Application Development: Basics
2 | 
3 | This chapter will get you started quickly with IPEX-LLM, and you'll learn how to build your very first LLM application. 
4 | 
5 | The accompanying notebook  [3_OpenLlamaBasicApp.ipynb](./3_OpenLlamaBasicApp.ipynb) in this chapter introduces some essential APIs of IPEX-LLM and walks you through the process of building a basic chat application, based on `open_llama_3b_v2` model. And [3_Baichuan2_BasicApp.ipynb](./3_Baichuan2_BasicApp.ipynb) demonstrates how to do that on `Baichuan2-7b-chat`.
6 | 
7 | ## What's Next
8 | 
9 | In [Chapter 4: Chinese Support](../ch_4_Chinese_Support/) You will learn how to use models with Chinese support for application development. In [Chapter 5: Application Development: Intermediate](../ch_5_AppDev_Intermediate/), you will learn some sophisticated skills to build a better chatbot and how to do speech recognition.  


--------------------------------------------------------------------------------
/ch_4_Chinese_Support/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/ch_4_Chinese_Support/.keep


--------------------------------------------------------------------------------
/ch_4_Chinese_Support/4_1_ChatGLM2-6B.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Notebook 4.1: ChatGLM2-6B\n",
  8 |     "\n",
  9 |     "## 4.1.1 Overview\n",
 10 |     "This example shows how to run [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B) Chinese inference on low-cost PCs (without the need of discrete GPU) using [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) APIs. ChatGLM2-6B is the second-generation version of the open-source bilingual (Chinese-English) chat model [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) proposed by [THUDM](https://github.com/THUDM). ChatGLM2-6B also can be found in [Huggingface models](https://huggingface.co/models) in following [link](https://huggingface.co/THUDM/chatglm2-6b).\n",
 11 |     "\n",
 12 |     "Before conducting inference, you may need to prepare environment according to [Chapter 2](../ch_2_Environment_Setup/README.md)."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## 4.1.2 Installation\n",
 20 |     "\n",
 21 |     "First of all, install IPEX-LLM in your prepared environment. For best practices of environment setup, refer to [Chapter 2](../ch_2_Environment_Setup/README.md) in this tutorial."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "!pip install --pre --upgrade ipex-llm[all]"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "The all option is for installing other required packages by IPEX-LLM."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## 4.1.3 Load Model and Tokenizer\n",
 45 |     "\n",
 46 |     "### 4.1.3.1 Load Model\n",
 47 |     "\n",
 48 |     "Load ChatGLM2 model with low-bit optimization(INT4) for lower resource cost using IPEX-LLM APIs, which convert the relevant layers in the model into INT4 format.\n",
 49 |     "\n",
 50 |     "> **Note**\n",
 51 |     ">\n",
 52 |     "> IPEX-LLM has supported `AutoModel`, `AutoModelForCausalLM`, `AutoModelForSpeechSeq2Seq` and `AutoModelForSeq2SeqLM`. The AutoClasses help users automatically retrieve the relevant model, in this case, we can simply use `AutoModel` to load.\n",
 53 |     "\n",
 54 |     "> **Note**\n",
 55 |     ">\n",
 56 |     "> You can specify the argument `model_path` with both Huggingface repo id or local model path."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "from ipex_llm.transformers import AutoModel\n",
 66 |     "\n",
 67 |     "model_path = \"THUDM/chatglm2-6b\"\n",
 68 |     "model = AutoModel.from_pretrained(model_path,\n",
 69 |     "                                  load_in_4bit=True,\n",
 70 |     "                                  trust_remote_code=True)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### 4.1.3.2 Load Tokenizer\n",
 78 |     "\n",
 79 |     "A tokenizer is also needed for LLM inference. It is used to encode input texts to tensors to feed to LLMs, and decode the LLM output tensors to texts. You can use [Huggingface transformers](https://huggingface.co/docs/transformers/index) API to load the tokenizer directly. It can be used seamlessly with models loaded by IPEX-LLM."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 21,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from transformers import AutoTokenizer\n",
 89 |     "\n",
 90 |     "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
 91 |     "                                          trust_remote_code=True)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## 4.1.4 Inference\n",
 99 |     "\n",
100 |     "### 4.1.4.1 Create Prompt Template\n",
101 |     "\n",
102 |     "Before generating, you need to create a prompt template. Here we give an example prompt template for question and answering refers to [ChatGLM2-6B prompt template](https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L1007). You can tune the prompt based on your own model as well."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 22,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "CHATGLM_V2_PROMPT_TEMPLATE = \"问：{prompt}\\n\\n答：\""
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### 4.1.4.2 Generate\n",
119 |     "\n",
120 |     "Then, you can generate output with loaded model and tokenizer.\n",
121 |     "\n",
122 |     "> **Note**\n",
123 |     "> \n",
124 |     "> `max_new_tokens` parameter in the `generate` function defines the maximum number of tokens to predict. "
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 25,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "-------------------- Output --------------------\n",
137 |       "问:AI是什么?\n",
138 |       "\n",
139 |       "答: AI是人工智能(Artificial Intelligence)的缩写,指的是一种能够模拟人类智能的技术或系统。AI系统可以通过学习、推理、解决问题等方式,实现类似于\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "import torch\n",
145 |     "\n",
146 |     "prompt = \"AI是什么？\"\n",
147 |     "n_predict = 32\n",
148 |     "\n",
149 |     "with torch.inference_mode():\n",
150 |     "    prompt = CHATGLM_V2_PROMPT_TEMPLATE.format(prompt=prompt)\n",
151 |     "    input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
152 |     "    output = model.generate(input_ids,\n",
153 |     "                            max_new_tokens=n_predict)\n",
154 |     "    output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
155 |     "    print('-'*20, 'Output', '-'*20)\n",
156 |     "    print(output_str)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "### 4.1.4.3 Stream Chat\n",
164 |     "\n",
165 |     "ChatGLM2-6B support streaming output function `stream_chat`, which enable the model to provide a streaming response word by word. However, other models may not provide similar APIs, if you want to implement general streaming output function, please refer to [Chapter 5.1](../ch_5_AppDev_Intermediate/5_1_ChatBot.ipynb).\n",
166 |     "\n",
167 |     "> **Note**\n",
168 |     ">\n",
169 |     "> To successfully observe the text streaming behavior in standard output, we need to set the environment variable `PYTHONUNBUFFERED=1 `to ensure that the standard output streams are directly sent to the terminal without being buffered first."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 6,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "-------------------- Stream Chat Output --------------------\n",
182 |       "AI指的是人工智能,是一种能够通过学习和理解数据,以及应用适当的算法和数学模型,来执行与人类智能相似的任务的计算机程序。AI可以包括机器学习、自然语言处理、计算机视觉、专家系统、强化学习等不同类型的技术。\n",
183 |       "\n",
184 |       "AI的应用领域广泛,例如自然语言处理可用于语音识别、机器翻译、情感分析等;计算机视觉可用于人脸识别、图像识别、自动驾驶等;机器学习可用于预测、分类、聚类等数据分析任务。\n",
185 |       "\n",
186 |       "AI是一种非常有前途的技术,已经在许多领域产生了积极的影响,并随着技术的不断进步,将继续为我们的生活和工作带来更多的便利和改变。"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "import torch\n",
192 |     "\n",
193 |     "with torch.inference_mode():\n",
194 |     "    question = \"AI 是什么？\"\n",
195 |     "    response_ = \"\"\n",
196 |     "    print('-'*20, 'Stream Chat Output', '-'*20)\n",
197 |     "    for response, history in model.stream_chat(tokenizer, question, history=[]):\n",
198 |     "        print(response.replace(response_, \"\"), end=\"\")\n",
199 |     "        response_ = response"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## 4.1.5 Use in LangChain\n",
207 |     "\n",
208 |     "[LangChain](https://python.langchain.com/docs/get_started/introduction.html) is a widely used framework for developing applications powered by language models. In this section, we will show how to integrate IPEX-LLM with LangChain. You can follow this [instruction](https://python.langchain.com/docs/get_started/installation) to prepare environment for LangChain."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "If you need, install LangChain as following, or you can refer to [Chapter 8](../ch_8_AppDev_Advanced/README.md) for more information about LangChain integrations:"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "!pip install -U langchain==0.0.248"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "> **Note**\n",
232 |     "> \n",
233 |     "> We recommend to use `langchain==0.0.248`, which is verified in our tutorial."
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### 4.1.5.1 Create Prompt Template\n",
241 |     "\n",
242 |     "Before inference, you need to create a prompt template. Here we give an example prompt template for question and answering, which contains two input variables, `history` and `human_input`. You can tune the prompt based on your own model as well."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 10,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "CHATGLM_V2_LANGCHAIN_PROMPT_TEMPLATE = \"\"\"{history}\\n\\n问：{human_input}\\n\\n答：\"\"\""
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "### 4.1.5.2 Prepare Chain"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "Use [LangChain API](https://api.python.langchain.com/en/latest/api_reference.html) `LLMChain` to construct a chain for inference. Here we use IPEX-LLM APIs to construct a `LLM` object, which will load model with low-bit optimization automatically.\n",
266 |     "\n",
267 |     "> **Note**\n",
268 |     ">\n",
269 |     "> `ConversationBufferWindowMemory` is a type of memory in LangChain that keeps a sliding window of the most recent `k` interactions in a conversation."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "from langchain import LLMChain, PromptTemplate\n",
279 |     "from ipex_llm.langchain.llms import TransformersLLM\n",
280 |     "from langchain.memory import ConversationBufferWindowMemory\n",
281 |     "\n",
282 |     "llm_model_path = \"THUDM/chatglm2-6b\" # the path to the huggingface llm model\n",
283 |     "\n",
284 |     "prompt = PromptTemplate(input_variables=[\"history\", \"human_input\"], template=CHATGLM_V2_LANGCHAIN_PROMPT_TEMPLATE)\n",
285 |     "max_new_tokens = 128\n",
286 |     "\n",
287 |     "llm = TransformersLLM.from_model_id(\n",
288 |     "        model_id=llm_model_path,\n",
289 |     "        model_kwargs={\"trust_remote_code\": True},\n",
290 |     ")\n",
291 |     "\n",
292 |     "# Following code are complete the same as the use-case\n",
293 |     "llm_chain = LLMChain(\n",
294 |     "    llm=llm,\n",
295 |     "    prompt=prompt,\n",
296 |     "    verbose=True,\n",
297 |     "    llm_kwargs={\"max_new_tokens\":max_new_tokens},\n",
298 |     "    memory=ConversationBufferWindowMemory(k=2),\n",
299 |     ")\n"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "### 4.1.5.3 Generate"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 17,
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "\n",
319 |       "\n",
320 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
321 |       "Prompt after formatting:\n",
322 |       "\u001b[32;1m\u001b[1;3m\n",
323 |       "\n",
324 |       "问：AI 是什么？\n",
325 |       "\n",
326 |       "答：\u001b[0m\n",
327 |       "AI指的是人工智能,是一种能够通过学习和理解数据,以及应用适当的算法和数学模型,来执行与人类智能相似的任务的技术。AI可以包括机器学习、自然语言处理、计算机视觉、知识表示、推理、决策等多种技术。\n",
328 |       "\n",
329 |       "\n",
330 |       "\n",
331 |       "\u001b[1m> Finished chain.\u001b[0m\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "text = \"AI 是什么？\"\n",
337 |     "response_text = llm_chain.run(human_input=text,stop=\"\\n\\n\")\n"
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "cn-eval",
344 |    "language": "python",
345 |    "name": "python3"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 3
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython3",
357 |    "version": "3.9.18"
358 |   },
359 |   "orig_nbformat": 4
360 |  },
361 |  "nbformat": 4,
362 |  "nbformat_minor": 2
363 | }
364 | 


--------------------------------------------------------------------------------
/ch_4_Chinese_Support/4_2_Baichuan-13B.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Notebook 4.2: Baichuan-13B"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 4.2.1 Overview\n",
 15 |     "\n",
 16 |     "This notebook shows how to run [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B) Chinese inference on low-cost PCs (without the need of discrete GPU) using [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm) APIs. Baichuan-13B is an open-source, commercially available large-scale language model developed by Baichuan Intelligent Technology following [Baichuan-7B](https://github.com/baichuan-inc/baichuan-7B). Baichuan-13B also can be found in [Huggingface models](https://huggingface.co/models) in following [link](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat)."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## 4.2.2 Installation\n",
 24 |     "\n",
 25 |     "First of all, install IPEX-LLM in your prepared environment. For best practices of environment setup, refer to [Chapter 2](../ch_2_Environment_Setup/README.md) in this tutorial."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "!pip install --pre --upgrade ipex-llm[all]\n",
 35 |     "\n",
 36 |     "# Additional package required for Baichuan-13B-Chat to conduct generation\n",
 37 |     "!pip install -U transformers_stream_generator"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "The all option is for installing other required packages by IPEX-LLM."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 4.2.3 Load Model and Tokenizer\n",
 52 |     "\n",
 53 |     "### 4.2.3.1 Load Model\n",
 54 |     "\n",
 55 |     "Load Baichuan model with low-bit optimization(INT4) for lower resource cost using IPEX-LLM APIs, which convert the relevant layers in the model into INT4 format. \n",
 56 |     "\n",
 57 |     "> **Note**\n",
 58 |     ">\n",
 59 |     "> You can specify the argument `model_path` with both Huggingface repo id or local model path."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from ipex_llm.transformers import AutoModelForCausalLM\n",
 69 |     "\n",
 70 |     "model_path = \"baichuan-inc/Baichuan-13B-Chat\"\n",
 71 |     "model = AutoModelForCausalLM.from_pretrained(model_path,\n",
 72 |     "                                             load_in_4bit=True,\n",
 73 |     "                                             trust_remote_code=True)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### 4.2.3.2 Load Tokenizer\n",
 81 |     "\n",
 82 |     "A tokenizer is also needed for LLM inference. It is used to encode input texts to tensors to feed to LLMs, and decode the LLM output tensors to texts. You can use [Huggingface transformers](https://huggingface.co/docs/transformers/index) API to load the tokenizer directly. It can be used seamlessly with models loaded by IPEX-LLM."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 7,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from transformers import AutoTokenizer\n",
 92 |     "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
 93 |     "                                          trust_remote_code=True)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## 4.2.4 Inference"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "### 4.2.4.1 Create Prompt Template\n",
108 |     "\n",
109 |     "Before generating, you need to create a prompt template, we show an example of a template for question and answering here. You can tune the prompt based on your own model as well."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 5,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "BAICHUAN_PROMPT_FORMAT = \"<human>{prompt} <bot>\""
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### 4.2.4.2 Generate\n",
126 |     "\n",
127 |     "Then, you can generate output with loaded model and tokenizer.\n",
128 |     "\n",
129 |     "> **Note**\n",
130 |     ">\n",
131 |     "> `max_new_tokens` parameter in the `generate` function defines the maximum number of tokens to predict."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "-------------------- Output --------------------\n",
144 |       "<human>AI是什么？ <bot>人工智能(Artificial Intelligence，简称AI)是指由人制造出来的系统所表现出的智能，通常是通过计算机程序和传感器实现的\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "import torch\n",
150 |     "\n",
151 |     "prompt = \"AI是什么？\"\n",
152 |     "n_predict = 32\n",
153 |     "with torch.inference_mode():\n",
154 |     "        prompt = BAICHUAN_PROMPT_FORMAT.format(prompt=prompt)\n",
155 |     "        input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n",
156 |     "        # if your selected model is capable of utilizing previous key/value attentions\n",
157 |     "        # to enhance decoding speed, but has `\"use_cache\": false` in its model config,\n",
158 |     "        # it is important to set `use_cache=True` explicitly in the `generate` function\n",
159 |     "        # to obtain optimal performance with IPEX-LLM INT4 optimizations\n",
160 |     "        output = model.generate(input_ids,\n",
161 |     "                                max_new_tokens=n_predict)\n",
162 |     "        output_str = tokenizer.decode(output[0], skip_special_tokens=True)\n",
163 |     "        print('-'*20, 'Output', '-'*20)\n",
164 |     "        print(output_str)"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "llm-zcg",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.9.18"
185 |   },
186 |   "orig_nbformat": 4
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/ch_4_Chinese_Support/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 4 Chinese Support
 2 | 
 3 | This chapter explores the capability of large languange models in handling multiple languages. Being able to support multiple languages is crucial for these models due to their wide-ranging use cases and real-world applications. 
 4 | 
 5 | Many popular models have support for multiple languages, such as: [ChatGPT](https://openai.com/blog/chatgpt), [ChatGLM](https://chatglm.cn/blog), [Baichuan](https://huggingface.co/baichuan-inc/Baichuan-13B-Chat), etc.
 6 | 
 7 | 
 8 | We provide two notebooks showing the usage of two popular multi-language models, using Chinese capabilities for illustration.
 9 | 
10 | + [ChatGLM2-6B](4_1_ChatGLM2-6B.ipynb)
11 | + [Baichuan-13B](4_2_Baichuan-13B.ipynb)
12 | 


--------------------------------------------------------------------------------
/ch_5_AppDev_Intermediate/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/ch_5_AppDev_Intermediate/.keep


--------------------------------------------------------------------------------
/ch_5_AppDev_Intermediate/5_2_Speech_Recognition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Notebook 5.2 Speech Recognition\n",
  9 |     "\n",
 10 |     "Speech recognition, also known as automatic speech recognition (ASR), is a technology that converts spoken words into written format or executes specific actions based on verbal commands. It involves machine learning models that analyze speech patterns, phonetics, and language structures to accurately transcribe and understand human speech.\n",
 11 |     "\n",
 12 |     "[Whisper](https://openai.com/research/whisper), published by OpenAI, is a popular open-source model for both ASR and speech translation. This means that Whisper has the capability to transcribe speech in multiple languages and facilitate translation from those languages into English.\n",
 13 |     "\n",
 14 |     "Due to its underlying Transformer-based encoder-decoder architecture, Whisper can be optimized effectively with IPEX-LLM INT4 optimizations. In this tutorial, we will guide you through building a speech recognition application on IPEX-LLM optimized Whisper model that can transcribe/translate audio files into text.\n",
 15 |     "\n",
 16 |     "## 5.2.1 Install Packages\n",
 17 |     "\n",
 18 |     "Follow instructions in [Chapter 2](../ch_2_Environment_Setup/README.md) to setup your environment if you haven't done so. Then install ipex-llm:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "!pip install --pre --upgrade ipex-llm[all]"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "attachments": {},
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Due to the requirement to process audio file, you will also need to install the `librosa` package for audio analysis."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "!pip install -U librosa"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "attachments": {},
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 5.2.2 Download Audio Files\n",
 53 |     "\n",
 54 |     "To begin, let's prepare some audio files. As an example, you can download [an English example](https://huggingface.co/datasets/facebook/voxpopuli/viewer/en?row=3) from multilingual audio dataset [voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) and [one Chinese example](https://huggingface.co/datasets/carlot/AIShell?row=84) from the Chinese audio dataset [AIShell](https://huggingface.co/datasets/carlot/AIShell). Here, the English audio file and the Chinese audio file have been randomly selected. Feel free to choose different audio files according to your preferences."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "attachments": {},
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Here we rename the files to `audio_en.wav` and `audio_zh.wav` and put them in the current path.You could play the successfully-downloaded audio:"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import IPython\n",
 72 |     "\n",
 73 |     "IPython.display.display(IPython.display.Audio(\"audio_en.wav\"))\n",
 74 |     "IPython.display.display(IPython.display.Audio(\"audio_zh.wav\"))"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "attachments": {},
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## 5.2.3 Load Pretrained Whisper Model\n",
 83 |     "\n",
 84 |     "Now, let's load a pretrained Whisper model, e.g. [whisper-medium](https://huggingface.co/openai/whisper-medium) as an example. OpenAI has released pretrained Whisper models in various sizes (including [whisper-small](https://huggingface.co/openai/whisper-small), [whisper-tiny](https://huggingface.co/openai/whisper-tiny), etc.), allowing you to choose the one that best fits your requirements. \n",
 85 |     "\n",
 86 |     "Simply use one-line `transformers`-style API in `ipex-llm` to load `whisper-medium` with INT4 optimizations (by specifying `load_in_4bit=True`) as follows. Please note that model class `AutoModelForSpeechSeq2Seq` is used for Whisper:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "from ipex_llm.transformers import AutoModelForSpeechSeq2Seq\n",
 96 |     "\n",
 97 |     "model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=\"openai/whisper-medium\",\n",
 98 |     "                                                  load_in_4bit=True)"
 99 |    ]
100 |   },
101 |   {
102 |    "attachments": {},
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## 5.2.4 Load Whisper Processor\n",
107 |     "\n",
108 |     "A Whisper processor is also needed for both audio pre-processing, and post-processing model outputs from tokens to texts. Just use the official `transformers` API to load `WhisperProcessor`:"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 3,
114 |    "metadata": {
115 |     "scrolled": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "from transformers import WhisperProcessor\n",
120 |     "\n",
121 |     "processor = WhisperProcessor.from_pretrained(pretrained_model_name_or_path=\"openai/whisper-medium\")"
122 |    ]
123 |   },
124 |   {
125 |    "attachments": {},
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## 5.2.5 Transcribe English Audio\n",
130 |     "\n",
131 |     "Once you have optimized the Whisper model using IPEX-LLM with INT4 optimization and loaded the Whisper processor, you are ready to begin transcribing the audio through model inference.\n",
132 |     "\n",
133 |     "Let's start with the English audio file `audio_en.wav`. Before we feed it into Whisper processor, we need to extract sequence data from raw speech waveform:"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 6,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "import librosa\n",
143 |     "\n",
144 |     "data_en, sample_rate_en = librosa.load(\"audio_en.wav\", sr=16000)"
145 |    ]
146 |   },
147 |   {
148 |    "attachments": {},
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "> **Note**\n",
153 |     ">\n",
154 |     "> For `whisper-medium`, its `WhisperFeatureExtractor` (part of `WhisperProcessor`) extracts features from audio using a 16,000Hz sampling rate by default. It's important to load the audio file at the sample sampling rate with model's `WhisperFeatureExtractor` for precise recognition.\n",
155 |     "\n",
156 |     "We can then proceed to transcribe the audio file based on the sequence data, using exactly the same way as using official `transformers` API:"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 7,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "-------------------- English Transcription --------------------\n",
169 |       "[' These are not easy issues to resolve.']\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "import torch\n",
175 |     "import time\n",
176 |     "\n",
177 |     "# define task type\n",
178 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"english\", task=\"transcribe\")\n",
179 |     "\n",
180 |     "with torch.inference_mode():\n",
181 |     "    # extract input features for the Whisper model\n",
182 |     "    input_features = processor(data_en, sampling_rate=sample_rate_en, return_tensors=\"pt\").input_features\n",
183 |     "\n",
184 |     "    # predict token ids for transcription\n",
185 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids,max_new_tokens=200)\n",
186 |     "\n",
187 |     "    # decode token ids into texts\n",
188 |     "    transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
189 |     "\n",
190 |     "    print('-'*20, 'English Transcription', '-'*20)\n",
191 |     "    print(transcribe_str)"
192 |    ]
193 |   },
194 |   {
195 |    "attachments": {},
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "> **Note**\n",
200 |     ">\n",
201 |     "> `forced_decoder_ids` defines the context token for different language and task (transcribe or translate). If it is set to `None`, Whisper will automatically predict them.\n",
202 |     "\n",
203 |     "\n",
204 |     "## 5.2.6 Transcribe Chinese Audio and Translate to English\n",
205 |     "\n",
206 |     "Then let's move to the Chinese audio `audio_zh.wav`. Whisper can transcribe multilingual audio, and translate them into English. The only difference here is to define specific context token through `forced_decoder_ids`:"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 8,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "name": "stdout",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "-------------------- Chinese Transcription --------------------\n",
219 |       "['这样能相对保障产品的质量']\n",
220 |       "-------------------- Chinese to English Translation --------------------\n",
221 |       "[' This can ensure the quality of the product relatively.']\n"
222 |      ]
223 |     }
224 |    ],
225 |    "source": [
226 |     "# extract sequence data\n",
227 |     "data_zh, sample_rate_zh = librosa.load(\"audio_zh.wav\", sr=16000)\n",
228 |     "\n",
229 |     "# define Chinese transcribe task\n",
230 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"chinese\", task=\"transcribe\")\n",
231 |     "\n",
232 |     "with torch.inference_mode():\n",
233 |     "    input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors=\"pt\").input_features\n",
234 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)\n",
235 |     "    transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
236 |     "\n",
237 |     "    print('-'*20, 'Chinese Transcription', '-'*20)\n",
238 |     "    print(transcribe_str)\n",
239 |     "\n",
240 |     "# define Chinese transcribe and translation task\n",
241 |     "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"chinese\", task=\"translate\")\n",
242 |     "\n",
243 |     "with torch.inference_mode():\n",
244 |     "    input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors=\"pt\").input_features\n",
245 |     "    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids, max_new_tokens=200)\n",
246 |     "    translate_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
247 |     "\n",
248 |     "    print('-'*20, 'Chinese to English Translation', '-'*20)\n",
249 |     "    print(translate_str)"
250 |    ]
251 |   },
252 |   {
253 |    "attachments": {},
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "## 5.2.7 What's Next?\n",
258 |     "\n",
259 |     "In the upcoming chapter, we will explore the usage of IPEX-LLM in conjunction with langchain, a framework designed for developing applications with language models. With langchain integration, application development process could be simplified."
260 |    ]
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3 (ipykernel)",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.11.9"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 4
284 | }
285 | 


--------------------------------------------------------------------------------
/ch_5_AppDev_Intermediate/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 5 Application Development: Intermediate
 2 | 
 3 | You can use IPEX-LLM to load any Hugging Face *transformers* model and accelerate it on your laptop. With IPEX-LLM, PyTorch models (in FP16/BF16/FP32) hosted on Hugging Face can be loaded and optimized automatically with low-bit quantizations (supported precisions include INT4/INT5/INT8).
 4 | 
 5 | This chapter is a deeper dive of the IPEX-LLM `transformers`-style API, which is used to load and optimize Huggingface *transformers* models. You'll learn about the API usage and common practices, and learn how to create real-world applications using these APIs.
 6 | 
 7 | Two notebooks are included in this chapter. 
 8 | 
 9 | In the notebook [5_1_ChatBot](./5_1_ChatBot.ipynb), you'll first learn how to use `transformers`-style API in different scenarios (e.g. save/load, precision choices, etc.), then proceed to build a chatbot application with streaming and multi-turn chat capabilities.
10 | 
11 | In the notebook [5_2_Speech_Recognition](./5_2_Speech_Recognition.ipynb), you'll learn how to use IPEX-LLM to load a Transformer-based speech recognition model [Whisper](https://openai.com/research/whisper), and then use it to transcribe and translate audio files.
12 | 


--------------------------------------------------------------------------------
/ch_6_GPU_Acceleration/6_1_GPU_Llama2-7B.md:
--------------------------------------------------------------------------------
  1 | # 6.1 Run Llama 2 (7B) on Intel GPUs
  2 | 
  3 | You can use IPEX-LLM to load any Hugging Face *transformers* model for acceleration on Intel GPUs. With IPEX-LLM, PyTorch models (in FP16/BF16/FP32) hosted on Hugging Face can be loaded and optimized automatically on Intel GPUs with low-bit quantization (supported precisions include INT4/NF4/INT5/INT8).
  4 | 
  5 | In this tutorial, you will learn how to run LLMs on Intel GPUs with IPEX-LLM optimizations, and based on that build a stream chatbot. A popular open-source LLM [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is used as an example.
  6 | 
  7 | ## 6.1.1 Install IPEX-LLM on Intel GPUs
  8 | 
  9 | First of all, install IPEX-LLM in your prepared environment. For best practices of environment setup on Intel GPUs, refer to the [README](./README.md#70-environment-setup) in this chapter.
 10 | 
 11 | In terminal, run:
 12 | 
 13 | ```bash
 14 | pip install --pre --upgrade ipex-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 15 | ```
 16 | 
 17 | > **Note**
 18 | > If you are using an older version of `ipex-llm` (specifically, older than `2.5.0b20240104`), you need to manually add `import intel_extension_for_pytorch as ipex` at the beginning of your code.
 19 | 
 20 | It is also required to set oneAPI environment variables for IPEX-LLM on Intel GPUs.
 21 | 
 22 | ```bash
 23 | # configure oneAPI environment variables
 24 | source /opt/intel/oneapi/setvars.sh
 25 | ```
 26 | 
 27 | After installation and environment setup, let's move to the **Python scripts** of this tutorial.
 28 | 
 29 | ## 6.1.2 (Optional) Download Llama 2 (7B)
 30 | 
 31 | To download the [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) model from Hugging Face, you will need to obtain access granted by Meta. Please follow the instructions provided [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) to request access to the model.
 32 | 
 33 | After receiving the access, download the model with your Hugging Face token:
 34 | 
 35 | ```python
 36 | from huggingface_hub import snapshot_download
 37 | 
 38 | model_path = snapshot_download(repo_id='meta-llama/Llama-2-7b-chat-hf',
 39 |                                token='hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX') # change it to your own Hugging Face access token
 40 | ```
 41 | 
 42 | > **Note**
 43 | > The model will by default be downloaded to `HF_HOME='~/.cache/huggingface'`.
 44 | 
 45 | ## 6.1.3 Load Model in Low Precision
 46 | 
 47 | One common use case is to load a Hugging Face *transformers* model in low precision, i.e. conduct **implicit** quantization while loading.
 48 | 
 49 | For Llama 2 (7B), you could simply import `ipex_llm.transformers.AutoModelForCausalLM` instead of `transformers.AutoModelForCausalLM`, and specify `load_in_4bit=True` or `load_in_low_bit` parameter accordingly in the `from_pretrained` function.
 50 | 
 51 | For Intel GPUs, **once you have the model in low precision, set it to `to('xpu')`.**
 52 | 
 53 | **For INT4 Optimizations (with `load_in_4bit=True`):**
 54 | 
 55 | ```python
 56 | from ipex_llm.transformers import AutoModelForCausalLM
 57 | 
 58 | # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
 59 | # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
 60 | model_in_4bit = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
 61 |                                                      load_in_4bit=True)
 62 | model_in_4bit_gpu = model_in_4bit.to('xpu')
 63 | ```
 64 | 
 65 | > **Note**
 66 | > IPEX-LLM has supported `AutoModel`, `AutoModelForCausalLM`, `AutoModelForSpeechSeq2Seq` and `AutoModelForSeq2SeqLM`.
 67 | >
 68 | > If you have already downloaded the Llama 2 (7B) model and skipped step [7.1.2.2](#712-optional-download-llama-2-7b), you could specify `pretrained_model_name_or_path` to the model path.
 69 | 
 70 | **(Optional) For INT8 Optimizations (with `load_in_low_bit="sym_int8"`):**
 71 | 
 72 | ```python
 73 | from ipex_llm.transformers import AutoModelForCausalLM
 74 | 
 75 | # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
 76 | # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
 77 | model_in_8bit = AutoModelForCausalLM.from_pretrained(
 78 |     pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
 79 |     load_in_low_bit="sym_int8"
 80 | )
 81 | model_in_8bit_gpu = model_in_8bit.to('xpu')
 82 | ```
 83 | 
 84 | > **Note**
 85 | > * Currently, `load_in_low_bit` supports options `'sym_int4'`, `'asym_int4'`, `'sym_int5'`, `'asym_int5'` or `'sym_int8'`, in which 'sym' and 'asym' differentiate between symmetric and asymmetric quantization. Option `'nf4'` is also supported, referring to 4-bit NormalFloat. Floating point precisions `'fp4'`, `'fp8'`, `'fp16'` and mixed precisions including `'mixed_fp4'` and `'mixed_fp8'` are also supported.
 86 | >
 87 | > * `load_in_4bit=True` is equivalent to `load_in_low_bit='sym_int4'`.
 88 | 
 89 | ## 6.1.4 Load Tokenizer 
 90 | 
 91 | A tokenizer is also needed for LLM inference. You can use [Huggingface transformers](https://huggingface.co/docs/transformers/index) API to load the tokenizer directly. It can be used seamlessly with models loaded by IPEX-LLM. For Llama 2, the corresponding tokenizer class is `LlamaTokenizer`.
 92 | 
 93 | ```python
 94 | from transformers import LlamaTokenizer
 95 | 
 96 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf")
 97 | ```
 98 | 
 99 | > **Note**
100 | > If you have already downloaded the Llama 2 (7B) model and skipped step [7.1.2.2](#712-optional-download-llama-2-7b), you could specify `pretrained_model_name_or_path` to the model path.
101 | 
102 | ## 6.1.5 Run Model
103 | 
104 | You can then do model inference with IPEX-LLM optimizations on Intel GPUs almostly the same way as using official `transformers` API. **The only difference is to set `to('xpu')` for token ids**. A Q&A dialog template is created for the model to complete.
105 | 
106 | ```python
107 | import torch
108 | 
109 | with torch.inference_mode():
110 |     prompt = 'Q: What is CPU?\nA:'
111 |     
112 |     # tokenize the input prompt from string to token ids;
113 |     # with .to('xpu') specifically for inference on Intel GPUs
114 |     input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
115 | 
116 |     # predict the next tokens (maximum 32) based on the input token ids
117 |     output = model_in_4bit_gpu.generate(input_ids,
118 |                             max_new_tokens=32)
119 | 
120 |     # decode the predicted token ids to output string
121 |     output = output.cpu()
122 |     output_str = tokenizer.decode(output[0], skip_special_tokens=True)
123 |     
124 |     print('-'*20, 'Output', '-'*20)
125 |     print(output_str)
126 | ```
127 | 
128 | > **Note**
129 | > The initial generation of optimized LLMs on Intel GPUs could be slow. Therefore, it's advisable to perform a **warm-up** run before the actual generation.
130 | >
131 | > For the next section of stream chat, we could treat this time of generation in section 7.1.6 as a warm-up.
132 | 
133 | ## 6.1.6 Stream Chat
134 | 
135 | Now, let's build a stream chatbot that runs on Intel GPUs, allowing LLMs to engage in interactive conversations. Chatbot interaction is no magic - it still relies on the prediction and generation of next tokens by LLMs. To make LLMs chat, we need to properly format the prompts into a conversation format, for example:
136 | 
137 | ```
138 | <s>[INST] <<SYS>>
139 | You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe.
140 | <</SYS>>
141 | 
142 | What is AI? [/INST]
143 | ```
144 | 
145 | Further, to enable a multi-turn chat experience, you need to append the new dialog input to the previous conversation to make a new prompt for the model, for example: 
146 | 
147 | ```
148 | <s>[INST] <<SYS>>
149 | You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe.
150 | <</SYS>>
151 | 
152 | What is AI? [/INST] AI is a term used to describe the development of computer systems that can perform tasks that typically require human intelligence, such as understanding natural language, recognizing images. </s><s> [INST] Is it dangerous? [INST]
153 | ```
154 | 
155 | Here we show a multi-turn chat example with stream capability on IPEX-LLM optimized Llama 2 (7B) model. 
156 | 
157 | First, define the conversation context format[^1] for the model to complete:
158 | 
159 | ```python
160 | SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant, who always answers as helpfully as possible, while being safe."
161 | 
162 | def format_prompt(input_str, chat_history):
163 |     prompt = [f'<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n\n']
164 |     do_strip = False
165 |     for history_input, history_response in chat_history:
166 |         history_input = history_input.strip() if do_strip else history_input
167 |         do_strip = True
168 |         prompt.append(f'{history_input} [/INST] {history_response.strip()} </s><s>[INST] ')
169 |     input_str = input_str.strip() if do_strip else input_str
170 |     prompt.append(f'{input_str} [/INST]')
171 |     return ''.join(prompt)
172 | ```
173 | 
174 | [^1]: The conversation context format is referenced from [here](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/323df5680706d388eff048fba2f9c9493dfc0152/model.py#L20) and [here](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/323df5680706d388eff048fba2f9c9493dfc0152/app.py#L9).
175 | 
176 | Next, define the `stream_chat` function, which continuously adds model outputs to the chat history. This ensures that conversation context can be properly formatted for next generation of responses. Here, the response is generated in a streaming (word-by-word) way:
177 | 
178 | ```python
179 | from transformers import TextIteratorStreamer
180 | 
181 | def stream_chat(model, tokenizer, input_str, chat_history):
182 |     # format conversation context as prompt through chat history
183 |     prompt = format_prompt(input_str, chat_history)
184 |     input_ids = tokenizer([prompt], return_tensors='pt').to('xpu') # specify to('xpu') for Intel GPUs
185 | 
186 |     streamer = TextIteratorStreamer(tokenizer,
187 |                                     skip_prompt=True, # skip prompt in the generated tokens
188 |                                     skip_special_tokens=True)
189 | 
190 |     generate_kwargs = dict(
191 |         input_ids,
192 |         streamer=streamer,
193 |         max_new_tokens=128
194 |     )
195 |     
196 |     # to ensure non-blocking access to the generated text, generation process should be ran in a separate thread
197 |     from threading import Thread
198 |     
199 |     thread = Thread(target=model.generate, kwargs=generate_kwargs)
200 |     thread.start()
201 | 
202 |     output_str = []
203 |     print("Response: ", end="")
204 |     for stream_output in streamer:
205 |         output_str.append(stream_output)
206 |         print(stream_output, end="")
207 | 
208 |     # add model output to the chat history
209 |     chat_history.append((input_str, ''.join(output_str)))
210 | ```
211 | 
212 | > **Note**
213 | > To successfully observe the text streaming behavior in standard output, we need to set the environment variable `PYTHONUNBUFFERED=1` to ensure that the standard output streams are directly sent to the terminal without being buffered first.
214 | >
215 | > The [Hugging Face *transformers* streamer classes](https://huggingface.co/docs/transformers/main/generation_strategies#streaming) is currently being developed and is subject to future changes.
216 | 
217 | We can then achieve interactive, multi-turn stream chat between humans and the bot by allowing continuous user input:
218 | 
219 | ```python
220 | chat_history = []
221 | 
222 | print('-'*20, 'Stream Chat', '-'*20, end="")
223 | while True:
224 |     with torch.inference_mode():
225 |         print("\n", end="")
226 |         user_input = input("Input: ")
227 |         if user_input == "stop": # let's stop the conversation when user input "stop"
228 |             print("Stream Chat with Llama 2 (7B) stopped.")
229 |             break
230 |         stream_chat(model=model_in_4bit_gpu,
231 |                     tokenizer=tokenizer,
232 |                     input_str=user_input,
233 |                     chat_history=chat_history)
234 | ```


--------------------------------------------------------------------------------
/ch_6_GPU_Acceleration/6_2_GPU_Baichuan2-7B.md:
--------------------------------------------------------------------------------
  1 | # 6.2 Run Baichuan 2 (7B) on Intel GPUs
  2 | 
  3 | You can use IPEX-LLM to load any ModelScope model for acceleration on Intel GPUs. With IPEX-LLM, PyTorch models (in FP16/BF16/FP32) hosted on ModelScope can be loaded and optimized automatically on Intel GPUs with low-bit quantization (supported precisions include INT4/NF4/INT5/FP8/INT8).
  4 | 
  5 | In this tutorial, you will learn how to run LLMs on Intel GPUs with IPEX-LLM optimizations, and based on that build a stream chatbot. A popular open-source LLM [baichuan-inc/Baichuan2-7B-Chat](https://www.modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat) is used as an example.
  6 | 
  7 | > [!NOTE]
  8 | > Please make sure that you have prepared the environment for IPEX-LLM on GPU before you started. Refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) for more information regarding installation and environment preparation. Besides, to load model from ModelScope Hub, you also need to `pip install modelscope==1.11.0`.
  9 | 
 10 | 
 11 | ## 6.2.1 Load Model in Low Precision
 12 | 
 13 | One common use case is to load a model from [ModelScope hub](https://www.modelscope.cn/models) with IPEX-LLM low-bit precision optimization. For Baichuan 2 (7B), you could simply import `ipex_llm.transformers.AutoModelForCausalLM` instead of `transformers.AutoModelForCausalLM`, and specify `load_in_4bit=True` or `load_in_low_bit` parameter accordingly in the `from_pretrained` function. Besides, it is important to set `model_hub='modelscope'`, otherwise model hub is default to be huggingface.
 14 | 
 15 | For Intel GPUs, **once you have the model in low precision, set it to `to('xpu')`.**
 16 | 
 17 | **For INT4 Optimizations (with `load_in_4bit=True`):**
 18 | 
 19 | ```python
 20 | from ipex_llm.transformers import AutoModelForCausalLM
 21 | 
 22 | model_in_4bit = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="baichuan-inc/Baichuan2-7B-Chat",
 23 |                                                      load_in_4bit=True,
 24 |                                                      trust_remote_code=True,
 25 |                                                      use_cache=True,
 26 |                                                      model_hub='modelscope')
 27 | model_in_4bit_gpu = model_in_4bit.to('xpu')
 28 | ```
 29 | 
 30 | > [!NOTE]
 31 | > * IPEX-LLM has supported `AutoModel`, `AutoModelForCausalLM`, `AutoModelForSpeechSeq2Seq` and `AutoModelForSeq2SeqLM`, etc.
 32 | >
 33 | >   If you have already downloaded the Baichuan 2 (7B) model, you could specify `pretrained_model_name_or_path` to the model path.
 34 | >
 35 | > * Currently, `load_in_low_bit` supports options `'sym_int4'`, `'asym_int4'`, `'sym_int8'`, `'nf4'`, `'fp6'`, `'fp8'`,`'fp16'`, etc., in which `'sym_int4'` means symmetric int 4, `'asym_int4'` means asymmetric int 4, and `'nf4'` means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.
 36 | >
 37 | >   `load_in_4bit=True` is equivalent to `load_in_low_bit='sym_int4'`.
 38 | >
 39 | > * When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
 40 | > 
 41 | >   This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
 42 | >
 43 | > * You could refer to the [API documentation](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/transformers.html) for more information.
 44 | 
 45 | ## 6.2.2 Load Tokenizer 
 46 | 
 47 | A tokenizer is also needed for LLM inference. You can use [ModelScope Library](https://www.modelscope.cn/docs/ModelScope%20Library%E6%A6%82%E8%A7%88%E4%BB%8B%E7%BB%8D) to load the tokenizer directly. It can be used seamlessly with models loaded by IPEX-LLM. For Baichuan 2, the corresponding tokenizer class is `AutoTokenizer`.
 48 | 
 49 | ```python
 50 | from modelscope import AutoTokenizer
 51 | 
 52 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="baichuan-inc/Baichuan2-7B-Chat",
 53 |                                           trust_remote_code=True)
 54 | ```
 55 | 
 56 | > [!NOTE]
 57 | > If you have already downloaded the Baichuan 2 (7B) model, you could specify `pretrained_model_name_or_path` to the model path.
 58 | 
 59 | ## 6.2.3 Run Model
 60 | 
 61 | You can then do model inference with IPEX-LLM optimizations on Intel GPUs almostly the same way as using official `transformers` API. **The only difference is to set `to('xpu')` for token ids**. A Q&A dialog template is created for the model to complete.
 62 | 
 63 | ```python
 64 | import torch
 65 | 
 66 | with torch.inference_mode():
 67 |     prompt = 'Q: What is CPU?\nA:'
 68 |     
 69 |     # tokenize the input prompt from string to token ids;
 70 |     # with .to('xpu') specifically for inference on Intel GPUs
 71 |     input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
 72 | 
 73 |     # predict the next tokens (maximum 32) based on the input token ids
 74 |     output = model_in_4bit_gpu.generate(input_ids,
 75 |                             max_new_tokens=32)
 76 | 
 77 |     # decode the predicted token ids to output string
 78 |     output = output.cpu()
 79 |     output_str = tokenizer.decode(output[0], skip_special_tokens=True)
 80 |     
 81 |     print('-'*20, 'Output', '-'*20)
 82 |     print(output_str)
 83 | ```
 84 | 
 85 | > [!NOTE]
 86 | > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. 
 87 | > 
 88 | > The initial generation of optimized LLMs on Intel GPUs could be slow. Therefore, it's advisable to perform a **warm-up** run before the actual generation.
 89 | >
 90 | > For the next section of stream chat, we could treat this time of generation in section 6.1.3 as a warm-up.
 91 | 
 92 | ## 6.2.4 Stream Chat
 93 | 
 94 | Now, let's build a stream chatbot that runs on Intel GPUs, allowing LLMs to engage in interactive conversations. Chatbot interaction is no magic - it still relies on the prediction and generation of next tokens by LLMs. We will use Baichuan 2's built-in `chat` function to build a stream chatbot here. 
 95 | 
 96 | ```python
 97 | chat_history = []
 98 | 
 99 | print('-'*20, 'Stream Chat', '-'*20, end="\n")
100 | while True:
101 |     prompt = input("Input: ")
102 |     if prompt.strip() == "stop": # let's stop the conversation when user input "stop"
103 |         print("Stream Chat with Baichuan 2 (7B) stopped.")
104 |         break
105 |     chat_history.append({"role": "user", "content": prompt})
106 |     position = 0
107 |     for response in model_in_4bit_gpu.chat(tokenizer, chat_history, stream=True):
108 |         print(response[position:], end='', flush=True)
109 |         position = len(response)
110 |     print()
111 |     chat_history.append({"role": "assistant", "content": response})
112 | ```
113 | 
114 | > [!NOTE]
115 | > To successfully observe the text streaming behavior in standard output, we need to set the environment variable `PYTHONUNBUFFERED=1` to ensure that the standard output streams are directly sent to the terminal without being buffered first.
116 | 


--------------------------------------------------------------------------------
/ch_6_GPU_Acceleration/6_3_GPU_Whisper-medium.md:
--------------------------------------------------------------------------------
  1 | # 6.3 Run Whisper (medium) on Intel GPUs
  2 | 
  3 | You can use IPEX-LLM to load Transformer-based automatic speech recognition (ASR) models for acceleration on Intel GPUs. With IPEX-LLM, PyTorch models (in FP16/BF16/FP32) for ASR can be loaded and optimized automatically on Intel GPUs with low-bit quantization (supported precisions include INT4/NF4/INT5/FP6/FP8/INT8).
  4 | 
  5 | In this tutorial, you will learn how to run speech models on Intel GPUs with IPEX-LLM optimizations, and based on that build a speech recognition application. A popular open-source model for both ASR and speech translation, [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) is used as an example.
  6 | 
  7 | > [!NOTE]
  8 | > Please make sure that you have prepared the environment for IPEX-LLM on GPU before you started. Refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) for more information regarding installation and environment preparation. Besides, to process audio files, you also need to install `librosa` by performing `pip install -U librosa`.
  9 | 
 10 | ## 6.3.1 Download Audio Files
 11 | To start with, the first thing to do is preparing some audio files for this demo. As an example, you can download an English example from multilingual audio dataset [voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) and one Chinese example from the Chinese audio dataset [AIShell](https://huggingface.co/datasets/carlot/AIShell). You are free to pick other recording clips found within or outside the dataset. 
 12 | 
 13 | 
 14 | ## 6.3.2 Load Model in Low Precision
 15 | 
 16 | One common use case is to load a model from Hugging Face with IPEX-LLM low-bit precision optimization. For Whisper (medium), you could simply import `ipex_llm.transformers.AutoModelForSpeechSeq2Seq` instead of `transformers.AutoModelForSpeechSeq2Seq`, and specify `load_in_4bit=True` parameter accordingly in the `from_pretrained` function.
 17 | 
 18 | For Intel GPUs, **once you have the model in low precision, set it to `to('xpu')`.**
 19 | 
 20 | **For INT4 Optimizations (with `load_in_4bit=True`):**
 21 | 
 22 | ```python
 23 | from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
 24 | 
 25 | model_in_4bit = AutoModelForSpeechSeq2Seq.from_pretrained(
 26 |     pretrained_model_name_or_path="openai/whisper-medium", load_in_4bit=True
 27 | )
 28 | model_in_4bit_gpu = model_in_4bit.to("xpu")
 29 | ```
 30 | 
 31 | > [!NOTE]
 32 | > * IPEX-LLM has supported `AutoModel`, `AutoModelForCausalLM`, `AutoModelForSpeechSeq2Seq` and `AutoModelForSeq2SeqLM`, etc.
 33 | >
 34 | >   If you have already downloaded the Whisper (medium) model, you could specify `pretrained_model_name_or_path` to the model path.
 35 | >
 36 | > * Currently, `load_in_low_bit` supports options `'sym_int4'`, `'asym_int4'`, `'sym_int8'`, `'nf4'`, `'fp6'`, `'fp8'`,`'fp16'`, etc., in which `'sym_int4'` means symmetric int 4, `'asym_int4'` means asymmetric int 4, and `'nf4'` means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.
 37 | >
 38 | >   `load_in_4bit=True` is equivalent to `load_in_low_bit='sym_int4'`.
 39 | >
 40 | > * When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
 41 | > 
 42 | >   This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
 43 | >
 44 | > * You could refer to the [API documentation](https://ipex-llm.readthedocs.io/en/latest/doc/PythonAPI/LLM/transformers.html) for more information.
 45 | 
 46 | ## 6.3.3 Load Whisper Processor
 47 | 
 48 | A Whisper processor is also needed for both audio pre-processing, and post-processing model outputs from tokens to texts. IPEX-LLM does not provide a customized implementation for that, so you might want to use the official `transformers` API to load `WhisperProcessor`:
 49 | 
 50 | ```python
 51 | from transformers import WhisperProcessor
 52 | 
 53 | processor = WhisperProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-medium")
 54 | ```
 55 | 
 56 | > [!NOTE]
 57 | > If you have already downloaded the Whisper (medium) model, you could specify `pretrained_model_name_or_path` to the model path.
 58 | 
 59 | ## 6.3.4 Run Model to Transcribe English Audio
 60 | 
 61 | Once you have optimized the Whisper model using IPEX-LLM with INT4 optimization and loaded the Whisper processor, you are ready to begin transcribing the audio through model inference.
 62 | 
 63 | Let's start with the English audio file `audio_en.wav`, taken from [voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset. Before we feed it into Whisper processor, we need to extract sequence data from raw speech waveform:
 64 | 
 65 | ```python
 66 | import librosa
 67 | 
 68 | data_en, sample_rate_en = librosa.load("audio_en.wav", sr=16000)
 69 | ```
 70 | 
 71 | > [!NOTE]
 72 | > For `whisper-medium`, its `WhisperFeatureExtractor` (part of `WhisperProcessor`) extracts features from audio using a 16,000Hz sampling rate by default. It's important to load the audio file at the sample sampling rate with model's `WhisperFeatureExtractor` for precise recognition.
 73 | > 
 74 | 
 75 | We can then proceed to transcribe the audio file based on the sequence data, using exactly the same way as using official `transformers` API:
 76 | 
 77 | ```python
 78 | import torch
 79 | import time
 80 | 
 81 | # define task type
 82 | forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
 83 | 
 84 | with torch.inference_mode():
 85 |     # extract input features for the Whisper model
 86 |     input_features = processor(data_en, sampling_rate=sample_rate_en, return_tensors="pt").input_features.to('xpu')
 87 | 
 88 |     # predict token ids for transcription
 89 |     predicted_ids = model_in_4bit_gpu.generate(input_features, forced_decoder_ids=forced_decoder_ids,max_new_tokens=200)
 90 | 
 91 |     # decode token ids into texts
 92 |     transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)
 93 | 
 94 |     print('-'*20, 'English Transcription', '-'*20)
 95 |     print(transcribe_str)
 96 | ```
 97 | 
 98 | > [!NOTE]
 99 | > `forced_decoder_ids` defines the context token for different language and task (transcribe or translate). If it is set to `None`, Whisper will automatically predict them.
100 | > 
101 | 
102 | 
103 | ## 6.3.5 Run Model to Transcribe Chinese Audio and Translate to English
104 | 
105 | Next, let's move to the Chinese audio `audio_zh.wav`, which is randomly taken from the [AIShell](https://huggingface.co/datasets/carlot/AIShell) dataset. Whisper offers capability to transcribe multilingual audio files, and translate the recognized text into English. The only difference here is to define specific context token through `forced_decoder_ids`:
106 | 
107 | ```python
108 | # extract sequence data
109 | data_zh, sample_rate_zh = librosa.load("audio_zh.wav", sr=16000)
110 | 
111 | # define Chinese transcribe task
112 | forced_decoder_ids = processor.get_decoder_prompt_ids(language="chinese", task="transcribe")
113 | 
114 | with torch.inference_mode():
115 |     input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors="pt").input_features.to('xpu')
116 |     predicted_ids = model_in_4bit.generate(input_features, forced_decoder_ids=forced_decoder_ids)
117 |     transcribe_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)
118 | 
119 |     print('-'*20, 'Chinese Transcription', '-'*20)
120 |     print(transcribe_str)
121 | 
122 | # define Chinese transcribe and translation task
123 | forced_decoder_ids = processor.get_decoder_prompt_ids(language="chinese", task="translate")
124 | 
125 | with torch.inference_mode():
126 |     input_features = processor(data_zh, sampling_rate=sample_rate_zh, return_tensors="pt").input_features.to('xpu')
127 |     predicted_ids = model_in_4bit.generate(input_features, forced_decoder_ids=forced_decoder_ids, max_new_tokens=200)
128 |     translate_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)
129 | 
130 |     print('-'*20, 'Chinese to English Translation', '-'*20)
131 |     print(translate_str)
132 | ```
133 | 


--------------------------------------------------------------------------------
/ch_6_GPU_Acceleration/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 6 GPU Acceleration
 2 | 
 3 | Apart from the significant acceleration capabilites on Intel CPUs, IPEX-LLM also supports optimizations and acceleration for running LLMs (large language models) on Intel GPUs.
 4 | 
 5 | IPEX-LLM supports optimizations of any [*HuggingFace transformers*](https://huggingface.co/docs/transformers/index) model on Intel GPUs with the help of low-bit techniques, modern hardware accelerations and latest software optimizations.
 6 | 
 7 | #### 6B model running on Intel Arc GPU (real-time screen capture):
 8 | 
 9 | <p align="left">
10 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/chatglm2-arc.gif" width='60%' /> 
11 | 
12 | </p>
13 | 
14 | #### 13B model running on Intel Arc GPU (real-time screen capture): 
15 | 
16 | <p align="left">
17 |             <img src="https://llm-assets.readthedocs.io/en/latest/_images/llama2-13b-arc.gif" width='60%' /> 
18 | 
19 | </p>
20 | 
21 | In Chapter 6, you will learn how to run LLMs, as well as implement stream chat functionalities, using IPEX-LLM optimizations on Intel GPUs. Popular open source models are used as examples:
22 | 
23 | + [Llama2-7B](./6_1_GPU_Llama2-7B.md)
24 | + [Baichuan2-7B](./6_2_GPU_Baichuan2-7B.md)
25 | 
26 | 
27 | ## 6.0 System Support
28 | ### 1. Linux: 
29 | **Hardware**:
30 | - Intel Arc™ A-Series Graphics
31 | - Intel Data Center GPU Flex Series
32 | - Intel Data Center GPU Max Series
33 | 
34 | **Operating System**:
35 | - Ubuntu 20.04 or later (Ubuntu 22.04 is preferred)
36 | 
37 | ### 2. Windows
38 | 
39 | **Hardware**:
40 | - Intel iGPU and dGPU
41 | 
42 | **Operating System**:
43 | - Windows 10/11, with or without WSL 
44 | 
45 | 
46 | ## 6.1 Environment Setup
47 | 
48 | Please refer to the [GPU installation guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) for mode details. It is strongly recommended that you follow the corresponding steps below to configure your environment properly.


--------------------------------------------------------------------------------
/ch_6_GPU_Acceleration/environment_setup.md:
--------------------------------------------------------------------------------
  1 | ## Environment setup for Intel Arc GPU
  2 | For Linux users, Ubuntu 22.04 and Linux kernel 5.19.0 are preferred. Ubuntu 22.04 and Linux kernel 5.19.0-41-generic are mostly used in our test environment. But the default Linux kernel of Ubuntu 22.04.3 is 6.2.0-35-generic, so we recommend you downgrade the kernel to 5.19.0-41-generic to achieve the best performance. Here is an introduction to several important steps, please refer to the [GPU installation guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) for full instructions on environment setup.
  3 | 
  4 | 
  5 | ### 1. Downgrade kernels
  6 | Here are the steps to downgrade your kernel:
  7 | ```bash
  8 | # downgrade kernel to 5.19.0-41-generic
  9 |   
 10 | sudo apt-get update && sudo apt-get install  -y --install-suggests  linux-image-5.19.0-41-generic
 11 | 
 12 | sudo sed -i "s/GRUB_DEFAULT=.*/GRUB_DEFAULT=\"1> $(echo $(($(awk -F\' '/menuentry / {print $2}' /boot/grub/grub.cfg \
 13 | | grep -no '5.19.0-41' | sed 's/:/\n/g' | head -n 1)-2)))\"/" /etc/default/grub
 14 | 
 15 | sudo  update-grub
 16 | 
 17 | sudo reboot
 18 | # As 5.19's kernel doesn't has any arc graphic driver. The machine may not start the desktop correctly, but we can use the ssh to login. 
 19 | # Or you can select 5.19's recovery mode in the grub, then choose resume to resume the normal boot directly.
 20 | ```
 21 | **Notice:  As 5.19's kernel doesn't have right Arc graphic driver. The machine may not start the desktop correctly, but you can use the ssh to login. Or you can select 5.19's recovery mode in the grub, then choose resume to resume the normal boot directly.**  
 22 | You can remove the 6.2.0 kernel if you don't need it. It's an optional step.
 23 | ```bash 
 24 | # remove latest kernel (optional)
 25 | sudo apt purge linux-image-6.2.0-*
 26 | sudo apt autoremove
 27 | sudo reboot
 28 | ```
 29 | 
 30 | #### 2. Install GPU driver
 31 | Here is the steps to install gpu driver:
 32 | ```bash
 33 | # install drivers
 34 | # setup driver's apt repository
 35 | sudo apt-get install -y gpg-agent wget
 36 | wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
 37 |   sudo gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
 38 | echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy client" | \
 39 |   sudo tee /etc/apt/sources.list.d/intel-gpu-jammy.list
 40 | 
 41 | sudo apt-get update
 42 | 
 43 | sudo apt-get -y install \
 44 |     gawk \
 45 |     dkms \
 46 |     linux-headers-$(uname -r) \
 47 |     libc6-dev
 48 | 	
 49 | sudo apt install intel-i915-dkms=1.23.5.19.230406.21.5.17.0.1034+i38-1 intel-platform-vsec-dkms=2023.20.0-21 intel-platform-cse-dkms=2023.11.1-36 intel-fw-gpu=2023.39.2-255~22.04
 50 | 
 51 | sudo apt-get install -y gawk libc6-dev udev\
 52 |   intel-opencl-icd intel-level-zero-gpu level-zero \
 53 |   intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
 54 |   libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
 55 |   libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
 56 |   mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo
 57 |   
 58 | sudo reboot
 59 | 
 60 | # Configuring permissions
 61 | 
 62 | sudo gpasswd -a ${USER} render
 63 | newgrp render
 64 | 
 65 | # Verify the device is working with i915 driver
 66 | sudo apt-get install -y hwinfo
 67 | hwinfo --display
 68 | ```
 69 | 
 70 | ### 3. Install oneAPI and IPEX-LLM
 71 | ```
 72 | # config oneAPI repository
 73 | wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
 74 | echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
 75 | sudo apt update
 76 | ```
 77 | Before you install oneAPI, please make sure which PyTorch version you use. PyTorch 2.0 and PyTorch 2.1 need different oneAPI versions, so we need to install different oneAPI for them.  
 78 | 
 79 | **PyTorch 2.1** requires oneAPI=2024.0, you can install as follows:
 80 | ```bash
 81 | sudo apt install -y intel-basekit # for torch 2.1 and ipex 2.1
 82 | # How to install ipex-llm, please install conda first.
 83 | conda create -n llm python=3.9
 84 | conda activate llm
 85 | pip install --pre --upgrade ipex-llm[xpu_2.1] -f https://developer.intel.com/ipex-whl-stable-xpu
 86 | ```
 87 | 
 88 | **PyTorch 2.0** requires oneAPI=2023.2, you can install as follows:
 89 | ```
 90 | sudo apt install -y intel-oneapi-common-vars=2023.2.0-49462 \
 91 |     intel-oneapi-compiler-cpp-eclipse-cfg=2023.2.0-49495 intel-oneapi-compiler-dpcpp-eclipse-cfg=2023.2.0-49495 \
 92 |     intel-oneapi-diagnostics-utility=2022.4.0-49091 \
 93 |     intel-oneapi-compiler-dpcpp-cpp=2023.2.0-49495 \
 94 |     intel-oneapi-mkl=2023.2.0-49495 intel-oneapi-mkl-devel=2023.2.0-49495 \
 95 |     intel-oneapi-mpi=2021.10.0-49371 intel-oneapi-mpi-devel=2021.10.0-49371 \
 96 |     intel-oneapi-tbb=2021.10.0-49541 intel-oneapi-tbb-devel=2021.10.0-49541\
 97 |     intel-oneapi-ccl=2021.10.0-49084 intel-oneapi-ccl-devel=2021.10.0-49084\
 98 |     intel-oneapi-dnnl-devel=2023.2.0-49516 intel-oneapi-dnnl=2023.2.0-49516
 99 | # How to install ipex-llm, please install conda first.
100 | conda create -n llm python=3.9
101 | conda activate llm
102 | pip install --pre --upgrade ipex-llm[xpu_2.0] -f https://developer.intel.com/ipex-whl-stable-xpu
103 | ```


--------------------------------------------------------------------------------
/ch_7_Finetune/7_1_Finetune_Llama2-7B.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # 7.1 Finetuning Llama 2 (7B) using QLoRA
  3 | 
  4 | To help you better understand the process of QLoRA Finetuning, in this tutorial, we provide a practical guide leveraging IPEX-LLM to tune a large language model to a specific task.  [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) is used as an example here to adapt the text generation implementation.
  5 | 
  6 | ## 7.1.1 Enable IPEX-LLM on Intel GPUs
  7 | 
  8 | ### 7.1.1.1 Install IPEX-LLM on Intel GPUs
  9 | 
 10 | After following the steps in [Readme](./README.md#70-environment-setup) to set up the environment, you can install IPEX-LLM in terminal with the command below:
 11 | ```bash
 12 | pip install --pre --upgrade ipex-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
 13 | pip install transformers==4.34.0 datasets
 14 | pip install peft==0.5.0
 15 | pip install accelerate==0.23.0
 16 | ```
 17 | 
 18 | > **Note**
 19 | > If you are using an older version of `ipex-llm` (specifically, older than `2.5.0b20240104`), you need to manually add `import intel_extension_for_pytorch as ipex` at the beginning of your code.
 20 | 
 21 | ### 7.1.1.2 Set OneAPI Environment Variables
 22 | 
 23 | It is also necessary to set OneAPI environment variables for IPEX-LLM on Intel GPUs.
 24 | 
 25 | ```bash
 26 | # configure OneAPI environment variables
 27 | source /opt/intel/oneapi/setvars.sh
 28 | ```
 29 | 
 30 | After installation and environment setup, let's move to the **Python scripts** of this tutorial.
 31 | 
 32 | ## 7.1.2 QLoRA Finetuning
 33 | 
 34 | ### 7.1.2.1 Load Model in Low Precision
 35 | 
 36 | 
 37 | A popular open-source LLM [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) is chosen to illustrate the process of QLoRA Finetuning.
 38 | 
 39 | > **Note**
 40 | >
 41 | > You can specify the argument `pretrained_model_name_or_path` with both Huggingface repo id or local model path.
 42 | > If you have already downloaded the Llama 2 (7B) model, you could specify `pretrained_model_name_or_path` to the local model path.
 43 | 
 44 | With IPEX-LLM optimization, you can load the model with `ipex_llm.transformers.AutoModelForCausalLM` instead of `transformers.AutoModelForCausalLM` to conduct implicit quantization.
 45 | 
 46 | For Intel GPUs, once you have the model in low precision, **set it to `to('xpu')`**.
 47 | 
 48 | ```python
 49 | from ipex_llm.transformers import AutoModelForCausalLM
 50 | model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = "meta-llama/Llama-2-7b-hf",
 51 |                                              load_in_low_bit="nf4",
 52 |                                              optimize_model=False,
 53 |                                              torch_dtype=torch.float16,
 54 |                                              modules_to_not_convert=["lm_head"])
 55 | model = model.to('xpu')
 56 | 
 57 | ```
 58 | 
 59 | > **Note**
 60 | >
 61 | > We specify load_in_low_bit="nf4" here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using "nf4" could yield better model quality than "int4".
 62 | 
 63 | ### 7.1.2.2 Prepare Model for Training
 64 | Then we apply `prepare_model_for_kbit_training` from `ipex_llm.transformers.qlora` to preprocess the model for training. 
 65 | 
 66 | ```python
 67 | from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
 68 | # model.gradient_checkpointing_enable() # can further reduce memory but slower
 69 | model = prepare_model_for_kbit_training(model)
 70 | ```
 71 | 
 72 | Next, we can obtain a PEFT model from the optimized model and a configuration object containing the parameters as follows:  
 73 | 
 74 | ```python
 75 | from ipex_llm.transformers.qlora import get_peft_model
 76 | from peft import LoraConfig
 77 | 
 78 | config = LoraConfig(r=8, 
 79 |                     lora_alpha=32, 
 80 |                     target_modules=["q_proj", "k_proj", "v_proj"], 
 81 |                     lora_dropout=0.05, 
 82 |                     bias="none", 
 83 |                     task_type="CAUSAL_LM")
 84 | model = get_peft_model(model, config)
 85 | 
 86 | ```
 87 | > **Note**
 88 | >
 89 | > Instead of `from peft import prepare_model_for_kbit_training, get_peft_model` as we did for regular QLoRA using bitandbytes and cuda, we import them from `ipex_llm.transformers.qlora` here to get a IPEX-LLM compatible PEFT model. And the rest is just the same as regular LoRA finetuning process using `peft`.
 90 | >
 91 | > **Note**
 92 | >
 93 | > More explanation about `LoraConfig` parameters can be found in [Transformer LoRA Guides](https://huggingface.co/docs/peft/conceptual_guides/lora#common-lora-parameters-in-peft).
 94 | >
 95 | 
 96 | ### 7.1.2.3 Load Dataset
 97 | 
 98 | A common dataset, [english quotes](https://huggingface.co/datasets/Abirate/english_quotes), is loaded to fine tune our model on famous quotes.
 99 | ```python
100 | from datasets import load_dataset
101 | data = load_dataset("Abirate/english_quotes")
102 | data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
103 | ```
104 | 
105 | > **Note**
106 | >
107 | > The dataset path here is default to be Huggingface repo id. 
108 | > If you have already downloaded the `.jsonl` file from [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes/blob/main/quotes.jsonl), you could use `data = load_dataset("json", data_files= "path/to/your/.jsonl/file")` to specify the local path instead of `data = load_dataset("Abirate/english_quotes")`.
109 | 
110 | ### 7.1.2.4 Load Tokenizer
111 | A tokenizer enables tokenizing and detokenizing process in LLM training and inference. You can use [Huggingface transformers](https://huggingface.co/docs/transformers/index) API to load the tokenizer directly. It can be used seamlessly with models loaded by IPEX-LLM. For Llama 2, the corresponding tokenizer class is `LlamaTokenizer`.
112 | 
113 | ```python
114 | from transformers import LlamaTokenizer
115 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path="meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)
116 | tokenizer.pad_token_id = 0
117 | tokenizer.padding_side = "left"
118 | ```
119 | > **Note**
120 | >
121 | > If you have already downloaded the Llama 2 (7B) model, you could specify `pretrained_model_name_or_path` to the local model path.
122 | 
123 | ### 7.1.2.5 Run the Training
124 | 
125 | You can then start the training process by setting the `trainer` with existing tools on the HF ecosystem. Here we set `warmup_steps` to be 20 to accelerate the process of training.
126 | ```python
127 | import transformers
128 | trainer = transformers.Trainer(
129 |     model=model,
130 |     train_dataset=data["train"],
131 |     args=transformers.TrainingArguments(
132 |         per_device_train_batch_size=4,
133 |         gradient_accumulation_steps= 1,
134 |         warmup_steps=20,
135 |         max_steps=200,
136 |         learning_rate=2e-4,
137 |         save_steps=100,
138 |         fp16=True,
139 |         logging_steps=20,
140 |         output_dir="outputs", # specify your own output path here
141 |         optim="adamw_hf", # paged_adamw_8bit is not supported yet
142 |         # gradient_checkpointing=True, # can further reduce memory but slower
143 |     ),
144 |     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
145 | )
146 | model.config.use_cache = False  # silence the warnings, and we should re-enable it for inference
147 | result = trainer.train()
148 | ```
149 | We can get the following outputs showcasing our training loss:
150 | ```
151 | /home/arda/anaconda3/envs/yining-llm-qlora/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
152 |   warnings.warn(
153 | {'loss': 1.7193, 'learning_rate': 0.0002, 'epoch': 0.03}                                                             
154 | {'loss': 1.3242, 'learning_rate': 0.00017777777777777779, 'epoch': 0.06}                                             
155 | {'loss': 1.2266, 'learning_rate': 0.00015555555555555556, 'epoch': 0.1}                                              
156 | {'loss': 1.1534, 'learning_rate': 0.00013333333333333334, 'epoch': 0.13}                                             
157 | {'loss': 0.9368, 'learning_rate': 0.00011111111111111112, 'epoch': 0.16}                                             
158 | {'loss': 0.9321, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.19}                                              
159 | {'loss': 0.9902, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.22}                                              
160 | {'loss': 0.8593, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.26}                                             
161 | {'loss': 1.0055, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.29}                                             
162 | {'loss': 1.0081, 'learning_rate': 0.0, 'epoch': 0.32}                                                                
163 | {'train_runtime': xxx, 'train_samples_per_second': xxx, 'train_steps_per_second': xxx, 'train_loss': 1.1155566596984863, 'epoch': 0.32}
164 | 100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [xx:xx<xx:xx,  xxxs/it]
165 | ```
166 | The final LoRA weights and configurations have been saved to `${output_dir}/checkpoint-{max_steps}/adapter_model.bin` and `${output_dir}/checkpoint-{max_steps}/adapter_config.json`, which can be used for merging.
167 | 
168 | ## 7.1.3 Merge the Model
169 | 
170 | After finetuning the model, you could merge the QLoRA weights back into the base model for export to Hugging Face format.
171 | 
172 | > **Note**
173 | >
174 | > Make sure your accelerate version is 0.23.0 to enable the merging process on CPU.
175 | 
176 | ### 7.1.3.1 Load Pre-trained Model
177 | 
178 | ```python
179 | from ipex_llm.transformers import AutoModelForCausalLM
180 | base_model = AutoModelForCausalLM.from_pretrained(
181 |         base_model,
182 |         torch_dtype=torch.float16,
183 |         device_map={"": "cpu"},
184 |     )
185 | ```
186 | 
187 | > **Note**
188 | >
189 | > In the merging state, load_in_low_bit="nf4" should be removed since we need to load the original model as the base model.
190 | 
191 | 
192 | 
193 | ### 7.1.3.2 Merge the Weights
194 | 
195 | 
196 | 
197 | Then we can load the QLoRA weights to enable the merging process.
198 | 
199 | ```python
200 | from ipex_llm.transformers.qlora import PeftModel
201 | adapter_path = "./outputs/checkpoint-200"
202 | lora_model = PeftModel.from_pretrained(
203 |         base_model,
204 |         adapter_path,
205 |         device_map={"": "cpu"},
206 |         torch_dtype=torch.float16,
207 |     )
208 | 
209 | 
210 | ```
211 | > **Note**
212 | >
213 | > Instead of `from peft import PeftModel`, we `import PeftModel from ipex_llm.transformers.qlora` as a IPEX-LLM compatible model.
214 | > 
215 | > **Note**
216 | > The adapter path is the local path you save the fine-tuned model, in our case is `./outputs/checkpoint-200`.
217 | >
218 | 
219 | To verify if the LoRA weights have worked in conjunction with the pretrained model, the first layer weights (which in llama2 case are trainable queries) are extracted to highlight the difference.
220 | 
221 | ```python
222 | first_weight = base_model.model.layers[0].self_attn.q_proj.weight
223 | first_weight_old = first_weight.clone()
224 | lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight
225 | assert torch.allclose(first_weight_old, first_weight)
226 | ```
227 | With the new merging method `merge_and_unload`, we can easily combine the fine-tuned model with pre-trained model, and testify whether the weights have changed with the `assert` statement. 
228 | 
229 | ```python
230 | lora_model = lora_model.merge_and_unload()
231 | lora_model.train(False)
232 | assert not torch.allclose(first_weight_old, first_weight)
233 | ```
234 | You may get the outputs below without error report to indicate the successful conversion.
235 | ```
236 | Using pad_token, but it is not set yet.
237 | Using pad_token, but it is not set yet.
238 | ```
239 | Finally we can save the fine-tuned model in a specified local path (in our case is `./outputs/checkpoint-200-merged`).
240 | ```python
241 | output_path = ./outputs/checkpoint-200-merged
242 | lora_model_sd = lora_model.state_dict()
243 | deloreanized_sd = {
244 |         k.replace("base_model.model.", ""): v
245 |         for k, v in lora_model_sd.items()
246 |         if "lora" not in k
247 |     }
248 | base_model.save_pretrained(output_path, state_dict=deloreanized_sd)
249 | tokenizer.save_pretrained(output_path)
250 | 
251 | ```
252 | 
253 | 
254 | ## 7.1.4 Inference with Fine-tuned model
255 | 
256 | After merging and deploying the models, we can test the performance of the fine-tuned model. 
257 | The detailed instructions of running LLM inference with IPEX-LLM optimizations could be found in [Chapter 6](../ch_6_GPU_Acceleration/6_1_GPU_Llama2-7B.md), here we quickly go through the preparation of model inference.
258 | 
259 | ### 7.1.4.1 Inference with the Fine-tuned Model
260 | 
261 | ```python
262 | model_path = "./outputs/checkpoint-200-merged"
263 | model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_path,load_in_4bit=True)
264 | model = model.to('xpu')
265 | tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path = model_path)
266 | ```
267 | > **Note**
268 | > The `model_path` argument should be consistent with the output path of your merged model.
269 | >
270 | Then we can verify if the fine-tuned model can produce reasonable and philosophical response with the new dataset added.
271 | ```python
272 | with torch.inference_mode():
273 |     input_ids = tokenizer.encode('The paradox of time and eternity is', 
274 |                                   return_tensors="pt").to('xpu')
275 |     output = model.generate(input_ids, max_new_tokens=32)
276 |     output = output.cpu()
277 |     output_str = tokenizer.decode(output[0], skip_special_tokens=True)
278 |     print(output_str)
279 | ```
280 | 
281 | We can repeat the process with the pre-trained model by replacing the `model_path` argument to verify the improvement after finetuning process. Now we can compare the answer of the pre-trained Model with the fine-tuned one:
282 | 
283 | > **Pre-trained Model**
284 | ```
285 | The paradox of time and eternity is that time is not eternal, but eternity is. nobody knows how long time is.
286 | The paradox of time and eternity is
287 | ```
288 | > **Fine-tuned Model**
289 | ```
290 | The paradox of time and eternity is that, on the one hand, we experience time as linear and progressive, and on the other hand, we experience time as cyclical. And the
291 | ```
292 | 
293 | We can see the result shares the same style and context with the samples contained in the fine-tuned Dataset. And note that we only trained the Model for some epochs in a few minutes based on the optimization of IPEX-LLM.
294 | 
295 | Here are more results with same prompts input for pretrained and fine-tuned models:
296 | 
297 | |   ♣ Pre-trained Model   | ♣ Fine-tuned Model  |
298 | |         -----          |       -----         |
299 | |   **There are two things that matter:** Einzelnes and the individual. Everyone has heard of the "individual," but few have heard of the "individuum," or "   |  **There are two things that matter:** the quality of our relationships and the legacy we leave. And I think that all of us as human beings are searching for it, no matter where |
300 | |   **In the quiet embrace of the night,** I felt the earth move. Unterscheidung von Wörtern und Ausdrücken.  |  **In the quiet embrace of the night,** the world is still and the stars are bright. My eyes are closed, my heart is at peace, my mind is at rest. I am ready for  |
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/ch_7_Finetune/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Chapter 7 Finetune
 3 | 
 4 | As one of the advanced parameter-efficient fine-tuning (PEFT) techniques, QLoRA enables light-weight infusion of specialty knowledge into a large language model with minimal overhead. IPEX-LLM also supports finetuning LLM (large language models) using QLora with 4bit optimizations on Intel GPUs.
 5 | 
 6 | > **Note**
 7 | >
 8 | > Currently, IPEX-LLM supports LoRA, QLoRA, ReLoRA, QA-LoRA and DPO finetuning.
 9 | 
10 | In Chapter 7, you will go through how to fine-tune a large language model to a text generation task using IPEX-LLM optimizations. IPEX-LLM has a comprehensive tool-set to help you fine-tune the model, merge the LoRA weights and inference with the fine-tuned model.
11 | 
12 | We are going to train with a popular open source model [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) as an example. For other finetuning methods, please refer to the [LLM-Finetuning](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/LLM-Finetuning) page for detailed instructions.
13 | 
14 | ## 7.0 Environment Setup
15 | 
16 | Please refer to the [GPU installation guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html) for mode details. It is strongly recommended that you follow the corresponding steps below to configure your environment properly.


--------------------------------------------------------------------------------
/ch_8_AppDev_Advanced/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/ipex-llm-tutorial/dcf1a80af8ddab03e03c48864432946c718abf35/ch_8_AppDev_Advanced/.keep


--------------------------------------------------------------------------------
/ch_8_AppDev_Advanced/README.md:
--------------------------------------------------------------------------------
1 | # Chapter 8 Application Development: Advanced
2 | This chapter introduces how to use LangChain with IPEX-LLM.   
3 | 


--------------------------------------------------------------------------------