├── .gitignore
├── .github
    ├── workflows
    │   ├── upload_pr_documentation.yml
    │   ├── build_documentation.yml
    │   ├── check_notebooks_lowercase.yml
    │   └── build_pr_documentation.yml
    └── pull_request_template.md
├── notebooks
    ├── zh-CN
    │   ├── _toctree.yml
    │   ├── index.md
    │   ├── rag_zephyr_langchain.ipynb
    │   ├── automatic_embedding_tei_inference_endpoints.ipynb
    │   └── fine_tuning_code_llm_on_single_gpu.ipynb
    └── en
    │   ├── _toctree.yml
    │   ├── index.md
    │   ├── rag_llamaindex_librarian.ipynb
    │   ├── rag_zephyr_langchain.ipynb
    │   ├── tgi_messages_api_demo.ipynb
    │   └── automatic_embedding_tei_inference_endpoints.ipynb
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea/
3 | .venv/
4 | 
5 | **/.ipynb_checkpoints
6 | **/.DS_Store
7 | 


--------------------------------------------------------------------------------
/.github/workflows/upload_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Upload PR Documentation
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Build PR Documentation"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   build:
11 |     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 |     with:
13 |       package_name: cookbook
14 |     secrets:
15 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 |       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # What does this PR do?
 2 | 
 3 | <!--
 4 | Thank you for submitting a PR to the Open-source AI cookbook!
 5 | 
 6 | Someone will review your PR shortly. They may suggest changes to further improve your contribution. 
 7 | If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same 
 8 | persons---sometimes notifications get lost.
 9 | 
10 | -->
11 | 
12 | <!-- Remove if not applicable -->
13 | 
14 | Fixes # (issue)
15 | 
16 | ## Who can review?
17 | 
18 | Feel free to tag members/contributors who may be interested in your PR.
19 | 


--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - "notebooks/**"
 7 |       - ".github/workflows/build_documentation.yml"
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   build:
13 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
14 |     with:
15 |       commit_sha: ${{ github.sha }}
16 |       package: cookbook
17 |       package_name: cookbook
18 |       path_to_docs: cookbook/notebooks/
19 |       additional_args: --not_python_module
20 |       languages: en zh-CN 
21 |       convert_notebooks: true
22 |     secrets:
23 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}


--------------------------------------------------------------------------------
/notebooks/zh-CN/_toctree.yml:
--------------------------------------------------------------------------------
 1 | - title: 开源 AI 指南 (Cookbook)
 2 |   sections:
 3 |   - local: index
 4 |     title: 开源 AI 指南 (Cookbook)
 5 |   - local: automatic_embedding_tei_inference_endpoints
 6 |     title: 通过推理端点使用 TEI 自动嵌入
 7 |   - local: faiss_with_hf_datasets_and_clip
 8 |     title: 用 🤗 transformers, 🤗 datasets 和 FAISS 嵌入多模态数据进行相似度搜索
 9 |   - local: fine_tuning_code_llm_on_single_gpu
10 |     title: 在单个 GPU 上针对自定义代码微调代码 LLM
11 |   - local: rag_zephyr_langchain
12 |     title: 用 Hugging Face Zephyr 和 LangChain 针对 Github issues 构建简单的 RAG
13 |   - local: advanced_rag
14 |     title: 使用 LangChain 在 HuggingFace 文档上构建高级 RAG
15 |   - local: rag_evaluation
16 |     title: 使用合成数据和 LLM 作为裁判评估 RAG
17 | 


--------------------------------------------------------------------------------
/.github/workflows/check_notebooks_lowercase.yml:
--------------------------------------------------------------------------------
 1 | name: Check notebook names are lowercase
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   check-lowercase:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - name: Checkout code
10 |       uses: actions/checkout@v2
11 | 
12 |     - name: Check filenames in 'notebooks/'
13 |       run: |
14 |         # Change to the directory where you want to check filenames
15 |         cd notebooks
16 |         # Find files with uppercase characters in their names
17 |         files=$(find . -type f | grep '[A-Z]' || true)
18 |         if [ -n "$files" ]; then
19 |             echo "The following files are not lowercase:"
20 |             echo "$files"
21 |             exit 1
22 |         else
23 |             echo "All filenames are lowercase."
24 |         fi


--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - "notebooks/**"
 7 |       - ".github/workflows/build_pr_documentation.yml"
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   build:
15 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
16 |     with:
17 |       commit_sha: ${{ github.event.pull_request.head.sha }}
18 |       pr_number: ${{ github.event.number }}
19 |       package: cookbook
20 |       package_name: cookbook
21 |       path_to_docs: cookbook/notebooks/
22 |       additional_args: --not_python_module
23 |       languages: en zh-CN 
24 |       convert_notebooks: true


--------------------------------------------------------------------------------
/notebooks/zh-CN/index.md:
--------------------------------------------------------------------------------
 1 | # 开源 AI 指南 (Cookbook)
 2 | 
 3 | 开源 AI 指南 (Cookbook) 是一系列 Notebook 的合集，里面展示了如何利用开源工具和模型来开发 AI 应用和解决各种机器学习问题的实际技巧和方法。
 4 | 
 5 | ## 最新 Notebook
 6 | 
 7 | 查看最近添加的 Notebook：
 8 | - [通过推理端点使用 TEI 自动嵌入](automatic_embedding_tei_inference_endpoints)
 9 | - [用 Hugging Face Zephyr 和 LangChain 针对 Github issues 构建简单的 RAG](rag_zephyr_langchain)
10 | - [用 🤗 transformers, 🤗 datasets 和 FAISS 嵌入多模态数据进行相似度搜索](faiss_with_hf_datasets_and_clip)
11 | - [在单个 GPU 上针对自定义代码微调代码 LLM](fine_tuning_code_llm_on_single_gpu)
12 | - [使用合成数据和 LLM 作为裁判评估 RAG](rag_evaluation)
13 | - [使用 LangChain 在 HuggingFace 文档上构建高级 RAG](advanced_rag)
14 | 
15 | 你还可以在指南 (Cookbook) 的[Github 仓库](https://github.com/huggingface/cookbook)中查看 Notebook。
16 | 
17 | ## 贡献
18 | 
19 | 开源 AI 指南 (Cookbook) 是社区和大家共同努力的成果，我们非常欢迎每个人都来参与贡献！
20 | 
21 | 
22 | 查看指南 (Cookbook) 的[贡献指引](https://github.com/huggingface/cookbook/blob/main/README.md)了解如何添加你的“食谱(教程)”。
23 | 


--------------------------------------------------------------------------------
/notebooks/en/_toctree.yml:
--------------------------------------------------------------------------------
 1 | - title: Open-Source AI Cookbook
 2 |   sections:
 3 |   - local: index
 4 |     title: Open-Source AI Cookbook
 5 |   - local: issues_in_text_dataset
 6 |     title: Detecting Issues in a Text Dataset with Cleanlab
 7 |   - local: stable_diffusion_interpolation
 8 |     title: Stable Diffusion Interpolation
 9 |   - local: rag_with_hugging_face_gemma_mongodb
10 |     title: Building A RAG System with Gemma, MongoDB and Open Source Models
11 |   - local: tgi_messages_api_demo
12 |     title: Migrating from OpenAI to Open LLMs Using TGI's Messages API
13 |   - local: automatic_embedding_tei_inference_endpoints
14 |     title: Automatic Embeddings with TEI through Inference Endpoints
15 |   - local: faiss_with_hf_datasets_and_clip
16 |     title: Embedding multimodal data for similarity search
17 |   - local: fine_tuning_code_llm_on_single_gpu
18 |     title: Fine-tuning a Code LLM on Custom Code on a single GPU
19 |   - local: rag_zephyr_langchain
20 |     title: Simple RAG using Hugging Face Zephyr and LangChain
21 |   - local: rag_llamaindex_librarian
22 |     title: RAG "Librarian" Using LlamaIndex
23 |   - local: advanced_rag
24 |     title: Advanced RAG on HuggingFace documentation using LangChain
25 |   - local: rag_evaluation
26 |     title: RAG Evaluation
27 |   - local: prompt_tuning_peft
28 |     title: Prompt tuning with PEFT
29 |   - local: labelling_feedback_setfit
30 |     title: Suggestions for Data Annotation with SetFit in Zero-shot Text Classification
31 |   - local: pipeline_notus_instructions_preferences_legal
32 |     title: Create a legal preference dataset
33 |   - local: semantic_cache_chroma_vector_database
34 |     title: Implementing semantic cache to improve a RAG system.
35 |   - local: annotate_text_data_transformers_via_active_learning
36 |     title: Annotate text data using Active Learning with Cleanlab
37 |   - local: llm_judge
38 |     title: Using LLM-as-a-judge for an automated and versatile evaluation
39 | 


--------------------------------------------------------------------------------
/notebooks/en/index.md:
--------------------------------------------------------------------------------
 1 | # Open-Source AI Cookbook
 2 | 
 3 | The Open-Source AI Cookbook is a collection of notebooks illustrating practical aspects of building AI
 4 | applications and solving various machine learning tasks using open-source tools and models.
 5 | 
 6 | ## Latest notebooks
 7 | 
 8 | Check out the recently added notebooks:
 9 | 
10 | - [Using LLM-as-a-judge 🧑‍⚖️ for an automated and versatile evaluation](llm_judge)
11 | - [Create a legal preference dataset](pipeline_notus_instructions_preferences_legal)
12 | - [Suggestions for Data Annotation with SetFit in Zero-shot Text Classification](labelling_feedback_setfit)
13 | - [Implementing semantic cache to improve a RAG system](semantic_cache_chroma_vector_database)
14 | - [Building A RAG Ebook "Librarian" Using LlamaIndex](rag_llamaindex_librarian)
15 | - [Stable Diffusion Interpolation](stable_diffusion_interpolation)
16 | - [Building A RAG System with Gemma, MongoDB and Open Source Models](rag_with_hugging_face_gemma_mongodb)
17 | - [Prompt Tuning with PEFT Library](prompt_tuning_peft)
18 | - [Migrating from OpenAI to Open LLMs Using TGI's Messages API](tgi_messages_api_demo)
19 | - [Automatic Embeddings with TEI through Inference Endpoints](automatic_embedding_tei_inference_endpoints)
20 | - [Simple RAG for GitHub issues using Hugging Face Zephyr and LangChain](rag_zephyr_langchain)
21 | - [Embedding multimodal data for similarity search using 🤗 transformers, 🤗 datasets and FAISS](faiss_with_hf_datasets_and_clip)
22 | - [Fine-tuning a Code LLM on Custom Code on a single GPU](fine_tuning_code_llm_on_single_gpu)
23 | - [RAG Evaluation Using Synthetic data and LLM-As-A-Judge](rag_evaluation)
24 | - [Advanced RAG on HuggingFace documentation using LangChain](advanced_rag)
25 | - [Detecting Issues in a Text Dataset with Cleanlab](issues_in_text_dataset)
26 | - [Annotate text data using Active Learning with Cleanlab](annotate_text_data_transformers_via_active_learning)
27 | 
28 | You can also check out the notebooks in the cookbook's [GitHub repo](https://github.com/huggingface/cookbook).
29 | 
30 | ## Contributing
31 | 
32 | The Open-Source AI Cookbook is a community effort, and we welcome contributions from everyone!
33 | Check out the cookbook's [Contribution guide](https://github.com/huggingface/cookbook/blob/main/README.md) to learn
34 | how you can add your "recipe".
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Open-Source AI Cookbook
 2 | 
 3 | This repo contains community-driven practical examples of building AI applications and solving various tasks with AI 
 4 | using open-source tools and models. 
 5 | 
 6 | ## Contributing to the cookbook
 7 | 
 8 | Everyone is welcome to contribute, and we value everybody's contribution! There are several ways you can contribute to 
 9 | the [Open-Source AI Cookbook](https://huggingface.co/learn/cookbook/index):
10 | 
11 | * Submit an idea for a desired example/guide via [GitHub Issues](https://github.com/huggingface/cookbook/issues).
12 | * Contribute a new notebook with a practical example.
13 | * Improve existing examples by fixing issues/typos. 
14 | 
15 | Before contributing, check currently [open issues](https://github.com/huggingface/cookbook/issues) and
16 | [pull requests](https://github.com/huggingface/cookbook/pulls) to avoid working on something that someone else is
17 | already working on.
18 | 
19 | ### What makes a good Cookbook notebook?
20 | 
21 | We believe that the Cookbook will be the most beneficial for everyone in the community if the Jupyter notebooks have the 
22 | following qualities: 
23 | 
24 | * *Practical*: Your notebook should provide an illustration of an end-to-end project or a specific aspect of AI development. Aim for real-world applications, but try to avoid overcomplicating. Clearly explain the objectives, challenges and steps involved.
25 | * *Build with open-source tools and models*: Utilize open source libraries, datasets, and pre-trained models available under permissive licenses. Include links to all resources used within the notebook.
26 | * *Clearly written*: Ensure your writing is clear, concise, and free from grammatical errors. Maintain a friendly and approachable tone throughout the notebook. Explain the steps you take to solve a problem, challenges, alternative approaches.
27 | * *Executes without errors*: Test your notebook to avoid runtime errors. 
28 | * *Adds to existing "recipes"*: Before submitting, review existing notebooks to confirm that the subject hasn't been covered yet. We welcome diverse use cases, modalities, techniques, and approaches! 
29 | 
30 | ### Creating a pull request
31 | 
32 | To contribute a new example/guide, open a pull request.
33 | 
34 | Here are some tips:
35 | 
36 | * Make sure that your notebook's file name is in lowercase.
37 | * Don't forget to add your notebook to the `_toctree.yml` and to `index.md`.
38 | * Right after the notebook's first header, add yourself as an author like this: `_Authored by: [Aymeric Roucher](https://huggingface.co/m-ric)_`. You can link to your Hugging Face profile, or to your GitHub profile.
39 | * Remove non-informative code cell outputs (e.g. from `pip install`). Make sure the notebook doesn't contain any empty code cells.
40 | * If using any images in the markdown, upload them to the [huggingface/cookbook-images](https://huggingface.co/datasets/huggingface/cookbook-images) dataset. Then use the link to the image in your markdown, e.g.:
41 | ```![RAG diagram](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/rag-diagram.png)```
42 | 
43 | Once your pull request is merged, the notebook will show up in the [Open-Source AI Cookbook](https://hf.co/learn/cookbook).
44 | 
45 | ### Translating the Cookbook into your language
46 | 
47 | We'd love to have the Cookbook to be available in many more languages! Please follow the steps below if you'd like to 
48 | help translate the notebooks into your language 🙏.
49 | 
50 | If some of the notebooks have already been translated into your language, add new translated notebooks 
51 | under `notebooks/your_language`. Don't forget to add the new translated notebook to `notebooks/your_language/_toctree.yml`,
52 | and to `notebooks/your_language/index.md`.
53 | 
54 | If the notebooks have not yet been translated to your language, create a directory under `notebooks` with your `LANG-ID` 
55 | (e.g. see `en` for English, `zh-CN` for Chinese). The `LANG-ID` should be ISO 639-1 (two lower case letters) language 
56 | code -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for reference. Alternatively, 
57 | `{two lowercase letters}-{two uppercase letters}` format is also supported, e.g. `zh-CN`.
58 | 
59 | Create the `notebooks/LANG-ID/_toctree.yml`, and `notebooks/LANG-ID/index.md`, and add the translated notebook.
60 | 
61 | Finally, add your language code (the exact same `LANG-ID`) to the `build_documentation.yml` and `build_pr_documentation.yml` 
62 | files in the `.github/workflows` folder.
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/notebooks/en/rag_llamaindex_librarian.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Building A RAG Ebook \"Librarian\" Using LlamaIndex\n",
  8 |     "\n",
  9 |     "_Authored by: [Jonathan Jin](https://huggingface.co/jinnovation)_"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Introduction\n",
 17 |     "\n",
 18 |     "This notebook demonstrates how to quickly build a RAG-based \"librarian\" for your\n",
 19 |     "local ebook library.\n",
 20 |     "\n",
 21 |     "Think about the last time you visited a library and took advantage of the\n",
 22 |     "expertise of the knowledgeable staff there to help you find what you need out of\n",
 23 |     "the troves of textbooks, novels, and other resources at the library. Our RAG\n",
 24 |     "\"librarian\" will do the same for us, except for our own local collection of\n",
 25 |     "ebooks.\n",
 26 |     "\n",
 27 |     "## Requirements\n",
 28 |     "\n",
 29 |     "We'd like our librarian to be **lightweight** and **run locally as much as\n",
 30 |     "possible** with **minimal dependencies**. This means that we will leverage\n",
 31 |     "open-source to the fullest extent possible, as well as bias towards models that\n",
 32 |     "can be **executed locally on typical hardware, e.g. M1 Macbooks**.\n",
 33 |     "\n",
 34 |     "## Components\n",
 35 |     "\n",
 36 |     "Our solution will consist of the following components:\n",
 37 |     "\n",
 38 |     "- [LlamaIndex], a data framework for LLM-based applications that's, unlike\n",
 39 |     "  [LangChain], designed specifically for RAG;\n",
 40 |     "- [Ollama], a user-friendly solution for running LLMs such as Llama 2 locally;\n",
 41 |     "- The [`BAAI/bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5)\n",
 42 |     "  embedding model, which performs [reasonably well and is reasonably lightweight\n",
 43 |     "  in size](https://huggingface.co/spaces/mteb/leaderboard);\n",
 44 |     "- [Llama 2], which we'll run via [Ollama].\n",
 45 |     "\n",
 46 |     "[LlamaIndex]: https://docs.llamaindex.ai/en/stable/index.html\n",
 47 |     "[LangChain]: https://python.langchain.com/docs/get_started/introduction\n",
 48 |     "[Ollama]: https://ollama.com/\n",
 49 |     "[Llama 2]: https://ollama.com/library/llama2"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Dependencies\n",
 57 |     "\n",
 58 |     "First let's install our dependencies."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "%pip install -q \\\n",
 68 |     "    llama-index \\\n",
 69 |     "    EbookLib \\\n",
 70 |     "    html2text \\\n",
 71 |     "    llama-index-embeddings-huggingface \\\n",
 72 |     "    llama-index-llms-ollama"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!brew install ollama"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Test Library Setup\n",
 89 |     "\n",
 90 |     "Next, let's create our test \"library.\"\n",
 91 |     "\n",
 92 |     "For simplicity's sake, let's say that our \"library\" is simply a **nested directory of `.epub` files**. We can easily see this solution generalizing to, say, a Calibre library with a `metadata.db` database file. We'll leave that extension as an exercise for the reader. 😇\n",
 93 |     "\n",
 94 |     "Let's pull two `.epub` files from [Project Gutenberg](https://www.gutenberg.org/) for our library."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "!mkdir -p \".test/library/jane-austen\"\n",
104 |     "!mkdir -p \".test/library/victor-hugo\"\n",
105 |     "!wget https://www.gutenberg.org/ebooks/1342.epub.noimages -O \".test/library/jane-austen/pride-and-prejudice.epub\"\n",
106 |     "!wget https://www.gutenberg.org/ebooks/135.epub.noimages -O \".test/library/victor-hugo/les-miserables.epub\""
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## RAG with LlamaIndex\n",
114 |     "\n",
115 |     "RAG with LlamaIndex, at its core, consists of the following broad phases:\n",
116 |     "\n",
117 |     "1. **Loading**, in which you tell LlamaIndex where your data lives and how to\n",
118 |     "   load it;\n",
119 |     "2. **Indexing**, in which you augment your loaded data to facilitate querying, e.g. with vector embeddings;\n",
120 |     "3. **Querying**, in which you configure an LLM to act as the query interface for\n",
121 |     "   your indexed data.\n",
122 |     "\n",
123 |     "This explanation only scratches at the surface of what's possible with\n",
124 |     "LlamaIndex. For more in-depth details, I highly recommend reading the\n",
125 |     "[\"High-Level Concepts\" page of the LlamaIndex\n",
126 |     "documentation](https://docs.llamaindex.ai/en/stable/getting_started/concepts.html)."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "### Loading\n",
134 |     "\n",
135 |     "Naturally, let's start with the **loading** phase.\n",
136 |     "\n",
137 |     "I mentioned before that LlamaIndex is designed specifically for RAG. This\n",
138 |     "immediately becomes obvious from its\n",
139 |     "[`SimpleDirectoryReader`](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html)\n",
140 |     "construct, which ✨ **magically** ✨ supports a whole host of multi-model file\n",
141 |     "types for free. Conveniently for us, `.epub` is in the supported set."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "from llama_index.core import SimpleDirectoryReader\n",
151 |     "\n",
152 |     "loader = SimpleDirectoryReader(\n",
153 |     "    input_dir=\"./.test/\",\n",
154 |     "    recursive=True,\n",
155 |     "    required_exts=[\".epub\"],\n",
156 |     ")\n",
157 |     "\n",
158 |     "documents = loader.load_data()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "`SimpleDirectoryReader.load_data()` converts our ebooks into a set of [`Document`s](https://docs.llamaindex.ai/en/stable/api/llama_index.core.schema.Document.html) for LlamaIndex to work with.\n",
166 |     "\n",
167 |     "One important thing to note here is that the documents **have not been chunked at this stage** -- that will happen during indexing. Read on..."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "### Indexing\n",
175 |     "\n",
176 |     "Next up after **loading** the data is to **index** it. This will allow our RAG pipeline to look up the relevant context for our query to pass to our LLM to **augment** their generated response. This is also where document chunking will take place.\n",
177 |     "\n",
178 |     "[`VectorStoreIndex`](https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index.html)\n",
179 |     "is a \"default\" entrypoint for indexing in LlamaIndex. By default,\n",
180 |     "`VectorStoreIndex` uses a simple, in-memory dictionary to store the indices, but\n",
181 |     "LlamaIndex also supports [a wide variety of vector storage\n",
182 |     "solutions](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores.html)\n",
183 |     "for you to graduate to as you scale.\n",
184 |     "\n",
185 |     "<Tip> \n",
186 |     "By default, LlamaIndex uses a chunk size of 1024 and a chunk overlap of\n",
187 |     "20. For more details, see the [LlamaIndex\n",
188 |     "documentation](https://docs.llamaindex.ai/en/stable/optimizing/basic_strategies/basic_strategies.html#chunk-sizes).\n",
189 |     "</Tip>\n",
190 |     "\n",
191 |     "\n",
192 |     "Like mentioned before, we'll use the\n",
193 |     "[`BAAI/bge-small-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) to\n",
194 |     "generate our embeddings. By default, [LlamaIndex uses\n",
195 |     "OpenAI](https://docs.llamaindex.ai/en/stable/getting_started/starter_example.html)\n",
196 |     "(specifically `gpt-3.5-turbo`), which we'd like to avoid given our desire for a lightweight, locally-runnable end-to-end solution.\n",
197 |     "\n",
198 |     "Thankfully, LlamaIndex supports retrieving embedding models from Hugging Face through the convenient `HuggingFaceEmbedding` class, so we'll use that here."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 2,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
208 |     "\n",
209 |     "embedding_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "We'll pass that in to `VectorStoreIndex` as our embedding model to circumvent the OpenAI default behavior."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 3,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "from llama_index.core import VectorStoreIndex\n",
226 |     "\n",
227 |     "index = VectorStoreIndex.from_documents(\n",
228 |     "    documents,\n",
229 |     "    embed_model=embedding_model,\n",
230 |     ")"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "### Querying\n",
238 |     "\n",
239 |     "Now for the final piece of the RAG puzzle -- wiring up the query layer.\n",
240 |     "\n",
241 |     "We'll use Llama 2 for the purposes of this recipe, but I encourage readers to play around with different models to see which produces the \"best\" responses here.\n",
242 |     "\n",
243 |     "First let's start up the Ollama server. Unfortunately, there is no support in the [Ollama Python client](https://github.com/ollama/ollama-python) for actually starting and stopping the server itself, so we'll have to pop out of Python land for this.\n",
244 |     "\n",
245 |     "In a separate terminal, run: `ollama serve`. Remember to terminate this after we're done here!"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "Now let's hook Llama 2 up to LlamaIndex and use it as the basis of our query engine."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 4,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "from llama_index.llms.ollama import Ollama\n",
262 |     "\n",
263 |     "llama = Ollama(\n",
264 |     "    model=\"llama2\",\n",
265 |     "    request_timeout=40.0,\n",
266 |     ")\n",
267 |     "\n",
268 |     "query_engine = index.as_query_engine(llm=llama)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "## Final Result\n",
276 |     "\n",
277 |     "With that, our basic RAG librarian is set up and we can start asking questions about our library. For example:"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 29,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "Based on the context provided, there are two books available:\n",
290 |       "\n",
291 |       "1. \"Pride and Prejudice\" by Jane Austen\n",
292 |       "2. \"Les Misérables\" by Victor Hugo\n",
293 |       "\n",
294 |       "The context used to derive this answer includes:\n",
295 |       "\n",
296 |       "* The file path for each book, which provides information about the location of the book files on the computer.\n",
297 |       "* The titles of the books, which are mentioned in the context as being available for reading.\n",
298 |       "* A list of words associated with each book, such as \"epub\" and \"notebooks\", which provide additional information about the format and storage location of each book.\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "print(query_engine.query(\"What are the titles of all the books available? Show me the context used to derive your answer.\"))"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 31,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "name": "stdout",
313 |      "output_type": "stream",
314 |      "text": [
315 |       "The main character of 'Pride and Prejudice' is Elizabeth Bennet.\n"
316 |      ]
317 |     }
318 |    ],
319 |    "source": [
320 |     "print(query_engine.query(\"Who is the main character of 'Pride and Prejudice'?\"))"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "## Conclusion and Future Improvements\n",
328 |     "\n",
329 |     "We've demonstrated how to build a basic RAG-based \"librarian\" that runs entirely locally, even on Apple silicon Macs. In doing so, we've also carried out a \"grand tour\" of LlamaIndex and how it streamlines the process of setting up RAG-based applications.\n",
330 |     "\n",
331 |     "That said, we've really only scratched the surface of what's possible here. Here are some ideas of how to refine and build upon this foundation.\n",
332 |     "\n",
333 |     "### Forcing Citations\n",
334 |     "\n",
335 |     "To guard against the risk of our librarian hallucinating, how might we require that it provide citations for everything that it says?\n",
336 |     "\n",
337 |     "### Using Extended Metadata\n",
338 |     "\n",
339 |     "Ebook library management solutions like [Calibre](https://calibre-ebook.com/) create additional metadata for ebooks in a library. This can provide information such as publisher or edition that might not be readily available in the text of the book itself. How could we extend our RAG pipeline to account for additional sources of information that aren't `.epub` files?\n",
340 |     "\n",
341 |     "### Efficient Indexing\n",
342 |     "\n",
343 |     "If we were to collect everything we built here into a script/executable, the resulting script would re-index our library on each invocation. For our tiny test library of two files, this is \"fine,\" but for any library of non-trivial size this will very quickly become annoying for users. How could we persist the embedding indices and only update them when the contents of the library have meaningfully changed, e.g. new books have been added?"
344 |    ]
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 3",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.11.8"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/notebooks/zh-CN/rag_zephyr_langchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "Kih21u1tyr-I"
  7 |       },
  8 |       "source": [
  9 |         "# 用 Hugging Face Zephyr 和 LangChain 针对 Github issues 构建简单的 RAG\n",
 10 |         "\n",
 11 |         "_作者: [Maria Khalusova](https://github.com/MKhalusova)_\n",
 12 |         "\n",
 13 |         "本 notebook 展示了如何使用 [`HuggingFaceH4/zephyr-7b-beta`](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) 模型和 LangChain 快速构建一个针对项目 GitHub issues 的简单 RAG。\n",
 14 |         "\n",
 15 |         "\n",
 16 |         "\n",
 17 |         "**什么是 RAG**\n",
 18 |         "\n",
 19 |         "RAG 是一个很流行的方法，用来解决强大的 LLM 不知道具体内容的问题，因为具体内容不在其训练数据中，或者当它看到它之前时产生幻觉。这样的具体内容可能是专有的、敏感的，或者，就像这个例子中一样，是最近的和更新的。\n",
 20 |         "\n",
 21 |         "如果你的数据集是静态的和不需要定期更新的，那么你可能会考虑微调一个大模型。但在大多数情况下，微调模型花费巨大并且重复去微调的话(比如，处理数据漂移的时候)，可能会导致“模型偏移”。这种情况模型行为的变换就不是设计的那样了。\n",
 22 |         "\n",
 23 |         "**RAG (检索增强生成)** 并不需要模型微调。相反， RAG 通过提供检索到的额外的相关内容喂给 LLM 以此来获得更好的回答。\n",
 24 |         "\n",
 25 |         "这里是一个简单说明：\n",
 26 |         "\n",
 27 |         "![RAG diagram](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/rag-diagram.png)\n",
 28 |         "\n",
 29 |         "* 额外的数据通过独立的嵌入模型会被转化为嵌入向量，这些向量会储存在向量数据库里。嵌入模型通常都比较小，因此在常规偏差上更新嵌入向量相比于微调模型会更快，便宜，和简单。\n",
 30 |         "\n",
 31 |         "* 与此同时，由于不需要微调，给了你极大的自由度去切换选择你自己的更强的 LLM，或者对于更快速的推理去切换更小的蒸馏模型。\n",
 32 |         "\n",
 33 |         "让我们用开源的 LLM ，嵌入模型，和 LangChain 快速构建一个针对项目 GitHub issues 的简单 RAG。\n",
 34 |         "\n",
 35 |         "\n",
 36 |         "首先安装相关依赖："
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "execution_count": null,
 42 |       "metadata": {
 43 |         "id": "lC9frDOlyi38"
 44 |       },
 45 |       "outputs": [],
 46 |       "source": [
 47 |         "!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-gpu"
 48 |       ]
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "execution_count": 2,
 53 |       "metadata": {
 54 |         "id": "-aYENQwZ-p_c"
 55 |       },
 56 |       "outputs": [],
 57 |       "source": [
 58 |         "# If running in Google Colab, you may need to run this cell to make sure you're using UTF-8 locale to install LangChain\n",
 59 |         "import locale\n",
 60 |         "locale.getpreferredencoding = lambda: \"UTF-8\""
 61 |       ]
 62 |     },
 63 |     {
 64 |       "cell_type": "code",
 65 |       "execution_count": null,
 66 |       "metadata": {
 67 |         "id": "W5HhMZ2c-NfU"
 68 |       },
 69 |       "outputs": [],
 70 |       "source": [
 71 |         "!pip install -q langchain"
 72 |       ]
 73 |     },
 74 |     {
 75 |       "cell_type": "markdown",
 76 |       "metadata": {
 77 |         "id": "R8po01vMWzXL"
 78 |       },
 79 |       "source": [
 80 |         "## 准备数据\n"
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "markdown",
 85 |       "metadata": {
 86 |         "id": "3cCmQywC04x6"
 87 |       },
 88 |       "source": [
 89 |         "在这个例子中，我们会从[PEFT 库的仓库](https://github.com/huggingface/peft)加载所有的 issues（包括现在开放的和已经关闭的）。\n",
 90 |         "\n",
 91 |         "首先，你需要获取一个 [GitHub 个人权限 token](https://github.com/settings/tokens?type=beta) 来访问 GitHub API。"
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "execution_count": null,
 97 |       "metadata": {
 98 |         "id": "8MoD7NbsNjlM"
 99 |       },
100 |       "outputs": [],
101 |       "source": [
102 |         "from getpass import getpass\n",
103 |         "ACCESS_TOKEN = getpass(\"YOUR_GITHUB_PERSONAL_TOKEN\")"
104 |       ]
105 |     },
106 |     {
107 |       "cell_type": "markdown",
108 |       "metadata": {
109 |         "id": "fccecm3a10N6"
110 |       },
111 |       "source": [
112 |         "下一步，我们将会加载  [huggingface/peft](https://github.com/huggingface/peft) 仓库中所有的 issues:\n",
113 |         "- 默认情况下， PR 也被认定为 issues，这里我们要设置 `include_prs=False` 来排除 PR。\n",
114 |         "- 设置 `state = \"all\"` 意味着我们会把开放和已经关闭的 issues 都加载了。"
115 |       ]
116 |     },
117 |     {
118 |       "cell_type": "code",
119 |       "execution_count": 5,
120 |       "metadata": {
121 |         "id": "8EKMit4WNDY8"
122 |       },
123 |       "outputs": [],
124 |       "source": [
125 |         "from langchain.document_loaders import GitHubIssuesLoader\n",
126 |         "\n",
127 |         "loader = GitHubIssuesLoader(\n",
128 |         "    repo=\"huggingface/peft\",\n",
129 |         "    access_token=ACCESS_TOKEN,\n",
130 |         "    include_prs=False,\n",
131 |         "    state=\"all\"\n",
132 |         ")\n",
133 |         "\n",
134 |         "docs = loader.load()"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "markdown",
139 |       "metadata": {
140 |         "id": "CChTrY-k2qO5"
141 |       },
142 |       "source": [
143 |         "个人仓库的 issues 内容可能会长于一个嵌入模型可以最为输入处理的长度。如果我们想要嵌入所有可用的内容，我们需要把文档分割成适当大小的块。\n",
144 |         "\n",
145 |         "最普通直接的切块方法就是定义一个固定的块大小，以及判断块之间是否加入重叠。保存一些块之间的重叠允许我们去保存一些语义上下文。\n",
146 |         "\n",
147 |         "其他方法通常更复杂，会考虑到文档的结构和上下文。例如，人们可能希望根据句子或段落来分割文档，然而，固定大小的分块在大多数常见情况下都表现得很好，所以我们将在这里采用这种方法。\n"
148 |       ]
149 |     },
150 |     {
151 |       "cell_type": "code",
152 |       "execution_count": null,
153 |       "metadata": {
154 |         "id": "OmsXOf59Pmm-"
155 |       },
156 |       "outputs": [],
157 |       "source": [
158 |         "from langchain.text_splitter import CharacterTextSplitter\n",
159 |         "\n",
160 |         "splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=30)\n",
161 |         "\n",
162 |         "chunked_docs = splitter.split_documents(docs)"
163 |       ]
164 |     },
165 |     {
166 |       "cell_type": "markdown",
167 |       "metadata": {
168 |         "id": "DAt_zPVlXOn7"
169 |       },
170 |       "source": [
171 |         "## 创建嵌入和检索器"
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "markdown",
176 |       "metadata": {
177 |         "id": "-mvat6JQl4yp"
178 |       },
179 |       "source": [
180 |         "现在所有的文档都设置成立合适的大小，我们可以用他们的嵌入创建一个数据集了。\n",
181 |         "\n",
182 |         "为了创建文档块嵌入，我们将会使用 `HuggingFaceEmbeddings` 和 [`BAAI/bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) 嵌入模型。在 Hub 上有许多其他的嵌入模型可用，你也可以查看 [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) 关注表现最好的模型。\n",
183 |         "\n",
184 |         "为了创建向量数据库，我们将会使用 `FAISS` 库。这个库提供高效的相似度搜索和稠密向量的聚类，正是我们需要的。FAISS 目前是大规模数据集上 NN 搜索最常用的库之一。\n",
185 |         "\n",
186 |         "我们通过 LangChain 的 API 来获取嵌入模型和 FAISS 向量数据库。"
187 |       ]
188 |     },
189 |     {
190 |       "cell_type": "code",
191 |       "execution_count": null,
192 |       "metadata": {
193 |         "id": "ixmCdRzBQ5gu"
194 |       },
195 |       "outputs": [],
196 |       "source": [
197 |         "from langchain.vectorstores import FAISS\n",
198 |         "from langchain.embeddings import HuggingFaceEmbeddings\n",
199 |         "\n",
200 |         "db = FAISS.from_documents(chunked_docs,\n",
201 |         "                          HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5'))"
202 |       ]
203 |     },
204 |     {
205 |       "cell_type": "markdown",
206 |       "metadata": {
207 |         "id": "2iCgEPi0nnN6"
208 |       },
209 |       "source": [
210 |         "我们需要一种方式，来返回给定无结构的查询所需要的文档。针对这个，我们会使用 `as_retriever` 方法，使用 `db` 作为支柱：\n",
211 |         "- `search_type=\"similarity\"` 意味着我们会执行查询和文档之间的相似度搜索\n",
212 |         "- `search_kwargs={'k': 4}` 指示我们指定返回的最高的 4 个结果\n"
213 |       ]
214 |     },
215 |     {
216 |       "cell_type": "code",
217 |       "execution_count": 8,
218 |       "metadata": {
219 |         "id": "mBTreCQ9noHK"
220 |       },
221 |       "outputs": [],
222 |       "source": [
223 |         "retriever = db.as_retriever(\n",
224 |         "    search_type=\"similarity\",\n",
225 |         "    search_kwargs={'k': 4}\n",
226 |         ")"
227 |       ]
228 |     },
229 |     {
230 |       "cell_type": "markdown",
231 |       "metadata": {
232 |         "id": "WgEhlISJpTgj"
233 |       },
234 |       "source": [
235 |         "向量数据库和检索器现在设置好了，下一步我们需要设置好链中的下一块 - 模型。"
236 |       ]
237 |     },
238 |     {
239 |       "cell_type": "markdown",
240 |       "metadata": {
241 |         "id": "tzQxx0HkXVFU"
242 |       },
243 |       "source": [
244 |         "## 加载量化模型"
245 |       ]
246 |     },
247 |     {
248 |       "cell_type": "markdown",
249 |       "metadata": {
250 |         "id": "9jy1cC65p_GD"
251 |       },
252 |       "source": [
253 |         "针对本例，我们选择 [`HuggingFaceH4/zephyr-7b-beta`](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta), 一个小而强大的模型。\n",
254 |         "\n",
255 |         "随着每周都会出好多模型，你可能会想要替换这个模型到最新的最好的模型。最好的方式是查看 [Open-source LLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)。\n",
256 |         "\n",
257 |         "为了推理更快，我们将加载模型的量化版本："
258 |       ]
259 |     },
260 |     {
261 |       "cell_type": "code",
262 |       "execution_count": null,
263 |       "metadata": {
264 |         "id": "L-ggaa763VRo"
265 |       },
266 |       "outputs": [],
267 |       "source": [
268 |         "import torch\n",
269 |         "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
270 |         "\n",
271 |         "model_name = 'HuggingFaceH4/zephyr-7b-beta'\n",
272 |         "\n",
273 |         "bnb_config = BitsAndBytesConfig(\n",
274 |         "    load_in_4bit=True,\n",
275 |         "    bnb_4bit_use_double_quant=True,\n",
276 |         "    bnb_4bit_quant_type=\"nf4\",\n",
277 |         "    bnb_4bit_compute_dtype=torch.bfloat16\n",
278 |         ")\n",
279 |         "\n",
280 |         "model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)\n",
281 |         "tokenizer = AutoTokenizer.from_pretrained(model_name)"
282 |       ]
283 |     },
284 |     {
285 |       "cell_type": "markdown",
286 |       "metadata": {
287 |         "id": "hVNRJALyXYHG"
288 |       },
289 |       "source": [
290 |         "## 设置 LLM 链"
291 |       ]
292 |     },
293 |     {
294 |       "cell_type": "markdown",
295 |       "metadata": {
296 |         "id": "RUUNneJ1smhl"
297 |       },
298 |       "source": [
299 |         "最后，我们有了所有的需要设置的 LLM 链的部分。\n",
300 |         "\n",
301 |         "首先，使用加载的模型和他的tokenizer创建一个文本生成的流水线(pipeline)\n",
302 |         "\n",
303 |         "下一步，创建一个提示模板-这个应该遵循模型的格式，所以如果你替换了模型检查点，确保使用合适的格式。\n"
304 |       ]
305 |     },
306 |     {
307 |       "cell_type": "code",
308 |       "execution_count": 15,
309 |       "metadata": {
310 |         "id": "cR0k1cRWz8Pm"
311 |       },
312 |       "outputs": [],
313 |       "source": [
314 |         "from langchain.llms import HuggingFacePipeline\n",
315 |         "from langchain.prompts import PromptTemplate\n",
316 |         "from transformers import pipeline\n",
317 |         "from langchain_core.output_parsers import StrOutputParser\n",
318 |         "\n",
319 |         "text_generation_pipeline = pipeline(\n",
320 |         "    model=model,\n",
321 |         "    tokenizer=tokenizer,\n",
322 |         "    task=\"text-generation\",\n",
323 |         "    temperature=0.2,\n",
324 |         "    do_sample=True,\n",
325 |         "    repetition_penalty=1.1,\n",
326 |         "    return_full_text=True,\n",
327 |         "    max_new_tokens=400,\n",
328 |         ")\n",
329 |         "\n",
330 |         "llm = HuggingFacePipeline(pipeline=text_generation_pipeline)\n",
331 |         "\n",
332 |         "prompt_template = \"\"\"\n",
333 |         "<|system|>\n",
334 |         "Answer the question based on your knowledge. Use the following context to help:\n",
335 |         "\n",
336 |         "{context}\n",
337 |         "\n",
338 |         "</s>\n",
339 |         "<|user|>\n",
340 |         "{question}\n",
341 |         "</s>\n",
342 |         "<|assistant|>\n",
343 |         "\n",
344 |         " \"\"\"\n",
345 |         "\n",
346 |         "prompt = PromptTemplate(\n",
347 |         "    input_variables=[\"context\", \"question\"],\n",
348 |         "    template=prompt_template,\n",
349 |         ")\n",
350 |         "\n",
351 |         "llm_chain = prompt | llm | StrOutputParser()"
352 |       ]
353 |     },
354 |     {
355 |       "cell_type": "markdown",
356 |       "metadata": {
357 |         "id": "l19UKq5HXfSp"
358 |       },
359 |       "source": [
360 |         "注意：你也可以使用 `tokenizer.apply_chat_template` 转换列表消息为合适聊天格式的字符串（字典也行  `{'role': 'user', 'content': '(...)'}`）\n",
361 |         "\n",
362 |         "最后，我们需要将 LLM 链与检索器(retriever)结合起来创建一个 RAG 链。我们将原始问题以及检索到的文档上下文传递到最后生成步骤："
363 |       ]
364 |     },
365 |     {
366 |       "cell_type": "code",
367 |       "execution_count": 17,
368 |       "metadata": {
369 |         "id": "_rI3YNp9Xl4s"
370 |       },
371 |       "outputs": [],
372 |       "source": [
373 |         "from langchain_core.runnables import RunnablePassthrough\n",
374 |         "\n",
375 |         "retriever = db.as_retriever()\n",
376 |         "\n",
377 |         "rag_chain = (\n",
378 |         " {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
379 |         "    | llm_chain\n",
380 |         ")\n"
381 |       ]
382 |     },
383 |     {
384 |       "cell_type": "markdown",
385 |       "metadata": {
386 |         "id": "UsCOhfDDXpaS"
387 |       },
388 |       "source": [
389 |         "## 比较结果\n",
390 |         "\n",
391 |         "让我们看看对于特定领域库的问题不同的 RAG 的生成的回答。"
392 |       ]
393 |     },
394 |     {
395 |       "cell_type": "code",
396 |       "execution_count": 18,
397 |       "metadata": {
398 |         "id": "W7F07fQLXusU"
399 |       },
400 |       "outputs": [],
401 |       "source": [
402 |         "question = \"How do you combine multiple adapters?\""
403 |       ]
404 |     },
405 |     {
406 |       "cell_type": "markdown",
407 |       "metadata": {
408 |         "id": "KC0rJYU1x1ir"
409 |       },
410 |       "source": [
411 |         "首先，让我们看看仅仅通过模型自身不加检索内容能得到什么答案:"
412 |       ]
413 |     },
414 |     {
415 |       "cell_type": "code",
416 |       "execution_count": 20,
417 |       "metadata": {
418 |         "colab": {
419 |           "base_uri": "https://localhost:8080/",
420 |           "height": 125
421 |         },
422 |         "id": "GYh-HG1l0De5",
423 |         "outputId": "277d8e89-ce9b-4e04-c11b-639ad2645759"
424 |       },
425 |       "outputs": [
426 |         {
427 |           "data": {
428 |             "application/vnd.google.colaboratory.intrinsic+json": {
429 |               "type": "string"
430 |             },
431 |             "text/plain": [
432 |               "\" To combine multiple adapters, you need to ensure that they are compatible with each other and the devices you want to connect. Here's how you can do it:\\n\\n1. Identify the adapters you need: Determine which adapters you require to connect the devices you want to use together. For example, if you want to connect a USB-C device to an HDMI monitor, you may need a USB-C to HDMI adapter and a USB-C to USB-A adapter (if your computer only has USB-A ports).\\n\\n2. Connect the first adapter: Plug in the first adapter into the device you want to connect. For instance, if you're connecting a USB-C laptop to an HDMI monitor, plug the USB-C to HDMI adapter into the laptop's USB-C port.\\n\\n3. Connect the second adapter: Next, connect the second adapter to the first one. In this case, connect the USB-C to USB-A adapter to the USB-C port of the USB-C to HDMI adapter.\\n\\n4. Connect the final device: Finally, connect the device you want to use to the second adapter. For example, connect the HDMI cable from the monitor to the HDMI port on the USB-C to HDMI adapter.\\n\\n5. Test the connection: Turn on both devices and check whether everything is working correctly. If necessary, adjust the settings on your devices to ensure optimal performance.\\n\\nBy combining multiple adapters, you can connect a variety of devices together, even if they don't have the same type of connector. Just be sure to choose adapters that are compatible with all the devices you want to connect and test the connection thoroughly before relying on it for critical tasks.\""
433 |             ]
434 |           },
435 |           "execution_count": 20,
436 |           "metadata": {},
437 |           "output_type": "execute_result"
438 |         }
439 |       ],
440 |       "source": [
441 |         "llm_chain.invoke({\"context\":\"\", \"question\": question})"
442 |       ]
443 |     },
444 |     {
445 |       "cell_type": "markdown",
446 |       "metadata": {
447 |         "id": "i-TIWr3wx9w8"
448 |       },
449 |       "source": [
450 |         "可以看到，模型将这个问题解释为关于物理电脑适配器的问题，而在 PEFT 的背景下，“适配器”指的是 LoRA 适配器。\n",
451 |         "让我们看看添加 GitHub issues 的上下文是否有助于模型给出更相关的答案：\n"
452 |       ]
453 |     },
454 |     {
455 |       "cell_type": "code",
456 |       "execution_count": 21,
457 |       "metadata": {
458 |         "colab": {
459 |           "base_uri": "https://localhost:8080/",
460 |           "height": 125
461 |         },
462 |         "id": "FZpNA3o10H10",
463 |         "outputId": "31f9aed3-3dd7-4ff8-d1a8-866794fefe80"
464 |       },
465 |       "outputs": [
466 |         {
467 |           "data": {
468 |             "application/vnd.google.colaboratory.intrinsic+json": {
469 |               "type": "string"
470 |             },
471 |             "text/plain": [
472 |               "\" Based on the provided context, it seems that combining multiple adapters is still an open question in the community. Here are some possibilities:\\n\\n  1. Save the output from the base model and pass it to each adapter separately, as described in the first context snippet. This allows you to run multiple adapters simultaneously and reuse the output from the base model. However, this approach requires loading and running each adapter separately.\\n\\n  2. Export everything into a single PyTorch model, as suggested in the second context snippet. This would involve saving all the adapters and their weights into a single model, potentially making it larger and more complex. The advantage of this approach is that it would allow you to run all the adapters simultaneously without having to load and run them separately.\\n\\n  3. Merge multiple Lora adapters, as mentioned in the third context snippet. This involves adding multiple distinct, independent behaviors to a base model by merging multiple Lora adapters. It's not clear from the context how this would be done, but it suggests that there might be a recommended way of doing it.\\n\\n  4. Combine adapters through a specific architecture, as proposed in the fourth context snippet. This involves merging multiple adapters into a single architecture, potentially creating a more complex model with multiple behaviors. Again, it's not clear from the context how this would be done.\\n\\n   Overall, combining multiple adapters is still an active area of research, and there doesn't seem to be a widely accepted solution yet. If you're interested in exploring this further, it might be worth reaching out to the Hugging Face community or checking out their documentation for more information.\""
473 |             ]
474 |           },
475 |           "execution_count": 21,
476 |           "metadata": {},
477 |           "output_type": "execute_result"
478 |         }
479 |       ],
480 |       "source": [
481 |         "rag_chain.invoke(question)"
482 |       ]
483 |     },
484 |     {
485 |       "cell_type": "markdown",
486 |       "metadata": {
487 |         "id": "hZQedZKSyrwO"
488 |       },
489 |       "source": [
490 |         "我们可以看到，加入检索的信息后，同一个模型能够对于特定库的问题给出更准确、更相关的答案。\n",
491 |         "\n",
492 |         "值得注意的是，将多个适配器结合用于推理的功能已经被添加到库中，人们可以在文档中找到这些信息，因此在下一个迭代的RAG中，包含文档嵌入可能是有价值的。"
493 |       ]
494 |     }
495 |   ],
496 |   "metadata": {
497 |     "accelerator": "GPU",
498 |     "colab": {
499 |       "gpuType": "T4",
500 |       "provenance": []
501 |     },
502 |     "kernelspec": {
503 |       "display_name": "Python 3",
504 |       "name": "python3"
505 |     },
506 |     "language_info": {
507 |       "codemirror_mode": {
508 |         "name": "ipython",
509 |         "version": 3
510 |       },
511 |       "file_extension": ".py",
512 |       "mimetype": "text/x-python",
513 |       "name": "python",
514 |       "nbconvert_exporter": "python",
515 |       "pygments_lexer": "ipython3",
516 |       "version": "3.11.3"
517 |     }
518 |   },
519 |   "nbformat": 4,
520 |   "nbformat_minor": 0
521 | }
522 | 


--------------------------------------------------------------------------------
/notebooks/en/rag_zephyr_langchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "Kih21u1tyr-I"
  7 |       },
  8 |       "source": [
  9 |         "# Simple RAG for GitHub issues using Hugging Face Zephyr and LangChain\n",
 10 |         "\n",
 11 |         "_Authored by: [Maria Khalusova](https://github.com/MKhalusova)_\n",
 12 |         "\n",
 13 |         "This notebook demonstrates how you can quickly build a RAG (Retrieval Augmented Generation) for a project's GitHub issues using [`HuggingFaceH4/zephyr-7b-beta`](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) model, and LangChain.\n",
 14 |         "\n",
 15 |         "\n",
 16 |         "**What is RAG?**\n",
 17 |         "\n",
 18 |         "RAG is a popular approach to address the issue of a powerful LLM not being aware of specific content due to said content not being in its training data, or hallucinating even when it has seen it before. Such specific content may be proprietary, sensitive, or, as in this example, recent and updated often.\n",
 19 |         "\n",
 20 |         "If your data is static and doesn't change regularly, you may consider fine-tuning a large model. In many cases, however, fine-tuning can be costly, and, when done repeatedly (e.g. to address data drift), leads to \"model shift\". This is when the model's behavior changes in ways that are not desirable.\n",
 21 |         "\n",
 22 |         "**RAG (Retrieval Augmented Generation)** does not require model fine-tuning. Instead, RAG works by providing an LLM with additional context that is retrieved from relevant data so that it can generate a better-informed response.\n",
 23 |         "\n",
 24 |         "Here's a quick illustration:\n",
 25 |         "\n",
 26 |         "![RAG diagram](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/rag-diagram.png)\n",
 27 |         "\n",
 28 |         "* The external data is converted into embedding vectors with a separate embeddings model, and the vectors are kept in a database. Embeddings models are typically small, so updating the embedding vectors on a regular basis is faster, cheaper, and easier than fine-tuning a model.\n",
 29 |         "\n",
 30 |         "* At the same time, the fact that fine-tuning is not required gives you the freedom to swap your LLM for a more powerful one when it becomes available, or switch to a smaller distilled version, should you need faster inference.\n",
 31 |         "\n",
 32 |         "Let's illustrate building a RAG using an open-source LLM, embeddings model, and LangChain.\n",
 33 |         "\n",
 34 |         "First, install the required dependencies:"
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "execution_count": null,
 40 |       "metadata": {
 41 |         "id": "lC9frDOlyi38"
 42 |       },
 43 |       "outputs": [],
 44 |       "source": [
 45 |         "!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-gpu"
 46 |       ]
 47 |     },
 48 |     {
 49 |       "cell_type": "code",
 50 |       "execution_count": 2,
 51 |       "metadata": {
 52 |         "id": "-aYENQwZ-p_c"
 53 |       },
 54 |       "outputs": [],
 55 |       "source": [
 56 |         "# If running in Google Colab, you may need to run this cell to make sure you're using UTF-8 locale to install LangChain\n",
 57 |         "import locale\n",
 58 |         "locale.getpreferredencoding = lambda: \"UTF-8\""
 59 |       ]
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "execution_count": null,
 64 |       "metadata": {
 65 |         "id": "W5HhMZ2c-NfU"
 66 |       },
 67 |       "outputs": [],
 68 |       "source": [
 69 |         "!pip install -q langchain"
 70 |       ]
 71 |     },
 72 |     {
 73 |       "cell_type": "markdown",
 74 |       "metadata": {
 75 |         "id": "R8po01vMWzXL"
 76 |       },
 77 |       "source": [
 78 |         "## Prepare the data\n"
 79 |       ]
 80 |     },
 81 |     {
 82 |       "cell_type": "markdown",
 83 |       "metadata": {
 84 |         "id": "3cCmQywC04x6"
 85 |       },
 86 |       "source": [
 87 |         "In this example, we'll load all of the issues (both open and closed) from [PEFT library's repo](https://github.com/huggingface/peft).\n",
 88 |         "\n",
 89 |         "First, you need to acquire a [GitHub personal access token](https://github.com/settings/tokens?type=beta) to access the GitHub API."
 90 |       ]
 91 |     },
 92 |     {
 93 |       "cell_type": "code",
 94 |       "execution_count": null,
 95 |       "metadata": {
 96 |         "id": "8MoD7NbsNjlM"
 97 |       },
 98 |       "outputs": [],
 99 |       "source": [
100 |         "from getpass import getpass\n",
101 |         "ACCESS_TOKEN = getpass(\"YOUR_GITHUB_PERSONAL_TOKEN\")"
102 |       ]
103 |     },
104 |     {
105 |       "cell_type": "markdown",
106 |       "metadata": {
107 |         "id": "fccecm3a10N6"
108 |       },
109 |       "source": [
110 |         "Next, we'll load all of the issues in the [huggingface/peft](https://github.com/huggingface/peft) repo:\n",
111 |         "- By default, pull requests are considered issues as well, here we chose to exclude them from data with by setting `include_prs=False`\n",
112 |         "- Setting `state = \"all\"` means we will load both open and closed issues."
113 |       ]
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "execution_count": 5,
118 |       "metadata": {
119 |         "id": "8EKMit4WNDY8"
120 |       },
121 |       "outputs": [],
122 |       "source": [
123 |         "from langchain.document_loaders import GitHubIssuesLoader\n",
124 |         "\n",
125 |         "loader = GitHubIssuesLoader(\n",
126 |         "    repo=\"huggingface/peft\",\n",
127 |         "    access_token=ACCESS_TOKEN,\n",
128 |         "    include_prs=False,\n",
129 |         "    state=\"all\"\n",
130 |         ")\n",
131 |         "\n",
132 |         "docs = loader.load()"
133 |       ]
134 |     },
135 |     {
136 |       "cell_type": "markdown",
137 |       "metadata": {
138 |         "id": "CChTrY-k2qO5"
139 |       },
140 |       "source": [
141 |         "The content of individual GitHub issues may be longer than what an embedding model can take as input. If we want to embed all of the available content, we need to chunk the documents into appropriately sized pieces.\n",
142 |         "\n",
143 |         "The most common and straightforward approach to chunking is to define a fixed size of chunks and whether there should be any overlap between them. Keeping some overlap between chunks allows us to preserve some semantic context between the chunks. The recommended splitter for generic text is the [RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter), and that's what we'll use here. "
144 |       ]
145 |     },
146 |     {
147 |       "cell_type": "code",
148 |       "execution_count": null,
149 |       "metadata": {
150 |         "id": "OmsXOf59Pmm-"
151 |       },
152 |       "outputs": [],
153 |       "source": [
154 |         "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
155 |         "\n",
156 |         "splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)\n",
157 |         "\n",
158 |         "chunked_docs = splitter.split_documents(docs)"
159 |       ]
160 |     },
161 |     {
162 |       "cell_type": "markdown",
163 |       "metadata": {
164 |         "id": "DAt_zPVlXOn7"
165 |       },
166 |       "source": [
167 |         "## Create the embeddings + retriever"
168 |       ]
169 |     },
170 |     {
171 |       "cell_type": "markdown",
172 |       "metadata": {
173 |         "id": "-mvat6JQl4yp"
174 |       },
175 |       "source": [
176 |         "Now that the docs are all of the appropriate size, we can create a database with their embeddings.\n",
177 |         "\n",
178 |         "To create document chunk embeddings we'll use the `HuggingFaceEmbeddings` and the [`BAAI/bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) embeddings model. There are many other embeddings models available on the Hub, and you can keep an eye on the best performing ones by checking the [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).\n",
179 |         "\n",
180 |         "\n",
181 |         "To create the vector database, we'll use `FAISS`, a library developed by Facebook AI. This library offers efficient similarity search and clustering of dense vectors, which is what we need here. FAISS is currently one of the most used libraries for NN search in massive datasets.\n",
182 |         "\n",
183 |         "We'll access both the embeddings model and FAISS via LangChain API."
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "execution_count": null,
189 |       "metadata": {
190 |         "id": "ixmCdRzBQ5gu"
191 |       },
192 |       "outputs": [],
193 |       "source": [
194 |         "from langchain.vectorstores import FAISS\n",
195 |         "from langchain.embeddings import HuggingFaceEmbeddings\n",
196 |         "\n",
197 |         "db = FAISS.from_documents(chunked_docs,\n",
198 |         "                          HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5'))"
199 |       ]
200 |     },
201 |     {
202 |       "cell_type": "markdown",
203 |       "metadata": {
204 |         "id": "2iCgEPi0nnN6"
205 |       },
206 |       "source": [
207 |         "We need a way to return(retrieve) the documents given an unstructured query. For that, we'll use the `as_retriever` method using the `db` as a backbone:\n",
208 |         "- `search_type=\"similarity\"` means we want to perform similarity search between the query and documents\n",
209 |         "- `search_kwargs={'k': 4}` instructs the retriever to return top 4 results.\n"
210 |       ]
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "execution_count": 8,
215 |       "metadata": {
216 |         "id": "mBTreCQ9noHK"
217 |       },
218 |       "outputs": [],
219 |       "source": [
220 |         "retriever = db.as_retriever(\n",
221 |         "    search_type=\"similarity\",\n",
222 |         "    search_kwargs={'k': 4}\n",
223 |         ")"
224 |       ]
225 |     },
226 |     {
227 |       "cell_type": "markdown",
228 |       "metadata": {
229 |         "id": "WgEhlISJpTgj"
230 |       },
231 |       "source": [
232 |         "The vector database and retriever are now set up, next we need to set up the next piece of the chain - the model."
233 |       ]
234 |     },
235 |     {
236 |       "cell_type": "markdown",
237 |       "metadata": {
238 |         "id": "tzQxx0HkXVFU"
239 |       },
240 |       "source": [
241 |         "## Load quantized model"
242 |       ]
243 |     },
244 |     {
245 |       "cell_type": "markdown",
246 |       "metadata": {
247 |         "id": "9jy1cC65p_GD"
248 |       },
249 |       "source": [
250 |         "For this example, we chose [`HuggingFaceH4/zephyr-7b-beta`](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta), a small but powerful model.\n",
251 |         "\n",
252 |         "With many models being released every week, you may want to substitute this model to the latest and greatest. The best way to keep track of open source LLMs is to check the [Open-source LLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).\n",
253 |         "\n",
254 |         "To make inference faster, we will load the quantized version of the model:"
255 |       ]
256 |     },
257 |     {
258 |       "cell_type": "code",
259 |       "execution_count": null,
260 |       "metadata": {
261 |         "id": "L-ggaa763VRo"
262 |       },
263 |       "outputs": [],
264 |       "source": [
265 |         "import torch\n",
266 |         "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
267 |         "\n",
268 |         "model_name = 'HuggingFaceH4/zephyr-7b-beta'\n",
269 |         "\n",
270 |         "bnb_config = BitsAndBytesConfig(\n",
271 |         "    load_in_4bit=True,\n",
272 |         "    bnb_4bit_use_double_quant=True,\n",
273 |         "    bnb_4bit_quant_type=\"nf4\",\n",
274 |         "    bnb_4bit_compute_dtype=torch.bfloat16\n",
275 |         ")\n",
276 |         "\n",
277 |         "model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)\n",
278 |         "tokenizer = AutoTokenizer.from_pretrained(model_name)"
279 |       ]
280 |     },
281 |     {
282 |       "cell_type": "markdown",
283 |       "metadata": {
284 |         "id": "hVNRJALyXYHG"
285 |       },
286 |       "source": [
287 |         "## Setup the LLM chain"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "markdown",
292 |       "metadata": {
293 |         "id": "RUUNneJ1smhl"
294 |       },
295 |       "source": [
296 |         "Finally, we have all the pieces we need to set up the LLM chain.\n",
297 |         "\n",
298 |         "First, create a text_generation pipeline using the loaded model and its tokenizer.\n",
299 |         "\n",
300 |         "Next, create a prompt template - this should follow the format of the model, so if you substitute the model checkpoint, make sure to use the appropriate formatting."
301 |       ]
302 |     },
303 |     {
304 |       "cell_type": "code",
305 |       "execution_count": 15,
306 |       "metadata": {
307 |         "id": "cR0k1cRWz8Pm"
308 |       },
309 |       "outputs": [],
310 |       "source": [
311 |         "from langchain.llms import HuggingFacePipeline\n",
312 |         "from langchain.prompts import PromptTemplate\n",
313 |         "from transformers import pipeline\n",
314 |         "from langchain_core.output_parsers import StrOutputParser\n",
315 |         "\n",
316 |         "text_generation_pipeline = pipeline(\n",
317 |         "    model=model,\n",
318 |         "    tokenizer=tokenizer,\n",
319 |         "    task=\"text-generation\",\n",
320 |         "    temperature=0.2,\n",
321 |         "    do_sample=True,\n",
322 |         "    repetition_penalty=1.1,\n",
323 |         "    return_full_text=True,\n",
324 |         "    max_new_tokens=400,\n",
325 |         ")\n",
326 |         "\n",
327 |         "llm = HuggingFacePipeline(pipeline=text_generation_pipeline)\n",
328 |         "\n",
329 |         "prompt_template = \"\"\"\n",
330 |         "<|system|>\n",
331 |         "Answer the question based on your knowledge. Use the following context to help:\n",
332 |         "\n",
333 |         "{context}\n",
334 |         "\n",
335 |         "</s>\n",
336 |         "<|user|>\n",
337 |         "{question}\n",
338 |         "</s>\n",
339 |         "<|assistant|>\n",
340 |         "\n",
341 |         " \"\"\"\n",
342 |         "\n",
343 |         "prompt = PromptTemplate(\n",
344 |         "    input_variables=[\"context\", \"question\"],\n",
345 |         "    template=prompt_template,\n",
346 |         ")\n",
347 |         "\n",
348 |         "llm_chain = prompt | llm | StrOutputParser()"
349 |       ]
350 |     },
351 |     {
352 |       "cell_type": "markdown",
353 |       "metadata": {
354 |         "id": "l19UKq5HXfSp"
355 |       },
356 |       "source": [
357 |         "Note: _You can also use `tokenizer.apply_chat_template` to convert a list of messages (as dicts: `{'role': 'user', 'content': '(...)'}`) into a string with the appropriate chat format._\n",
358 |         "\n",
359 |         "\n",
360 |         "Finally, we need to combine the `llm_chain` with the retriever to create a RAG chain. We pass the original question through to the final generation step, as well as the retrieved context docs:"
361 |       ]
362 |     },
363 |     {
364 |       "cell_type": "code",
365 |       "execution_count": 17,
366 |       "metadata": {
367 |         "id": "_rI3YNp9Xl4s"
368 |       },
369 |       "outputs": [],
370 |       "source": [
371 |         "from langchain_core.runnables import RunnablePassthrough\n",
372 |         "\n",
373 |         "retriever = db.as_retriever()\n",
374 |         "\n",
375 |         "rag_chain = (\n",
376 |         " {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
377 |         "    | llm_chain\n",
378 |         ")\n"
379 |       ]
380 |     },
381 |     {
382 |       "cell_type": "markdown",
383 |       "metadata": {
384 |         "id": "UsCOhfDDXpaS"
385 |       },
386 |       "source": [
387 |         "## Compare the results\n",
388 |         "\n",
389 |         "Let's see the difference RAG makes in generating answers to the library-specific questions."
390 |       ]
391 |     },
392 |     {
393 |       "cell_type": "code",
394 |       "execution_count": 18,
395 |       "metadata": {
396 |         "id": "W7F07fQLXusU"
397 |       },
398 |       "outputs": [],
399 |       "source": [
400 |         "question = \"How do you combine multiple adapters?\""
401 |       ]
402 |     },
403 |     {
404 |       "cell_type": "markdown",
405 |       "metadata": {
406 |         "id": "KC0rJYU1x1ir"
407 |       },
408 |       "source": [
409 |         "First, let's see what kind of answer we can get with just the model itself, no context added:"
410 |       ]
411 |     },
412 |     {
413 |       "cell_type": "code",
414 |       "execution_count": 20,
415 |       "metadata": {
416 |         "colab": {
417 |           "base_uri": "https://localhost:8080/",
418 |           "height": 125
419 |         },
420 |         "id": "GYh-HG1l0De5",
421 |         "outputId": "277d8e89-ce9b-4e04-c11b-639ad2645759"
422 |       },
423 |       "outputs": [
424 |         {
425 |           "data": {
426 |             "application/vnd.google.colaboratory.intrinsic+json": {
427 |               "type": "string"
428 |             },
429 |             "text/plain": [
430 |               "\" To combine multiple adapters, you need to ensure that they are compatible with each other and the devices you want to connect. Here's how you can do it:\\n\\n1. Identify the adapters you need: Determine which adapters you require to connect the devices you want to use together. For example, if you want to connect a USB-C device to an HDMI monitor, you may need a USB-C to HDMI adapter and a USB-C to USB-A adapter (if your computer only has USB-A ports).\\n\\n2. Connect the first adapter: Plug in the first adapter into the device you want to connect. For instance, if you're connecting a USB-C laptop to an HDMI monitor, plug the USB-C to HDMI adapter into the laptop's USB-C port.\\n\\n3. Connect the second adapter: Next, connect the second adapter to the first one. In this case, connect the USB-C to USB-A adapter to the USB-C port of the USB-C to HDMI adapter.\\n\\n4. Connect the final device: Finally, connect the device you want to use to the second adapter. For example, connect the HDMI cable from the monitor to the HDMI port on the USB-C to HDMI adapter.\\n\\n5. Test the connection: Turn on both devices and check whether everything is working correctly. If necessary, adjust the settings on your devices to ensure optimal performance.\\n\\nBy combining multiple adapters, you can connect a variety of devices together, even if they don't have the same type of connector. Just be sure to choose adapters that are compatible with all the devices you want to connect and test the connection thoroughly before relying on it for critical tasks.\""
431 |             ]
432 |           },
433 |           "execution_count": 20,
434 |           "metadata": {},
435 |           "output_type": "execute_result"
436 |         }
437 |       ],
438 |       "source": [
439 |         "llm_chain.invoke({\"context\":\"\", \"question\": question})"
440 |       ]
441 |     },
442 |     {
443 |       "cell_type": "markdown",
444 |       "metadata": {
445 |         "id": "i-TIWr3wx9w8"
446 |       },
447 |       "source": [
448 |         "As you can see, the model interpreted the question as one about physical computer adapters, while in the context of PEFT, \"adapters\" refer to LoRA adapters.\n",
449 |         "Let's see if adding context from GitHub issues helps the model give a more relevant answer:"
450 |       ]
451 |     },
452 |     {
453 |       "cell_type": "code",
454 |       "execution_count": 21,
455 |       "metadata": {
456 |         "colab": {
457 |           "base_uri": "https://localhost:8080/",
458 |           "height": 125
459 |         },
460 |         "id": "FZpNA3o10H10",
461 |         "outputId": "31f9aed3-3dd7-4ff8-d1a8-866794fefe80"
462 |       },
463 |       "outputs": [
464 |         {
465 |           "data": {
466 |             "application/vnd.google.colaboratory.intrinsic+json": {
467 |               "type": "string"
468 |             },
469 |             "text/plain": [
470 |               "\" Based on the provided context, it seems that combining multiple adapters is still an open question in the community. Here are some possibilities:\\n\\n  1. Save the output from the base model and pass it to each adapter separately, as described in the first context snippet. This allows you to run multiple adapters simultaneously and reuse the output from the base model. However, this approach requires loading and running each adapter separately.\\n\\n  2. Export everything into a single PyTorch model, as suggested in the second context snippet. This would involve saving all the adapters and their weights into a single model, potentially making it larger and more complex. The advantage of this approach is that it would allow you to run all the adapters simultaneously without having to load and run them separately.\\n\\n  3. Merge multiple Lora adapters, as mentioned in the third context snippet. This involves adding multiple distinct, independent behaviors to a base model by merging multiple Lora adapters. It's not clear from the context how this would be done, but it suggests that there might be a recommended way of doing it.\\n\\n  4. Combine adapters through a specific architecture, as proposed in the fourth context snippet. This involves merging multiple adapters into a single architecture, potentially creating a more complex model with multiple behaviors. Again, it's not clear from the context how this would be done.\\n\\n   Overall, combining multiple adapters is still an active area of research, and there doesn't seem to be a widely accepted solution yet. If you're interested in exploring this further, it might be worth reaching out to the Hugging Face community or checking out their documentation for more information.\""
471 |             ]
472 |           },
473 |           "execution_count": 21,
474 |           "metadata": {},
475 |           "output_type": "execute_result"
476 |         }
477 |       ],
478 |       "source": [
479 |         "rag_chain.invoke(question)"
480 |       ]
481 |     },
482 |     {
483 |       "cell_type": "markdown",
484 |       "metadata": {
485 |         "id": "hZQedZKSyrwO"
486 |       },
487 |       "source": [
488 |         "As we can see, the added context, really helps the exact same model, provide a much more relevant and informed answer to the library-specific question.\n",
489 |         "\n",
490 |         "Notably, combining multiple adapters for inference has been added to the library, and one can find this information in the documentation, so for the next iteration of this RAG it may be worth including documentation embeddings."
491 |       ]
492 |     }
493 |   ],
494 |   "metadata": {
495 |     "accelerator": "GPU",
496 |     "colab": {
497 |       "gpuType": "T4",
498 |       "provenance": []
499 |     },
500 |     "kernelspec": {
501 |       "display_name": "Python 3",
502 |       "name": "python3"
503 |     },
504 |     "language_info": {
505 |       "codemirror_mode": {
506 |         "name": "ipython",
507 |         "version": 3
508 |       },
509 |       "file_extension": ".py",
510 |       "mimetype": "text/x-python",
511 |       "name": "python",
512 |       "nbconvert_exporter": "python",
513 |       "pygments_lexer": "ipython3",
514 |       "version": "3.11.3"
515 |     }
516 |   },
517 |   "nbformat": 4,
518 |   "nbformat_minor": 0
519 | }
520 | 


--------------------------------------------------------------------------------
/notebooks/zh-CN/automatic_embedding_tei_inference_endpoints.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5d9aca72-957a-4ee2-862f-e011b9cd3a62",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 怎么使用推理端点去嵌入文档\n",
  9 |     "\n",
 10 |     "_作者: [Derek Thomas](https://huggingface.co/derek-thomas)_\n",
 11 |     "\n",
 12 |     "## 目标\n",
 13 |     "\n",
 14 |     "我有一个数据集，我想为其嵌入语义搜索（或问答，或 RAG），我希望以最简单的方式嵌入这个数据集并将其放入一个新的数据集中。\n",
 15 |     "\n",
 16 |     "## 方法\n",
 17 |     "\n",
 18 |     "我将使用我最喜欢的 subreddit [r/bestofredditorupdates](https://www.reddit.com/r/bestofredditorupdates/) 中的数据集。因为它有很长的条目，同时使用新的 [jinaai/jina-embeddings-v2-base-en](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) 嵌入模型，因为它有 8k 的上下文长度。还将使用 [推理端点](https://huggingface.co/inference-endpoints) 部署这个，以节省时间和金钱。要跟随这个教程，你需要**已经添加了支付方式**。如果你还没有添加，可以在 [账单](https://huggingface.co/docs/hub/billing#billing) 中添加。为了使操作更加简单，我将完全基于 API 进行操作。\n",
 19 |     "\n",
 20 |     "为了使这个过程更快，我将使用 [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) 镜像。这有许多好处，比如：\n",
 21 |     "- 无需模型图编译步骤\n",
 22 |     "- Docker 镜像小，启动时间快。真正的无服务器！\n",
 23 |     "- 基于 token 的动态批处理\n",
 24 |     "- 使用 Flash 注意力机制、Candle 和 cuBLASLt 优化的 transformers 代码进行推理\n",
 25 |     "- Safetensors 权重加载\n",
 26 |     "- 生产就绪（使用 Open Telemetry 进行分布式跟踪，Prometheus 指标）\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "![img](https://media.githubusercontent.com/media/huggingface/text-embeddings-inference/main/assets/bs1-tp.png)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "3c830114-dd88-45a9-81b9-78b0e3da7384",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## 环境(Requirements)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "35386f72-32cb-49fa-a108-3aa504e20429",
 44 |    "metadata": {
 45 |     "tags": []
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "!pip install -q aiohttp==3.8.3 datasets==2.14.6 pandas==1.5.3 requests==2.31.0 tqdm==4.66.1 huggingface-hub>=0.20"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "b6f72042-173d-4a72-ade1-9304b43b528d",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## 导入包"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "id": "e2beecdd-d033-4736-bd45-6754ec53b4ac",
 64 |    "metadata": {
 65 |     "tags": []
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import asyncio\n",
 70 |     "from getpass import getpass\n",
 71 |     "import json\n",
 72 |     "from pathlib import Path\n",
 73 |     "import time\n",
 74 |     "from typing import Optional\n",
 75 |     "\n",
 76 |     "from aiohttp import ClientSession, ClientTimeout\n",
 77 |     "from datasets import load_dataset, Dataset, DatasetDict\n",
 78 |     "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami\n",
 79 |     "import numpy as np\n",
 80 |     "import pandas as pd\n",
 81 |     "import requests\n",
 82 |     "from tqdm.auto import tqdm"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "5eece903-64ce-435d-a2fd-096c0ff650bf",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## 设置(Config)\n",
 91 |     "`DATASET_IN` 你文本数据的位置\n",
 92 |     "`DATASET_OUT` 你的嵌入储存的位置\n",
 93 |     "\n",
 94 |     "注意：我将 `MAX_WORKERS` 设置为 5，因为 `jina-embeddings-v2` 对内存的需求较大。"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 4,
100 |    "id": "df2f79f0-9f28-46e6-9fc7-27e9537ff5be",
101 |    "metadata": {
102 |     "tags": []
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "DATASET_IN = 'derek-thomas/dataset-creator-reddit-bestofredditorupdates'\n",
107 |     "DATASET_OUT = \"processed-subset-bestofredditorupdates\"\n",
108 |     "ENDPOINT_NAME = \"boru-jina-embeddings-demo-ie\"\n",
109 |     "\n",
110 |     "MAX_WORKERS = 5  # This is for how many async workers you want. Choose based on the model and hardware \n",
111 |     "ROW_COUNT = 100  # Choose None to use all rows, Im using 100 just for a demo"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "1e680f3d-4900-46cc-8b49-bb6ba3e27e2b",
117 |    "metadata": {},
118 |    "source": [
119 |     "Hugging Face 在推理端点中提供了多种 GPU 供选择。下面以表格形式呈现：\n",
120 |     "\n",
121 |     "| GPU                 | 实例类型   | 实例大小 | vRAM  |\n",
122 |     "|---------------------|----------------|--------------|-------|\n",
123 |     "| 1x Nvidia Tesla T4 | g4dn.xlarge | small | 16GB |\n",
124 |     "| 4x Nvidia Tesla T4 | g4dn.12xlarge | large | 64GB |\n",
125 |     "| 1x Nvidia A10G | g5.2xlarge | medium | 24GB |\n",
126 |     "| 4x Nvidia A10G | g5.12xlarge | xxlarge | 96GB |\n",
127 |     "| 1x Nvidia A100* | p4de | xlarge | 80GB |\n",
128 |     "| 2x Nvidia A100* | p4de | 2xlarge | 160GB |\n",
129 |     "\n",
130 |     "\\*注意，对于 A100 的机型你需要发邮件给我们来获取权限。"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 4,
136 |    "id": "3c2106c1-2e5a-443a-9ea8-a3cd0e9c5a94",
137 |    "metadata": {
138 |     "tags": []
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# GPU Choice\n",
143 |     "VENDOR=\"aws\"\n",
144 |     "REGION=\"us-east-1\"\n",
145 |     "INSTANCE_SIZE=\"medium\"\n",
146 |     "INSTANCE_TYPE=\"g5.2xlarge\""
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 5,
152 |    "id": "0ca1140c-3fcc-4b99-9210-6da1505a27b7",
153 |    "metadata": {
154 |     "tags": []
155 |    },
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "application/vnd.jupyter.widget-view+json": {
160 |        "model_id": "ee80821056e147fa9cabf30f64dc85a8",
161 |        "version_major": 2,
162 |        "version_minor": 0
163 |       },
164 |       "text/plain": [
165 |        "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
166 |       ]
167 |      },
168 |      "metadata": {},
169 |      "output_type": "display_data"
170 |     }
171 |    ],
172 |    "source": [
173 |     "notebook_login()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "5f4ba0a8-0a6c-4705-a73b-7be09b889610",
179 |    "metadata": {},
180 |    "source": [
181 |     "有些用户可能会在组织中注册支付信息。这肯能会使你的支付方式链接组织。\n",
182 |     "\n",
183 |     "如果你想使用你自己的用户名，请将其留空。"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 6,
189 |    "id": "88cdbd73-5923-4ae9-9940-b6be935f70fa",
190 |    "metadata": {
191 |     "tags": []
192 |    },
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "What is your Hugging Face 🤗 username or organization? (with an added payment method) ········\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "who = whoami()\n",
204 |     "organization = getpass(prompt=\"What is your Hugging Face 🤗 username or organization? (with an added payment method)\")\n",
205 |     "\n",
206 |     "namespace = organization or who['name']"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "b972a719-2aed-4d2e-a24f-fae7776d5fa4",
212 |    "metadata": {},
213 |    "source": [
214 |     "## 获取数据"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 7,
220 |    "id": "27835fa4-3a4f-44b1-a02a-5e31584a1bba",
221 |    "metadata": {
222 |     "tags": []
223 |    },
224 |    "outputs": [
225 |     {
226 |      "data": {
227 |       "application/vnd.jupyter.widget-view+json": {
228 |        "model_id": "4041cedd3b3f4f8db3e29ec102f46a3a",
229 |        "version_major": 2,
230 |        "version_minor": 0
231 |       },
232 |       "text/plain": [
233 |        "Downloading readme:   0%|          | 0.00/1.73k [00:00<?, ?B/s]"
234 |       ]
235 |      },
236 |      "metadata": {},
237 |      "output_type": "display_data"
238 |     },
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "Dataset({\n",
243 |        "    features: ['id', 'content', 'score', 'date_utc', 'title', 'flair', 'poster', 'permalink', 'new', 'updated'],\n",
244 |        "    num_rows: 10042\n",
245 |        "})"
246 |       ]
247 |      },
248 |      "execution_count": 7,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "dataset = load_dataset(DATASET_IN)\n",
255 |     "dataset['train']"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "id": "8846087e-4d0d-4c0e-8aeb-ea95d9e97126",
262 |    "metadata": {
263 |     "tags": []
264 |    },
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "(100,\n",
270 |        " {'id': '10004zw',\n",
271 |        "  'content': '[removed]',\n",
272 |        "  'score': 1,\n",
273 |        "  'date_utc': Timestamp('2022-12-31 18:16:22'),\n",
274 |        "  'title': 'To All BORU contributors, Thank you :)',\n",
275 |        "  'flair': 'CONCLUDED',\n",
276 |        "  'poster': 'IsItAcOnSeQuEnCe',\n",
277 |        "  'permalink': '/r/BestofRedditorUpdates/comments/10004zw/to_all_boru_contributors_thank_you/',\n",
278 |        "  'new': False,\n",
279 |        "  'updated': False})"
280 |       ]
281 |      },
282 |      "execution_count": 8,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "documents = dataset['train'].to_pandas().to_dict('records')[:ROW_COUNT]\n",
289 |     "len(documents), documents[0]"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "id": "93096cbc-81c6-4137-a283-6afb0f48fbb9",
295 |    "metadata": {},
296 |    "source": [
297 |     "# 推理端点\n",
298 |     "## 创建推理端点\n",
299 |     "\n",
300 |     "我们将使用 [API](https://huggingface.co/docs/inference-endpoints/api_reference) 来创建一个 [推理端点](https://huggingface.co/inference-endpoints)。主要有以下几个好处：\n",
301 |     "- 方便（无需点击）\n",
302 |     "- 可重复（我们有代码可以轻松运行它）\n",
303 |     "- 更便宜（无需花费时间等待加载，并且可以自动关闭）\n",
304 |     "\n"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 9,
310 |    "id": "9e59de46-26b7-4bb9-bbad-8bba9931bde7",
311 |    "metadata": {
312 |     "tags": []
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "try:\n",
317 |     "    endpoint = create_inference_endpoint(\n",
318 |     "        ENDPOINT_NAME,\n",
319 |     "        repository=\"jinaai/jina-embeddings-v2-base-en\",\n",
320 |     "        revision=\"7302ac470bed880590f9344bfeee32ff8722d0e5\",\n",
321 |     "        task=\"sentence-embeddings\",\n",
322 |     "        framework=\"pytorch\",\n",
323 |     "        accelerator=\"gpu\",\n",
324 |     "        instance_size=INSTANCE_SIZE,\n",
325 |     "        instance_type=INSTANCE_TYPE,\n",
326 |     "        region=REGION,\n",
327 |     "        vendor=VENDOR,\n",
328 |     "        namespace=namespace,\n",
329 |     "        custom_image={\n",
330 |     "            \"health_route\": \"/health\",\n",
331 |     "            \"env\": {\n",
332 |     "                \"MAX_BATCH_TOKENS\": str(MAX_WORKERS * 2048),\n",
333 |     "                \"MAX_CONCURRENT_REQUESTS\": \"512\",\n",
334 |     "                \"MODEL_ID\": \"/repository\"\n",
335 |     "            },\n",
336 |     "            \"url\": \"ghcr.io/huggingface/text-embeddings-inference:0.5.0\",\n",
337 |     "        },\n",
338 |     "        type=\"protected\",\n",
339 |     "    )\n",
340 |     "except:\n",
341 |     "    endpoint = [ie for ie in list_inference_endpoints(namespace=namespace) if ie.name == ENDPOINT_NAME][0]\n",
342 |     "    print('Loaded endpoint')"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "id": "0f2c97dc-34e8-49e9-b60e-f5b7366294c0",
348 |    "metadata": {},
349 |    "source": [
350 |     "这里有几个设计选择：\n",
351 |     "- 像之前所说，我们使用 `jinaai/jina-embeddings-v2-base-en` 作为我们的模型。\n",
352 |     "    - 为了可复现性，我们将它固定到一个特定的修订版本。\n",
353 |     "- 如果你对更多模型感兴趣，可以查看支持[列表](https://huggingface.co/docs/text-embeddings-inference/supported_models)。\n",
354 |     "    - 请注意，大多数嵌入模型都是基于 BERT 架构的。\n",
355 |     "- `MAX_BATCH_TOKENS` 是根据我们的工作数量和嵌入模型的上下文窗口来选择的。\n",
356 |     "- `type=\"protected\"` 利用的是推理端点详细说明的安全功能。\n",
357 |     "- 我使用 **1x Nvidia A10**，因为 `jina-embeddings-v2` 对内存的需求很大（记住 8k 的上下文长度）。\n",
358 |     "- 如果你有高工作负载的需求，你应该考虑进一步调整 `MAX_BATCH_TOKENS` 和 `MAX_CONCURRENT_REQUESTS`。\n"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "id": "96d173b2-8980-4554-9039-c62843d3fc7d",
364 |    "metadata": {},
365 |    "source": [
366 |     "## 等待直到它运行起来"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 10,
372 |    "id": "5f3a8bd2-753c-49a8-9452-899578beddc5",
373 |    "metadata": {
374 |     "tags": []
375 |    },
376 |    "outputs": [
377 |     {
378 |      "name": "stdout",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "CPU times: user 48.1 ms, sys: 15.7 ms, total: 63.8 ms\n",
382 |       "Wall time: 52.6 s\n"
383 |      ]
384 |     },
385 |     {
386 |      "data": {
387 |       "text/plain": [
388 |        "InferenceEndpoint(name='boru-jina-embeddings-demo-ie', namespace='HF-test-lab', repository='jinaai/jina-embeddings-v2-base-en', status='running', url='https://k7l1xeok1jwnpbx5.us-east-1.aws.endpoints.huggingface.cloud')"
389 |       ]
390 |      },
391 |      "execution_count": 10,
392 |      "metadata": {},
393 |      "output_type": "execute_result"
394 |     }
395 |    ],
396 |    "source": [
397 |     "%%time\n",
398 |     "endpoint.wait()"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "id": "a906645e-60de-4eb6-b8b6-3ec98a9d9b00",
404 |    "metadata": {},
405 |    "source": [
406 |     "当我们使用 `endpoint.client.post` 时，我们得到一个字节字符串。这有点繁琐，因为我们需要将这个字节字符串转换为一个 `np.array`，但这只是 Python 中的几行快速代码。"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 12,
412 |    "id": "e09253d5-70ff-4d0e-8888-0022ce0adf7b",
413 |    "metadata": {
414 |     "tags": []
415 |    },
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/plain": [
420 |        "array([-0.05630935, -0.03560849,  0.02789049,  0.02792823, -0.02800371,\n",
421 |        "       -0.01530391, -0.01863454, -0.0077982 ,  0.05374297,  0.03672185,\n",
422 |        "       -0.06114018, -0.06880157, -0.0093503 , -0.03174005, -0.03206085,\n",
423 |        "        0.0610647 ,  0.02243694,  0.03217408,  0.04181686,  0.00248854])"
424 |       ]
425 |      },
426 |      "execution_count": 12,
427 |      "metadata": {},
428 |      "output_type": "execute_result"
429 |     }
430 |    ],
431 |    "source": [
432 |     "response = endpoint.client.post(json={\"inputs\": 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music!', 'truncate': True}, task=\"feature-extraction\")\n",
433 |     "response = np.array(json.loads(response.decode()))\n",
434 |     "response[0][:20]"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "id": "0d024788-6e6e-4a8d-b192-36ee3dacca13",
440 |    "metadata": {},
441 |    "source": [
442 |     "你可能遇到超过上下文长度的输入。在这种情况下，需要你来处理它们。在我的情况下，我更愿意截断而不是出现错误。让我们测试一下这是否有效。\n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 13,
448 |    "id": "a4a1cd15-dda3-4cfa-8bda-788d8c1b9e32",
449 |    "metadata": {
450 |     "tags": []
451 |    },
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "The length of the embedding_input is: 300000\n"
458 |      ]
459 |     },
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "array([-0.03088215, -0.0351537 ,  0.05749275,  0.00983467,  0.02108356,\n",
464 |        "        0.04539965,  0.06107162, -0.02536954,  0.03887688,  0.01998681,\n",
465 |        "       -0.05391388,  0.01529677, -0.1279156 ,  0.01653782, -0.01940958,\n",
466 |        "        0.0367411 ,  0.0031748 ,  0.04716022, -0.00713609, -0.00155313])"
467 |       ]
468 |      },
469 |      "execution_count": 13,
470 |      "metadata": {},
471 |      "output_type": "execute_result"
472 |     }
473 |    ],
474 |    "source": [
475 |     "embedding_input = 'This input will get multiplied' * 10000\n",
476 |     "print(f'The length of the embedding_input is: {len(embedding_input)}')\n",
477 |     "response = endpoint.client.post(json={\"inputs\": embedding_input, 'truncate': True}, task=\"feature-extraction\")\n",
478 |     "response = np.array(json.loads(response.decode()))\n",
479 |     "response[0][:20]"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "id": "f7186126-ef6a-47d0-b158-112810649cd9",
485 |    "metadata": {},
486 |    "source": [
487 |     "# 获取嵌入"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "id": "1dadfd68-6d46-4ce8-a165-bfeb43b1f114",
493 |    "metadata": {},
494 |    "source": [
495 |     "在这里，我发送一个文档，用嵌入更新它，然后返回它。这是与 `MAX_WORKERS` 并行的发生的。"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 14,
501 |    "id": "ad3193fb-3def-42a8-968e-c63f2b864ca8",
502 |    "metadata": {
503 |     "tags": []
504 |    },
505 |    "outputs": [],
506 |    "source": [
507 |     "async def request(document, semaphore):\n",
508 |     "    # Semaphore guard\n",
509 |     "    async with semaphore:\n",
510 |     "        result = await endpoint.async_client.post(json={\"inputs\": document['content'], 'truncate': True}, task=\"feature-extraction\")\n",
511 |     "        result = np.array(json.loads(result.decode()))\n",
512 |     "        document['embedding'] = result[0]  # Assuming the API's output can be directly assigned\n",
513 |     "        return document\n",
514 |     "\n",
515 |     "async def main(documents):\n",
516 |     "    # Semaphore to limit concurrent requests. Adjust the number as needed.\n",
517 |     "    semaphore = asyncio.BoundedSemaphore(MAX_WORKERS)\n",
518 |     "\n",
519 |     "    # Creating a list of tasks\n",
520 |     "    tasks = [request(document, semaphore) for document in documents]\n",
521 |     "    \n",
522 |     "    # Using tqdm to show progress. It's been integrated into the async loop.\n",
523 |     "    for f in tqdm(asyncio.as_completed(tasks), total=len(documents)):\n",
524 |     "        await f"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 15,
530 |    "id": "ec4983af-65eb-4841-808a-3738fb4d682d",
531 |    "metadata": {
532 |     "tags": []
533 |    },
534 |    "outputs": [
535 |     {
536 |      "data": {
537 |       "application/vnd.jupyter.widget-view+json": {
538 |        "model_id": "48a2affdee8d46f3b0c1f691eaac4b89",
539 |        "version_major": 2,
540 |        "version_minor": 0
541 |       },
542 |       "text/plain": [
543 |        "  0%|          | 0/100 [00:00<?, ?it/s]"
544 |       ]
545 |      },
546 |      "metadata": {},
547 |      "output_type": "display_data"
548 |     },
549 |     {
550 |      "name": "stdout",
551 |      "output_type": "stream",
552 |      "text": [
553 |       "Embeddings = 100 documents = 100\n",
554 |       "0 min 21.33 sec\n"
555 |      ]
556 |     }
557 |    ],
558 |    "source": [
559 |     "start = time.perf_counter()\n",
560 |     "\n",
561 |     "# Get embeddings\n",
562 |     "await main(documents)\n",
563 |     "\n",
564 |     "# Make sure we got it all\n",
565 |     "count = 0\n",
566 |     "for document in documents:\n",
567 |     "    if 'embedding' in document.keys() and len(document['embedding']) == 768:\n",
568 |     "        count += 1\n",
569 |     "print(f'Embeddings = {count} documents = {len(documents)}')\n",
570 |     "\n",
571 |     "            \n",
572 |     "# Print elapsed time\n",
573 |     "elapsed_time = time.perf_counter() - start\n",
574 |     "minutes, seconds = divmod(elapsed_time, 60)\n",
575 |     "print(f\"{int(minutes)} min {seconds:.2f} sec\")"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "markdown",
580 |    "id": "bab97c7b-7bac-4bf5-9752-b528294dadc7",
581 |    "metadata": {},
582 |    "source": [
583 |     "## 暂停推理端点\n",
584 |     "\n",
585 |     "现在我们已经完成了嵌入，让我们暂停端点，以免产生任何额外费用，同时这也允许我们分析成本。"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 16,
591 |    "id": "540a0978-7670-4ce3-95c1-3823cc113b85",
592 |    "metadata": {
593 |     "tags": []
594 |    },
595 |    "outputs": [
596 |     {
597 |      "name": "stdout",
598 |      "output_type": "stream",
599 |      "text": [
600 |       "Endpoint Status: paused\n"
601 |      ]
602 |     }
603 |    ],
604 |    "source": [
605 |     "endpoint = endpoint.pause()\n",
606 |     "\n",
607 |     "print(f\"Endpoint Status: {endpoint.status}\")"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "id": "45ad65b7-3da2-4113-9b95-8fb4e21ae793",
613 |    "metadata": {},
614 |    "source": [
615 |     "# 将更新后的数据集推送到 Hub\n",
616 |     "现在我们的文档已经更新了我们想要的嵌入。首先我们需要将其转换回 `Dataset` 格式。我发现从字典列表 -> `pd.DataFrame` -> `Dataset` 这条路径最为简单。\n"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": 17,
622 |    "id": "9bb993f8-d624-4192-9626-8e9ed9888a1b",
623 |    "metadata": {
624 |     "tags": []
625 |    },
626 |    "outputs": [],
627 |    "source": [
628 |     "df = pd.DataFrame(documents)\n",
629 |     "dd = DatasetDict({'train': Dataset.from_pandas(df)})"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "markdown",
634 |    "id": "129760c8-cae1-4b1e-8216-f5152df8c536",
635 |    "metadata": {},
636 |    "source": [
637 |     "我默认将其上传到用户的账户（而不是上传到组织），但你可以通过在 `repo_id` 中设置用户或在配置中通过设置 `DATASET_OUT` 来自由推送到任何你想要的地方。\n"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 18,
643 |    "id": "f48e7c55-d5b7-4ed6-8516-272ae38716b1",
644 |    "metadata": {
645 |     "tags": []
646 |    },
647 |    "outputs": [
648 |     {
649 |      "data": {
650 |       "application/vnd.jupyter.widget-view+json": {
651 |        "model_id": "d3af2e864770481db5adc3968500b5d3",
652 |        "version_major": 2,
653 |        "version_minor": 0
654 |       },
655 |       "text/plain": [
656 |        "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
657 |       ]
658 |      },
659 |      "metadata": {},
660 |      "output_type": "display_data"
661 |     },
662 |     {
663 |      "data": {
664 |       "application/vnd.jupyter.widget-view+json": {
665 |        "model_id": "4e063c42d8f4490c939bc64e626b507a",
666 |        "version_major": 2,
667 |        "version_minor": 0
668 |       },
669 |       "text/plain": [
670 |        "Downloading metadata:   0%|          | 0.00/823 [00:00<?, ?B/s]"
671 |       ]
672 |      },
673 |      "metadata": {},
674 |      "output_type": "display_data"
675 |     }
676 |    ],
677 |    "source": [
678 |     "dd.push_to_hub(repo_id=DATASET_OUT)"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": 19,
684 |    "id": "85ea2244-a4c6-4f04-b187-965a2fc356a8",
685 |    "metadata": {
686 |     "tags": []
687 |    },
688 |    "outputs": [
689 |     {
690 |      "name": "stdout",
691 |      "output_type": "stream",
692 |      "text": [
693 |       "Dataset is at https://huggingface.co/datasets/derek-thomas/processed-subset-bestofredditorupdates\n"
694 |      ]
695 |     }
696 |    ],
697 |    "source": [
698 |     "print(f'Dataset is at https://huggingface.co/datasets/{who[\"name\"]}/{DATASET_OUT}')"
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "markdown",
703 |    "id": "41abea64-379d-49de-8d9a-355c2f4ce1ac",
704 |    "metadata": {},
705 |    "source": [
706 |     "# 分析使用情况\n",
707 |     "1. 前往下面打印的 `dashboard_url`\n",
708 |     "2. 点击使用与成本 (Usage & Cost) 标签\n",
709 |     "3. 查看你已经花费了多少"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": 20,
715 |    "id": "16815445-3079-43da-b14e-b54176a07a62",
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "name": "stdout",
720 |      "output_type": "stream",
721 |      "text": [
722 |       "https://ui.endpoints.huggingface.co/HF-test-lab/endpoints/boru-jina-embeddings-demo-ie\n"
723 |      ]
724 |     }
725 |    ],
726 |    "source": [
727 |     "dashboard_url = f'https://ui.endpoints.huggingface.co/{namespace}/endpoints/{ENDPOINT_NAME}'\n",
728 |     "print(dashboard_url)"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 21,
734 |    "id": "81096c6f-d12f-4781-84ec-9066cfa465b3",
735 |    "metadata": {},
736 |    "outputs": [
737 |     {
738 |      "name": "stdout",
739 |      "output_type": "stream",
740 |      "text": [
741 |       "Hit enter to continue with the notebook \n"
742 |      ]
743 |     },
744 |     {
745 |      "data": {
746 |       "text/plain": [
747 |        "''"
748 |       ]
749 |      },
750 |      "execution_count": 21,
751 |      "metadata": {},
752 |      "output_type": "execute_result"
753 |     }
754 |    ],
755 |    "source": [
756 |     "input(\"Hit enter to continue with the notebook\")"
757 |    ]
758 |   },
759 |   {
760 |    "cell_type": "markdown",
761 |    "id": "847d524e-9aa6-4a6f-a275-8a552e289818",
762 |    "metadata": {},
763 |    "source": [
764 |     "我们可以看到只花了 `$0.04` !\n"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "id": "b953d5be-2494-4ff8-be42-9daf00c99c41",
770 |    "metadata": {},
771 |    "source": [
772 |     "\n",
773 |     "# 删除端点\n",
774 |     "现在我们已经完成了，不再需要我们的端点了。我们可以以编程方式删除端点。\n",
775 |     "\n",
776 |     "![Cost](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/automatic_embedding_tei_inference_endpoints.png)"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 22,
782 |    "id": "c310c0f3-6f12-4d5c-838b-3a4c1f2e54ad",
783 |    "metadata": {
784 |     "tags": []
785 |    },
786 |    "outputs": [
787 |     {
788 |      "name": "stdout",
789 |      "output_type": "stream",
790 |      "text": [
791 |       "Endpoint deleted successfully\n"
792 |      ]
793 |     }
794 |    ],
795 |    "source": [
796 |     "endpoint = endpoint.delete()\n",
797 |     "\n",
798 |     "if not endpoint:\n",
799 |     "    print('Endpoint deleted successfully')\n",
800 |     "else:\n",
801 |     "    print('Delete Endpoint in manually') "
802 |    ]
803 |   }
804 |  ],
805 |  "metadata": {
806 |   "kernelspec": {
807 |    "display_name": "Python 3 (ipykernel)",
808 |    "language": "python",
809 |    "name": "python3"
810 |   },
811 |   "language_info": {
812 |    "codemirror_mode": {
813 |     "name": "ipython",
814 |     "version": 3
815 |    },
816 |    "file_extension": ".py",
817 |    "mimetype": "text/x-python",
818 |    "name": "python",
819 |    "nbconvert_exporter": "python",
820 |    "pygments_lexer": "ipython3",
821 |    "version": "3.10.8"
822 |   }
823 |  },
824 |  "nbformat": 4,
825 |  "nbformat_minor": 5
826 | }
827 | 


--------------------------------------------------------------------------------
/notebooks/en/tgi_messages_api_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Migrating from OpenAI to Open LLMs Using TGI's Messages API\n",
  8 |     "\n",
  9 |     "_Authored by: [Andrew Reed](https://huggingface.co/andrewrreed)_\n",
 10 |     "\n",
 11 |     "This notebook demonstrates how you can easily transition from OpenAI models to Open LLMs without needing to refactor any existing code.\n",
 12 |     "\n",
 13 |     "[Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) now offers a [Messages API](https://huggingface.co/blog/tgi-messages-api), making it directly compatible with the OpenAI Chat Completion API. This means that any existing scripts that use OpenAI models (via the OpenAI client library or third-party tools like LangChain or LlamaIndex) can be directly swapped out to use any open LLM running on a TGI endpoint!\n",
 14 |     "\n",
 15 |     "This allows you to quickly test out and benefit from the numerous advantages offered by open models. Things like:\n",
 16 |     "\n",
 17 |     "- Complete control and transparency over models and data\n",
 18 |     "- No more worrying about rate limits\n",
 19 |     "- The ability to fully customize systems according to your specific needs\n",
 20 |     "\n",
 21 |     "In this notebook, we'll show you how to:\n",
 22 |     "\n",
 23 |     "1. [Create Inference Endpoint to Deploy a Model with TGI](#section_1)\n",
 24 |     "2. [Query the Inference Endpoint with OpenAI Client Libraries](#section_2)\n",
 25 |     "3. [Integrate the Endpoint with LangChain and LlamaIndex Workflows](#section_3)\n",
 26 |     "\n",
 27 |     "**Let's dive in!**\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Setup\n",
 35 |     "\n",
 36 |     "First we need to install dependencies and set an HF API key.\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "!pip install --upgrade -q huggingface_hub langchain langchain-community langchainhub langchain-openai llama-index chromadb bs4 sentence_transformers torch"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import os\n",
 55 |     "import getpass\n",
 56 |     "\n",
 57 |     "# enter API key\n",
 58 |     "os.environ[\"HUGGINGFACEHUB_API_TOKEN\"] = HF_API_KEY = getpass.getpass()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "<a id=\"section_1\"></a>\n",
 66 |     "\n",
 67 |     "## 1. Create an Inference Endpoint\n",
 68 |     "\n",
 69 |     "To get started, let's deploy [Nous-Hermes-2-Mixtral-8x7B-DPO](https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO), a fine-tuned Mixtral model, to Inference Endpoints using TGI.\n",
 70 |     "\n",
 71 |     "We can deploy the model in just [a few clicks from the UI](https://ui.endpoints.huggingface.co/new?vendor=aws&repository=NousResearch%2FNous-Hermes-2-Mixtral-8x7B-DPO&tgi_max_total_tokens=32000&tgi=true&tgi_max_input_length=1024&task=text-generation&instance_size=2xlarge&tgi_max_batch_prefill_tokens=2048&tgi_max_batch_total_tokens=1024000&no_suggested_compute=true&accelerator=gpu&region=us-east-1), or take advantage of the `huggingface_hub` Python library to programmatically create and manage Inference Endpoints.\n",
 72 |     "\n",
 73 |     "We'll use the Hub library here by specifing an endpoint name and model repository, along with the task of `text-generation`. In this example, we use a `protected` type so access to the deployed model will require a valid Hugging Face token. We also need to configure the hardware requirements like vendor, region, accelerator, instance type, and size. You can check out the list of available resource options [using this API call](https://api.endpoints.huggingface.cloud/#get-/v2/provider), and view recommended configurations for select models in the catalog [here](https://ui.endpoints.huggingface.co/catalog).\n",
 74 |     "\n",
 75 |     "_Note: You may need to request a quota upgrade by sending an email to [api-enterprise@huggingface.co](mailto:api-enterprise@huggingface.co)_\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "running\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "from huggingface_hub import create_inference_endpoint\n",
 93 |     "\n",
 94 |     "endpoint = create_inference_endpoint(\n",
 95 |     "    \"nous-hermes-2-mixtral-8x7b-demo\",\n",
 96 |     "    repository=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n",
 97 |     "    framework=\"pytorch\",\n",
 98 |     "    task=\"text-generation\",\n",
 99 |     "    accelerator=\"gpu\",\n",
100 |     "    vendor=\"aws\",\n",
101 |     "    region=\"us-east-1\",\n",
102 |     "    type=\"protected\",\n",
103 |     "    instance_type=\"p4de\",\n",
104 |     "    instance_size=\"2xlarge\",\n",
105 |     "    custom_image={\n",
106 |     "        \"health_route\": \"/health\",\n",
107 |     "        \"env\": {\n",
108 |     "            \"MAX_INPUT_LENGTH\": \"4096\",\n",
109 |     "            \"MAX_BATCH_PREFILL_TOKENS\": \"4096\",\n",
110 |     "            \"MAX_TOTAL_TOKENS\": \"32000\",\n",
111 |     "            \"MAX_BATCH_TOTAL_TOKENS\": \"1024000\",\n",
112 |     "            \"MODEL_ID\": \"/repository\",\n",
113 |     "        },\n",
114 |     "        \"url\": \"ghcr.io/huggingface/text-generation-inference:sha-1734540\",  # must be >= 1.4.0\n",
115 |     "    },\n",
116 |     ")\n",
117 |     "\n",
118 |     "endpoint.wait()\n",
119 |     "print(endpoint.status)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "It will take a few minutes for our deployment to spin up. We can use the `.wait()` utility to block the running thread until the endpoint reaches a final \"running\" state. Once running, we can confirm its status and take it for a spin via the UI Playground:\n",
127 |     "\n",
128 |     "![IE UI Overview](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/messages-api/endpoint-overview.png)\n",
129 |     "\n",
130 |     "Great, we now have a working endpoint!\n",
131 |     "\n",
132 |     "_Note: When deploying with `huggingface_hub`, your endpoint will scale-to-zero after 15 minutes of idle time by default to optimize cost during periods of inactivity. Check out [the Hub Python Library documentation](https://huggingface.co/docs/huggingface_hub/guides/inference_endpoints) to see all the functionality available for managing your endpoint lifecycle._\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "<a id=\"section_2\"></a>\n",
140 |     "\n",
141 |     "## 2. Query the Inference Endpoint with OpenAI Client Libraries\n",
142 |     "\n",
143 |     "As mentioned above, since our model is hosted with TGI it now supports a Messages API meaning we can query it directly using the familiar OpenAI client libraries.\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "### With the Python client\n",
151 |     "\n",
152 |     "The example below shows how to make this transition using the [OpenAI Python Library](https://github.com/openai/openai-python). Simply replace the `<ENDPOINT_URL>` with your endpoint URL (be sure to include the `v1/` the suffix) and populate the `<HF_API_KEY>` field with a valid Hugging Face user token. The `<ENDPOINT_URL>` can be gathered from Inference Endpoints UI, or from the endpoint object we created above with `endpoint.url`.\n",
153 |     "\n",
154 |     "We can then use the client as usual, passing a list of messages to stream responses from our Inference Endpoint.\n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 6,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "Open-source software is important due to a number of reasons, including:\n",
167 |       "\n",
168 |       "1. Collaboration: The collaborative nature of open-source software allows developers from around the world to work together, share their ideas and improve the code. This often results in faster progress and better software.\n",
169 |       "\n",
170 |       "2. Transparency: With open-source software, the code is publicly available, making it easy to see exactly how the software functions, and allowing users to determine if there are any security vulnerabilities.\n",
171 |       "\n",
172 |       "3. Customization: Being able to access the code also allows users to customize the software to better suit their needs. This makes open-source software incredibly versatile, as users can tweak it to suit their specific use case.\n",
173 |       "\n",
174 |       "4. Quality: Open-source software is often developed by large communities of dedicated developers, who work together to improve the software. This results in a higher level of quality than might be found in proprietary software.\n",
175 |       "\n",
176 |       "5. Cost: Open-source software is often provided free of charge, which makes it accessible to a wider range of users. This can be especially important for organizations with limited budgets for software.\n",
177 |       "\n",
178 |       "6. Shared Benefit: By sharing the code of open-source software, everyone can benefit from the hard work of the developers. This contributes to the overall advancement of technology, as users and developers work together to improve and build upon the software.\n",
179 |       "\n",
180 |       "In summary, open-source software provides a collaborative platform that leads to high-quality, customizable, and transparent software, all available at little or no cost, benefiting both individuals and the technology community as a whole.<|im_end|>"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "from openai import OpenAI\n",
186 |     "\n",
187 |     "BASE_URL = endpoint.url\n",
188 |     "\n",
189 |     "# init the client but point it to TGI\n",
190 |     "client = OpenAI(\n",
191 |     "    base_url=os.path.join(BASE_URL, \"v1/\"),\n",
192 |     "    api_key=HF_API_KEY,\n",
193 |     ")\n",
194 |     "chat_completion = client.chat.completions.create(\n",
195 |     "    model=\"tgi\",\n",
196 |     "    messages=[\n",
197 |     "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
198 |     "        {\"role\": \"user\", \"content\": \"Why is open-source software important?\"},\n",
199 |     "    ],\n",
200 |     "    stream=True,\n",
201 |     "    max_tokens=500,\n",
202 |     ")\n",
203 |     "\n",
204 |     "# iterate and print stream\n",
205 |     "for message in chat_completion:\n",
206 |     "    print(message.choices[0].delta.content, end=\"\")"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "Behind the scenes, TGI’s Messages API automatically converts the list of messages into the model’s required instruction format using its [chat template](https://huggingface.co/docs/transformers/chat_templating).\n",
214 |     "\n",
215 |     "_Note: Certain OpenAI features, like function calling, are not compatible with TGI. Currently, the Messages API supports the following chat completion parameters: `stream`, `max_new_tokens`, `frequency_penalty`, `logprobs`, `seed`, `temperature`, and `top_p`._\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "### With the JavaScript client\n",
223 |     "\n",
224 |     "Here’s the same streaming example above, but using the [OpenAI Javascript/Typescript Library](https://github.com/openai/openai-node).\n",
225 |     "\n",
226 |     "```js\n",
227 |     "import OpenAI from \"openai\";\n",
228 |     "\n",
229 |     "const openai = new OpenAI({\n",
230 |     "  baseURL: \"<ENDPOINT_URL>\" + \"/v1/\", // replace with your endpoint url\n",
231 |     "  apiKey: \"<HF_API_TOKEN>\", // replace with your token\n",
232 |     "});\n",
233 |     "\n",
234 |     "async function main() {\n",
235 |     "  const stream = await openai.chat.completions.create({\n",
236 |     "    model: \"tgi\",\n",
237 |     "    messages: [\n",
238 |     "      { role: \"system\", content: \"You are a helpful assistant.\" },\n",
239 |     "      { role: \"user\", content: \"Why is open-source software important?\" },\n",
240 |     "    ],\n",
241 |     "    stream: true,\n",
242 |     "    max_tokens: 500,\n",
243 |     "  });\n",
244 |     "  for await (const chunk of stream) {\n",
245 |     "    process.stdout.write(chunk.choices[0]?.delta?.content || \"\");\n",
246 |     "  }\n",
247 |     "}\n",
248 |     "\n",
249 |     "main();\n",
250 |     "```\n"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "<a id=\"section_3\"></a>\n",
258 |     "\n",
259 |     "## 3. Integrate with LangChain and LlamaIndex\n",
260 |     "\n",
261 |     "Now, let’s see how to use this newly created endpoint with popular RAG frameworks like LangChain and LlamaIndex.\n"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "### How to use with LangChain\n",
269 |     "\n",
270 |     "To use it in [LangChain](https://python.langchain.com/docs/get_started/introduction), simply create an instance of `ChatOpenAI` and pass your `<ENDPOINT_URL>` and `<HF_API_TOKEN>` as follows:\n"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 7,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "AIMessage(content='Open-source software is important for several reasons:\\n\\n1. Transparency: Open-source software allows users to see the underlying code, making it easier to understand how the software works and identify any potential security vulnerabilities or bugs. This transparency fosters trust between users and developers.\\n\\n2. Collaboration: Open-source projects encourage collaboration among developers, allowing them to work together to improve the software, fix issues, and add new features. This collective effort can lead to')"
282 |       ]
283 |      },
284 |      "execution_count": 7,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "from langchain_openai import ChatOpenAI\n",
291 |     "\n",
292 |     "llm = ChatOpenAI(\n",
293 |     "    model_name=\"tgi\",\n",
294 |     "    openai_api_key=HF_API_KEY,\n",
295 |     "    openai_api_base=os.path.join(BASE_URL, \"v1/\"),\n",
296 |     ")\n",
297 |     "llm.invoke(\"Why is open-source software important?\")"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "We’re able to directly leverage the same `ChatOpenAI` class that we would have used with the OpenAI models. This allows all previous code to work with our endpoint by changing just one line of code.\n",
305 |     "\n",
306 |     "Let’s now use our Mixtral model in a simple RAG pipeline to answer a question over the contents of a HF blog post.\n"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 8,
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "data": {
316 |       "text/plain": [
317 |        "{'context': [Document(page_content='To overcome this weakness, amongst other approaches, one can integrate the LLM into a system where it can call tools: such a system is called an LLM agent.\\nIn this post, we explain the inner workings of ReAct agents, then show how to build them using the ChatHuggingFace class recently integrated in LangChain. Finally, we benchmark several open-source LLMs against GPT-3.5 and GPT-4.', metadata={'description': 'We’re on a journey to advance and democratize artificial intelligence through open source and open science.', 'language': 'No language found.', 'source': 'https://huggingface.co/blog/open-source-llms-as-agents', 'title': 'Open-source LLMs as LangChain Agents'}),\n",
318 |        "  Document(page_content='Since the open-source models were not specifically fine-tuned for calling functions in the given output format, they are at a slight disadvantage compared to the OpenAI agents.\\nDespite this, some models perform really well! 💪\\nHere’s an example of Mixtral-8x7B answering the question: “Which city has a larger population, Guiyang or Tacheng?”\\nThought: To answer this question, I need to find the current populations of both Guiyang and Tacheng. I will use the search tool to find this information.\\nAction:\\n{', metadata={'description': 'We’re on a journey to advance and democratize artificial intelligence through open source and open science.', 'language': 'No language found.', 'source': 'https://huggingface.co/blog/open-source-llms-as-agents', 'title': 'Open-source LLMs as LangChain Agents'}),\n",
319 |        "  Document(page_content='Agents Showdown: how do open-source LLMs perform as general purpose reasoning agents?\\n\\t\\n\\nYou can find the code for this benchmark here.\\n\\n\\n\\n\\n\\n\\t\\tEvaluation\\n\\t\\n\\nWe want to measure how open-source LLMs perform as general purpose reasoning agents. Thus we select questions requiring using logic and the use of basic tools: a calculator and access to internet search.\\nThe final dataset is a combination of samples from 3 other datasets:', metadata={'description': 'We’re on a journey to advance and democratize artificial intelligence through open source and open science.', 'language': 'No language found.', 'source': 'https://huggingface.co/blog/open-source-llms-as-agents', 'title': 'Open-source LLMs as LangChain Agents'}),\n",
320 |        "  Document(page_content='Open-source LLMs as LangChain Agents\\n\\t\\n\\nPublished\\n\\t\\t\\t\\tJanuary 24, 2024\\nUpdate on GitHub\\n\\nm-ric\\nAymeric Roucher\\n\\n\\n\\n\\nJofthomas\\nJoffrey THOMAS\\n\\n\\n\\n\\nandrewrreed\\nAndrew Reed\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\t\\tTL;DR\\n\\t\\n\\nOpen-source LLMs have now reached a performance level that makes them suitable reasoning engines for powering agent workflows: Mixtral even surpasses GPT-3.5 on our benchmark, and its performance could easily be further enhanced with fine-tuning.\\n\\n\\n\\n\\n\\n\\t\\tIntroduction', metadata={'description': 'We’re on a journey to advance and democratize artificial intelligence through open source and open science.', 'language': 'No language found.', 'source': 'https://huggingface.co/blog/open-source-llms-as-agents', 'title': 'Open-source LLMs as LangChain Agents'})],\n",
321 |        " 'question': 'According to this article which open-source model is the best for an agent behaviour?',\n",
322 |        " 'answer': 'According to the article, Mixtral-8x7B is an open-source LLM that performs really well as a general-purpose reasoning agent. It even surpasses GPT-3.5 on the benchmark in the article.'}"
323 |       ]
324 |      },
325 |      "execution_count": 8,
326 |      "metadata": {},
327 |      "output_type": "execute_result"
328 |     }
329 |    ],
330 |    "source": [
331 |     "from langchain import hub\n",
332 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
333 |     "from langchain_community.document_loaders import WebBaseLoader\n",
334 |     "from langchain_community.vectorstores import Chroma\n",
335 |     "from langchain_core.output_parsers import StrOutputParser\n",
336 |     "from langchain_core.runnables import RunnablePassthrough\n",
337 |     "from langchain_core.runnables import RunnableParallel\n",
338 |     "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
339 |     "\n",
340 |     "# Load, chunk and index the contents of the blog\n",
341 |     "loader = WebBaseLoader(\n",
342 |     "    web_paths=(\"https://huggingface.co/blog/open-source-llms-as-agents\",),\n",
343 |     ")\n",
344 |     "docs = loader.load()\n",
345 |     "\n",
346 |     "# declare an HF embedding model\n",
347 |     "hf_embeddings = HuggingFaceEmbeddings(model_name=\"BAAI/bge-large-en-v1.5\")\n",
348 |     "\n",
349 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=200)\n",
350 |     "splits = text_splitter.split_documents(docs)\n",
351 |     "vectorstore = Chroma.from_documents(documents=splits, embedding=hf_embeddings)\n",
352 |     "\n",
353 |     "# Retrieve and generate using the relevant snippets of the blog\n",
354 |     "retriever = vectorstore.as_retriever()\n",
355 |     "prompt = hub.pull(\"rlm/rag-prompt\")\n",
356 |     "\n",
357 |     "\n",
358 |     "def format_docs(docs):\n",
359 |     "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
360 |     "\n",
361 |     "\n",
362 |     "rag_chain_from_docs = (\n",
363 |     "    RunnablePassthrough.assign(context=(lambda x: format_docs(x[\"context\"])))\n",
364 |     "    | prompt\n",
365 |     "    | llm\n",
366 |     "    | StrOutputParser()\n",
367 |     ")\n",
368 |     "\n",
369 |     "rag_chain_with_source = RunnableParallel(\n",
370 |     "    {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
371 |     ").assign(answer=rag_chain_from_docs)\n",
372 |     "\n",
373 |     "rag_chain_with_source.invoke(\n",
374 |     "    \"According to this article which open-source model is the best for an agent behaviour?\"\n",
375 |     ")"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "### How to use with LlamaIndex\n",
383 |     "\n",
384 |     "Similarly, you can also use a TGI endpoint in [LlamaIndex](https://www.llamaindex.ai/). We’ll use the `OpenAILike` class, and instantiate it by configuring some additional arguments (i.e. `is_local`, `is_function_calling_model`, `is_chat_model`, `context_window`).\n",
385 |     "\n",
386 |     "_Note: that the context window argument should match the value previously set for `MAX_TOTAL_TOKENS` of your endpoint._\n"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 9,
392 |    "metadata": {},
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "CompletionResponse(text='Open-source software is important for several reasons:\\n\\n1. Transparency: Open-source software allows users to see the source code, which means they can understand how the software works and how it processes data. This transparency helps build trust in the software and its developers.\\n\\n2. Collaboration: Open-source software encourages collaboration among developers, who can contribute to the code, fix bugs, and add new features. This collaborative approach often leads to faster development and', additional_kwargs={}, raw={'id': '', 'choices': [Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Open-source software is important for several reasons:\\n\\n1. Transparency: Open-source software allows users to see the source code, which means they can understand how the software works and how it processes data. This transparency helps build trust in the software and its developers.\\n\\n2. Collaboration: Open-source software encourages collaboration among developers, who can contribute to the code, fix bugs, and add new features. This collaborative approach often leads to faster development and', role='assistant', function_call=None, tool_calls=None))], 'created': 1707342025, 'model': '/repository', 'object': 'text_completion', 'system_fingerprint': '1.4.0-sha-1734540', 'usage': CompletionUsage(completion_tokens=100, prompt_tokens=18, total_tokens=118)}, delta=None)"
398 |       ]
399 |      },
400 |      "execution_count": 9,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "from llama_index.llms import OpenAILike\n",
407 |     "\n",
408 |     "llm = OpenAILike(\n",
409 |     "    model=\"tgi\",\n",
410 |     "    api_key=HF_API_KEY,\n",
411 |     "    api_base=BASE_URL + \"/v1/\",\n",
412 |     "    is_chat_model=True,\n",
413 |     "    is_local=False,\n",
414 |     "    is_function_calling_model=False,\n",
415 |     "    context_window=4096,\n",
416 |     ")\n",
417 |     "\n",
418 |     "llm.complete(\"Why is open-source software important?\")"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "markdown",
423 |    "metadata": {},
424 |    "source": [
425 |     "We can now use it in a similar RAG pipeline. Keep in mind that the previous choice of `MAX_INPUT_LENGTH` in your Inference Endpoint will directly influence the number of retrieved chunk (`similarity_top_k`) the model can process.\n"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "from llama_index import (\n",
435 |     "    ServiceContext,\n",
436 |     "    VectorStoreIndex,\n",
437 |     ")\n",
438 |     "from llama_index import download_loader\n",
439 |     "from llama_index.embeddings import HuggingFaceEmbedding\n",
440 |     "from llama_index.query_engine import CitationQueryEngine\n",
441 |     "\n",
442 |     "\n",
443 |     "SimpleWebPageReader = download_loader(\"SimpleWebPageReader\")\n",
444 |     "\n",
445 |     "documents = SimpleWebPageReader(html_to_text=True).load_data(\n",
446 |     "    [\"https://huggingface.co/blog/open-source-llms-as-agents\"]\n",
447 |     ")\n",
448 |     "\n",
449 |     "# Load embedding model\n",
450 |     "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-large-en-v1.5\")\n",
451 |     "\n",
452 |     "# Pass LLM to pipeline\n",
453 |     "service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)\n",
454 |     "index = VectorStoreIndex.from_documents(\n",
455 |     "    documents, service_context=service_context, show_progress=True\n",
456 |     ")\n",
457 |     "\n",
458 |     "# Query the index\n",
459 |     "query_engine = CitationQueryEngine.from_args(\n",
460 |     "    index,\n",
461 |     "    similarity_top_k=2,\n",
462 |     ")\n",
463 |     "response = query_engine.query(\n",
464 |     "    \"According to this article which open-source model is the best for an agent behaviour?\"\n",
465 |     ")\n",
466 |     "\n",
467 |     "response.response"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "## Wrap up\n",
475 |     "\n",
476 |     "After you are done with your endpoint, you can either pause or delete it. This step can be completed via the UI, or programmatically like follows.\n"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "# pause our running endpoint\n",
486 |     "endpoint.pause()\n",
487 |     "\n",
488 |     "# optionally delete\n",
489 |     "# endpoint.delete()"
490 |    ]
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": ".venv",
496 |    "language": "python",
497 |    "name": "python3"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 3
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython3",
509 |    "version": "3.10.11"
510 |   }
511 |  },
512 |  "nbformat": 4,
513 |  "nbformat_minor": 2
514 | }
515 | 


--------------------------------------------------------------------------------
/notebooks/en/automatic_embedding_tei_inference_endpoints.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5d9aca72-957a-4ee2-862f-e011b9cd3a62",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# How to use Inference Endpoints to Embed Documents\n",
  9 |     "\n",
 10 |     "_Authored by: [Derek Thomas](https://huggingface.co/derek-thomas)_\n",
 11 |     "\n",
 12 |     "## Goal\n",
 13 |     "I have a dataset I want to embed for semantic search (or QA, or RAG), I want the easiest way to do embed this and put it in a new dataset.\n",
 14 |     "\n",
 15 |     "## Approach\n",
 16 |     "I'm using a dataset from my favorite subreddit [r/bestofredditorupdates](https://www.reddit.com/r/bestofredditorupdates/). Because it has long entries, I will use the new [jinaai/jina-embeddings-v2-base-en](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) since it has an 8k context length. I will deploy this using [Inference Endpoint](https://huggingface.co/inference-endpoints) to save time and money. To follow this tutorial, you will need to **have already added a payment method**. If you haven't, you can add one here in [billing](https://huggingface.co/docs/hub/billing#billing). To make it even easier, I'll make this fully API based.\n",
 17 |     "\n",
 18 |     "To make this MUCH faster I will use the [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) image. This has many benefits like:\n",
 19 |     "- No model graph compilation step\n",
 20 |     "- Small docker images and fast boot times. Get ready for true serverless!\n",
 21 |     "- Token based dynamic batching\n",
 22 |     "- Optimized transformers code for inference using Flash Attention, Candle and cuBLASLt\n",
 23 |     "- Safetensors weight loading\n",
 24 |     "- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)\n",
 25 |     "\n",
 26 |     "![img](https://media.githubusercontent.com/media/huggingface/text-embeddings-inference/main/assets/bs1-tp.png)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "3c830114-dd88-45a9-81b9-78b0e3da7384",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Requirements"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "35386f72-32cb-49fa-a108-3aa504e20429",
 41 |    "metadata": {
 42 |     "tags": []
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "!pip install -q aiohttp==3.8.3 datasets==2.14.6 pandas==1.5.3 requests==2.31.0 tqdm==4.66.1 huggingface-hub>=0.20"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "b6f72042-173d-4a72-ade1-9304b43b528d",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Imports"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "id": "e2beecdd-d033-4736-bd45-6754ec53b4ac",
 61 |    "metadata": {
 62 |     "tags": []
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import asyncio\n",
 67 |     "from getpass import getpass\n",
 68 |     "import json\n",
 69 |     "from pathlib import Path\n",
 70 |     "import time\n",
 71 |     "from typing import Optional\n",
 72 |     "\n",
 73 |     "from aiohttp import ClientSession, ClientTimeout\n",
 74 |     "from datasets import load_dataset, Dataset, DatasetDict\n",
 75 |     "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami\n",
 76 |     "import numpy as np\n",
 77 |     "import pandas as pd\n",
 78 |     "import requests\n",
 79 |     "from tqdm.auto import tqdm"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "5eece903-64ce-435d-a2fd-096c0ff650bf",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Config\n",
 88 |     "`DATASET_IN` is where your text data is\n",
 89 |     "`DATASET_OUT` is where your embeddings will be stored\n",
 90 |     "\n",
 91 |     "Note I used 5 for the `MAX_WORKERS` since `jina-embeddings-v2` are quite memory hungry. "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "id": "df2f79f0-9f28-46e6-9fc7-27e9537ff5be",
 98 |    "metadata": {
 99 |     "tags": []
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "DATASET_IN = 'derek-thomas/dataset-creator-reddit-bestofredditorupdates'\n",
104 |     "DATASET_OUT = \"processed-subset-bestofredditorupdates\"\n",
105 |     "ENDPOINT_NAME = \"boru-jina-embeddings-demo-ie\"\n",
106 |     "\n",
107 |     "MAX_WORKERS = 5  # This is for how many async workers you want. Choose based on the model and hardware \n",
108 |     "ROW_COUNT = 100  # Choose None to use all rows, Im using 100 just for a demo"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "1e680f3d-4900-46cc-8b49-bb6ba3e27e2b",
114 |    "metadata": {},
115 |    "source": [
116 |     "Hugging Face offers a number of GPUs that you can choose from a number of GPUs that you can choose in Inference Endpoints. Here they are in table form:\n",
117 |     "\n",
118 |     "| GPU                 | instanceType   | instanceSize | vRAM  |\n",
119 |     "|---------------------|----------------|--------------|-------|\n",
120 |     "| 1x Nvidia Tesla T4 | g4dn.xlarge | small | 16GB |\n",
121 |     "| 4x Nvidia Tesla T4 | g4dn.12xlarge | large | 64GB |\n",
122 |     "| 1x Nvidia A10G | g5.2xlarge | medium | 24GB |\n",
123 |     "| 4x Nvidia A10G | g5.12xlarge | xxlarge | 96GB |\n",
124 |     "| 1x Nvidia A100* | p4de | xlarge | 80GB |\n",
125 |     "| 2x Nvidia A100* | p4de | 2xlarge | 160GB |\n",
126 |     "\n",
127 |     "\\*Note that for A100s you might get a note to email us to get access."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 4,
133 |    "id": "3c2106c1-2e5a-443a-9ea8-a3cd0e9c5a94",
134 |    "metadata": {
135 |     "tags": []
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "# GPU Choice\n",
140 |     "VENDOR=\"aws\"\n",
141 |     "REGION=\"us-east-1\"\n",
142 |     "INSTANCE_SIZE=\"medium\"\n",
143 |     "INSTANCE_TYPE=\"g5.2xlarge\""
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 5,
149 |    "id": "0ca1140c-3fcc-4b99-9210-6da1505a27b7",
150 |    "metadata": {
151 |     "tags": []
152 |    },
153 |    "outputs": [
154 |     {
155 |      "data": {
156 |       "application/vnd.jupyter.widget-view+json": {
157 |        "model_id": "ee80821056e147fa9cabf30f64dc85a8",
158 |        "version_major": 2,
159 |        "version_minor": 0
160 |       },
161 |       "text/plain": [
162 |        "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
163 |       ]
164 |      },
165 |      "metadata": {},
166 |      "output_type": "display_data"
167 |     }
168 |    ],
169 |    "source": [
170 |     "notebook_login()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "id": "5f4ba0a8-0a6c-4705-a73b-7be09b889610",
176 |    "metadata": {},
177 |    "source": [
178 |     "Some users might have payment registered in an organization. This allows you to connect to an organization (that you are a member of) with a payment method.\n",
179 |     "\n",
180 |     "Leave it blank is you want to use your username."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 6,
186 |    "id": "88cdbd73-5923-4ae9-9940-b6be935f70fa",
187 |    "metadata": {
188 |     "tags": []
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stdin",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "What is your Hugging Face 🤗 username or organization? (with an added payment method) ········\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "who = whoami()\n",
201 |     "organization = getpass(prompt=\"What is your Hugging Face 🤗 username or organization? (with an added payment method)\")\n",
202 |     "\n",
203 |     "namespace = organization or who['name']"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "id": "b972a719-2aed-4d2e-a24f-fae7776d5fa4",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Get Dataset"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "id": "27835fa4-3a4f-44b1-a02a-5e31584a1bba",
218 |    "metadata": {
219 |     "tags": []
220 |    },
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "application/vnd.jupyter.widget-view+json": {
225 |        "model_id": "4041cedd3b3f4f8db3e29ec102f46a3a",
226 |        "version_major": 2,
227 |        "version_minor": 0
228 |       },
229 |       "text/plain": [
230 |        "Downloading readme:   0%|          | 0.00/1.73k [00:00<?, ?B/s]"
231 |       ]
232 |      },
233 |      "metadata": {},
234 |      "output_type": "display_data"
235 |     },
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "Dataset({\n",
240 |        "    features: ['id', 'content', 'score', 'date_utc', 'title', 'flair', 'poster', 'permalink', 'new', 'updated'],\n",
241 |        "    num_rows: 10042\n",
242 |        "})"
243 |       ]
244 |      },
245 |      "execution_count": 7,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "dataset = load_dataset(DATASET_IN)\n",
252 |     "dataset['train']"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 8,
258 |    "id": "8846087e-4d0d-4c0e-8aeb-ea95d9e97126",
259 |    "metadata": {
260 |     "tags": []
261 |    },
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "(100,\n",
267 |        " {'id': '10004zw',\n",
268 |        "  'content': '[removed]',\n",
269 |        "  'score': 1,\n",
270 |        "  'date_utc': Timestamp('2022-12-31 18:16:22'),\n",
271 |        "  'title': 'To All BORU contributors, Thank you :)',\n",
272 |        "  'flair': 'CONCLUDED',\n",
273 |        "  'poster': 'IsItAcOnSeQuEnCe',\n",
274 |        "  'permalink': '/r/BestofRedditorUpdates/comments/10004zw/to_all_boru_contributors_thank_you/',\n",
275 |        "  'new': False,\n",
276 |        "  'updated': False})"
277 |       ]
278 |      },
279 |      "execution_count": 8,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "documents = dataset['train'].to_pandas().to_dict('records')[:ROW_COUNT]\n",
286 |     "len(documents), documents[0]"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "id": "93096cbc-81c6-4137-a283-6afb0f48fbb9",
292 |    "metadata": {},
293 |    "source": [
294 |     "# Inference Endpoints\n",
295 |     "## Create Inference Endpoint\n",
296 |     "We are going to use the [API](https://huggingface.co/docs/inference-endpoints/api_reference) to create an [Inference Endpoint](https://huggingface.co/inference-endpoints). This should provide a few main benefits:\n",
297 |     "- It's convenient (No clicking)\n",
298 |     "- It's repeatable (We have the code to run it easily)\n",
299 |     "- It's cheaper (No time spent waiting for it to load, and automatically shut it down)\n",
300 |     "\n"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 9,
306 |    "id": "9e59de46-26b7-4bb9-bbad-8bba9931bde7",
307 |    "metadata": {
308 |     "tags": []
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "try:\n",
313 |     "    endpoint = create_inference_endpoint(\n",
314 |     "        ENDPOINT_NAME,\n",
315 |     "        repository=\"jinaai/jina-embeddings-v2-base-en\",\n",
316 |     "        revision=\"7302ac470bed880590f9344bfeee32ff8722d0e5\",\n",
317 |     "        task=\"sentence-embeddings\",\n",
318 |     "        framework=\"pytorch\",\n",
319 |     "        accelerator=\"gpu\",\n",
320 |     "        instance_size=INSTANCE_SIZE,\n",
321 |     "        instance_type=INSTANCE_TYPE,\n",
322 |     "        region=REGION,\n",
323 |     "        vendor=VENDOR,\n",
324 |     "        namespace=namespace,\n",
325 |     "        custom_image={\n",
326 |     "            \"health_route\": \"/health\",\n",
327 |     "            \"env\": {\n",
328 |     "                \"MAX_BATCH_TOKENS\": str(MAX_WORKERS * 2048),\n",
329 |     "                \"MAX_CONCURRENT_REQUESTS\": \"512\",\n",
330 |     "                \"MODEL_ID\": \"/repository\"\n",
331 |     "            },\n",
332 |     "            \"url\": \"ghcr.io/huggingface/text-embeddings-inference:0.5.0\",\n",
333 |     "        },\n",
334 |     "        type=\"protected\",\n",
335 |     "    )\n",
336 |     "except:\n",
337 |     "    endpoint = [ie for ie in list_inference_endpoints(namespace=namespace) if ie.name == ENDPOINT_NAME][0]\n",
338 |     "    print('Loaded endpoint')"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "markdown",
343 |    "id": "0f2c97dc-34e8-49e9-b60e-f5b7366294c0",
344 |    "metadata": {},
345 |    "source": [
346 |     "There are a few design choices here:\n",
347 |     "- As discussed before we are using `jinaai/jina-embeddings-v2-base-en` as our model. \n",
348 |     "    - For reproducibility we are pinning it to a specific revision.\n",
349 |     "- If you are interested in more models, check out the supported list [here](https://huggingface.co/docs/text-embeddings-inference/supported_models). \n",
350 |     "    - Note that most embedding models are based on the BERT architecture.\n",
351 |     "- `MAX_BATCH_TOKENS` is chosen based on our number of workers and the context window of our embedding model.\n",
352 |     "- `type=\"protected\"` utilized the security from Inference Endpoints detailed here.\n",
353 |     "- I'm using **1x Nvidia A10** since `jina-embeddings-v2` is memory hungry (remember the 8k context length). \n",
354 |     "- You should consider further tuning `MAX_BATCH_TOKENS` and `MAX_CONCURRENT_REQUESTS` if you have high workloads\n"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "id": "96d173b2-8980-4554-9039-c62843d3fc7d",
360 |    "metadata": {},
361 |    "source": [
362 |     "## Wait until it's running"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 10,
368 |    "id": "5f3a8bd2-753c-49a8-9452-899578beddc5",
369 |    "metadata": {
370 |     "tags": []
371 |    },
372 |    "outputs": [
373 |     {
374 |      "name": "stdout",
375 |      "output_type": "stream",
376 |      "text": [
377 |       "CPU times: user 48.1 ms, sys: 15.7 ms, total: 63.8 ms\n",
378 |       "Wall time: 52.6 s\n"
379 |      ]
380 |     },
381 |     {
382 |      "data": {
383 |       "text/plain": [
384 |        "InferenceEndpoint(name='boru-jina-embeddings-demo-ie', namespace='HF-test-lab', repository='jinaai/jina-embeddings-v2-base-en', status='running', url='https://k7l1xeok1jwnpbx5.us-east-1.aws.endpoints.huggingface.cloud')"
385 |       ]
386 |      },
387 |      "execution_count": 10,
388 |      "metadata": {},
389 |      "output_type": "execute_result"
390 |     }
391 |    ],
392 |    "source": [
393 |     "%%time\n",
394 |     "endpoint.wait()"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "id": "a906645e-60de-4eb6-b8b6-3ec98a9d9b00",
400 |    "metadata": {},
401 |    "source": [
402 |     "When we use `endpoint.client.post` we get a bytes string back. This is a little tedious because we need to convert this to an `np.array`, but it's just a couple quick lines in python."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 12,
408 |    "id": "e09253d5-70ff-4d0e-8888-0022ce0adf7b",
409 |    "metadata": {
410 |     "tags": []
411 |    },
412 |    "outputs": [
413 |     {
414 |      "data": {
415 |       "text/plain": [
416 |        "array([-0.05630935, -0.03560849,  0.02789049,  0.02792823, -0.02800371,\n",
417 |        "       -0.01530391, -0.01863454, -0.0077982 ,  0.05374297,  0.03672185,\n",
418 |        "       -0.06114018, -0.06880157, -0.0093503 , -0.03174005, -0.03206085,\n",
419 |        "        0.0610647 ,  0.02243694,  0.03217408,  0.04181686,  0.00248854])"
420 |       ]
421 |      },
422 |      "execution_count": 12,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "response = endpoint.client.post(json={\"inputs\": 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music!', 'truncate': True}, task=\"feature-extraction\")\n",
429 |     "response = np.array(json.loads(response.decode()))\n",
430 |     "response[0][:20]"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "id": "0d024788-6e6e-4a8d-b192-36ee3dacca13",
436 |    "metadata": {},
437 |    "source": [
438 |     "You may have inputs that exceed the context. In such scenarios, it's up to you to handle them. In my case, I'd like to truncate rather than have an error. Let's test that it works."
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 13,
444 |    "id": "a4a1cd15-dda3-4cfa-8bda-788d8c1b9e32",
445 |    "metadata": {
446 |     "tags": []
447 |    },
448 |    "outputs": [
449 |     {
450 |      "name": "stdout",
451 |      "output_type": "stream",
452 |      "text": [
453 |       "The length of the embedding_input is: 300000\n"
454 |      ]
455 |     },
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "array([-0.03088215, -0.0351537 ,  0.05749275,  0.00983467,  0.02108356,\n",
460 |        "        0.04539965,  0.06107162, -0.02536954,  0.03887688,  0.01998681,\n",
461 |        "       -0.05391388,  0.01529677, -0.1279156 ,  0.01653782, -0.01940958,\n",
462 |        "        0.0367411 ,  0.0031748 ,  0.04716022, -0.00713609, -0.00155313])"
463 |       ]
464 |      },
465 |      "execution_count": 13,
466 |      "metadata": {},
467 |      "output_type": "execute_result"
468 |     }
469 |    ],
470 |    "source": [
471 |     "embedding_input = 'This input will get multiplied' * 10000\n",
472 |     "print(f'The length of the embedding_input is: {len(embedding_input)}')\n",
473 |     "response = endpoint.client.post(json={\"inputs\": embedding_input, 'truncate': True}, task=\"feature-extraction\")\n",
474 |     "response = np.array(json.loads(response.decode()))\n",
475 |     "response[0][:20]"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "id": "f7186126-ef6a-47d0-b158-112810649cd9",
481 |    "metadata": {},
482 |    "source": [
483 |     "# Get Embeddings"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "id": "1dadfd68-6d46-4ce8-a165-bfeb43b1f114",
489 |    "metadata": {},
490 |    "source": [
491 |     "Here I send a document, update it with the embedding, and return it. This happens in parallel with `MAX_WORKERS`."
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 14,
497 |    "id": "ad3193fb-3def-42a8-968e-c63f2b864ca8",
498 |    "metadata": {
499 |     "tags": []
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "async def request(document, semaphore):\n",
504 |     "    # Semaphore guard\n",
505 |     "    async with semaphore:\n",
506 |     "        result = await endpoint.async_client.post(json={\"inputs\": document['content'], 'truncate': True}, task=\"feature-extraction\")\n",
507 |     "        result = np.array(json.loads(result.decode()))\n",
508 |     "        document['embedding'] = result[0]  # Assuming the API's output can be directly assigned\n",
509 |     "        return document\n",
510 |     "\n",
511 |     "async def main(documents):\n",
512 |     "    # Semaphore to limit concurrent requests. Adjust the number as needed.\n",
513 |     "    semaphore = asyncio.BoundedSemaphore(MAX_WORKERS)\n",
514 |     "\n",
515 |     "    # Creating a list of tasks\n",
516 |     "    tasks = [request(document, semaphore) for document in documents]\n",
517 |     "    \n",
518 |     "    # Using tqdm to show progress. It's been integrated into the async loop.\n",
519 |     "    for f in tqdm(asyncio.as_completed(tasks), total=len(documents)):\n",
520 |     "        await f"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 15,
526 |    "id": "ec4983af-65eb-4841-808a-3738fb4d682d",
527 |    "metadata": {
528 |     "tags": []
529 |    },
530 |    "outputs": [
531 |     {
532 |      "data": {
533 |       "application/vnd.jupyter.widget-view+json": {
534 |        "model_id": "48a2affdee8d46f3b0c1f691eaac4b89",
535 |        "version_major": 2,
536 |        "version_minor": 0
537 |       },
538 |       "text/plain": [
539 |        "  0%|          | 0/100 [00:00<?, ?it/s]"
540 |       ]
541 |      },
542 |      "metadata": {},
543 |      "output_type": "display_data"
544 |     },
545 |     {
546 |      "name": "stdout",
547 |      "output_type": "stream",
548 |      "text": [
549 |       "Embeddings = 100 documents = 100\n",
550 |       "0 min 21.33 sec\n"
551 |      ]
552 |     }
553 |    ],
554 |    "source": [
555 |     "start = time.perf_counter()\n",
556 |     "\n",
557 |     "# Get embeddings\n",
558 |     "await main(documents)\n",
559 |     "\n",
560 |     "# Make sure we got it all\n",
561 |     "count = 0\n",
562 |     "for document in documents:\n",
563 |     "    if 'embedding' in document.keys() and len(document['embedding']) == 768:\n",
564 |     "        count += 1\n",
565 |     "print(f'Embeddings = {count} documents = {len(documents)}')\n",
566 |     "\n",
567 |     "            \n",
568 |     "# Print elapsed time\n",
569 |     "elapsed_time = time.perf_counter() - start\n",
570 |     "minutes, seconds = divmod(elapsed_time, 60)\n",
571 |     "print(f\"{int(minutes)} min {seconds:.2f} sec\")"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "markdown",
576 |    "id": "bab97c7b-7bac-4bf5-9752-b528294dadc7",
577 |    "metadata": {},
578 |    "source": [
579 |     "## Pause Inference Endpoint\n",
580 |     "Now that we have finished, let's pause the endpoint so we don't incur any extra charges, this will also allow us to analyze the cost."
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 16,
586 |    "id": "540a0978-7670-4ce3-95c1-3823cc113b85",
587 |    "metadata": {
588 |     "tags": []
589 |    },
590 |    "outputs": [
591 |     {
592 |      "name": "stdout",
593 |      "output_type": "stream",
594 |      "text": [
595 |       "Endpoint Status: paused\n"
596 |      ]
597 |     }
598 |    ],
599 |    "source": [
600 |     "endpoint = endpoint.pause()\n",
601 |     "\n",
602 |     "print(f\"Endpoint Status: {endpoint.status}\")"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "markdown",
607 |    "id": "45ad65b7-3da2-4113-9b95-8fb4e21ae793",
608 |    "metadata": {},
609 |    "source": [
610 |     "# Push updated dataset to Hub\n",
611 |     "We now have our documents updated with the embeddings we wanted. First we need to convert it back to a `Dataset` format. I find it easiest to go from list of dicts -> `pd.DataFrame` -> `Dataset`"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 17,
617 |    "id": "9bb993f8-d624-4192-9626-8e9ed9888a1b",
618 |    "metadata": {
619 |     "tags": []
620 |    },
621 |    "outputs": [],
622 |    "source": [
623 |     "df = pd.DataFrame(documents)\n",
624 |     "dd = DatasetDict({'train': Dataset.from_pandas(df)})"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "markdown",
629 |    "id": "129760c8-cae1-4b1e-8216-f5152df8c536",
630 |    "metadata": {},
631 |    "source": [
632 |     "I'm uploading it to the user's account by default (as opposed to uploading to an organization) but feel free to push to wherever you want by setting the user in the `repo_id` or in the config by setting `DATASET_OUT`"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": 18,
638 |    "id": "f48e7c55-d5b7-4ed6-8516-272ae38716b1",
639 |    "metadata": {
640 |     "tags": []
641 |    },
642 |    "outputs": [
643 |     {
644 |      "data": {
645 |       "application/vnd.jupyter.widget-view+json": {
646 |        "model_id": "d3af2e864770481db5adc3968500b5d3",
647 |        "version_major": 2,
648 |        "version_minor": 0
649 |       },
650 |       "text/plain": [
651 |        "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
652 |       ]
653 |      },
654 |      "metadata": {},
655 |      "output_type": "display_data"
656 |     },
657 |     {
658 |      "data": {
659 |       "application/vnd.jupyter.widget-view+json": {
660 |        "model_id": "4e063c42d8f4490c939bc64e626b507a",
661 |        "version_major": 2,
662 |        "version_minor": 0
663 |       },
664 |       "text/plain": [
665 |        "Downloading metadata:   0%|          | 0.00/823 [00:00<?, ?B/s]"
666 |       ]
667 |      },
668 |      "metadata": {},
669 |      "output_type": "display_data"
670 |     }
671 |    ],
672 |    "source": [
673 |     "dd.push_to_hub(repo_id=DATASET_OUT)"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": 19,
679 |    "id": "85ea2244-a4c6-4f04-b187-965a2fc356a8",
680 |    "metadata": {
681 |     "tags": []
682 |    },
683 |    "outputs": [
684 |     {
685 |      "name": "stdout",
686 |      "output_type": "stream",
687 |      "text": [
688 |       "Dataset is at https://huggingface.co/datasets/derek-thomas/processed-subset-bestofredditorupdates\n"
689 |      ]
690 |     }
691 |    ],
692 |    "source": [
693 |     "print(f'Dataset is at https://huggingface.co/datasets/{who[\"name\"]}/{DATASET_OUT}')"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "markdown",
698 |    "id": "41abea64-379d-49de-8d9a-355c2f4ce1ac",
699 |    "metadata": {},
700 |    "source": [
701 |     "# Analyze Usage\n",
702 |     "1. Go to your `dashboard_url` printed below\n",
703 |     "1. Click on the Usage & Cost tab\n",
704 |     "1. See how much you have spent"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": 20,
710 |    "id": "16815445-3079-43da-b14e-b54176a07a62",
711 |    "metadata": {},
712 |    "outputs": [
713 |     {
714 |      "name": "stdout",
715 |      "output_type": "stream",
716 |      "text": [
717 |       "https://ui.endpoints.huggingface.co/HF-test-lab/endpoints/boru-jina-embeddings-demo-ie\n"
718 |      ]
719 |     }
720 |    ],
721 |    "source": [
722 |     "dashboard_url = f'https://ui.endpoints.huggingface.co/{namespace}/endpoints/{ENDPOINT_NAME}'\n",
723 |     "print(dashboard_url)"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": 21,
729 |    "id": "81096c6f-d12f-4781-84ec-9066cfa465b3",
730 |    "metadata": {},
731 |    "outputs": [
732 |     {
733 |      "name": "stdin",
734 |      "output_type": "stream",
735 |      "text": [
736 |       "Hit enter to continue with the notebook \n"
737 |      ]
738 |     },
739 |     {
740 |      "data": {
741 |       "text/plain": [
742 |        "''"
743 |       ]
744 |      },
745 |      "execution_count": 21,
746 |      "metadata": {},
747 |      "output_type": "execute_result"
748 |     }
749 |    ],
750 |    "source": [
751 |     "input(\"Hit enter to continue with the notebook\")"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "markdown",
756 |    "id": "847d524e-9aa6-4a6f-a275-8a552e289818",
757 |    "metadata": {},
758 |    "source": [
759 |     "We can see that it only took `$0.04` to pay for this!\n"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "markdown",
764 |    "id": "b953d5be-2494-4ff8-be42-9daf00c99c41",
765 |    "metadata": {},
766 |    "source": [
767 |     "\n",
768 |     "# Delete Endpoint\n",
769 |     "Now that we are done, we don't need our endpoint anymore. We can delete our endpoint programmatically. \n",
770 |     "\n",
771 |     "![Cost](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/automatic_embedding_tei_inference_endpoints.png)"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": 22,
777 |    "id": "c310c0f3-6f12-4d5c-838b-3a4c1f2e54ad",
778 |    "metadata": {
779 |     "tags": []
780 |    },
781 |    "outputs": [
782 |     {
783 |      "name": "stdout",
784 |      "output_type": "stream",
785 |      "text": [
786 |       "Endpoint deleted successfully\n"
787 |      ]
788 |     }
789 |    ],
790 |    "source": [
791 |     "endpoint = endpoint.delete()\n",
792 |     "\n",
793 |     "if not endpoint:\n",
794 |     "    print('Endpoint deleted successfully')\n",
795 |     "else:\n",
796 |     "    print('Delete Endpoint in manually') "
797 |    ]
798 |   }
799 |  ],
800 |  "metadata": {
801 |   "kernelspec": {
802 |    "display_name": "Python 3 (ipykernel)",
803 |    "language": "python",
804 |    "name": "python3"
805 |   },
806 |   "language_info": {
807 |    "codemirror_mode": {
808 |     "name": "ipython",
809 |     "version": 3
810 |    },
811 |    "file_extension": ".py",
812 |    "mimetype": "text/x-python",
813 |    "name": "python",
814 |    "nbconvert_exporter": "python",
815 |    "pygments_lexer": "ipython3",
816 |    "version": "3.10.8"
817 |   }
818 |  },
819 |  "nbformat": 4,
820 |  "nbformat_minor": 5
821 | }
822 | 


--------------------------------------------------------------------------------
/notebooks/zh-CN/fine_tuning_code_llm_on_single_gpu.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "id": "FNdZ-kD0l78P"
   7 |    },
   8 |    "source": [
   9 |     "#  在单个 GPU 上针对自定义代码微调代码 LLM\n",
  10 |     "\n",
  11 |     "_作者: [Maria Khalusova](https://github.com/MKhalusova)_\n",
  12 |     "\n",
  13 |     "公开发布的代码 LLM，如 Codex、StarCoder 和 Code Llama，在生成遵循通用编程原则和语法的代码方面表现出色，但它们可能不符合组织的内部惯例，或者不了解某些特定的库。\n",
  14 |     "\n",
  15 |     "在这个 notebook 中，我们将展示如何微调代码 LLM 来更好的理解你们公司或组织的代码风格和习惯。由于代码 LLM 非常大，按照传统的微调方式可能会消耗大量资源。但不用担心！我们会教你一些技巧，让你只用单个 GPU 就能完成微调工作。\n",
  16 |     "\n",
  17 |     "\n",
  18 |     "## 数据集\n",
  19 |     "\n",
  20 |     "对于这个例子，我们选择了 GitHub 上 Hugging Face 的前 10 个公共仓库。我们已经排除了非代码文件，如图片、音频文件、演示文稿等。对于 Jupyter notebook，我们只保留了包含代码的单元格。生成的代码被存储为一个数据集，你可以在 Hugging Face Hub 上找到，位于 [`smangrul/hf-stack-v1`](https://huggingface.co/datasets/smangrul/hf-stack-v1)。它包含仓库 id、文件路径和文件内容。\n",
  21 |     "\n",
  22 |     "\n",
  23 |     "## 模型\n",
  24 |     "\n",
  25 |     "我们将微调 [`bigcode/starcoderbase-1b`](https://huggingface.co/bigcode/starcoderbase-1b) 模型，这是一个在 80 多种编程语言上训练的 10 亿参数模型。这是一个需要权限的模型，所以如果你计划使用这个确切模型运行这个 notebook，你需要在其模型页面上获得访问权限。登录你的 Hugging Face 帐户以执行此操作：\n"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "code",
  30 |    "execution_count": null,
  31 |    "metadata": {
  32 |     "id": "bPlCJYDK6vrF"
  33 |    },
  34 |    "outputs": [],
  35 |    "source": [
  36 |     "from huggingface_hub import notebook_login\n",
  37 |     "\n",
  38 |     "notebook_login()"
  39 |    ]
  40 |   },
  41 |   {
  42 |    "cell_type": "markdown",
  43 |    "metadata": {
  44 |     "id": "WMVe_c8q43Qo"
  45 |    },
  46 |    "source": [
  47 |     "\n",
  48 |     "\n",
  49 |     "To get started, let's install all the necessary libraries. As you can see, in addition to `transformers` and `datasets`, we'll be using `peft`, `bitsandbytes`, and `flash-attn` to optimize the training.\n",
  50 |     "\n",
  51 |     "By employing parameter-efficient training techniques, we can run this notebook on a single A100 High-RAM GPU."
  52 |    ]
  53 |   },
  54 |   {
  55 |    "cell_type": "code",
  56 |    "execution_count": null,
  57 |    "metadata": {
  58 |     "id": "Fp7i8WMCjKJG"
  59 |    },
  60 |    "outputs": [],
  61 |    "source": [
  62 |     "!pip install -q transformers datasets peft bitsandbytes flash-attn"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "markdown",
  67 |    "metadata": {
  68 |     "id": "16EdABzt3_Ig"
  69 |    },
  70 |    "source": [
  71 |     "现在让我们定义一些变量。请随意调整这些变量。"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": null,
  77 |    "metadata": {
  78 |     "id": "hru3G-CLmqis"
  79 |    },
  80 |    "outputs": [],
  81 |    "source": [
  82 |     "MODEL=\"bigcode/starcoderbase-1b\" # Model checkpoint on the Hugging Face Hub\n",
  83 |     "DATASET=\"smangrul/hf-stack-v1\"   # Dataset on the Hugging Face Hub\n",
  84 |     "DATA_COLUMN=\"content\"            # Column name containing the code content\n",
  85 |     "\n",
  86 |     "SEQ_LENGTH=2048                  # Sequence length\n",
  87 |     "\n",
  88 |     "# Training arguments\n",
  89 |     "MAX_STEPS=2000                   # max_steps\n",
  90 |     "BATCH_SIZE=16                    # batch_size\n",
  91 |     "GR_ACC_STEPS=1                   # gradient_accumulation_steps\n",
  92 |     "LR=5e-4                          # learning_rate\n",
  93 |     "LR_SCHEDULER_TYPE=\"cosine\"       # lr_scheduler_type\n",
  94 |     "WEIGHT_DECAY=0.01                # weight_decay\n",
  95 |     "NUM_WARMUP_STEPS=30              # num_warmup_steps\n",
  96 |     "EVAL_FREQ=100                    # eval_freq\n",
  97 |     "SAVE_FREQ=100                    # save_freq\n",
  98 |     "LOG_FREQ=25                      # log_freq\n",
  99 |     "OUTPUT_DIR=\"peft-starcoder-lora-a100\" # output_dir\n",
 100 |     "BF16=True                        # bf16\n",
 101 |     "FP16=False                       # no_fp16\n",
 102 |     "\n",
 103 |     "# FIM trasformations arguments\n",
 104 |     "FIM_RATE=0.5                     # fim_rate\n",
 105 |     "FIM_SPM_RATE=0.5                 # fim_spm_rate\n",
 106 |     "\n",
 107 |     "# LORA\n",
 108 |     "LORA_R=8                         # lora_r\n",
 109 |     "LORA_ALPHA=32                    # lora_alpha\n",
 110 |     "LORA_DROPOUT=0.0                 # lora_dropout\n",
 111 |     "LORA_TARGET_MODULES=\"c_proj,c_attn,q_attn,c_fc,c_proj\"    # lora_target_modules\n",
 112 |     "\n",
 113 |     "# bitsandbytes config\n",
 114 |     "USE_NESTED_QUANT=True            # use_nested_quant\n",
 115 |     "BNB_4BIT_COMPUTE_DTYPE=\"bfloat16\"# bnb_4bit_compute_dtype\n",
 116 |     "\n",
 117 |     "SEED=0"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "code",
 122 |    "execution_count": null,
 123 |    "metadata": {
 124 |     "id": "FyZSXTbJrcnC"
 125 |    },
 126 |    "outputs": [],
 127 |    "source": [
 128 |     "from transformers import (\n",
 129 |     "    AutoModelForCausalLM,\n",
 130 |     "    AutoTokenizer,\n",
 131 |     "    Trainer,\n",
 132 |     "    TrainingArguments,\n",
 133 |     "    logging,\n",
 134 |     "    set_seed,\n",
 135 |     "    BitsAndBytesConfig,\n",
 136 |     ")\n",
 137 |     "\n",
 138 |     "set_seed(SEED)"
 139 |    ]
 140 |   },
 141 |   {
 142 |    "cell_type": "markdown",
 143 |    "metadata": {
 144 |     "id": "pO7F5L5AtKo1"
 145 |    },
 146 |    "source": [
 147 |     "## 准备数据"
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "markdown",
 152 |    "metadata": {
 153 |     "id": "1LmrIZqP0oUE"
 154 |    },
 155 |    "source": [
 156 |     "首先加载数据。由于数据集可能相当大，请确保启用流模式。流模式允许我们在遍历数据集时逐步加载数据，而不是一次性下载数据集的整个内容。\n",
 157 |     "\n",
 158 |     "我们将前 4000 个示例作为验证集，其余的全部作为训练数据。\n"
 159 |    ]
 160 |   },
 161 |   {
 162 |    "cell_type": "code",
 163 |    "execution_count": null,
 164 |    "metadata": {
 165 |     "id": "4oJZvZb-1J88"
 166 |    },
 167 |    "outputs": [],
 168 |    "source": [
 169 |     "from datasets import load_dataset\n",
 170 |     "import torch\n",
 171 |     "from tqdm import tqdm\n",
 172 |     "\n",
 173 |     "\n",
 174 |     "dataset = load_dataset(\n",
 175 |     "    DATASET,\n",
 176 |     "    data_dir=\"data\",\n",
 177 |     "    split=\"train\",\n",
 178 |     "    streaming=True,\n",
 179 |     ")\n",
 180 |     "\n",
 181 |     "valid_data = dataset.take(4000)\n",
 182 |     "train_data = dataset.skip(4000)\n",
 183 |     "train_data = train_data.shuffle(buffer_size=5000, seed=SEED)"
 184 |    ]
 185 |   },
 186 |   {
 187 |    "cell_type": "markdown",
 188 |    "metadata": {
 189 |     "id": "sLQ8t0LM2GR6"
 190 |    },
 191 |    "source": [
 192 |     "在这一步，数据集仍然包含任意长度的原始数据。为了训练，我们需要固定长度的输入。让我们创建一个可迭代的数据集，它可以从文本文件流中返回固定长度的 token 块。\n",
 193 |     "\n",
 194 |     "首先，让我们估计数据集中每个 token 的平均字符数，这将帮助我们稍后估计文本缓冲区中的 token 数量。默认情况下，我们只从数据集中取 400 个示例（`nb_examples`）。只使用整个数据集的一个子集可以减少计算成本，同时仍然提供了对整体字符到 token 比的合理估计。\n"
 195 |    ]
 196 |   },
 197 |   {
 198 |    "cell_type": "code",
 199 |    "execution_count": null,
 200 |    "metadata": {
 201 |     "colab": {
 202 |      "base_uri": "https://localhost:8080/"
 203 |     },
 204 |     "id": "KCiAvydztNsu",
 205 |     "outputId": "cabf7fd0-a922-4371-cbc6-60ee99ef7469"
 206 |    },
 207 |    "outputs": [
 208 |     {
 209 |      "name": "stderr",
 210 |      "output_type": "stream",
 211 |      "text": [
 212 |       "100%|██████████| 400/400 [00:10<00:00, 39.87it/s] "
 213 |      ]
 214 |     },
 215 |     {
 216 |      "name": "stdout",
 217 |      "output_type": "stream",
 218 |      "text": [
 219 |       "The character to token ratio of the dataset is: 2.43\n"
 220 |      ]
 221 |     },
 222 |     {
 223 |      "name": "stderr",
 224 |      "output_type": "stream",
 225 |      "text": [
 226 |       "\n"
 227 |      ]
 228 |     }
 229 |    ],
 230 |    "source": [
 231 |     "tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)\n",
 232 |     "\n",
 233 |     "def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):\n",
 234 |     "    \"\"\"\n",
 235 |     "    Estimate the average number of characters per token in the dataset.\n",
 236 |     "    \"\"\"\n",
 237 |     "\n",
 238 |     "    total_characters, total_tokens = 0, 0\n",
 239 |     "    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):\n",
 240 |     "        total_characters += len(example[data_column])\n",
 241 |     "        total_tokens += len(tokenizer(example[data_column]).tokens())\n",
 242 |     "\n",
 243 |     "    return total_characters / total_tokens\n",
 244 |     "\n",
 245 |     "\n",
 246 |     "chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)\n",
 247 |     "print(f\"The character to token ratio of the dataset is: {chars_per_token:.2f}\")"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "markdown",
 252 |    "metadata": {
 253 |     "id": "6F13VGobB3Ma"
 254 |    },
 255 |    "source": [
 256 |     "字符到 token 的比也可以用作文本标记质量的一个指标。例如，字符到 token 的比为 1.0 意味着每个字符都由一个 token 表示，这并没有太多意义。表明标记化做得不好。在标准的英文文本中，一个 token 通常相当于大约四个字符，这意味着字符到 token 的比率大约是 4.0。我们可以预见在代码数据集中的比率会更低，但一般来说，2.0 到 3.5 之间的数字可以认为是足够好的。"
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "markdown",
 261 |    "metadata": {
 262 |     "id": "rcwYFRPpwxea"
 263 |    },
 264 |    "source": [
 265 |     "**可选的 FIM 变换**\n",
 266 |     "自回归语言模型通常是从左到右生成序列的。通过应用 FIM 变换，模型也可以学习填充文本。详细信息可以看[\"Efficient Training of Language Models to Fill in the Middle\" 这篇论文](https://arxiv.org/pdf/2207.14255.pdf)了解这种技术。\n",
 267 |     "\n",
 268 |     "我们将在下面定义 FIM 变换，并在创建可迭代数据集时使用它们。然而，如果你想省略变换步骤，请将 `fim_rate` 设置为 0。"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": null,
 274 |    "metadata": {
 275 |     "id": "zmejYvEKw1E-"
 276 |    },
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "import functools\n",
 280 |     "import numpy as np\n",
 281 |     "\n",
 282 |     "\n",
 283 |     "# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.\n",
 284 |     "@functools.lru_cache(maxsize=None)\n",
 285 |     "def get_fim_token_ids(tokenizer):\n",
 286 |     "    try:\n",
 287 |     "        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[\"additional_special_tokens\"][1:5]\n",
 288 |     "        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (\n",
 289 |     "            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]\n",
 290 |     "        )\n",
 291 |     "    except KeyError:\n",
 292 |     "        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None\n",
 293 |     "    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id\n",
 294 |     "\n",
 295 |     "\n",
 296 |     "## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py\n",
 297 |     "def permute(\n",
 298 |     "    sample,\n",
 299 |     "    np_rng,\n",
 300 |     "    suffix_tok_id,\n",
 301 |     "    prefix_tok_id,\n",
 302 |     "    middle_tok_id,\n",
 303 |     "    pad_tok_id,\n",
 304 |     "    fim_rate=0.5,\n",
 305 |     "    fim_spm_rate=0.5,\n",
 306 |     "    truncate_or_pad=False,\n",
 307 |     "):\n",
 308 |     "    \"\"\"\n",
 309 |     "    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:\n",
 310 |     "    PSM and SPM (with a probability of fim_spm_rate).\n",
 311 |     "    \"\"\"\n",
 312 |     "\n",
 313 |     "    # The if condition will trigger with the probability of fim_rate\n",
 314 |     "    # This means FIM transformations will apply to samples with a probability of fim_rate\n",
 315 |     "    if np_rng.binomial(1, fim_rate):\n",
 316 |     "\n",
 317 |     "        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.\n",
 318 |     "        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))\n",
 319 |     "        boundaries.sort()\n",
 320 |     "\n",
 321 |     "        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)\n",
 322 |     "        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)\n",
 323 |     "        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)\n",
 324 |     "\n",
 325 |     "        if truncate_or_pad:\n",
 326 |     "            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix\n",
 327 |     "            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3\n",
 328 |     "            diff = new_length - len(sample)\n",
 329 |     "\n",
 330 |     "            # trancate or pad if there's a difference in length between the new length and the original\n",
 331 |     "            if diff > 0:\n",
 332 |     "                if suffix.shape[0] <= diff:\n",
 333 |     "                    return sample, np_rng\n",
 334 |     "                suffix = suffix[: suffix.shape[0] - diff]\n",
 335 |     "            elif diff < 0:\n",
 336 |     "                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])\n",
 337 |     "\n",
 338 |     "        # With the probability of fim_spm_rateapply SPM variant of FIM transformations\n",
 339 |     "        # SPM: suffix, prefix, middle\n",
 340 |     "        if np_rng.binomial(1, fim_spm_rate):\n",
 341 |     "            new_sample = np.concatenate(\n",
 342 |     "                [\n",
 343 |     "                    [prefix_tok_id, suffix_tok_id],\n",
 344 |     "                    suffix,\n",
 345 |     "                    [middle_tok_id],\n",
 346 |     "                    prefix,\n",
 347 |     "                    middle,\n",
 348 |     "                ]\n",
 349 |     "            )\n",
 350 |     "        # Otherwise, apply the PSM variant of FIM transformations\n",
 351 |     "        # PSM: prefix, suffix, middle\n",
 352 |     "        else:\n",
 353 |     "\n",
 354 |     "            new_sample = np.concatenate(\n",
 355 |     "                [\n",
 356 |     "                    [prefix_tok_id],\n",
 357 |     "                    prefix,\n",
 358 |     "                    [suffix_tok_id],\n",
 359 |     "                    suffix,\n",
 360 |     "                    [middle_tok_id],\n",
 361 |     "                    middle,\n",
 362 |     "                ]\n",
 363 |     "            )\n",
 364 |     "    else:\n",
 365 |     "        # don't apply FIM transformations\n",
 366 |     "        new_sample = sample\n",
 367 |     "\n",
 368 |     "    return list(new_sample), np_rng\n"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "markdown",
 373 |    "metadata": {
 374 |     "id": "AwW5FviD9xBH"
 375 |    },
 376 |    "source": [
 377 |     "让我们定义 `ConstantLengthDataset`，这是一个可迭代的数据集，它将返回固定长度的 token 块。为此，我们将从原始数据集中读取文本缓冲区，直到达到大小限制，然后应用分词器将原始文本转换为 token 后的输入。可选项，我们可以在一些序列上执行 FIM 变换（受影响的序列比例由 `fim_rate` 控制）。\n",
 378 |     "\n",
 379 |     "定义好后，我们可以从训练和验证数据中创建 `ConstantLengthDataset` 的实例。"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": null,
 385 |    "metadata": {
 386 |     "id": "AgDW-692wzOl"
 387 |    },
 388 |    "outputs": [],
 389 |    "source": [
 390 |     "from torch.utils.data import IterableDataset\n",
 391 |     "from torch.utils.data.dataloader import DataLoader\n",
 392 |     "import random\n",
 393 |     "\n",
 394 |     "# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.\n",
 395 |     "\n",
 396 |     "class ConstantLengthDataset(IterableDataset):\n",
 397 |     "    \"\"\"\n",
 398 |     "    Iterable dataset that returns constant length chunks of tokens from stream of text files.\n",
 399 |     "        Args:\n",
 400 |     "            tokenizer (Tokenizer): The processor used for proccessing the data.\n",
 401 |     "            dataset (dataset.Dataset): Dataset with text files.\n",
 402 |     "            infinite (bool): If True the iterator is reset after dataset reaches end else stops.\n",
 403 |     "            seq_length (int): Length of token sequences to return.\n",
 404 |     "            num_of_sequences (int): Number of token sequences to keep in buffer.\n",
 405 |     "            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.\n",
 406 |     "            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.\n",
 407 |     "            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.\n",
 408 |     "            seed (int): Seed for random number generator.\n",
 409 |     "    \"\"\"\n",
 410 |     "\n",
 411 |     "    def __init__(\n",
 412 |     "        self,\n",
 413 |     "        tokenizer,\n",
 414 |     "        dataset,\n",
 415 |     "        infinite=False,\n",
 416 |     "        seq_length=1024,\n",
 417 |     "        num_of_sequences=1024,\n",
 418 |     "        chars_per_token=3.6,\n",
 419 |     "        content_field=\"content\",\n",
 420 |     "        fim_rate=0.5,\n",
 421 |     "        fim_spm_rate=0.5,\n",
 422 |     "        seed=0,\n",
 423 |     "    ):\n",
 424 |     "        self.tokenizer = tokenizer\n",
 425 |     "        self.concat_token_id = tokenizer.eos_token_id\n",
 426 |     "        self.dataset = dataset\n",
 427 |     "        self.seq_length = seq_length\n",
 428 |     "        self.infinite = infinite\n",
 429 |     "        self.current_size = 0\n",
 430 |     "        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences\n",
 431 |     "        self.content_field = content_field\n",
 432 |     "        self.fim_rate = fim_rate\n",
 433 |     "        self.fim_spm_rate = fim_spm_rate\n",
 434 |     "        self.seed = seed\n",
 435 |     "\n",
 436 |     "        (\n",
 437 |     "            self.suffix_tok_id,\n",
 438 |     "            self.prefix_tok_id,\n",
 439 |     "            self.middle_tok_id,\n",
 440 |     "            self.pad_tok_id,\n",
 441 |     "        ) = get_fim_token_ids(self.tokenizer)\n",
 442 |     "        if not self.suffix_tok_id and self.fim_rate > 0:\n",
 443 |     "            print(\"FIM is not supported by tokenizer, disabling FIM\")\n",
 444 |     "            self.fim_rate = 0\n",
 445 |     "\n",
 446 |     "    def __iter__(self):\n",
 447 |     "        iterator = iter(self.dataset)\n",
 448 |     "        more_examples = True\n",
 449 |     "        np_rng = np.random.RandomState(seed=self.seed)\n",
 450 |     "        while more_examples:\n",
 451 |     "            buffer, buffer_len = [], 0\n",
 452 |     "            while True:\n",
 453 |     "                if buffer_len >= self.max_buffer_size:\n",
 454 |     "                    break\n",
 455 |     "                try:\n",
 456 |     "                    buffer.append(next(iterator)[self.content_field])\n",
 457 |     "                    buffer_len += len(buffer[-1])\n",
 458 |     "                except StopIteration:\n",
 459 |     "                    if self.infinite:\n",
 460 |     "                        iterator = iter(self.dataset)\n",
 461 |     "                    else:\n",
 462 |     "                        more_examples = False\n",
 463 |     "                        break\n",
 464 |     "            tokenized_inputs = self.tokenizer(buffer, truncation=False)[\"input_ids\"]\n",
 465 |     "            all_token_ids = []\n",
 466 |     "\n",
 467 |     "            for tokenized_input in tokenized_inputs:\n",
 468 |     "                # optionally do FIM permutations\n",
 469 |     "                if self.fim_rate > 0:\n",
 470 |     "                    tokenized_input, np_rng = permute(\n",
 471 |     "                        tokenized_input,\n",
 472 |     "                        np_rng,\n",
 473 |     "                        self.suffix_tok_id,\n",
 474 |     "                        self.prefix_tok_id,\n",
 475 |     "                        self.middle_tok_id,\n",
 476 |     "                        self.pad_tok_id,\n",
 477 |     "                        fim_rate=self.fim_rate,\n",
 478 |     "                        fim_spm_rate=self.fim_spm_rate,\n",
 479 |     "                        truncate_or_pad=False,\n",
 480 |     "                    )\n",
 481 |     "\n",
 482 |     "                all_token_ids.extend(tokenized_input + [self.concat_token_id])\n",
 483 |     "            examples = []\n",
 484 |     "            for i in range(0, len(all_token_ids), self.seq_length):\n",
 485 |     "                input_ids = all_token_ids[i : i + self.seq_length]\n",
 486 |     "                if len(input_ids) == self.seq_length:\n",
 487 |     "                    examples.append(input_ids)\n",
 488 |     "            random.shuffle(examples)\n",
 489 |     "            for example in examples:\n",
 490 |     "                self.current_size += 1\n",
 491 |     "                yield {\n",
 492 |     "                    \"input_ids\": torch.LongTensor(example),\n",
 493 |     "                    \"labels\": torch.LongTensor(example),\n",
 494 |     "                }\n",
 495 |     "\n",
 496 |     "\n",
 497 |     "train_dataset = ConstantLengthDataset(\n",
 498 |     "        tokenizer,\n",
 499 |     "        train_data,\n",
 500 |     "        infinite=True,\n",
 501 |     "        seq_length=SEQ_LENGTH,\n",
 502 |     "        chars_per_token=chars_per_token,\n",
 503 |     "        content_field=DATA_COLUMN,\n",
 504 |     "        fim_rate=FIM_RATE,\n",
 505 |     "        fim_spm_rate=FIM_SPM_RATE,\n",
 506 |     "        seed=SEED,\n",
 507 |     ")\n",
 508 |     "eval_dataset = ConstantLengthDataset(\n",
 509 |     "        tokenizer,\n",
 510 |     "        valid_data,\n",
 511 |     "        infinite=False,\n",
 512 |     "        seq_length=SEQ_LENGTH,\n",
 513 |     "        chars_per_token=chars_per_token,\n",
 514 |     "        content_field=DATA_COLUMN,\n",
 515 |     "        fim_rate=FIM_RATE,\n",
 516 |     "        fim_spm_rate=FIM_SPM_RATE,\n",
 517 |     "        seed=SEED,\n",
 518 |     ")"
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "markdown",
 523 |    "metadata": {
 524 |     "id": "rxev1sk6tRW9"
 525 |    },
 526 |    "source": [
 527 |     "## 准备模型"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "markdown",
 532 |    "metadata": {
 533 |     "id": "UCtWV-U42Eq_"
 534 |    },
 535 |    "source": [
 536 |     "现在数据已经准备好了，是时候加载模型了！我们将加载量化的模型。\n",
 537 |     "\n",
 538 |     "因为量化使用更少的位来表示数据，所以会减少内存使用。我们将使用 `bitsandbytes` 库来量化模型，因为它与 `transformers` 有很好的集成。我们需要做的只是定义一个 `bitsandbytes` 配置，然后在加载模型时使用它。\n",
 539 |     "\n",
 540 |     "4 比特位量化有不同的变体，但通常我们推荐使用 NF4 量化以获得更好的性能（`bnb_4bit_quant_type=\"nf4\"`）。\n",
 541 |     "\n",
 542 |     "`bnb_4bit_use_double_quant` 选项在第一次量化后添加第二次量化，以节省每个参数额外的 0.4 位。\n",
 543 |     "\n",
 544 |     "要了解更多关于量化的信息，请查看 [\"利用 bitsandbytes、4 比特位量化和 QLoRA 让 LLMs 更易于访问\" 的博客](https://huggingface.co/blog/4bit-transformers-bitsandbytes)。\n",
 545 |     "\n",
 546 |     "定义好后，将配置传递给 `from_pretrained` 方法以加载量化的模型。\n"
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "code",
 551 |    "execution_count": null,
 552 |    "metadata": {
 553 |     "id": "XuwoX6U2DUvK"
 554 |    },
 555 |    "outputs": [],
 556 |    "source": [
 557 |     "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n",
 558 |     "from peft.tuners.lora import LoraLayer\n",
 559 |     "\n",
 560 |     "load_in_8bit = False\n",
 561 |     "\n",
 562 |     "# 4-bit quantization\n",
 563 |     "compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)\n",
 564 |     "\n",
 565 |     "bnb_config = BitsAndBytesConfig(\n",
 566 |     "    load_in_4bit=True,\n",
 567 |     "    bnb_4bit_quant_type=\"nf4\",\n",
 568 |     "    bnb_4bit_compute_dtype=compute_dtype,\n",
 569 |     "    bnb_4bit_use_double_quant=USE_NESTED_QUANT,\n",
 570 |     ")\n",
 571 |     "\n",
 572 |     "device_map = {\"\": 0}\n",
 573 |     "\n",
 574 |     "model = AutoModelForCausalLM.from_pretrained(\n",
 575 |     "        MODEL,\n",
 576 |     "        load_in_8bit=load_in_8bit,\n",
 577 |     "        quantization_config=bnb_config,\n",
 578 |     "        device_map=device_map,\n",
 579 |     "        use_cache=False,  # We will be using gradient checkpointing\n",
 580 |     "        trust_remote_code=True,\n",
 581 |     "        use_flash_attention_2=True,\n",
 582 |     ")\n"
 583 |    ]
 584 |   },
 585 |   {
 586 |    "cell_type": "markdown",
 587 |    "metadata": {
 588 |     "id": "bO9e2FV8D8ZF"
 589 |    },
 590 |    "source": [
 591 |     "当使用量化模型进行训练时，你需要调用 `prepare_model_for_kbit_training()` 函数来预处理量化模型以进行训练。"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": null,
 597 |    "metadata": {
 598 |     "id": "Qb_eB4xzEDBk"
 599 |    },
 600 |    "outputs": [],
 601 |    "source": [
 602 |     "model = prepare_model_for_kbit_training(model)"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "markdown",
 607 |    "metadata": {
 608 |     "id": "lmnLjPZpDVtg"
 609 |    },
 610 |    "source": [
 611 |     "现在量化模型已经准备好了，我们可以设置一个 LoRA 配置。LoRA 通过大幅减少可训练参数的数量，使得微调更加高效。\n",
 612 |     "\n",
 613 |     "要使用 LoRA 技术训练模型，我们需要将基础模型包装为 `PeftModel`。这涉及到使用 `LoraConfig` 定义 LoRA 配置，并使用 `get_peft_model()` 和 `LoraConfig` 包装原始模型。\n",
 614 |     "\n",
 615 |     "要了解更多关于 LoRA 及其参数的信息，请参考 [PEFT 文档](https://huggingface.co/docs/peft/conceptual_guides/lora)。\n"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": null,
 621 |    "metadata": {
 622 |     "colab": {
 623 |      "base_uri": "https://localhost:8080/"
 624 |     },
 625 |     "id": "_pAUU2FR2Gey",
 626 |     "outputId": "63328c2b-e693-49b1-ce0a-3ca8722f852a"
 627 |    },
 628 |    "outputs": [
 629 |     {
 630 |      "name": "stdout",
 631 |      "output_type": "stream",
 632 |      "text": [
 633 |       "trainable params: 5,554,176 || all params: 1,142,761,472 || trainable%: 0.4860310866343243\n"
 634 |      ]
 635 |     }
 636 |    ],
 637 |    "source": [
 638 |     "# Set up lora\n",
 639 |     "peft_config = LoraConfig(\n",
 640 |     "    lora_alpha=LORA_ALPHA,\n",
 641 |     "    lora_dropout=LORA_DROPOUT,\n",
 642 |     "    r=LORA_R,\n",
 643 |     "    bias=\"none\",\n",
 644 |     "    task_type=\"CAUSAL_LM\",\n",
 645 |     "    target_modules=LORA_TARGET_MODULES.split(\",\"),\n",
 646 |     ")\n",
 647 |     "\n",
 648 |     "model = get_peft_model(model, peft_config)\n",
 649 |     "model.print_trainable_parameters()"
 650 |    ]
 651 |   },
 652 |   {
 653 |    "cell_type": "markdown",
 654 |    "metadata": {
 655 |     "id": "tHe7AElXzXVV"
 656 |    },
 657 |    "source": [
 658 |     "可以看到，通过应用 LoRA 技术，我们现在只需要训练不到 1% 的参数。"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "markdown",
 663 |    "metadata": {
 664 |     "id": "T_CqVydc40IM"
 665 |    },
 666 |    "source": [
 667 |     "## 训练模型"
 668 |    ]
 669 |   },
 670 |   {
 671 |    "cell_type": "markdown",
 672 |    "metadata": {
 673 |     "id": "Q_iN2khjrbD3"
 674 |    },
 675 |    "source": [
 676 |     "现在我们已经准备好了数据，并且优化了模型，我们可以将所有东西整合在一起开始训练。\n",
 677 |     "\n",
 678 |     "要实例化一个 `Trainer`，你需要定义训练配置。最重要的是 `TrainingArguments`，这是一个包含所有用于配置训练的属性的类。\n",
 679 |     "\n",
 680 |     "这些与你可能运行的任何其他类型的模型训练相似，所以我们这里不会详细说明。\n"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "code",
 685 |    "execution_count": null,
 686 |    "metadata": {
 687 |     "id": "65QHS8l1tKQe"
 688 |    },
 689 |    "outputs": [],
 690 |    "source": [
 691 |     "train_data.start_iteration = 0\n",
 692 |     "\n",
 693 |     "\n",
 694 |     "training_args = TrainingArguments(\n",
 695 |     "    output_dir=f\"Your_HF_username/{OUTPUT_DIR}\",\n",
 696 |     "    dataloader_drop_last=True,\n",
 697 |     "    evaluation_strategy=\"steps\",\n",
 698 |     "    save_strategy=\"steps\",\n",
 699 |     "    max_steps=MAX_STEPS,\n",
 700 |     "    eval_steps=EVAL_FREQ,\n",
 701 |     "    save_steps=SAVE_FREQ,\n",
 702 |     "    logging_steps=LOG_FREQ,\n",
 703 |     "    per_device_train_batch_size=BATCH_SIZE,\n",
 704 |     "    per_device_eval_batch_size=BATCH_SIZE,\n",
 705 |     "    learning_rate=LR,\n",
 706 |     "    lr_scheduler_type=LR_SCHEDULER_TYPE,\n",
 707 |     "    warmup_steps=NUM_WARMUP_STEPS,\n",
 708 |     "    gradient_accumulation_steps=GR_ACC_STEPS,\n",
 709 |     "    gradient_checkpointing=True,\n",
 710 |     "    fp16=FP16,\n",
 711 |     "    bf16=BF16,\n",
 712 |     "    weight_decay=WEIGHT_DECAY,\n",
 713 |     "    push_to_hub=True,\n",
 714 |     "    include_tokens_per_second=True,\n",
 715 |     ")\n"
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "markdown",
 720 |    "metadata": {
 721 |     "id": "kB_fLRex09ut"
 722 |    },
 723 |    "source": [
 724 |     "最后一步，实例化 `Trainer` 并调用 `train` 方法。   "
 725 |    ]
 726 |   },
 727 |   {
 728 |    "cell_type": "code",
 729 |    "execution_count": null,
 730 |    "metadata": {
 731 |     "colab": {
 732 |      "base_uri": "https://localhost:8080/",
 733 |      "height": 1000
 734 |     },
 735 |     "id": "rS3nVwhUC69O",
 736 |     "outputId": "61a5bdb2-b7d0-4aed-8290-4bf20c2ccd38"
 737 |    },
 738 |    "outputs": [
 739 |     {
 740 |      "name": "stdout",
 741 |      "output_type": "stream",
 742 |      "text": [
 743 |       "Training...\n"
 744 |      ]
 745 |     },
 746 |     {
 747 |      "data": {
 748 |       "text/html": [
 749 |        "\n",
 750 |        "    <div>\n",
 751 |        "      \n",
 752 |        "      <progress value='2000' max='2000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
 753 |        "      [2000/2000 4:16:10, Epoch 1/9223372036854775807]\n",
 754 |        "    </div>\n",
 755 |        "    <table border=\"1\" class=\"dataframe\">\n",
 756 |        "  <thead>\n",
 757 |        " <tr style=\"text-align: left;\">\n",
 758 |        "      <th>Step</th>\n",
 759 |        "      <th>Training Loss</th>\n",
 760 |        "      <th>Validation Loss</th>\n",
 761 |        "    </tr>\n",
 762 |        "  </thead>\n",
 763 |        "  <tbody>\n",
 764 |        "    <tr>\n",
 765 |        "      <td>100</td>\n",
 766 |        "      <td>5.524600</td>\n",
 767 |        "      <td>7.456872</td>\n",
 768 |        "    </tr>\n",
 769 |        "    <tr>\n",
 770 |        "      <td>200</td>\n",
 771 |        "      <td>5.617800</td>\n",
 772 |        "      <td>7.262190</td>\n",
 773 |        "    </tr>\n",
 774 |        "    <tr>\n",
 775 |        "      <td>300</td>\n",
 776 |        "      <td>5.129100</td>\n",
 777 |        "      <td>6.410039</td>\n",
 778 |        "    </tr>\n",
 779 |        "    <tr>\n",
 780 |        "      <td>400</td>\n",
 781 |        "      <td>5.052200</td>\n",
 782 |        "      <td>6.306774</td>\n",
 783 |        "    </tr>\n",
 784 |        "    <tr>\n",
 785 |        "      <td>500</td>\n",
 786 |        "      <td>5.202900</td>\n",
 787 |        "      <td>6.117062</td>\n",
 788 |        "    </tr>\n",
 789 |        "    <tr>\n",
 790 |        "      <td>600</td>\n",
 791 |        "      <td>4.654100</td>\n",
 792 |        "      <td>6.018349</td>\n",
 793 |        "    </tr>\n",
 794 |        "    <tr>\n",
 795 |        "      <td>700</td>\n",
 796 |        "      <td>5.100200</td>\n",
 797 |        "      <td>6.000355</td>\n",
 798 |        "    </tr>\n",
 799 |        "    <tr>\n",
 800 |        "      <td>800</td>\n",
 801 |        "      <td>5.049800</td>\n",
 802 |        "      <td>5.889457</td>\n",
 803 |        "    </tr>\n",
 804 |        "    <tr>\n",
 805 |        "      <td>900</td>\n",
 806 |        "      <td>4.541200</td>\n",
 807 |        "      <td>5.813823</td>\n",
 808 |        "    </tr>\n",
 809 |        "    <tr>\n",
 810 |        "      <td>1000</td>\n",
 811 |        "      <td>5.000700</td>\n",
 812 |        "      <td>5.834208</td>\n",
 813 |        "    </tr>\n",
 814 |        "    <tr>\n",
 815 |        "      <td>1100</td>\n",
 816 |        "      <td>5.026500</td>\n",
 817 |        "      <td>5.781939</td>\n",
 818 |        "    </tr>\n",
 819 |        "    <tr>\n",
 820 |        "      <td>1200</td>\n",
 821 |        "      <td>4.411800</td>\n",
 822 |        "      <td>5.720596</td>\n",
 823 |        "    </tr>\n",
 824 |        "    <tr>\n",
 825 |        "      <td>1300</td>\n",
 826 |        "      <td>4.782500</td>\n",
 827 |        "      <td>5.736376</td>\n",
 828 |        "    </tr>\n",
 829 |        "    <tr>\n",
 830 |        "      <td>1400</td>\n",
 831 |        "      <td>4.980200</td>\n",
 832 |        "      <td>5.712276</td>\n",
 833 |        "    </tr>\n",
 834 |        "    <tr>\n",
 835 |        "      <td>1500</td>\n",
 836 |        "      <td>4.368700</td>\n",
 837 |        "      <td>5.689637</td>\n",
 838 |        "    </tr>\n",
 839 |        "    <tr>\n",
 840 |        "      <td>1600</td>\n",
 841 |        "      <td>4.884700</td>\n",
 842 |        "      <td>5.675920</td>\n",
 843 |        "    </tr>\n",
 844 |        "    <tr>\n",
 845 |        "      <td>1700</td>\n",
 846 |        "      <td>4.914400</td>\n",
 847 |        "      <td>5.662421</td>\n",
 848 |        "    </tr>\n",
 849 |        "    <tr>\n",
 850 |        "      <td>1800</td>\n",
 851 |        "      <td>4.248700</td>\n",
 852 |        "      <td>5.660122</td>\n",
 853 |        "    </tr>\n",
 854 |        "    <tr>\n",
 855 |        "      <td>1900</td>\n",
 856 |        "      <td>4.798400</td>\n",
 857 |        "      <td>5.664026</td>\n",
 858 |        "    </tr>\n",
 859 |        "    <tr>\n",
 860 |        "      <td>2000</td>\n",
 861 |        "      <td>4.704200</td>\n",
 862 |        "      <td>5.655665</td>\n",
 863 |        "    </tr>\n",
 864 |        "  </tbody>\n",
 865 |        "</table><p>"
 866 |       ],
 867 |       "text/plain": [
 868 |        "<IPython.core.display.HTML object>"
 869 |       ]
 870 |      },
 871 |      "metadata": {},
 872 |      "output_type": "display_data"
 873 |     },
 874 |     {
 875 |      "data": {
 876 |       "text/plain": [
 877 |        "TrainOutput(global_step=2000, training_loss=4.885598585128784, metrics={'train_runtime': 15380.3075, 'train_samples_per_second': 2.081, 'train_steps_per_second': 0.13, 'train_tokens_per_second': 4261.033, 'total_flos': 4.0317260660736e+17, 'train_loss': 4.885598585128784, 'epoch': 1.0})"
 878 |       ]
 879 |      },
 880 |      "execution_count": 19,
 881 |      "metadata": {},
 882 |      "output_type": "execute_result"
 883 |     }
 884 |    ],
 885 |    "source": [
 886 |     "trainer = Trainer(\n",
 887 |     "    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset\n",
 888 |     ")\n",
 889 |     "\n",
 890 |     "print(\"Training...\")\n",
 891 |     "trainer.train()\n"
 892 |    ]
 893 |   },
 894 |   {
 895 |    "cell_type": "markdown",
 896 |    "metadata": {
 897 |     "id": "aAERlCnt1PEW"
 898 |    },
 899 |    "source": [
 900 |     "最后，你可以将微调好的模型推送到你的 Hub 仓库中，并分享给你的团队。"
 901 |    ]
 902 |   },
 903 |   {
 904 |    "cell_type": "code",
 905 |    "execution_count": null,
 906 |    "metadata": {
 907 |     "id": "1h7_AUTTDwE1"
 908 |    },
 909 |    "outputs": [],
 910 |    "source": [
 911 |     "trainer.push_to_hub()"
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "markdown",
 916 |    "metadata": {
 917 |     "id": "KBVH7uFOM_UF"
 918 |    },
 919 |    "source": [
 920 |     "## 推理\n",
 921 |     "\n",
 922 |     "一旦模型被上传到 Hub，我们就可以使用它进行推理。为此，我们首先初始化原始的基础模型及其分词器。接下来，我们需要将微调后的权重与基础模型合并。"
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "code",
 927 |    "execution_count": null,
 928 |    "metadata": {
 929 |     "id": "jtL37piINBFe"
 930 |    },
 931 |    "outputs": [],
 932 |    "source": [
 933 |     "from peft import PeftModel\n",
 934 |     "import torch\n",
 935 |     "\n",
 936 |     "# load the original model first\n",
 937 |     "tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)\n",
 938 |     "base_model = AutoModelForCausalLM.from_pretrained(\n",
 939 |     "    MODEL,\n",
 940 |     "    quantization_config=None,\n",
 941 |     "    device_map=None,\n",
 942 |     "    trust_remote_code=True,\n",
 943 |     "    torch_dtype=torch.bfloat16,\n",
 944 |     ").cuda()\n",
 945 |     "\n",
 946 |     "# merge fine-tuned weights with the base model\n",
 947 |     "peft_model_id = f\"Your_HF_username/{OUTPUT_DIR}\"\n",
 948 |     "model = PeftModel.from_pretrained(base_model, peft_model_id)\n",
 949 |     "model.merge_and_unload()"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "markdown",
 954 |    "metadata": {
 955 |     "id": "3USQ2suvDi9M"
 956 |    },
 957 |    "source": [
 958 |     "现在我们可以使用合并后的模型进行推理。为了方便起见，我们将定义一个 `get_code_completion` 函数 - 请随意尝试文本生成参数！\n"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "code",
 963 |    "execution_count": null,
 964 |    "metadata": {
 965 |     "id": "RoTGpNbjDeWI"
 966 |    },
 967 |    "outputs": [],
 968 |    "source": [
 969 |     "def get_code_completion(prefix, suffix):\n",
 970 |     "    text = prompt = f\"\"\"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>\"\"\"\n",
 971 |     "    model.eval()\n",
 972 |     "    outputs = model.generate(\n",
 973 |     "        input_ids=tokenizer(text, return_tensors=\"pt\").input_ids.cuda(),\n",
 974 |     "        max_new_tokens=128,\n",
 975 |     "        temperature=0.2,\n",
 976 |     "        top_k=50,\n",
 977 |     "        top_p=0.95,\n",
 978 |     "        do_sample=True,\n",
 979 |     "        repetition_penalty=1.0,\n",
 980 |     "    )\n",
 981 |     "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]"
 982 |    ]
 983 |   },
 984 |   {
 985 |    "cell_type": "markdown",
 986 |    "metadata": {
 987 |     "id": "0kMJiGDfDrBf"
 988 |    },
 989 |    "source": [
 990 |     "现在，为了获得代码补全，我们只需要调用 `get_code_complete` 函数，并将我们希望补全的前几行作为前缀传递，以及一个空字符串作为后缀。"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": null,
 996 |    "metadata": {
 997 |     "colab": {
 998 |      "base_uri": "https://localhost:8080/"
 999 |     },
1000 |     "id": "nXlco2_-YcvM",
1001 |     "outputId": "41c411ad-b7dc-4277-f975-c173888234bb"
1002 |    },
1003 |    "outputs": [
1004 |     {
1005 |      "name": "stdout",
1006 |      "output_type": "stream",
1007 |      "text": [
1008 |       "from peft import LoraConfig, TaskType, get_peft_model\n",
1009 |       "from transformers import AutoModelForCausalLM\n",
1010 |       "peft_config = LoraConfig(\n",
1011 |       "    task_type=TaskType.CAUSAL_LM,\n",
1012 |       "    r=8,\n",
1013 |       "    lora_alpha=32,\n",
1014 |       "    target_modules=[\"q_proj\", \"v_proj\"],\n",
1015 |       "    lora_dropout=0.1,\n",
1016 |       "    bias=\"none\",\n",
1017 |       "    modules_to_save=[\"q_proj\", \"v_proj\"],\n",
1018 |       "    inference_mode=False,\n",
1019 |       ")\n",
1020 |       "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
1021 |       "model = get_peft_model(model, peft_config)\n",
1022 |       "model.print_trainable_parameters()\n"
1023 |      ]
1024 |     }
1025 |    ],
1026 |    "source": [
1027 |     "prefix = \"\"\"from peft import LoraConfig, TaskType, get_peft_model\n",
1028 |     "from transformers import AutoModelForCausalLM\n",
1029 |     "peft_config = LoraConfig(\n",
1030 |     "\"\"\"\n",
1031 |     "suffix =\"\"\"\"\"\"\n",
1032 |     "\n",
1033 |     "print(get_code_completion(prefix, suffix))"
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "markdown",
1038 |    "metadata": {
1039 |     "id": "Ql2563kGlnmu"
1040 |    },
1041 |    "source": [
1042 |     "作为刚刚在这个 notebook 中使用过 PEFT 库的人，你可以看到创建为 `LoraConfig` 函数的生成结果相当不错！\n",
1043 |     "\n",
1044 |     "如果你回到我们为推理实例化模型的单元格，并注释掉我们合并微调权重的行，你可以看到原始模型对于完全相同的前缀会生成什么内容："
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "code",
1049 |    "execution_count": null,
1050 |    "metadata": {
1051 |     "colab": {
1052 |      "base_uri": "https://localhost:8080/"
1053 |     },
1054 |     "id": "29xxp1eHTgJ9",
1055 |     "outputId": "c6d597a2-01da-4d25-a32f-3a551212c5b4"
1056 |    },
1057 |    "outputs": [
1058 |     {
1059 |      "name": "stdout",
1060 |      "output_type": "stream",
1061 |      "text": [
1062 |       "from peft import LoraConfig, TaskType, get_peft_model\n",
1063 |       "from transformers import AutoModelForCausalLM\n",
1064 |       "peft_config = LoraConfig(\n",
1065 |       "    model_name_or_path=\"facebook/wav2vec2-base-960h\",\n",
1066 |       "    num_labels=1,\n",
1067 |       "    num_features=1,\n",
1068 |       "    num_hidden_layers=1,\n",
1069 |       "    num_attention_heads=1,\n",
1070 |       "    num_hidden_layers_per_attention_head=1,\n",
1071 |       "    num_attention_heads_per_hidden_layer=1,\n",
1072 |       "    hidden_size=1024,\n",
1073 |       "    hidden_dropout_prob=0.1,\n",
1074 |       "    hidden_act=\"gelu\",\n",
1075 |       "    hidden_act_dropout_prob=0.1,\n",
1076 |       "    hidden\n"
1077 |      ]
1078 |     }
1079 |    ],
1080 |    "source": [
1081 |     "prefix = \"\"\"from peft import LoraConfig, TaskType, get_peft_model\n",
1082 |     "from transformers import AutoModelForCausalLM\n",
1083 |     "peft_config = LoraConfig(\n",
1084 |     "\"\"\"\n",
1085 |     "suffix =\"\"\"\"\"\"\n",
1086 |     "\n",
1087 |     "print(get_code_completion(prefix, suffix))"
1088 |    ]
1089 |   },
1090 |   {
1091 |    "cell_type": "markdown",
1092 |    "metadata": {
1093 |     "id": "Pwy2ZC7U8Ema"
1094 |    },
1095 |    "source": [
1096 |     "尽管这是 Python 语法，但你可以看到原始模型并不理解 `LoraConfig` 应该做什么。\n"
1097 |    ]
1098 |   },
1099 |   {
1100 |    "cell_type": "markdown",
1101 |    "metadata": {
1102 |     "id": "CATYE8pp2drQ"
1103 |    },
1104 |    "source": [
1105 |     "要了解这种高效参数微调与完全微调的比较，以及如何通过推理端点在 VS Code 中使用这样的模型作为你的编程助手(copilot)，或者在本地使用，请查看[\"个人编程助手(copilot)：训练你自己的编码助手\"博客](https://huggingface.co/blog/personal-copilot)。这个 notebook 补充了原始博客内容。\n"
1106 |    ]
1107 |   }
1108 |  ],
1109 |  "metadata": {
1110 |   "accelerator": "GPU",
1111 |   "colab": {
1112 |    "gpuType": "A100",
1113 |    "machine_shape": "hm",
1114 |    "provenance": []
1115 |   },
1116 |   "kernelspec": {
1117 |    "display_name": "Python 3",
1118 |    "name": "python3"
1119 |   },
1120 |   "language_info": {
1121 |    "name": "python"
1122 |   }
1123 |  },
1124 |  "nbformat": 4,
1125 |  "nbformat_minor": 0
1126 | }
1127 | 


--------------------------------------------------------------------------------