├── .gitignore
├── README.md
├── code_editing_examples.md
├── code_explanation_examples.md
├── code_writing_examples.md
├── examples
├── Backtranslation_of_SQL_queries.py
├── Classification_using_embeddings.ipynb
├── Clustering.ipynb
├── Clustering_for_transaction_classification.ipynb
├── Code_search.ipynb
├── Customizing_embeddings.ipynb
├── Embedding_long_inputs.ipynb
├── Fine-tuned_classification.ipynb
├── Get_embeddings.ipynb
├── How_to_count_tokens_with_tiktoken.ipynb
├── How_to_handle_rate_limits.ipynb
├── How_to_stream_completions.ipynb
├── Multiclass_classification_for_transactions.ipynb
├── Obtain_dataset.ipynb
├── Question_answering_using_embeddings.ipynb
├── Recommendation_using_embeddings.ipynb
├── Regression_using_embeddings.ipynb
├── Semantic_text_search_using_embeddings.ipynb
├── Unit_test_writing_using_a_multi-step_prompt.ipynb
├── User_and_product_embeddings.ipynb
├── Visualizing_embeddings_in_2D.ipynb
├── Visualizing_embeddings_in_3D.ipynb
├── Zero-shot_classification_with_embeddings.ipynb
├── api_request_parallel_processor.py
├── azure
│ ├── completions.ipynb
│ ├── embeddings.ipynb
│ └── finetuning.ipynb
├── book_translation
│ ├── data
│ │ ├── geometry_English.tex
│ │ └── geometry_slovenian.tex
│ └── translate_latex_book.ipynb
├── dalle
│ └── Image_generations_edits_and_variations_with_DALL-E.ipynb
├── data
│ ├── 25000_spend_dataset_current.csv
│ ├── AG_news_samples.csv
│ ├── dbpedia_samples.jsonl
│ ├── example_requests_to_parallel_process.jsonl
│ ├── fine_food_reviews_1k.csv
│ ├── fine_food_reviews_with_embeddings_1k.csv
│ ├── labelled_transactions.csv
│ ├── library_transactions_with_embeddings_359.csv
│ ├── recommendations_embeddings_cache.pkl
│ └── snli_1.0_train_2k.csv
└── fine-tuned_qa
│ ├── answers_with_ft.py
│ ├── olympics-1-collect-data.ipynb
│ ├── olympics-2-create-qa.ipynb
│ └── olympics-3-train-qa.ipynb
├── how_to_work_with_large_language_models.md
├── images
├── OpenAI_Logo.png
├── chain_of_thought_fig1.png
├── chain_of_thought_fig11.png
├── chain_of_thought_fig3.png
├── chain_of_thought_fig5.png
├── faithful-reasoning_fig1.png
├── faithful-reasoning_fig2.png
├── faithful-reasoning_fig3.png
├── faithful-reasoning_fig4.png
├── faithful-reasoning_fig5.png
├── faithful-reasoning_fig7.png
├── faithful-reasoning_tab2.png
├── faithful-reasoning_tab5.png
├── least-to-most_fig1.png
├── least-to-most_tab11.png
├── least-to-most_tab4.png
├── least-to-most_tab9.png
├── lm_cascades_fig1.png
├── lm_cascades_fig3.png
├── lm_cascades_fig4.png
├── lm_cascades_fig5.png
├── lm_cascades_fig6.png
├── maieutic_fig2.png
├── maieutic_fig6.png
├── maieutic_tab1.png
├── selection-inference_fig1.png
├── selection-inference_fig4.png
├── self-consistency_fig1.png
├── self-consistency_fig3.png
├── star_fig1.png
├── star_tab1.png
├── verifiers_fig3.png
├── verifiers_fig5.png
├── zero-shot_reasoners_fig1.png
├── zero-shot_reasoners_fig2.png
├── zero-shot_reasoners_tab1.png
└── zero-shot_reasoners_tab5.png
├── solutions
└── web_crawl_Q&A
│ ├── requirements.txt
│ ├── web-qa.ipynb
│ └── web-qa.py
├── techniques_to_improve_reliability.md
├── text_comparison_examples.md
├── text_editing_examples.md
├── text_explanation_examples.md
├── text_writing_examples.md
└── transition_guides_for_deprecated_API_endpoints
├── README.md
├── answers_functionality_example.py
├── classification_functionality_example.py
└── search_functionality_example.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Data
132 | *transactions*.jsonl
133 | /examples/data/transactions*
134 | *.DS_Store
135 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # OpenAI 中文手册(OpenAI Cookbook)
4 |
5 | 
6 |
7 | OpenAI 中文手册分享了使用 [OpenAI API] 完成常见任务的示例代码。
8 |
9 | 要运行这些例子,你需要一个OpenAI账户和相关的API密钥([创建一个免费账户][API Signup]).
10 |
11 | 大多数代码示例是用Python编写的,尽管这些概念可以应用于任何语言。
12 |
13 | ## 指南 & 示例
14 |
15 | * API 的使用情况
16 | * [如何处理请求频率限制](examples/How_to_handle_rate_limits.ipynb)
17 | * [避免触及请求频率限制的并行处理脚本示例](examples/api_request_parallel_processor.py)
18 | * [如何用 tiktoken 计算 token](examples/How_to_count_tokens_with_tiktoken.ipynb)
19 | * [如何串联补全](examples/How_to_stream_completions.ipynb)
20 | * GPT-3
21 | * [指南:如何运行大型语言模型](how_to_work_with_large_language_models.md)
22 | * [指南:提高可靠性的技术](techniques_to_improve_reliability.md)
23 | * [如何使用多步骤提示来编写单元测试](examples/Unit_test_writing_using_a_multi-step_prompt.ipynb)
24 | * [文本写作范例](text_writing_examples.md)
25 | * [文本解释实例](text_explanation_examples.md)
26 | * [文本编辑实例](text_editing_examples.md)
27 | * [代码编写实例](code_writing_examples.md)
28 | * [代码解释实例](code_explanation_examples.md)
29 | * [代码编辑实例](code_editing_examples.md)
30 | * 词嵌入
31 | * [文本比较实例](text_comparison_examples.md)
32 | * [如何获得词嵌入](examples/Get_embeddings.ipynb)
33 | * [使用词嵌入回答问题](examples/Question_answering_using_embeddings.ipynb)
34 | * [使用词嵌入的语义搜索](examples/Semantic_text_search_using_embeddings.ipynb)
35 | * [使用词嵌入的建议](examples/Recommendation_using_embeddings.ipynb)
36 | * [对词嵌入进行聚类](examples/Clustering.ipynb)
37 | * [在 2D 中可视化词嵌入](examples/Visualizing_embeddings_in_2D.ipynb) 或 [3D](examples/Visualizing_embeddings_in_3D.ipynb)
38 | * [词嵌入长文本](examples/Embedding_long_inputs.ipynb)
39 | * 微调 GPT-3
40 | * [指南:微调 GPT-3 对文本进行分类的最佳做法](https://docs.google.com/document/d/1rqj7dkuvl7Byd5KQPUJRxc19BJt8wo0yHNwK84KfU3Q/edit)
41 | * [微调分类](examples/Fine-tuned_classification.ipynb)
42 | * DALL-E
43 | * [如何用 DALL-E 生成和编辑图像](examples/dalle/Image_generations_edits_and_variations_with_DALL-E.ipynb)
44 | * Azure OpenAI(基于微软 Azure 的 OpenAI)
45 | * [如何从 Azure OpenAI 获取补全功能](examples/azure/completions.ipynb)
46 | * [如何从 Azure OpenAI 获取词向量功能](examples/azure/embeddings.ipynb)
47 | * [如何用 Azure OpenAI 微调 GPT-3](examples/azure/finetuning.ipynb)
48 |
49 | ## 相关资源
50 |
51 | 除了这里的代码示例,你可以从以下资源中了解 [OpenAI API] :
52 |
53 | * 在试验室 [OpenAI Playground] 试试 API 的使用效果
54 | * 在文档 [OpenAI Documentation] 了解 API 的使用方法
55 | * 在论坛 [OpenAI Community Forum] 讨论 API 的使用经验
56 | * 在帮助中心 [OpenAI Help Center] 寻求使用问题的帮助
57 | * 在案例库 [OpenAI Examples] 找到更多使用案例
58 | * 或者不如亲自上手体验一下神奇的 [ChatGPT] 吧
59 | * 别忘了,在博客 [OpenAI Blog] 第一时间跟进我们的最新消息
60 |
61 | ## 贡献
62 |
63 | 如果有你想看到的示例或指南,请随时在 [issues page] 提出你的需求.
64 |
65 | [ChatGPT]: https://chat.openai.com/
66 | [OpenAI API]: https://openai.com/api/
67 | [API Signup]: https://beta.openai.com/signup
68 | [OpenAI Playground]: https://beta.openai.com/playground
69 | [OpenAI Documentation]: https://beta.openai.com/docs/introduction
70 | [OpenAI Community Forum]: https://community.openai.com/top?period=monthly
71 | [OpenAI Help Center]: https://help.openai.com/en/
72 | [OpenAI Examples]: https://beta.openai.com/examples
73 | [OpenAI Blog]: https://openai.com/blog/
74 | [issues page]: https://github.com/openai/openai-cookbook/issues
75 |
76 | ## 更多探讨
77 |
78 | 在翻译过程中,我也发现自己能力的不足,在一些词的翻译上,无法找到准确的表达,我使用了原词,同时有些词我按照我的理解做了翻译,但仍觉得不够完美,将他们罗列在下面,供大家共同讨论,欢迎大家在 issue 中一起讨论。
79 |
80 | ### 是否有更好更准确的翻译
81 | 1. token
82 | 2. How_to_stream_completions
83 | 3. Embeddings
--------------------------------------------------------------------------------
/code_editing_examples.md:
--------------------------------------------------------------------------------
1 | # 代码编辑实例
2 |
3 | OpenAI的[edits](https://openai.com/blog/gpt-3-edit-insert/)端点对于编辑代码特别有用。
4 |
5 | 与补全不同,edits需要两个输入:要编辑的文本和一条指令。
6 |
7 | 例如,如果你想编辑一个Python函数,你可以提供该函数的文本和一个指令,如 "add a docstring"。
8 |
9 | 输入`code-davinci-edit-001`的文本示例。
10 |
11 | ```python
12 | def tribonacci(n):
13 | if n == 0:
14 | return 0
15 | elif n == 1:
16 | return 1
17 | elif n == 2:
18 | return 1
19 | elif n == 3:
20 | return 2
21 | else:
22 | return tribonacci(n-1) + tribonacci(n-2) + tribonacci(n-3)
23 | ```
24 |
25 | 示例指令输入:
26 |
27 | ```text
28 | add a docstring
29 | ```
30 |
31 | ```text
32 | Add typing, using Python 3.9 conventions
33 | ```
34 |
35 | ```text
36 | improved the runtime
37 | ```
38 |
39 | ```text
40 | Add a test.
41 | ```
42 |
43 | ```text
44 | Translate to JavaScript (or Rust or Lisp or any language you like)
45 | ```
46 |
47 | 改进运行时间并翻译成JavaScript后的输出示例:
48 |
49 | ```JavaScript
50 | function tribonacci(n) {
51 | let a = 0;
52 | let b = 1;
53 | let c = 1;
54 | for (let i = 0; i < n; i++) {
55 | [a, b, c] = [b, c, a + b + c];
56 | }
57 | return a;
58 | }
59 | ```
60 |
61 | 正如你所看到的,`code-davinci-edit-001`能够成功地将函数的运行时间从指数级降低到线性,以及从Python转换到JavaScript。
62 |
63 | 在 [OpenAI Playground](https://beta.openai.com/playground?mode=edit&model=code-davinci-edit-001) 中使用 `code-davinci-edit-001` 进行代码编辑实验。
--------------------------------------------------------------------------------
/code_explanation_examples.md:
--------------------------------------------------------------------------------
1 | # Code explanation examples
2 |
3 | GPT's understanding of code can be applied to many use cases, e.g.:
4 |
5 | * Generating in-code documentation (e.g., Python docstrings, git commit messages)
6 | * Generating out-of-code documentation (e.g., man pages)
7 | * An interactive code exploration tool
8 | * Communicating program results back to users via a natural language interface
9 |
10 | For example, if you wanted to understand a SQL query, you could give `code-davinci-002` the following example prompt:
11 |
12 | ````text
13 | A SQL query:
14 | ```
15 | SELECT c.customer_id
16 | FROM Customers c
17 | JOIN Streaming s
18 | ON c.customer_id = s.customer_id
19 | WHERE c.signup_date BETWEEN '2020-03-01' AND '2020-03-31'
20 | AND s.watch_date BETWEEN c.signup_date AND DATE_ADD(c.signup_date, INTERVAL 30 DAY)
21 | GROUP BY c.customer_id
22 | HAVING SUM(s.watch_minutes) > 50 * 60
23 | ```
24 |
25 | Questions:
26 | 1. What does the SQL query do?
27 | 2. Why might someone be interested in this time period?
28 | 3. Why might a company be interested in this SQL query?
29 |
30 | Answers:
31 | ````
32 |
33 | [Output]((https://beta.openai.com/playground/p/Sv1VQKbJV1TZKmiTK9r6nlj3)):
34 |
35 | ```text
36 | 1. The SQL query finds all customers who signed up in March 2020 and watched more than 50 hours of content in the first 30 days after signing up.
37 | 2. The time period is interesting because it is the first month of the COVID-19 pandemic.
38 | 3. A company might be interested in this SQL query because it can help them understand how the pandemic has affected their business.
39 | ```
40 |
41 | Note that `code-davinci-002` is not trained to follow instructions and therefore usually needs examples or other structure to help steer its output, as well as stop sequences to stop generating. For easier prompting, try `text-davinci-003`.
--------------------------------------------------------------------------------
/code_writing_examples.md:
--------------------------------------------------------------------------------
1 | # Code writing examples
2 |
3 | GPT-3 is able to write code as well as text.
4 |
5 | Here's an example of `code-davinci-002` writing a SQL query:
6 |
7 | ````text
8 | SQL tables (and columns):
9 | * Customers(customer_id, signup_date)
10 | * Streaming(customer_id, video_id, watch_date, watch_minutes)
11 |
12 | A well-written SQL query that lists customers who signed up during March 2020 and watched more than 50 hours of video in their first 30 days:
13 | ```
14 | ````
15 |
16 | [Output](https://beta.openai.com/playground/p/r2mw99cANoa0TJHok725CeaC):
17 |
18 | ```sql
19 | SELECT c.customer_id
20 | FROM Customers c
21 | JOIN Streaming s
22 | ON c.customer_id = s.customer_id
23 | WHERE c.signup_date BETWEEN '2020-03-01' AND '2020-03-31'
24 | AND s.watch_date BETWEEN c.signup_date AND DATE_ADD(c.signup_date, INTERVAL 30 DAY)
25 | GROUP BY c.customer_id
26 | HAVING SUM(s.watch_minutes) > 50 * 60
27 | ```
28 |
29 | Helpfully, `code-davinci-002` is able to make inferences from variable names; for example, it infers that `watch_minutes` has units of minutes and therefore needs to be converted by a factor of 60 before being compared with 50 hours.
30 |
31 | For easier prompting, you can also try `text-davinci-003`.
--------------------------------------------------------------------------------
/examples/Backtranslation_of_SQL_queries.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | from smokey import Smokey
4 |
5 | import openai
6 |
7 |
8 | def get_candidates(
9 | prompt: str,
10 | stop: List[str],
11 | temperature: float,
12 | priming_prefix: str,
13 | engine: str,
14 | n: int = 5,
15 | ) -> List[str]:
16 | """
17 | Generate N candidate completions based on the prompt, generated with a specific temperature.
18 |
19 | :param prompt: The prompt to start the conversation with.
20 | :param stop: A list of tokens that indicate the end of the generation.
21 | :param temperature: The temperature of the generation.
22 | :param priming_prefix: The prefix to use for the priming.
23 | :param engine: The engine to use for the generation.
24 | :param n: The number of completions to generate.
25 | :return: A list of completions.
26 | """
27 | response = openai.Completion.create(
28 | engine=engine,
29 | prompt=prompt,
30 | temperature=temperature,
31 | max_tokens=150,
32 | top_p=1,
33 | frequency_penalty=0,
34 | presence_penalty=0,
35 | stop=stop,
36 | n=n,
37 | )
38 | responses = [priming_prefix + choice.text for choice in response.choices]
39 | return responses
40 |
41 |
42 | def rindex(lst: List, value: str) -> int:
43 | """
44 | Return the index of the last occurence of a value in a list.
45 |
46 | :param lst: The list to search in.
47 | :param value: The value to search for.
48 | :return: The index of the last occurence of the value.
49 | """
50 | try:
51 | return len(lst) - lst[::-1].index(value) - 1
52 | except ValueError:
53 | raise ValueError(f"Answer start token `{value}` not found in the eval template")
54 |
55 |
56 | def eval_candidate(
57 | candidate_answer: str,
58 | original_instruction: str,
59 | eval_template: str,
60 | answer_start_token: str,
61 | engine: str,
62 | ) -> float:
63 | """
64 | Evaluate a candidate answer by calculating the average log probability
65 | of the original instruction, given the candidate answer with a specific
66 | evaluation template, aimed at reconstructing the original instruction.
67 |
68 | :param candidate_answer: The candidate answer to evaluate.
69 | :param original_instruction: The original instruction.
70 | :param eval_template: The template to use for the evaluation.
71 | :param answer_start_token: The token to use to indicate the start of the answer.
72 | :param engine: The engine to use for the evaluation.
73 | :return: The evaluation of the candidate answer.
74 | """
75 | response = openai.Completion.create(
76 | engine=engine,
77 | prompt=eval_template.format(candidate_answer, original_instruction),
78 | temperature=0,
79 | max_tokens=0,
80 | top_p=1,
81 | frequency_penalty=0,
82 | presence_penalty=0,
83 | logprobs=1,
84 | echo=True,
85 | )
86 |
87 | answer_start = rindex(
88 | response["choices"][0]["logprobs"]["tokens"], answer_start_token
89 | )
90 | logprobs = response["choices"][0]["logprobs"]["token_logprobs"][answer_start + 1 :]
91 | return sum(logprobs) / len(logprobs)
92 |
93 |
94 | def backtranslation(
95 | prompt_template: str,
96 | additional_info: str,
97 | instruction: str,
98 | eval_template: str,
99 | priming_prefix: str = "SELECT",
100 | stop1: List[str] = ["#", ";"],
101 | answer_start_token: str = "--",
102 | n: int = 5,
103 | temperature: float = 0.5,
104 | return_all_results: bool = False,
105 | engine: str = "davinci-codex",
106 | ) -> Union[str, List[str, float]]:
107 | """
108 | Generate a number of SQL queries given a natural language instruction,
109 | and pick the best one based on the average log probability of explaining the
110 | candidate SQL query with the exact original instruction, when prompted for
111 | a natural language explanation of the candidate SQL query.
112 |
113 | :param prompt_template: The template to use for the prompt to generate SQL.
114 | :param additional_info: Additional information to include in the prompt
115 | (SQL Tables, and their properties).
116 | :param instruction: The instruction in natural language.
117 | :param eval_template: The template to use for the evaluation.
118 | :param priming_prefix: The prefix to use for the priming of the SQL query.
119 | :param stop1: A list of tokens that indicate the end of the generation.
120 | :param answer_start_token: The token to use to indicate the start of the
121 | natural answer.
122 | :param n: The number of candidates to generate.
123 | :param temperature: The temperature of the generation.
124 | :param return_all_results: Whether to return all results or just the best one.
125 | :param engine: The engine to use for the generation and evaluation.
126 | :return: The best SQL query, or a list of all scored generated SQL queries.
127 | """
128 | prompt_template = prompt_template.format(
129 | additional_info, instruction, priming_prefix
130 | )
131 |
132 | candidates = []
133 | responses = get_candidates(
134 | prompt_template, stop1, temperature, priming_prefix, engine=engine, n=n
135 | )
136 | for i in range(n):
137 | quality = eval_candidate(
138 | responses[i],
139 | instruction,
140 | eval_template,
141 | answer_start_token,
142 | engine=engine,
143 | )
144 | candidates.append((responses[i], quality))
145 |
146 | candidates.sort(key=lambda x: x[1], reverse=True)
147 | if return_all_results:
148 | return candidates
149 | return candidates[0][0]
150 |
151 |
152 | def main(
153 | nl_query: str = "Return the name of each department that had more than 10 employees in June 2021",
154 | eval_template: str = "{};\n-- Explanation of the above query in human readable format\n-- {}",
155 | table_definitions: str = "# Employee(id, name, department_id)\n# Department(id, name, address)\n# Salary_Payments(id, employee_id, amount, date)\n",
156 | prompt_template: str = "### Postgres SQL tables, with their properties:\n#\n{}#\n### {}\n{}",
157 | n: int = 3,
158 | temperature: float = 0.3,
159 | engine: str = "davinci-codex",
160 | ):
161 | """
162 | Generate a number of SQL queries given a natural language instruction,
163 | and pick the best one based on the highest backtranslation score.
164 |
165 | :param nl_query: The natural language query.
166 | :param eval_template: The template to use for the evaluation.
167 | :param table_definitions: The definitions of the tables used in the query.
168 | :param prompt_template: The template to use for the prompt to generate SQL.
169 | :param n: The number of candidates to generate.
170 | :param temperature: The temperature of the generation.
171 | :param engine: The engine to use for the generation and evaluation.
172 | :return: The best SQL query, or a list of all scored generated SQL queries.
173 | """
174 |
175 | result = backtranslation(
176 | prompt_template,
177 | table_definitions,
178 | nl_query,
179 | eval_template,
180 | priming_prefix="SELECT",
181 | temperature=temperature,
182 | n=n,
183 | engine=engine,
184 | )
185 | print(result)
186 |
187 |
188 | if __name__ == "__main__":
189 | Smokey(main)
190 |
--------------------------------------------------------------------------------
/examples/Code_search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "## Code search\n",
9 | "\n",
10 | "We index our own [openai-python code repository](https://github.com/openai/openai-python), and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "Total number of py files: 51\n",
23 | "Total number of functions extracted: 97\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import os\n",
29 | "from glob import glob\n",
30 | "import pandas as pd\n",
31 | "\n",
32 | "def get_function_name(code):\n",
33 | " \"\"\"\n",
34 | " Extract function name from a line beginning with \"def \"\n",
35 | " \"\"\"\n",
36 | " assert code.startswith(\"def \")\n",
37 | " return code[len(\"def \"): code.index(\"(\")]\n",
38 | "\n",
39 | "def get_until_no_space(all_lines, i) -> str:\n",
40 | " \"\"\"\n",
41 | " Get all lines until a line outside the function definition is found.\n",
42 | " \"\"\"\n",
43 | " ret = [all_lines[i]]\n",
44 | " for j in range(i + 1, i + 10000):\n",
45 | " if j < len(all_lines):\n",
46 | " if len(all_lines[j]) == 0 or all_lines[j][0] in [\" \", \"\\t\", \")\"]:\n",
47 | " ret.append(all_lines[j])\n",
48 | " else:\n",
49 | " break\n",
50 | " return \"\\n\".join(ret)\n",
51 | "\n",
52 | "def get_functions(filepath):\n",
53 | " \"\"\"\n",
54 | " Get all functions in a Python file.\n",
55 | " \"\"\"\n",
56 | " whole_code = open(filepath).read().replace(\"\\r\", \"\\n\")\n",
57 | " all_lines = whole_code.split(\"\\n\")\n",
58 | " for i, l in enumerate(all_lines):\n",
59 | " if l.startswith(\"def \"):\n",
60 | " code = get_until_no_space(all_lines, i)\n",
61 | " function_name = get_function_name(code)\n",
62 | " yield {\"code\": code, \"function_name\": function_name, \"filepath\": filepath}\n",
63 | "\n",
64 | "\n",
65 | "# get user root directory\n",
66 | "root_dir = os.path.expanduser(\"~\")\n",
67 | "# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory\n",
68 | "\n",
69 | "# path to code repository directory\n",
70 | "code_root = root_dir + \"/openai-python\"\n",
71 | "\n",
72 | "code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]\n",
73 | "print(\"Total number of py files:\", len(code_files))\n",
74 | "\n",
75 | "if len(code_files) == 0:\n",
76 | " print(\"Double check that you have downloaded the openai-python repo and set the code_root variable correctly.\")\n",
77 | "\n",
78 | "all_funcs = []\n",
79 | "for code_file in code_files:\n",
80 | " funcs = list(get_functions(code_file))\n",
81 | " for func in funcs:\n",
82 | " all_funcs.append(func)\n",
83 | "\n",
84 | "print(\"Total number of functions extracted:\", len(all_funcs))"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 2,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/html": [
95 | "
\n",
96 | "\n",
109 | "
\n",
110 | " \n",
111 | " \n",
112 | " | \n",
113 | " code | \n",
114 | " function_name | \n",
115 | " filepath | \n",
116 | " code_embedding | \n",
117 | "
\n",
118 | " \n",
119 | " \n",
120 | " \n",
121 | " 0 | \n",
122 | " def _console_log_level():\\n if openai.log i... | \n",
123 | " _console_log_level | \n",
124 | " /openai/util.py | \n",
125 | " [0.03389773145318031, -0.004390408284962177, 0... | \n",
126 | "
\n",
127 | " \n",
128 | " 1 | \n",
129 | " def log_debug(message, **params):\\n msg = l... | \n",
130 | " log_debug | \n",
131 | " /openai/util.py | \n",
132 | " [-0.004034275189042091, 0.004895383026450872, ... | \n",
133 | "
\n",
134 | " \n",
135 | " 2 | \n",
136 | " def log_info(message, **params):\\n msg = lo... | \n",
137 | " log_info | \n",
138 | " /openai/util.py | \n",
139 | " [0.004882764536887407, 0.0033515947870910168, ... | \n",
140 | "
\n",
141 | " \n",
142 | " 3 | \n",
143 | " def log_warn(message, **params):\\n msg = lo... | \n",
144 | " log_warn | \n",
145 | " /openai/util.py | \n",
146 | " [0.002535992069169879, -0.010829543694853783, ... | \n",
147 | "
\n",
148 | " \n",
149 | " 4 | \n",
150 | " def logfmt(props):\\n def fmt(key, val):\\n ... | \n",
151 | " logfmt | \n",
152 | " /openai/util.py | \n",
153 | " [0.016732551157474518, 0.017367802560329437, 0... | \n",
154 | "
\n",
155 | " \n",
156 | "
\n",
157 | "
"
158 | ],
159 | "text/plain": [
160 | " code function_name \\\n",
161 | "0 def _console_log_level():\\n if openai.log i... _console_log_level \n",
162 | "1 def log_debug(message, **params):\\n msg = l... log_debug \n",
163 | "2 def log_info(message, **params):\\n msg = lo... log_info \n",
164 | "3 def log_warn(message, **params):\\n msg = lo... log_warn \n",
165 | "4 def logfmt(props):\\n def fmt(key, val):\\n ... logfmt \n",
166 | "\n",
167 | " filepath code_embedding \n",
168 | "0 /openai/util.py [0.03389773145318031, -0.004390408284962177, 0... \n",
169 | "1 /openai/util.py [-0.004034275189042091, 0.004895383026450872, ... \n",
170 | "2 /openai/util.py [0.004882764536887407, 0.0033515947870910168, ... \n",
171 | "3 /openai/util.py [0.002535992069169879, -0.010829543694853783, ... \n",
172 | "4 /openai/util.py [0.016732551157474518, 0.017367802560329437, 0... "
173 | ]
174 | },
175 | "execution_count": 2,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "from openai.embeddings_utils import get_embedding\n",
182 | "\n",
183 | "df = pd.DataFrame(all_funcs)\n",
184 | "df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
185 | "df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, \"\"))\n",
186 | "df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n",
187 | "df.head()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 3,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "/openai/tests/test_endpoints.py:test_completions score=0.826\n",
200 | "def test_completions():\n",
201 | " result = openai.Completion.create(prompt=\"This was a test\", n=5, engine=\"ada\")\n",
202 | " assert len(result.choices) == 5\n",
203 | "\n",
204 | "\n",
205 | "----------------------------------------------------------------------\n",
206 | "/openai/tests/test_endpoints.py:test_completions_model score=0.811\n",
207 | "def test_completions_model():\n",
208 | " result = openai.Completion.create(prompt=\"This was a test\", n=5, model=\"ada\")\n",
209 | " assert len(result.choices) == 5\n",
210 | " assert result.model.startswith(\"ada\")\n",
211 | "\n",
212 | "\n",
213 | "----------------------------------------------------------------------\n",
214 | "/openai/tests/test_endpoints.py:test_completions_multiple_prompts score=0.808\n",
215 | "def test_completions_multiple_prompts():\n",
216 | " result = openai.Completion.create(\n",
217 | " prompt=[\"This was a test\", \"This was another test\"], n=5, engine=\"ada\"\n",
218 | " )\n",
219 | " assert len(result.choices) == 10\n",
220 | "\n",
221 | "\n",
222 | "----------------------------------------------------------------------\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "from openai.embeddings_utils import cosine_similarity\n",
228 | "\n",
229 | "def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n",
230 | " embedding = get_embedding(code_query, engine='text-embedding-ada-002')\n",
231 | " df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
232 | "\n",
233 | " res = df.sort_values('similarities', ascending=False).head(n)\n",
234 | " if pprint:\n",
235 | " for r in res.iterrows():\n",
236 | " print(r[1].filepath+\":\"+r[1].function_name + \" score=\" + str(round(r[1].similarities, 3)))\n",
237 | " print(\"\\n\".join(r[1].code.split(\"\\n\")[:n_lines]))\n",
238 | " print('-'*70)\n",
239 | " return res\n",
240 | "\n",
241 | "res = search_functions(df, 'Completions API tests', n=3)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 4,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "name": "stdout",
251 | "output_type": "stream",
252 | "text": [
253 | "/openai/validators.py:format_inferrer_validator score=0.751\n",
254 | "def format_inferrer_validator(df):\n",
255 | " \"\"\"\n",
256 | " This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.\n",
257 | " It will also suggest to use ada and explain train/validation split benefits.\n",
258 | " \"\"\"\n",
259 | " ft_type = infer_task_type(df)\n",
260 | " immediate_msg = None\n",
261 | "----------------------------------------------------------------------\n",
262 | "/openai/validators.py:get_validators score=0.748\n",
263 | "def get_validators():\n",
264 | " return [\n",
265 | " num_examples_validator,\n",
266 | " lambda x: necessary_column_validator(x, \"prompt\"),\n",
267 | " lambda x: necessary_column_validator(x, \"completion\"),\n",
268 | " additional_column_validator,\n",
269 | " non_empty_field_validator,\n",
270 | "----------------------------------------------------------------------\n",
271 | "/openai/validators.py:infer_task_type score=0.738\n",
272 | "def infer_task_type(df):\n",
273 | " \"\"\"\n",
274 | " Infer the likely fine-tuning task type from the data\n",
275 | " \"\"\"\n",
276 | " CLASSIFICATION_THRESHOLD = 3 # min_average instances of each class\n",
277 | " if sum(df.prompt.str.len()) == 0:\n",
278 | " return \"open-ended generation\"\n",
279 | "----------------------------------------------------------------------\n"
280 | ]
281 | }
282 | ],
283 | "source": [
284 | "res = search_functions(df, 'fine-tuning input data validation logic', n=3)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 5,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "name": "stdout",
294 | "output_type": "stream",
295 | "text": [
296 | "/openai/validators.py:get_common_xfix score=0.793\n",
297 | "def get_common_xfix(series, xfix=\"suffix\"):\n",
298 | " \"\"\"\n",
299 | " Finds the longest common suffix or prefix of all the values in a series\n",
300 | " \"\"\"\n",
301 | " common_xfix = \"\"\n",
302 | " while True:\n",
303 | " common_xfixes = (\n",
304 | " series.str[-(len(common_xfix) + 1) :]\n",
305 | " if xfix == \"suffix\"\n",
306 | " else series.str[: len(common_xfix) + 1]\n",
307 | "----------------------------------------------------------------------\n",
308 | "/openai/validators.py:common_completion_suffix_validator score=0.778\n",
309 | "def common_completion_suffix_validator(df):\n",
310 | " \"\"\"\n",
311 | " This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.\n",
312 | " \"\"\"\n",
313 | " error_msg = None\n",
314 | " immediate_msg = None\n",
315 | " optional_msg = None\n",
316 | " optional_fn = None\n",
317 | "\n",
318 | " ft_type = infer_task_type(df)\n",
319 | "----------------------------------------------------------------------\n"
320 | ]
321 | }
322 | ],
323 | "source": [
324 | "res = search_functions(df, 'find common suffix', n=2, n_lines=10)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 6,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | "/openai/cli.py:tools_register score=0.773\n",
337 | "def tools_register(parser):\n",
338 | " subparsers = parser.add_subparsers(\n",
339 | " title=\"Tools\", help=\"Convenience client side tools\"\n",
340 | " )\n",
341 | "\n",
342 | " def help(args):\n",
343 | " parser.print_help()\n",
344 | "\n",
345 | " parser.set_defaults(func=help)\n",
346 | "\n",
347 | " sub = subparsers.add_parser(\"fine_tunes.prepare_data\")\n",
348 | " sub.add_argument(\n",
349 | " \"-f\",\n",
350 | " \"--file\",\n",
351 | " required=True,\n",
352 | " help=\"JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed.\"\n",
353 | " \"This should be the local file path.\",\n",
354 | " )\n",
355 | " sub.add_argument(\n",
356 | " \"-q\",\n",
357 | "----------------------------------------------------------------------\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)"
363 | ]
364 | }
365 | ],
366 | "metadata": {
367 | "interpreter": {
368 | "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8"
369 | },
370 | "kernelspec": {
371 | "display_name": "openai-cookbook",
372 | "language": "python",
373 | "name": "openai-cookbook"
374 | },
375 | "language_info": {
376 | "codemirror_mode": {
377 | "name": "ipython",
378 | "version": 3
379 | },
380 | "file_extension": ".py",
381 | "mimetype": "text/x-python",
382 | "name": "python",
383 | "nbconvert_exporter": "python",
384 | "pygments_lexer": "ipython3",
385 | "version": "3.9.6"
386 | },
387 | "orig_nbformat": 4
388 | },
389 | "nbformat": 4,
390 | "nbformat_minor": 2
391 | }
392 |
--------------------------------------------------------------------------------
/examples/Embedding_long_inputs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Embedding texts that are longer than the model's maximum context length\n",
9 | "\n",
10 | "OpenAI's embedding models cannot embed text that exceeds a maximum length. The maximum length varies by model, and is measured by _tokens_, not string length. If you are unfamiliar with tokenization, check out [How to count tokens with tiktoken](How_to_count_tokens_with_tiktoken.ipynb).\n",
11 | "\n",
12 | "This notebook shows how to handle texts that are longer than a model's maximum context length. We'll demonstrate using embeddings from `text-embedding-ada-002`, but the same ideas can be applied to other models and tasks. To learn more about embeddings, check out the [OpenAI Embeddings Guide](https://beta.openai.com/docs/guides/embeddings).\n"
13 | ]
14 | },
15 | {
16 | "attachments": {},
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## 1. Model context length\n",
21 | "\n",
22 | "First, we select the model and define a function to get embeddings from the API."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 1,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import openai\n",
32 | "from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type\n",
33 | "\n",
34 | "\n",
35 | "EMBEDDING_MODEL = 'text-embedding-ada-002'\n",
36 | "EMBEDDING_CTX_LENGTH = 8191\n",
37 | "EMBEDDING_ENCODING = 'cl100k_base'\n",
38 | "\n",
39 | "# let's make sure to not retry on an invalid request, because that is what we want to demonstrate\n",
40 | "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))\n",
41 | "def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):\n",
42 | " return openai.Embedding.create(input=text_or_tokens, model=model)[\"data\"][0][\"embedding\"]"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "The `text-embedding-ada-002` model has a context length of 8191 tokens with the `cl100k_base` encoding, and we can see that going over that limit causes an error."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "This model's maximum context length is 8191 tokens, however you requested 10001 tokens (10001 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "long_text = 'AGI ' * 5000\n",
67 | "try:\n",
68 | " get_embedding(long_text)\n",
69 | "except openai.InvalidRequestError as e:\n",
70 | " print(e)"
71 | ]
72 | },
73 | {
74 | "attachments": {},
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "Clearly we want to avoid these errors, particularly when handling programatically with a large number of embeddings. Yet, we still might be faced with texts that are longer than the maximum context length. Below we describe and provide recipes for the main approaches to handling these longer texts: (1) simply truncating the text to the maximum allowed length, and (2) chunking the text and embedding each chunk individually."
79 | ]
80 | },
81 | {
82 | "attachments": {},
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## 1. Truncating the input text\n",
87 | "\n",
88 | "The simplest solution is to truncate the input text to the maximum allowed length. Because the context length is measured in tokens, we have to first tokenize the text before truncating it. The API accepts inputs both in the form of text or tokens, so as long as you are careful that you are using the appropriate encoding, there is no need to convert the tokens back into string form. Below is an example of such a truncation function."
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "import tiktoken\n",
98 | "\n",
99 | "def truncate_text_tokens(text, encoding_name=EMBEDDING_ENCODING, max_tokens=EMBEDDING_CTX_LENGTH):\n",
100 | " \"\"\"Truncate a string to have `max_tokens` according to the given encoding.\"\"\"\n",
101 | " encoding = tiktoken.get_encoding(encoding_name)\n",
102 | " return encoding.encode(text)[:max_tokens]"
103 | ]
104 | },
105 | {
106 | "attachments": {},
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "Our example from before now works without error."
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 4,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "1536"
122 | ]
123 | },
124 | "execution_count": 4,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "truncated = truncate_text_tokens(long_text)\n",
131 | "len(get_embedding(truncated))"
132 | ]
133 | },
134 | {
135 | "attachments": {},
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "## 2. Chunking the input text\n",
140 | "\n",
141 | "Though truncation works, discarding potentially relevant text is a clear drawback. Another approach is to divide the input text into chunks and then embed each chunk individually. Then, we can either use the chunk embeddings separately, or combine them in some way, such as averaging (weighted by the size of each chunk).\n",
142 | "\n",
143 | "We will take a function from [Python's own cookbook](https://docs.python.org/3/library/itertools.html#itertools-recipes) that breaks up a sequence into chunks."
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 5,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "from itertools import islice\n",
153 | "\n",
154 | "def batched(iterable, n):\n",
155 | " \"\"\"Batch data into tuples of length n. The last batch may be shorter.\"\"\"\n",
156 | " # batched('ABCDEFG', 3) --> ABC DEF G\n",
157 | " if n < 1:\n",
158 | " raise ValueError('n must be at least one')\n",
159 | " it = iter(iterable)\n",
160 | " while (batch := tuple(islice(it, n))):\n",
161 | " yield batch"
162 | ]
163 | },
164 | {
165 | "attachments": {},
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "Now we define a function that encodes a string into tokens and then breaks it up into chunks."
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 6,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "def chunked_tokens(text, encoding_name, chunk_length):\n",
179 | " encoding = tiktoken.get_encoding(encoding_name)\n",
180 | " tokens = encoding.encode(text)\n",
181 | " chunks_iterator = batched(tokens, chunk_length)\n",
182 | " yield from chunks_iterator"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "Finally, we can write a function that safely handles embedding requests, even when the input text is longer than the maximum context length, by chunking the input tokens and embedding each chunk individually. The `average` flag can be set to `True` to return the weighted average of the chunk embeddings, or `False` to simply return the unmodified list of chunk embeddings."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 7,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "import numpy as np\n",
199 | "\n",
200 | "\n",
201 | "def len_safe_get_embedding(text, model=EMBEDDING_MODEL, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):\n",
202 | " chunk_embeddings = []\n",
203 | " for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):\n",
204 | " chunk_embeddings.append(get_embedding(chunk, model=model))\n",
205 | "\n",
206 | " if average:\n",
207 | " chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=[len(c) for c in chunk_embeddings])\n",
208 | " chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings) # normalizes length to 1\n",
209 | " chunk_embeddings = chunk_embeddings.tolist()\n",
210 | " return chunk_embeddings"
211 | ]
212 | },
213 | {
214 | "attachments": {},
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "Once again, we can now handle long input texts."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 8,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "name": "stdout",
228 | "output_type": "stream",
229 | "text": [
230 | "Setting average=True gives us a single 1536-dimensional embedding vector for our long text.\n",
231 | "Setting average=False gives us 2 embedding vectors, one for each of the chunks.\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "average_embedding_vector = len_safe_get_embedding(long_text, average=True)\n",
237 | "chunks_embedding_vectors = len_safe_get_embedding(long_text, average=False)\n",
238 | "\n",
239 | "print(f\"Setting average=True gives us a single {len(average_embedding_vector)}-dimensional embedding vector for our long text.\")\n",
240 | "print(f\"Setting average=False gives us {len(chunks_embedding_vectors)} embedding vectors, one for each of the chunks.\")\n"
241 | ]
242 | },
243 | {
244 | "attachments": {},
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "In some cases, it may make sense to split chunks on paragraph boundaries or sentence boundaries to help preserve the meaning of the text."
249 | ]
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python 3 (ipykernel)",
255 | "language": "python",
256 | "name": "python3"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 3
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython3",
268 | "version": "3.9.9"
269 | },
270 | "vscode": {
271 | "interpreter": {
272 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
273 | }
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 2
278 | }
279 |
--------------------------------------------------------------------------------
/examples/Get_embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Get embeddings\n",
8 | "\n",
9 | "The function `get_embedding` will give us an embedding for an input text."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/plain": [
20 | "1536"
21 | ]
22 | },
23 | "execution_count": 1,
24 | "metadata": {},
25 | "output_type": "execute_result"
26 | }
27 | ],
28 | "source": [
29 | "import openai\n",
30 | "\n",
31 | "embedding = openai.Embedding.create(\n",
32 | " input=\"Your text goes here\", model=\"text-embedding-ada-002\"\n",
33 | ")[\"data\"][0][\"embedding\"]\n",
34 | "len(embedding)\n"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "name": "stdout",
44 | "output_type": "stream",
45 | "text": [
46 | "1536\n"
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "import openai\n",
52 | "from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
53 | "\n",
54 | "\n",
55 | "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
56 | "def get_embedding(text: str, model=\"text-embedding-ada-002\") -> list[float]:\n",
57 | " return openai.Embedding.create(input=[text], model=model)[\"data\"][0][\"embedding\"]\n",
58 | "\n",
59 | "\n",
60 | "embedding = get_embedding(\"Your text goes here\", model=\"text-embedding-ada-002\")\n",
61 | "print(len(embedding))\n"
62 | ]
63 | }
64 | ],
65 | "metadata": {
66 | "kernelspec": {
67 | "display_name": "Python 3.9.9 ('openai')",
68 | "language": "python",
69 | "name": "python3"
70 | },
71 | "language_info": {
72 | "codemirror_mode": {
73 | "name": "ipython",
74 | "version": 3
75 | },
76 | "file_extension": ".py",
77 | "mimetype": "text/x-python",
78 | "name": "python",
79 | "nbconvert_exporter": "python",
80 | "pygments_lexer": "ipython3",
81 | "version": "3.9.9"
82 | },
83 | "orig_nbformat": 4,
84 | "vscode": {
85 | "interpreter": {
86 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
87 | }
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 2
92 | }
93 |
--------------------------------------------------------------------------------
/examples/How_to_count_tokens_with_tiktoken.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# How to count tokens with tiktoken\n",
9 | "\n",
10 | "[`tiktoken`](https://github.com/openai/tiktoken/blob/main/README.md) is a fast open-source tokenizer by OpenAI.\n",
11 | "\n",
12 | "Given a text string (e.g., `\"tiktoken is great!\"`) and an encoding (e.g., `\"gpt2\"`), a tokenizer can split the text string into a list of tokens (e.g., `[\"t\", \"ik\", \"token\", \" is\", \" great\", \"!\"]`).\n",
13 | "\n",
14 | "Splitting text strings into tokens is useful because models like GPT-3 see text in the form of tokens. Knowing how many tokens are in a text string can tell you (a) whether the string is too long for a text model to process and (b) how much an OpenAI API call costs (as usage is priced by token). Different models use different encodings.\n",
15 | "\n",
16 | "`tiktoken` supports three encodings used by OpenAI models:\n",
17 | "\n",
18 | "| Encoding name | OpenAI models |\n",
19 | "|-------------------------|-----------------------------------------------------|\n",
20 | "| `gpt2` (or `r50k_base`) | Most GPT-3 models |\n",
21 | "| `p50k_base` | Code models, `text-davinci-002`, `text-davinci-003` |\n",
22 | "| `cl100k_base` | `text-embedding-ada-002` |\n",
23 | "\n",
24 | "`p50k_base` overlaps substantially with `gpt2`, and for non-code applications, they will usually give the same tokens.\n",
25 | "\n",
26 | "## Tokenizer libraries and languages\n",
27 | "\n",
28 | "For `gpt2` encodings, tokenizers are available in many languages.\n",
29 | "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md) (or alternatively [GPT2TokenizerFast](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2TokenizerFast))\n",
30 | "- JavaScript: [gpt-3-encoder](https://www.npmjs.com/package/gpt-3-encoder)\n",
31 | "- .NET / C#: [GPT Tokenizer](https://github.com/dluc/openai-tools)\n",
32 | "- Java: [gpt2-tokenizer-java](https://github.com/hyunwoongko/gpt2-tokenizer-java)\n",
33 | "- PHP: [GPT-3-Encoder-PHP](https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP)\n",
34 | "\n",
35 | "(OpenAI makes no endorsements or guarantees of third-party libraries.)\n",
36 | "\n",
37 | "For `p50k_base` and `cl100k_base` encodings, `tiktoken` is the only tokenizer available as of January 2023.\n",
38 | "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md)\n",
39 | "\n",
40 | "## How strings are typically tokenized\n",
41 | "\n",
42 | "In English, tokens commonly range in length from one character to one word (e.g., `\"t\"` or `\" great\"`), though in some languages tokens can be shorter than one character or longer than one word. Spaces are usually grouped with the starts of words (e.g., `\" is\"` instead of `\"is \"` or `\" \"`+`\"is\"`). You can quickly check how a string is tokenized at the [OpenAI Tokenizer](https://beta.openai.com/tokenizer)."
43 | ]
44 | },
45 | {
46 | "attachments": {},
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## 0. Install `tiktoken`\n",
51 | "\n",
52 | "In your terminal, install `tiktoken` with `pip`:\n",
53 | "\n",
54 | "```bash\n",
55 | "pip install tiktoken\n",
56 | "```"
57 | ]
58 | },
59 | {
60 | "attachments": {},
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## 1. Import `tiktoken`"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 1,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "import tiktoken\n"
74 | ]
75 | },
76 | {
77 | "attachments": {},
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## 2. Load an encoding\n",
82 | "\n",
83 | "Use `tiktoken.get_encoding()` to load an encoding by name.\n",
84 | "\n",
85 | "The first time this runs, it will require an internet connection to download. Later runs won't need an internet connection."
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 2,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "encoding = tiktoken.get_encoding(\"gpt2\")\n"
95 | ]
96 | },
97 | {
98 | "attachments": {},
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "## 3. Turn text into tokens with `encoding.encode()`\n",
103 | "\n"
104 | ]
105 | },
106 | {
107 | "attachments": {},
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "The `.encode()` method converts a text string into a list of token integers."
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 3,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/plain": [
122 | "[83, 1134, 30001, 318, 1049, 0]"
123 | ]
124 | },
125 | "execution_count": 3,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "encoding.encode(\"tiktoken is great!\")\n"
132 | ]
133 | },
134 | {
135 | "attachments": {},
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "Count tokens by counting the length of the list returned by `.encode()`."
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 4,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
149 | " \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
150 | " encoding = tiktoken.get_encoding(encoding_name)\n",
151 | " num_tokens = len(encoding.encode(string))\n",
152 | " return num_tokens\n"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 5,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/plain": [
163 | "6"
164 | ]
165 | },
166 | "execution_count": 5,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "num_tokens_from_string(\"tiktoken is great!\", \"gpt2\")\n"
173 | ]
174 | },
175 | {
176 | "attachments": {},
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "## 4. Turn tokens into text with `encoding.decode()`"
181 | ]
182 | },
183 | {
184 | "attachments": {},
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "`.decode()` converts a list of token integers to a string."
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 6,
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "'tiktoken is great!'"
200 | ]
201 | },
202 | "execution_count": 6,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "encoding.decode([83, 1134, 30001, 318, 1049, 0])\n"
209 | ]
210 | },
211 | {
212 | "attachments": {},
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "Warning: although `.decode()` can be applied to single tokens, beware that it can be lossy for tokens that aren't on utf-8 boundaries."
217 | ]
218 | },
219 | {
220 | "attachments": {},
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "For single tokens, `.decode_single_token_bytes()` safely converts a single integer token to the bytes it represents."
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 7,
230 | "metadata": {},
231 | "outputs": [
232 | {
233 | "data": {
234 | "text/plain": [
235 | "[b't', b'ik', b'token', b' is', b' great', b'!']"
236 | ]
237 | },
238 | "execution_count": 7,
239 | "metadata": {},
240 | "output_type": "execute_result"
241 | }
242 | ],
243 | "source": [
244 | "[encoding.decode_single_token_bytes(token) for token in [83, 1134, 30001, 318, 1049, 0]]\n"
245 | ]
246 | },
247 | {
248 | "attachments": {},
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "(The `b` in front of the strings indicates that the strings are byte strings.)"
253 | ]
254 | },
255 | {
256 | "attachments": {},
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "## 5. Comparing encodings\n",
261 | "\n",
262 | "Different encodings can vary in how they split words, group spaces, and handle non-English characters. Using the methods above, we can compare different encodings on a few example strings."
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 8,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "def compare_encodings(example_string: str) -> None:\n",
272 | " \"\"\"Prints a comparison of three string encodings.\"\"\"\n",
273 | " # print the example string\n",
274 | " print(f'\\nExample string: \"{example_string}\"')\n",
275 | " # for each encoding, print the # of tokens, the token integers, and the token bytes\n",
276 | " for encoding_name in [\"gpt2\", \"p50k_base\", \"cl100k_base\"]:\n",
277 | " encoding = tiktoken.get_encoding(encoding_name)\n",
278 | " token_integers = encoding.encode(example_string)\n",
279 | " num_tokens = len(token_integers)\n",
280 | " token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]\n",
281 | " print()\n",
282 | " print(f\"{encoding_name}: {num_tokens} tokens\")\n",
283 | " print(f\"token integers: {token_integers}\")\n",
284 | " print(f\"token bytes: {token_bytes}\")\n",
285 | " "
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 9,
291 | "metadata": {},
292 | "outputs": [
293 | {
294 | "name": "stdout",
295 | "output_type": "stream",
296 | "text": [
297 | "\n",
298 | "Example string: \"antidisestablishmentarianism\"\n",
299 | "\n",
300 | "gpt2: 5 tokens\n",
301 | "token integers: [415, 29207, 44390, 3699, 1042]\n",
302 | "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
303 | "\n",
304 | "p50k_base: 5 tokens\n",
305 | "token integers: [415, 29207, 44390, 3699, 1042]\n",
306 | "token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']\n",
307 | "\n",
308 | "cl100k_base: 6 tokens\n",
309 | "token integers: [519, 85342, 34500, 479, 8997, 2191]\n",
310 | "token bytes: [b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "compare_encodings(\"antidisestablishmentarianism\")\n"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 10,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "\n",
328 | "Example string: \"2 + 2 = 4\"\n",
329 | "\n",
330 | "gpt2: 5 tokens\n",
331 | "token integers: [17, 1343, 362, 796, 604]\n",
332 | "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
333 | "\n",
334 | "p50k_base: 5 tokens\n",
335 | "token integers: [17, 1343, 362, 796, 604]\n",
336 | "token bytes: [b'2', b' +', b' 2', b' =', b' 4']\n",
337 | "\n",
338 | "cl100k_base: 7 tokens\n",
339 | "token integers: [17, 489, 220, 17, 284, 220, 19]\n",
340 | "token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']\n"
341 | ]
342 | }
343 | ],
344 | "source": [
345 | "compare_encodings(\"2 + 2 = 4\")\n"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 11,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "name": "stdout",
355 | "output_type": "stream",
356 | "text": [
357 | "\n",
358 | "Example string: \"お誕生日おめでとう\"\n",
359 | "\n",
360 | "gpt2: 14 tokens\n",
361 | "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
362 | "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
363 | "\n",
364 | "p50k_base: 14 tokens\n",
365 | "token integers: [2515, 232, 45739, 243, 37955, 33768, 98, 2515, 232, 1792, 223, 30640, 30201, 29557]\n",
366 | "token bytes: [b'\\xe3\\x81', b'\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97', b'\\xa5', b'\\xe3\\x81', b'\\x8a', b'\\xe3\\x82', b'\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8', b'\\xe3\\x81\\x86']\n",
367 | "\n",
368 | "cl100k_base: 9 tokens\n",
369 | "token integers: [33334, 45918, 243, 21990, 9080, 33334, 62004, 16556, 78699]\n",
370 | "token bytes: [b'\\xe3\\x81\\x8a', b'\\xe8\\xaa', b'\\x95', b'\\xe7\\x94\\x9f', b'\\xe6\\x97\\xa5', b'\\xe3\\x81\\x8a', b'\\xe3\\x82\\x81', b'\\xe3\\x81\\xa7', b'\\xe3\\x81\\xa8\\xe3\\x81\\x86']\n"
371 | ]
372 | }
373 | ],
374 | "source": [
375 | "compare_encodings(\"お誕生日おめでとう\")\n"
376 | ]
377 | }
378 | ],
379 | "metadata": {
380 | "kernelspec": {
381 | "display_name": "openai",
382 | "language": "python",
383 | "name": "python3"
384 | },
385 | "language_info": {
386 | "codemirror_mode": {
387 | "name": "ipython",
388 | "version": 3
389 | },
390 | "file_extension": ".py",
391 | "mimetype": "text/x-python",
392 | "name": "python",
393 | "nbconvert_exporter": "python",
394 | "pygments_lexer": "ipython3",
395 | "version": "3.9.9"
396 | },
397 | "orig_nbformat": 4,
398 | "vscode": {
399 | "interpreter": {
400 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
401 | }
402 | }
403 | },
404 | "nbformat": 4,
405 | "nbformat_minor": 2
406 | }
407 |
--------------------------------------------------------------------------------
/examples/Obtain_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 1. Load the dataset\n",
8 | "\n",
9 | "The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).\n",
10 | "\n",
11 | "We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding."
12 | ]
13 | },
14 | {
15 | "attachments": {},
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 6,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "# imports\n",
29 | "import pandas as pd\n",
30 | "import tiktoken\n",
31 | "\n",
32 | "from openai.embeddings_utils import get_embedding\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 7,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# embedding model parameters\n",
42 | "embedding_model = \"text-embedding-ada-002\"\n",
43 | "embedding_encoding = \"cl100k_base\" # this the encoding for text-embedding-ada-002\n",
44 | "max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191\n"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 8,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Time | \n",
74 | " ProductId | \n",
75 | " UserId | \n",
76 | " Score | \n",
77 | " Summary | \n",
78 | " Text | \n",
79 | " combined | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 | \n",
85 | " 1351123200 | \n",
86 | " B003XPF9BO | \n",
87 | " A3R7JR3FMEBXQB | \n",
88 | " 5 | \n",
89 | " where does one start...and stop... with a tre... | \n",
90 | " Wanted to save some to bring to my Chicago fam... | \n",
91 | " Title: where does one start...and stop... wit... | \n",
92 | "
\n",
93 | " \n",
94 | " 1 | \n",
95 | " 1351123200 | \n",
96 | " B003JK537S | \n",
97 | " A3JBPC3WFUT5ZP | \n",
98 | " 1 | \n",
99 | " Arrived in pieces | \n",
100 | " Not pleased at all. When I opened the box, mos... | \n",
101 | " Title: Arrived in pieces; Content: Not pleased... | \n",
102 | "
\n",
103 | " \n",
104 | "
\n",
105 | "
"
106 | ],
107 | "text/plain": [
108 | " Time ProductId UserId Score \\\n",
109 | "0 1351123200 B003XPF9BO A3R7JR3FMEBXQB 5 \n",
110 | "1 1351123200 B003JK537S A3JBPC3WFUT5ZP 1 \n",
111 | "\n",
112 | " Summary \\\n",
113 | "0 where does one start...and stop... with a tre... \n",
114 | "1 Arrived in pieces \n",
115 | "\n",
116 | " Text \\\n",
117 | "0 Wanted to save some to bring to my Chicago fam... \n",
118 | "1 Not pleased at all. When I opened the box, mos... \n",
119 | "\n",
120 | " combined \n",
121 | "0 Title: where does one start...and stop... wit... \n",
122 | "1 Title: Arrived in pieces; Content: Not pleased... "
123 | ]
124 | },
125 | "execution_count": 8,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "# load & inspect dataset\n",
132 | "input_datapath = \"data/fine_food_reviews_1k.csv\" # to save space, we provide a pre-filtered dataset\n",
133 | "df = pd.read_csv(input_datapath, index_col=0)\n",
134 | "df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\"]]\n",
135 | "df = df.dropna()\n",
136 | "df[\"combined\"] = (\n",
137 | " \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n",
138 | ")\n",
139 | "df.head(2)\n"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "1000"
151 | ]
152 | },
153 | "execution_count": 9,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "# subsample to 1k most recent reviews and remove samples that are too long\n",
160 | "top_n = 1000\n",
161 | "df = df.sort_values(\"Time\").tail(top_n * 2) # first cut to first 2k entries, assuming less than half will be filtered out\n",
162 | "df.drop(\"Time\", axis=1, inplace=True)\n",
163 | "\n",
164 | "encoding = tiktoken.get_encoding(embedding_encoding)\n",
165 | "\n",
166 | "# omit reviews that are too long to embed\n",
167 | "df[\"n_tokens\"] = df.combined.apply(lambda x: len(encoding.encode(x)))\n",
168 | "df = df[df.n_tokens <= max_tokens].tail(top_n)\n",
169 | "len(df)\n"
170 | ]
171 | },
172 | {
173 | "attachments": {},
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "## 2. Get embeddings and save them for future reuse"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 10,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\n",
187 | "\n",
188 | "# This may take a few minutes\n",
189 | "df[\"embedding\"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))\n",
190 | "df.to_csv(\"data/fine_food_reviews_with_embeddings_1k.csv\")\n"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "openai",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.9.9 (main, Dec 7 2021, 18:04:56) \n[Clang 13.0.0 (clang-1300.0.29.3)]"
211 | },
212 | "orig_nbformat": 4,
213 | "vscode": {
214 | "interpreter": {
215 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
216 | }
217 | }
218 | },
219 | "nbformat": 4,
220 | "nbformat_minor": 2
221 | }
222 |
--------------------------------------------------------------------------------
/examples/Regression_using_embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Regression using the embeddings\n",
8 | "\n",
9 | "Regression means predicting a number, rather than one of the categories. We will predict the score based on the embedding of the review's text. We split the dataset into a training and a testing set for all of the following tasks, so we can realistically evaluate performance on unseen data. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb).\n",
10 | "\n",
11 | "We're predicting the score of the review, which is a number between 1 and 5 (1-star being negative and 5-star positive)."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "ada-002 embedding performance on 1k Amazon reviews: mse=0.62, mae=0.53\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "\n",
31 | "from sklearn.ensemble import RandomForestRegressor\n",
32 | "from sklearn.model_selection import train_test_split\n",
33 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
34 | "\n",
35 | "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
36 | "\n",
37 | "df = pd.read_csv(datafile_path)\n",
38 | "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n",
39 | "\n",
40 | "X_train, X_test, y_train, y_test = train_test_split(list(df.embedding.values), df.Score, test_size=0.2, random_state=42)\n",
41 | "\n",
42 | "rfr = RandomForestRegressor(n_estimators=100)\n",
43 | "rfr.fit(X_train, y_train)\n",
44 | "preds = rfr.predict(X_test)\n",
45 | "\n",
46 | "mse = mean_squared_error(y_test, preds)\n",
47 | "mae = mean_absolute_error(y_test, preds)\n",
48 | "\n",
49 | "print(f\"ada-002 embedding performance on 1k Amazon reviews: mse={mse:.2f}, mae={mae:.2f}\")\n"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "Dummy mean prediction performance on Amazon reviews: mse=1.73, mae=1.03\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "bmse = mean_squared_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n",
67 | "bmae = mean_absolute_error(y_test, np.repeat(y_test.mean(), len(y_test)))\n",
68 | "print(\n",
69 | " f\"Dummy mean prediction performance on Amazon reviews: mse={bmse:.2f}, mae={bmae:.2f}\"\n",
70 | ")\n"
71 | ]
72 | },
73 | {
74 | "attachments": {},
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "We can see that the embeddings are able to predict the scores with an average error of 0.53 per score prediction. This is roughly equivalent to predicting half of reviews perfectly, and half off by one star."
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "You could also train a classifier to predict the label, or use the embeddings within an existing ML model to encode free text features."
86 | ]
87 | }
88 | ],
89 | "metadata": {
90 | "kernelspec": {
91 | "display_name": "openai",
92 | "language": "python",
93 | "name": "python3"
94 | },
95 | "language_info": {
96 | "codemirror_mode": {
97 | "name": "ipython",
98 | "version": 3
99 | },
100 | "file_extension": ".py",
101 | "mimetype": "text/x-python",
102 | "name": "python",
103 | "nbconvert_exporter": "python",
104 | "pygments_lexer": "ipython3",
105 | "version": "3.9.9"
106 | },
107 | "orig_nbformat": 4,
108 | "vscode": {
109 | "interpreter": {
110 | "hash": "365536dcbde60510dc9073d6b991cd35db2d9bac356a11f5b64279a5e6708b97"
111 | }
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 2
116 | }
117 |
--------------------------------------------------------------------------------
/examples/Semantic_text_search_using_embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Semantic text search using embeddings\n",
8 | "\n",
9 | "We can search through all our reviews semantically in a very efficient manner and at very low cost, by simply embedding our search query, and then finding the most similar reviews. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "\n",
21 | "datafile_path = \"data/fine_food_reviews_with_embeddings_1k.csv\"\n",
22 | "\n",
23 | "df = pd.read_csv(datafile_path)\n",
24 | "df[\"embedding\"] = df.embedding.apply(eval).apply(np.array)\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "Remember to use the documents embedding engine for documents (in this case reviews), and query embedding engine for queries. Note that here we just compare the cosine similarity of the embeddings of the query and the documents, and show top_n best matches."
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "Good Buy: I liked the beans. They were vacuum sealed, plump and moist. Would recommend them for any use. I personally split and stuck them in some vodka to make vanilla extract. Yum!\n",
44 | "\n",
45 | "Jamaican Blue beans: Excellent coffee bean for roasting. Our family just purchased another 5 pounds for more roasting. Plenty of flavor and mild on acidity when roasted to a dark brown bean and befor\n",
46 | "\n",
47 | "Delicious!: I enjoy this white beans seasoning, it gives a rich flavor to the beans I just love it, my mother in law didn't know about this Zatarain's brand and now she is traying different seasoning\n",
48 | "\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "from openai.embeddings_utils import get_embedding, cosine_similarity\n",
54 | "\n",
55 | "# search through the reviews for a specific product\n",
56 | "def search_reviews(df, product_description, n=3, pprint=True):\n",
57 | " product_embedding = get_embedding(\n",
58 | " product_description,\n",
59 | " engine=\"text-embedding-ada-002\"\n",
60 | " )\n",
61 | " df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
62 | "\n",
63 | " results = (\n",
64 | " df.sort_values(\"similarity\", ascending=False)\n",
65 | " .head(n)\n",
66 | " .combined.str.replace(\"Title: \", \"\")\n",
67 | " .str.replace(\"; Content:\", \": \")\n",
68 | " )\n",
69 | " if pprint:\n",
70 | " for r in results:\n",
71 | " print(r[:200])\n",
72 | " print()\n",
73 | " return results\n",
74 | "\n",
75 | "\n",
76 | "results = search_reviews(df, \"delicious beans\", n=3)\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 3,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "Tasty and Quick Pasta: Barilla Whole Grain Fusilli with Vegetable Marinara is tasty and has an excellent chunky vegetable marinara. I just wish there was more of it. If you aren't starving or on a \n",
89 | "\n",
90 | "sooo good: tastes so good. Worth the money. My boyfriend hates wheat pasta and LOVES this. cooks fast tastes great.I love this brand and started buying more of their pastas. Bulk is best.\n",
91 | "\n",
92 | "Handy: Love the idea of ready in a minute pasta and for that alone this product gets praise. The pasta is whole grain so that's a big plus and it actually comes out al dente. The vegetable marinara\n",
93 | "\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "results = search_reviews(df, \"whole wheat pasta\", n=3)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "We can search through these reviews easily. To speed up computation, we can use a special algorithm, aimed at faster search through embeddings."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 4,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "great product, poor delivery: The coffee is excellent and I am a repeat buyer. Problem this time was with the UPS delivery. They left the box in front of my garage door in the middle of the drivewa\n",
118 | "\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "results = search_reviews(df, \"bad delivery\", n=1)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "As we can see, this can immediately deliver a lot of value. In this example we show being able to quickly find the examples of delivery failures."
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "Extremely dissapointed: Hi,
I am very disappointed with the past shipment I received of the ONE coconut water. 3 of the boxes were leaking and the coconut water was spoiled.
Thanks."
117 | ]
118 | },
119 | "metadata": {
120 | "needs_background": "light"
121 | },
122 | "output_type": "display_data"
123 | }
124 | ],
125 | "source": [
126 | "import matplotlib.pyplot as plt\n",
127 | "import statsmodels.api as sm\n",
128 | "\n",
129 | "\n",
130 | "correlation = X_test[['percentile_cosine_similarity', 'Score']].corr().values[0,1]\n",
131 | "print('Correlation between user & vector similarity percentile metric and review number of stars (score): %.2f%%' % (100*correlation))\n",
132 | "\n",
133 | "# boxplot of cosine similarity for each score\n",
134 | "X_test.boxplot(column='percentile_cosine_similarity', by='Score')\n",
135 | "plt.title('')\n",
136 | "plt.show()\n",
137 | "plt.close()"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "We can observe a weak trend, showing that the higher the similarity score between the user and the product embedding, the higher the review score. Therefore, the user and product embeddings can weakly predict the review score - even before the user receives the product!\n",
145 | "\n",
146 | "Because this signal works in a different way than the more commonly used collaborative filtering, it can act as an additional feature to slightly improve the performance on existing problems."
147 | ]
148 | }
149 | ],
150 | "metadata": {
151 | "interpreter": {
152 | "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8"
153 | },
154 | "kernelspec": {
155 | "display_name": "Python 3.7.3 64-bit ('base': conda)",
156 | "name": "python3"
157 | },
158 | "language_info": {
159 | "codemirror_mode": {
160 | "name": "ipython",
161 | "version": 3
162 | },
163 | "file_extension": ".py",
164 | "mimetype": "text/x-python",
165 | "name": "python",
166 | "nbconvert_exporter": "python",
167 | "pygments_lexer": "ipython3",
168 | "version": "3.7.3"
169 | },
170 | "orig_nbformat": 4
171 | },
172 | "nbformat": 4,
173 | "nbformat_minor": 2
174 | }
175 |
--------------------------------------------------------------------------------
/examples/azure/completions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Azure completions example\n",
8 | "In this example we'll try to go over all operations needed to get completions working using the Azure endpoints. \\\n",
9 | "This example focuses on completions but also touches on some other operations that are also available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a tutorial."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import openai\n",
19 | "from openai import cli"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Setup\n",
27 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "openai.api_version = '2022-12-01'\n",
37 | "openai.api_base = '' # Please add your endpoint here"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`."
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "### Setup: Portal\n",
52 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "openai.api_type = 'azure'\n",
62 | "openai.api_key = '' # Please add your api key here"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "### (Optional) Setup: Microsoft Active Directory Authentication\n",
70 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "# from azure.identity import DefaultAzureCredential\n",
80 | "\n",
81 | "# default_credential = DefaultAzureCredential()\n",
82 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n",
83 | "\n",
84 | "# openai.api_type = 'azure_ad'\n",
85 | "# openai.api_key = token.token"
86 | ]
87 | },
88 | {
89 | "attachments": {},
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## Deployments\n",
94 | "In this section we are going to create a deployment using the `text-davinci-002` model that we can then use to create completions."
95 | ]
96 | },
97 | {
98 | "attachments": {},
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Deployments: Create manually\n",
103 | "Create a new deployment by going to your Resource in your portal under \"Resource Management\" -> \"Model deployments\". Select `text-davinci-002` as the model."
104 | ]
105 | },
106 | {
107 | "attachments": {},
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "### (Optional) Deployments: Create programatically\n",
112 | "We can also create a deployment using code:"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "model = \"text-davinci-002\"\n",
122 | "\n",
123 | "# Now let's create the deployment\n",
124 | "print(f'Creating a new deployment with model: {model}')\n",
125 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n",
126 | "deployment_id = result[\"id\"]\n",
127 | "print(f'Successfully created deployment with id: {deployment_id}')"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "### (Optional) Deployments: Wait for deployment to succeed\n",
135 | "Now let's check the status of the newly created deployment and wait till it is succeeded."
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "print(f'Checking for deployment status.')\n",
145 | "resp = openai.Deployment.retrieve(id=deployment_id)\n",
146 | "status = resp[\"status\"]\n",
147 | "print(f'Deployment {deployment_id} has status: {status}')\n",
148 | "while status not in [\"succeeded\", \"failed\"]:\n",
149 | " resp = openai.Deployment.retrieve(id=deployment_id)\n",
150 | " status = resp[\"status\"]\n",
151 | " print(f'Deployment {deployment_id} has status: {status}')"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "### Completions\n",
159 | "Now let's send a sample completion to the deployment."
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "prompt = \"The food was delicious and the waiter\"\n",
169 | "completion = openai.Completion.create(deployment_id=deployment_id,\n",
170 | " prompt=prompt, stop=\".\", temperature=0)\n",
171 | " \n",
172 | "print(f\"{prompt}{completion['choices'][0]['text']}.\")"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### (Optional) Deployments: Delete\n",
180 | "Finally let's delete the deployment"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "print(f'Deleting deployment: {deployment_id}')\n",
190 | "openai.Deployment.delete(sid=deployment_id)"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": []
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 3 (ipykernel)",
204 | "language": "python",
205 | "name": "python3"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 3
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython3",
217 | "version": "3.10.8"
218 | },
219 | "vscode": {
220 | "interpreter": {
221 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17"
222 | }
223 | }
224 | },
225 | "nbformat": 4,
226 | "nbformat_minor": 2
227 | }
228 |
--------------------------------------------------------------------------------
/examples/azure/embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Azure embeddings example\n",
9 | "In this example we'll try to go over all operations for embeddings that can be done using the Azure endpoints. \\\n",
10 | "This example focuses on embeddings but also touches some other operations that are also available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a tutorial."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import openai\n",
20 | "from openai import cli"
21 | ]
22 | },
23 | {
24 | "attachments": {},
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Setup\n",
29 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "openai.api_version = '2022-12-01'\n",
39 | "openai.api_base = '' # Please add your endpoint here"
40 | ]
41 | },
42 | {
43 | "attachments": {},
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`."
48 | ]
49 | },
50 | {
51 | "attachments": {},
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "### Setup: Portal\n",
56 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values."
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "openai.api_type = 'azure'\n",
66 | "openai.api_key = '' # Please add your api key here"
67 | ]
68 | },
69 | {
70 | "attachments": {},
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### (Optional) Setup: Microsoft Active Directory Authentication\n",
75 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal."
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "# from azure.identity import DefaultAzureCredential\n",
85 | "\n",
86 | "# default_credential = DefaultAzureCredential()\n",
87 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n",
88 | "\n",
89 | "# openai.api_type = 'azure_ad'\n",
90 | "# openai.api_key = token.token"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Deployments\n",
98 | "In this section we are going to create a deployment that we can use to create embeddings."
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "### Deployments: Create manually\n",
106 | "Let's create a deployment using the `text-similarity-curie-001` model. Create a new deployment by going to your Resource in your portal under \"Resource Management\" -> \"Model deployments\"."
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### (Optional) Deployments: Create programatically\n",
114 | "We can also create a deployment using code:"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "model = \"text-similarity-curie-001\"\n",
124 | "\n",
125 | "# Now let's create the deployment\n",
126 | "print(f'Creating a new deployment with model: {model}')\n",
127 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n",
128 | "deployment_id = result[\"id\"]"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### (Optional) Deployments: Retrieving\n",
136 | "Now let's check the status of the newly created deployment"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "print(f'Checking for deployment status.')\n",
146 | "resp = openai.Deployment.retrieve(id=deployment_id)\n",
147 | "status = resp[\"status\"]\n",
148 | "print(f'Deployment {deployment_id} is with status: {status}')"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "### Deployments: Listing\n",
156 | "Now because creating a new deployment takes a long time, let's look in the subscription for an already finished deployment that succeeded."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "print('While deployment running, selecting a completed one that supports embeddings.')\n",
166 | "deployment_id = None\n",
167 | "result = openai.Deployment.list()\n",
168 | "for deployment in result.data:\n",
169 | " if deployment[\"status\"] != \"succeeded\":\n",
170 | " continue\n",
171 | " \n",
172 | " model = openai.Model.retrieve(deployment[\"model\"])\n",
173 | " if model[\"capabilities\"][\"embeddings\"] != True:\n",
174 | " continue\n",
175 | " \n",
176 | " deployment_id = deployment[\"id\"]\n",
177 | " break\n",
178 | "\n",
179 | "if not deployment_id:\n",
180 | " print('No deployment with status: succeeded found.')\n",
181 | "else:\n",
182 | " print(f'Found a succeeded deployment that supports embeddings with id: {deployment_id}.')"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "### Embeddings\n",
190 | "Now let's send a sample embedding to the deployment."
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "embeddings = openai.Embedding.create(deployment_id=deployment_id,\n",
200 | " input=\"The food was delicious and the waiter...\")\n",
201 | " \n",
202 | "print(embeddings)"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "### (Optional) Deployments: Delete\n",
210 | "Finally let's delete the deployment"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "print(f'Deleting deployment: {deployment_id}')\n",
220 | "openai.Deployment.delete(sid=deployment_id)"
221 | ]
222 | }
223 | ],
224 | "metadata": {
225 | "kernelspec": {
226 | "display_name": "Python 3",
227 | "language": "python",
228 | "name": "python3"
229 | },
230 | "language_info": {
231 | "codemirror_mode": {
232 | "name": "ipython",
233 | "version": 3
234 | },
235 | "file_extension": ".py",
236 | "mimetype": "text/x-python",
237 | "name": "python",
238 | "nbconvert_exporter": "python",
239 | "pygments_lexer": "ipython3",
240 | "version": "3.10.8"
241 | },
242 | "vscode": {
243 | "interpreter": {
244 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17"
245 | }
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 2
250 | }
251 |
--------------------------------------------------------------------------------
/examples/azure/finetuning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Azure Fine tuning example\n",
8 | "In this example we'll try to go over all operations that can be done using the Azure endpoints and their differences with the openAi endpoints (if any).
\n",
9 | "This example focuses on finetuning but also touches on the majority of operations that are available using the API. This example is meant to be a quick way of showing simple operations and is not meant as a finetune model adaptation tutorial.\n"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import openai\n",
19 | "from openai import cli"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Setup\n",
27 | "For the following sections to work properly we first have to setup some things. Let's start with the `api_base` and `api_version`. To find your `api_base` go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for the \"Endpoint\" value."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "openai.api_version = '2022-12-01'\n",
37 | "openai.api_base = '' # Please add your endpoint here"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "We next have to setup the `api_type` and `api_key`. We can either get the key from the portal or we can get it through Microsoft Active Directory Authentication. Depending on this the `api_type` is either `azure` or `azure_ad`."
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "### Setup: Portal\n",
52 | "Let's first look at getting the key from the portal. Go to https://portal.azure.com, find your resource and then under \"Resource Management\" -> \"Keys and Endpoints\" look for one of the \"Keys\" values."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "openai.api_type = 'azure'\n",
62 | "openai.api_key = '' # Please add your api key here"
63 | ]
64 | },
65 | {
66 | "attachments": {},
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### (Optional) Setup: Microsoft Active Directory Authentication\n",
71 | "Let's now see how we can get a key via Microsoft Active Directory Authentication. Uncomment the following code if you want to use Active Directory Authentication instead of keys from the portal."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# from azure.identity import DefaultAzureCredential\n",
81 | "\n",
82 | "# default_credential = DefaultAzureCredential()\n",
83 | "# token = default_credential.get_token(\"https://cognitiveservices.azure.com/.default\")\n",
84 | "\n",
85 | "# openai.api_type = 'azure_ad'\n",
86 | "# openai.api_key = token.token"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## Files\n",
94 | "In the next section we will focus on the files operations: importing, listing, retrieving, deleting. For this we need to create 2 temporary files with some sample data. For the sake of simplicity, we will use the same data for training and validation."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "import shutil\n",
104 | "import json\n",
105 | "\n",
106 | "training_file_name = 'training.jsonl'\n",
107 | "validation_file_name = 'validation.jsonl'\n",
108 | "\n",
109 | "sample_data = [{\"prompt\": \"When I go to the store, I want an\", \"completion\": \"apple.\"},\n",
110 | " {\"prompt\": \"When I go to work, I want a\", \"completion\": \"coffee.\"},\n",
111 | " {\"prompt\": \"When I go home, I want a\", \"completion\": \"soda.\"}]\n",
112 | "\n",
113 | "print(f'Generating the training file: {training_file_name}')\n",
114 | "with open(training_file_name, 'w') as training_file:\n",
115 | " for entry in sample_data:\n",
116 | " json.dump(entry, training_file)\n",
117 | " training_file.write('\\n')\n",
118 | "\n",
119 | "print(f'Copying the training file to the validation file')\n",
120 | "shutil.copy(training_file_name, validation_file_name)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "### Files: Listing\n",
128 | "List all of the uploaded files and check for the ones that are named \"training.jsonl\" or \"validation.jsonl\""
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "print('Checking for existing uploaded files.')\n",
138 | "results = []\n",
139 | "files = openai.File.list().data\n",
140 | "print(f'Found {len(files)} total uploaded files in the subscription.')\n",
141 | "for item in files:\n",
142 | " if item[\"filename\"] in [training_file_name, validation_file_name]:\n",
143 | " results.append(item[\"id\"])\n",
144 | "print(f'Found {len(results)} already uploaded files that match our names.')\n"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "### Files: Deleting\n",
152 | "Let's now delete those found files (if any) since we're going to be re-uploading them next."
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "print(f'Deleting already uploaded files...')\n",
162 | "for id in results:\n",
163 | " openai.File.delete(sid = id)\n"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "### Files: Importing & Retrieving\n",
171 | "Now, let's import our two files ('training.jsonl' and 'validation.jsonl') and keep those IDs since we're going to use them later for finetuning.
\n",
172 | "For this operation we are going to use the cli wrapper which does a bit more checks before uploading and also gives us progress. In addition, after uploading we're going to check the status our import until it has succeeded (or failed if something goes wrong)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "import time\n",
182 | "\n",
183 | "def check_status(training_id, validation_id):\n",
184 | " train_status = openai.File.retrieve(training_id)[\"status\"]\n",
185 | " valid_status = openai.File.retrieve(validation_id)[\"status\"]\n",
186 | " print(f'Status (training_file | validation_file): {train_status} | {valid_status}')\n",
187 | " return (train_status, valid_status)\n",
188 | "\n",
189 | "#importing our two files\n",
190 | "training_id = cli.FineTune._get_or_upload(training_file_name, True)\n",
191 | "validation_id = cli.FineTune._get_or_upload(validation_file_name, True)\n",
192 | "\n",
193 | "#checking the status of the imports\n",
194 | "(train_status, valid_status) = check_status(training_id, validation_id)\n",
195 | "\n",
196 | "while train_status not in [\"succeeded\", \"failed\"] or valid_status not in [\"succeeded\", \"failed\"]:\n",
197 | " time.sleep(1)\n",
198 | " (train_status, valid_status) = check_status(training_id, validation_id)\n"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "### Files: Downloading\n",
206 | "Now let's download one of the files, the training file for example, to check that everything was in order during importing and all bits are there."
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "print(f'Downloading training file: {training_id}')\n",
216 | "result = openai.File.download(training_id)\n",
217 | "print(result.decode('utf-8'))"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "## Finetune\n",
225 | "In this section we are going to use the two training and validation files that we imported in the previous section, to train a finetune model."
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "### Finetune: Adapt\n",
233 | "First let's create the finetune adaptation job."
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "create_args = {\n",
243 | " \"training_file\": training_id,\n",
244 | " \"validation_file\": validation_id,\n",
245 | " \"model\": \"babbage\",\n",
246 | " \"compute_classification_metrics\": True,\n",
247 | " \"classification_n_classes\": 3,\n",
248 | " \"n_epochs\": 20,\n",
249 | " \"batch_size\": 3,\n",
250 | " \"learning_rate_multiplier\": 0.3\n",
251 | "}\n",
252 | "resp = openai.FineTune.create(**create_args)\n",
253 | "job_id = resp[\"id\"]\n",
254 | "status = resp[\"status\"]\n",
255 | "\n",
256 | "print(f'Fine-tunning model with jobID: {job_id}.')"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "### Finetune: Streaming\n",
264 | "While the job runs, we can subscribe to the streaming events to check the progress of the operation."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "import signal\n",
274 | "import datetime\n",
275 | "\n",
276 | "def signal_handler(sig, frame):\n",
277 | " status = openai.FineTune.retrieve(job_id).status\n",
278 | " print(f\"Stream interrupted. Job is still {status}.\")\n",
279 | " return\n",
280 | "\n",
281 | "print(f'Streaming events for the fine-tuning job: {job_id}')\n",
282 | "signal.signal(signal.SIGINT, signal_handler)\n",
283 | "\n",
284 | "events = openai.FineTune.stream_events(job_id)\n",
285 | "try:\n",
286 | " for event in events:\n",
287 | " print(f'{datetime.datetime.fromtimestamp(event[\"created_at\"])} {event[\"message\"]}')\n",
288 | "\n",
289 | "except Exception:\n",
290 | " print(\"Stream interrupted (client disconnected).\")"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "### Finetune: Listing and Retrieving\n",
298 | "Now let's check that our operation was successful and in addition we can look at all of the finetuning operations using a list operation."
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "status = openai.FineTune.retrieve(id=job_id)[\"status\"]\n",
308 | "if status not in [\"succeeded\", \"failed\"]:\n",
309 | " print(f'Job not in terminal status: {status}. Waiting.')\n",
310 | " while status not in [\"succeeded\", \"failed\"]:\n",
311 | " time.sleep(2)\n",
312 | " status = openai.FineTune.retrieve(id=job_id)[\"status\"]\n",
313 | " print(f'Status: {status}')\n",
314 | "else:\n",
315 | " print(f'Finetune job {job_id} finished with status: {status}')\n",
316 | "\n",
317 | "print('Checking other finetune jobs in the subscription.')\n",
318 | "result = openai.FineTune.list()\n",
319 | "print(f'Found {len(result.data)} finetune jobs.')"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "### Finetune: Deleting\n",
327 | "Finally we can delete our finetune job.
\n",
328 | "WARNING: Please skip this step if you want to continue with the next section as the finetune model is needed. (The delete code is commented out by default)"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "# openai.FineTune.delete(sid=job_id)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "## Deployments\n",
345 | "In this section we are going to create a deployment using the finetune model that we just adapted and then used the deployment to create a simple completion operation."
346 | ]
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "metadata": {},
351 | "source": [
352 | "### Deployments: Create\n",
353 | "Let's create a deployment using the fine-tune model."
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "#Fist let's get the model of the previous job:\n",
363 | "result = openai.FineTune.retrieve(id=job_id)\n",
364 | "if result[\"status\"] == 'succeeded':\n",
365 | " model = result[\"fine_tuned_model\"]\n",
366 | "\n",
367 | "# Now let's create the deployment\n",
368 | "print(f'Creating a new deployment with model: {model}')\n",
369 | "result = openai.Deployment.create(model=model, scale_settings={\"scale_type\":\"standard\"})\n",
370 | "deployment_id = result[\"id\"]\n"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "### Deployments: Retrieving\n",
378 | "Now let's check the status of the newly created deployment"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "print(f'Checking for deployment status.')\n",
388 | "resp = openai.Deployment.retrieve(id=deployment_id)\n",
389 | "status = resp[\"status\"]\n",
390 | "print(f'Deployment {deployment_id} is with status: {status}')\n"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "metadata": {},
396 | "source": [
397 | "### Deployments: Listing\n",
398 | "Now because creating a new deployment takes a long time, let's look in the subscription for an already finished deployment that succeeded."
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "print('While deployment running, selecting a completed one.')\n",
408 | "deployment_id = None\n",
409 | "result = openai.Deployment.list()\n",
410 | "for deployment in result.data:\n",
411 | " if deployment[\"status\"] == \"succeeded\":\n",
412 | " deployment_id = deployment[\"id\"]\n",
413 | " break\n",
414 | "\n",
415 | "if not deployment_id:\n",
416 | " print('No deployment with status: succeeded found.')\n",
417 | "else:\n",
418 | " print(f'Found a successful deployment with id: {deployment_id}.')\n"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "metadata": {},
424 | "source": [
425 | "### Completions\n",
426 | "Now let's send a sample completion to the deployment."
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": [
435 | "print('Sending a test completion job')\n",
436 | "start_phrase = 'When I go home, I want a'\n",
437 | "response = openai.Completion.create(deployment_id=deployment_id, prompt=start_phrase, temperature=0, stop=\".\")\n",
438 | "text = response['choices'][0]['text'].replace('\\n', '').replace(' .', '.').strip()\n",
439 | "print(f'\"{start_phrase} {text}.\"')"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "### Deployments: Delete\n",
447 | "Finally let's delete the deployment"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "print(f'Deleting deployment: {deployment_id}')\n",
457 | "openai.Deployment.delete(sid=deployment_id)"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {},
463 | "source": [
464 | "Thank you"
465 | ]
466 | }
467 | ],
468 | "metadata": {
469 | "kernelspec": {
470 | "display_name": "Python 3",
471 | "language": "python",
472 | "name": "python3"
473 | },
474 | "language_info": {
475 | "codemirror_mode": {
476 | "name": "ipython",
477 | "version": 3
478 | },
479 | "file_extension": ".py",
480 | "mimetype": "text/x-python",
481 | "name": "python",
482 | "nbconvert_exporter": "python",
483 | "pygments_lexer": "ipython3",
484 | "version": "3.10.8"
485 | },
486 | "vscode": {
487 | "interpreter": {
488 | "hash": "3a5103089ab7e7c666b279eeded403fcec76de49a40685dbdfe9f9c78ad97c17"
489 | }
490 | }
491 | },
492 | "nbformat": 4,
493 | "nbformat_minor": 2
494 | }
495 |
--------------------------------------------------------------------------------
/examples/data/25000_spend_dataset_current.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/examples/data/25000_spend_dataset_current.csv
--------------------------------------------------------------------------------
/examples/data/labelled_transactions.csv:
--------------------------------------------------------------------------------
1 | Date,Supplier,Description,Transaction value (£),Classification
2 | 15/08/2016,Creative Video Productions Ltd,Kelvin Hall,26866,Other
3 | 29/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,74806,Building Improvement
4 | 29/05/2017,Morris & Spottiswood Ltd,George IV Bridge Work,56448,Building Improvement
5 | 31/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,164691,Building Improvement
6 | 24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,27926,Building Improvement
7 | 24/07/2017,John Graham Construction Ltd,Causewayside Refurbishment,212690,Building Improvement
8 | 16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,59021,Building Improvement
9 | 16/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,136379,Building Improvement
10 | 23/08/2017,Culture And Sport Glasgow,Kelvin Hall,60503,Building Improvement
11 | 23/08/2017,XMA Scotland Ltd,Kelvin Hall,31830,Building Improvement
12 | 31/08/2017,John Graham Construction Ltd,Causewayside Refurbishment,36313,Building Improvement
13 | 31/08/2017,Insight Direct (UK) Ltd,Causewayside Refurbishment,68222,Building Improvement
14 | 31/08/2017,Mark Finn Laboratory,George IV Bridge Work,53884,Building Improvement
15 | 11/09/2017,John Graham Construction Ltd,Causewayside Refurbishment,189483,Building Improvement
16 | 23/10/2017,John Graham Construction Ltd,Causewayside Refurbishment,151659,Building Improvement
17 | 23/10/2017,City Building LLP,Causewayside Refurbishment,53147,Building Improvement
18 | 07/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,52404,Building Improvement
19 | 13/02/2017,John Graham Construction Ltd,Causewayside Refurbishment,272390,Building Improvement
20 | 06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,31781,Building Improvement
21 | 06/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,198048,Building Improvement
22 | 31/03/2017,Nicholson Bros(Electrical Contractors) Ltd,Causewayside Refurbishment,33666,Building Improvement
23 | 31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,222090,Building Improvement
24 | 31/03/2017,John Graham Construction Ltd,Causewayside Refurbishment,63971,Building Improvement
25 | 24/04/2017,Scottish Historic Buildings Trust,Lawnmarket Work,50057,Building Improvement
26 | 30/04/2017,Morris & Spottiswood Ltd,George IV Bridge Work,63716,Building Improvement
27 | 15/05/2017,John Graham Construction Ltd,Causewayside Refurbishment,245381,Building Improvement
28 | 12/09/2016,Flexiform,Kelvin Hall,42623,Building Improvement
29 | 12/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,228689,Building Improvement
30 | 26/09/2016,Senator International,Kelvin Hall,35706,Building Improvement
31 | 26/09/2016,John Graham Construction Ltd,Causewayside Refurbishment,28378,Building Improvement
32 | 30/09/2016,A McGillivray,Causewayside Refurbishment,44392,Building Improvement
33 | 10/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,303999,Building Improvement
34 | 31/10/2016,John Graham Construction Ltd,Causewayside Refurbishment,74245,Building Improvement
35 | 07/11/2016,CBRE,Kelvin Hall,83736,Building Improvement
36 | 14/11/2016,University Of Glasgow,Kelvin Hall,188682,Building Improvement
37 | 14/11/2016,John Graham Construction Ltd,Causewayside Refurbishment,362326,Building Improvement
38 | 12/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,385310,Building Improvement
39 | 30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,253618,Building Improvement
40 | 30/12/2016,John Graham Construction Ltd,Causewayside Refurbishment,45127,Building Improvement
41 | 21/04/2016,M & J Ballantyne Ltd,George IV Bridge Work,35098,Building Improvement
42 | 09/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,64361,Building Improvement
43 | 09/05/2016,A McGillivray,Causewayside Refurbishment,53690,Building Improvement
44 | 16/05/2016,John Graham Construction Ltd,Causewayside Refurbishment,365344,Building Improvement
45 | 10/06/2016,Wavetek Ltd,Kelvin Hall,87589,Building Improvement
46 | 10/06/2016,John Graham Construction Ltd,Causewayside Refurbishment,381803,Building Improvement
47 | 30/06/2016,Glasgow City Council,Kelvin Hall,1700000,Building Improvement
48 | 11/07/2016,Wavetek Ltd,Kelvin Hall,65692,Building Improvement
49 | 11/07/2016,John Graham Construction Ltd,Causewayside Refurbishment,139845,Building Improvement
50 | 25/07/2016,A McGillivray,Causewayside Refurbishment,30113,Building Improvement
51 | 15/08/2016,John Graham Construction Ltd,Causewayside Refurbishment,196807,Building Improvement
52 | 06/11/2017,John Graham Construction Ltd,Causewayside Refurbishment,134208,Building Improvement
53 | 31/03/2017,NLS Foundation,Grant Payment,177500,Other
54 | 09/10/2017,Frost And Sullivan Ltd,Literary & Archival Items,28125,Literature & Archive
55 | 09/10/2017,JISC Services Ltd ,Literary & Archival Items,43481,Literature & Archive
56 | 27/02/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive
57 | 06/03/2017,Private Sale,Literary & Archival Items,72500,Literature & Archive
58 | 31/03/2017,Private Sale,Literary & Archival Items,3422500,Literature & Archive
59 | 24/04/2017,Cengage Learning (Emea )Ltd,Literary & Archival Items,43302,Literature & Archive
60 | 22/05/2017,ALDL,Legal Deposit Services,27067,Literature & Archive
61 | 19/09/2016,Jisc Services Ltd Subscription Account,Literary & Archival Items,42629,Literature & Archive
62 | 10/10/2016,Cengage Learning (Emea )Ltd,Literary & Archival Items,86604,Literature & Archive
63 | 24/10/2016,ALDL,ALDL Charges,32317,Literature & Archive
64 | 26/04/2016,Private Sale,Literary & Archival Items,30000,Literature & Archive
65 | 30/05/2016,ALDL,ALDL Charges,32317,Literature & Archive
66 | 15/07/2016,Sotheby'S,Literary & Archival Items,28500,Literature & Archive
67 | 18/07/2016,Christies,Literary & Archival Items,33800,Literature & Archive
68 | 31/07/2016,ALDL,ALDL Charges,32317,Literature & Archive
69 | 08/12/2016,Sothebys,Literary & Archival Items,166000,Literature & Archive
70 | 08/12/2016,Private Sale,Literary & Archival Items,87500,Literature & Archive
71 | 26/06/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills
72 | 26/06/2017,British Library,Legal Deposit Services,50056,Other
73 | 24/07/2017,ALDL,Legal Deposit Services,27067,Other
74 | 16/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills
75 | 23/08/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills
76 | 07/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
77 | 27/02/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
78 | 27/03/2017,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
79 | 22/05/2017,ECG Facilities Service,Facilities Management Charge,33386,Utility Bills
80 | 26/09/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
81 | 24/10/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
82 | 08/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
83 | 30/12/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
84 | 23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills
85 | 23/05/2016,ECG Facilities Service,Facilities Management Charge,32777,Utility Bills
86 | 28/06/2016,ECG Facilities Service,Facilities Management Charge,32832,Utility Bills
87 | 08/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
88 | 24/08/2016,ECG Facilities Service,Facilities Management Charge,32795,Utility Bills
89 | 30/10/2017,ECG Facilities Service,Facilities Management Charge,35758,Utility Bills
90 | 16/08/2017,Ex Libris,IT equipment,76610,Software/IT
91 | 31/03/2017,XMA Scotland Ltd,IT equipment,33450,Software/IT
92 | 31/03/2017,XMA Scotland Ltd,IT equipment,84524,Software/IT
93 | 24/04/2017,Insight Direct (UK) Ltd,IT equipment,56768,Software/IT
94 | 09/05/2016,Computacenter Uk,Kelvin Hall,72835,Software/IT
95 | 23/05/2016,Computacenter Uk,Kelvin Hall,26506,Software/IT
96 | 15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,57662,Utility Bills
97 | 15/09/2017,City Of Edinburgh Council,Non Domestic Rates ,142680,Utility Bills
98 | 08/05/2017,Anglian Water Business,Water,26832,Utility Bills
99 | 30/04/2016,City Of Edinburgh Council,Non Domestic Rates ,40800,Utility Bills
100 | 12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,144330,Utility Bills
101 | 12/09/2016,City Of Edinburgh Council,Non Domestic Rates ,49827,Utility Bills
102 | 24/07/2017,AM Phillip,Vehicle Purchase,26604,Other
--------------------------------------------------------------------------------
/examples/data/recommendations_embeddings_cache.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/examples/data/recommendations_embeddings_cache.pkl
--------------------------------------------------------------------------------
/examples/fine-tuned_qa/answers_with_ft.py:
--------------------------------------------------------------------------------
1 | """
2 | Note: To answer questions based on text documents, we recommend the procedure in
3 | [Question Answering using Embeddings](https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb).
4 | Some of the code below may rely on [deprecated API endpoints](https://github.com/openai/openai-cookbook/tree/main/transition_guides_for_deprecated_API_endpoints).
5 | """
6 |
7 | import argparse
8 |
9 | import openai
10 |
11 |
12 | def create_context(
13 | question, search_file_id, max_len=1800, search_model="ada", max_rerank=10
14 | ):
15 | """
16 | Create a context for a question by finding the most similar context from the search file.
17 | :param question: The question
18 | :param search_file_id: The file id of the search file
19 | :param max_len: The maximum length of the returned context (in tokens)
20 | :param search_model: The search model to use
21 | :param max_rerank: The maximum number of reranking
22 | :return: The context
23 | """
24 | results = openai.Engine(search_model).search(
25 | search_model=search_model,
26 | query=question,
27 | max_rerank=max_rerank,
28 | file=search_file_id,
29 | return_metadata=True,
30 | )
31 | returns = []
32 | cur_len = 0
33 | for result in results["data"]:
34 | cur_len += int(result["metadata"]) + 4
35 | if cur_len > max_len:
36 | break
37 | returns.append(result["text"])
38 | return "\n\n###\n\n".join(returns)
39 |
40 |
41 | def answer_question(
42 | search_file_id="",
43 | fine_tuned_qa_model="",
44 | question="Which country won the European Football championship in 2021?",
45 | max_len=1800,
46 | search_model="ada",
47 | max_rerank=10,
48 | debug=False,
49 | stop_sequence=["\n", "."],
50 | max_tokens=100,
51 | ):
52 | """
53 | Answer a question based on the most similar context from the search file, using your fine-tuned model.
54 | :param question: The question
55 | :param fine_tuned_qa_model: The fine tuned QA model
56 | :param search_file_id: The file id of the search file
57 | :param max_len: The maximum length of the returned context (in tokens)
58 | :param search_model: The search model to use
59 | :param max_rerank: The maximum number of reranking
60 | :param debug: Whether to output debug information
61 | :param stop_sequence: The stop sequence for Q&A model
62 | :param max_tokens: The maximum number of tokens to return
63 | :return: The answer
64 | """
65 | context = create_context(
66 | question,
67 | search_file_id,
68 | max_len=max_len,
69 | search_model=search_model,
70 | max_rerank=max_rerank,
71 | )
72 | if debug:
73 | print("Context:\n" + context)
74 | print("\n\n")
75 | try:
76 | # fine-tuned models requires model parameter, whereas other models require engine parameter
77 | model_param = (
78 | {"model": fine_tuned_qa_model}
79 | if ":" in fine_tuned_qa_model
80 | and fine_tuned_qa_model.split(":")[1].startswith("ft")
81 | else {"engine": fine_tuned_qa_model}
82 | )
83 | response = openai.Completion.create(
84 | prompt=f"Answer the question based on the context below\n\nText: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
85 | temperature=0,
86 | max_tokens=max_tokens,
87 | top_p=1,
88 | frequency_penalty=0,
89 | presence_penalty=0,
90 | stop=stop_sequence,
91 | **model_param,
92 | )
93 | return response["choices"][0]["text"]
94 | except Exception as e:
95 | print(e)
96 | return ""
97 |
98 |
99 | if __name__ == "__main__":
100 | parser = argparse.ArgumentParser(
101 | description="Rudimentary functionality of the answers endpoint with a fine-tuned Q&A model.",
102 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
103 | )
104 | parser.add_argument(
105 | "--search_file_id", help="Search file id", required=True, type=str
106 | )
107 | parser.add_argument(
108 | "--fine_tuned_qa_model", help="Fine-tuned QA model id", required=True, type=str
109 | )
110 | parser.add_argument(
111 | "--question", help="Question to answer", required=True, type=str
112 | )
113 | parser.add_argument(
114 | "--max_len",
115 | help="Maximum length of the returned context (in tokens)",
116 | default=1800,
117 | type=int,
118 | )
119 | parser.add_argument(
120 | "--search_model", help="Search model to use", default="ada", type=str
121 | )
122 | parser.add_argument(
123 | "--max_rerank",
124 | help="Maximum number of reranking for the search",
125 | default=10,
126 | type=int,
127 | )
128 | parser.add_argument(
129 | "--debug", help="Print debug information (context used)", action="store_true"
130 | )
131 | parser.add_argument(
132 | "--stop_sequence",
133 | help="Stop sequences for the Q&A model",
134 | default=["\n", "."],
135 | nargs="+",
136 | type=str,
137 | )
138 | parser.add_argument(
139 | "--max_tokens",
140 | help="Maximum number of tokens to return",
141 | default=100,
142 | type=int,
143 | )
144 | args = parser.parse_args()
145 | response = answer_question(
146 | search_file_id=args.search_file_id,
147 | fine_tuned_qa_model=args.fine_tuned_qa_model,
148 | question=args.question,
149 | max_len=args.max_len,
150 | search_model=args.search_model,
151 | max_rerank=args.max_rerank,
152 | debug=args.debug,
153 | stop_sequence=args.stop_sequence,
154 | max_tokens=args.max_tokens,
155 | )
156 | print(f"Answer:{response}")
157 |
--------------------------------------------------------------------------------
/how_to_work_with_large_language_models.md:
--------------------------------------------------------------------------------
1 | # How to work with large language models
2 |
3 | ## How large language models work
4 |
5 | [Large language models][Large language models Blog Post] are functions that map text to text. Given an input string of text, a large language model predicts the text that should come next.
6 |
7 | The magic of large language models is that by being trained to minimize this prediction error over vast quantities of text, the models end up learning concepts useful for these predictions. For example, they learn:
8 |
9 | * how to spell
10 | * how grammar works
11 | * how to paraphrase
12 | * how to answer questions
13 | * how to hold a conversation
14 | * how to write in many languages
15 | * how to code
16 | * etc.
17 |
18 | None of these capabilities are explicitly programmed in—they all emerge as a result of training.
19 |
20 | GPT-3 powers [hundreds of software products][GPT3 Apps Blog Post], including productivity apps, education apps, games, and more.
21 |
22 | ## How to control a large language model
23 |
24 | Of all the inputs to a large language model, by far the most influential is the text prompt.
25 |
26 | Large language models can be prompted to produce output in a few ways:
27 |
28 | * **Instruction**: Tell the model what you want
29 | * **Completion**: Induce the model to complete the beginning of what you want
30 | * **Demonstration**: Show the model what you want, with either:
31 | * A few examples in the prompt
32 | * Many hundreds or thousands of examples in a fine-tuning training dataset
33 |
34 | An example of each is shown below.
35 |
36 | ### Instruction prompts
37 |
38 | Instruction-following models (e.g., `text-davinci-003` or any model beginning with `text-`) are specially designed to follow instructions. Write your instruction at the top of the prompt (or at the bottom, or both), and the model will do its best to follow the instruction and then stop. Instructions can be detailed, so don't be afraid to write a paragraph explicitly detailing the output you want.
39 |
40 | Example instruction prompt:
41 |
42 | ```text
43 | Extract the name of the author from the quotation below.
44 |
45 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.”
46 | ― Ted Chiang, Exhalation
47 | ```
48 |
49 | Output:
50 |
51 | ```text
52 | Ted Chiang
53 | ```
54 |
55 | ### Completion prompt example
56 |
57 | Completion-style prompts take advantage of how large language models try to write text they think is mostly likely to come next. To steer the model, try beginning a pattern or sentence that will be completed by the output you want to see. Relative to direct instructions, this mode of steering large language models can take more care and experimentation. In addition, the models won't necessarily know where to stop, so you will often need stop sequences or post-processing to cut off text generated beyond the desired output.
58 |
59 | Example completion prompt:
60 |
61 | ```text
62 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.”
63 | ― Ted Chiang, Exhalation
64 |
65 | The author of this quote is
66 | ```
67 |
68 | Output:
69 |
70 | ```text
71 | Ted Chiang
72 | ```
73 |
74 | ### Demonstration prompt example (few-shot learning)
75 |
76 | Similar to completion-style prompts, demonstrations can show the model what you want it to do. This approach is sometimes called few-shot learning, as the model learns from a few examples provided in the prompt.
77 |
78 | Example demonstration prompt:
79 |
80 | ```text
81 | Quote:
82 | “When the reasoning mind is forced to confront the impossible again and again, it has no choice but to adapt.”
83 | ― N.K. Jemisin, The Fifth Season
84 | Author: N.K. Jemisin
85 |
86 | Quote:
87 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.”
88 | ― Ted Chiang, Exhalation
89 | Author:
90 | ```
91 |
92 | Output:
93 |
94 | ```text
95 | Ted Chiang
96 | ```
97 |
98 | ### Fine-tuned prompt example
99 |
100 | With enough training examples, you can [fine-tune][Fine Tuning Docs] a custom model. In this case, instructions become unnecessary, as the model can learn the task from the training data provided. However, it can be helpful to include separator sequences (e.g., `->` or `###` or any string that doesn't commonly appear in your inputs) to tell the model when the prompt has ended and the output should begin. Without separator sequences, there is a risk that the model continues elaborating on the input text rather than starting on the answer you want to see.
101 |
102 | Example fine-tuned prompt (for a model that has been custom trained on similar prompt-completion pairs):
103 |
104 | ```text
105 | “Some humans theorize that intelligent species go extinct before they can expand into outer space. If they're correct, then the hush of the night sky is the silence of the graveyard.”
106 | ― Ted Chiang, Exhalation
107 |
108 | ###
109 |
110 |
111 | ```
112 |
113 | Output:
114 |
115 | ```text
116 | Ted Chiang
117 | ```
118 |
119 | ## Code Capabilities
120 |
121 | Large language models aren't only great at text - they can be great at code too. OpenAI's specialized code model is called [Codex].
122 |
123 | Codex powers [more than 70 products][Codex Apps Blog Post], including:
124 |
125 | * [GitHub Copilot] (autocompletes code in VS Code and other IDEs)
126 | * [Pygma](https://pygma.app/) (turns Figma designs into code)
127 | * [Replit](https://replit.com/) (has an 'Explain code' button and other features)
128 | * [Warp](https://www.warp.dev/) (a smart terminal with AI command search)
129 | * [Machinet](https://machinet.net/) (writes Java unit test templates)
130 |
131 | Note that unlike instruction-following text models (e.g., `text-davinci-002`), Codex is *not* trained to follow instructions. As a result, designing good prompts can take more care.
132 |
133 | ### More prompt advice
134 |
135 | For more prompt examples, visit [OpenAI Examples][OpenAI Examples].
136 |
137 | In general, the input prompt is the best lever for improving model outputs. You can try tricks like:
138 |
139 | * **Give more explicit instructions.** E.g., if you want the output to be a comma separated list, ask it to return a comma separated list. If you want it to say "I don't know" when the it doesn't know the answer, tell it 'Say "I don't know" if you do not know the answer.'
140 | * **Supply better examples.** If you're demonstrating examples in your prompt, make sure that your examples are diverse and high quality.
141 | * **Ask the model to answer as if it was an expert.** Explicitly asking the model to produce high quality output or output as if it was written by an expert can induce the model to give higher quality answers that it thinks an expert would write. E.g., "The following answer is correct, high-quality, and written by an expert."
142 | * **Prompt the model to write down the series of steps explaining its reasoning.** E.g., prepend your answer with something like "[Let's think step by step](https://arxiv.org/pdf/2205.11916v1.pdf)." Prompting the model to give an explanation of its reasoning before its final answer can increase the likelihood that its final answer is consistent and correct.
143 |
144 |
145 |
146 | [Fine Tuning Docs]: https://beta.openai.com/docs/guides/fine-tuning
147 | [Codex Apps Blog Post]: https://openai.com/blog/codex-apps/
148 | [Large language models Blog Post]: https://openai.com/blog/better-language-models/
149 | [GitHub Copilot]: https://copilot.github.com/
150 | [Codex]: https://openai.com/blog/openai-codex/
151 | [GPT3 Apps Blog Post]: https://openai.com/blog/gpt-3-apps/
152 | [OpenAI Examples]: https://beta.openai.com/examples
153 |
--------------------------------------------------------------------------------
/images/OpenAI_Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/OpenAI_Logo.png
--------------------------------------------------------------------------------
/images/chain_of_thought_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig1.png
--------------------------------------------------------------------------------
/images/chain_of_thought_fig11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig11.png
--------------------------------------------------------------------------------
/images/chain_of_thought_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig3.png
--------------------------------------------------------------------------------
/images/chain_of_thought_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/chain_of_thought_fig5.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig1.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig2.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig3.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig4.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig5.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_fig7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_fig7.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_tab2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_tab2.png
--------------------------------------------------------------------------------
/images/faithful-reasoning_tab5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/faithful-reasoning_tab5.png
--------------------------------------------------------------------------------
/images/least-to-most_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_fig1.png
--------------------------------------------------------------------------------
/images/least-to-most_tab11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab11.png
--------------------------------------------------------------------------------
/images/least-to-most_tab4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab4.png
--------------------------------------------------------------------------------
/images/least-to-most_tab9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/least-to-most_tab9.png
--------------------------------------------------------------------------------
/images/lm_cascades_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig1.png
--------------------------------------------------------------------------------
/images/lm_cascades_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig3.png
--------------------------------------------------------------------------------
/images/lm_cascades_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig4.png
--------------------------------------------------------------------------------
/images/lm_cascades_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig5.png
--------------------------------------------------------------------------------
/images/lm_cascades_fig6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/lm_cascades_fig6.png
--------------------------------------------------------------------------------
/images/maieutic_fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_fig2.png
--------------------------------------------------------------------------------
/images/maieutic_fig6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_fig6.png
--------------------------------------------------------------------------------
/images/maieutic_tab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/maieutic_tab1.png
--------------------------------------------------------------------------------
/images/selection-inference_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/selection-inference_fig1.png
--------------------------------------------------------------------------------
/images/selection-inference_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/selection-inference_fig4.png
--------------------------------------------------------------------------------
/images/self-consistency_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/self-consistency_fig1.png
--------------------------------------------------------------------------------
/images/self-consistency_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/self-consistency_fig3.png
--------------------------------------------------------------------------------
/images/star_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/star_fig1.png
--------------------------------------------------------------------------------
/images/star_tab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/star_tab1.png
--------------------------------------------------------------------------------
/images/verifiers_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/verifiers_fig3.png
--------------------------------------------------------------------------------
/images/verifiers_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/verifiers_fig5.png
--------------------------------------------------------------------------------
/images/zero-shot_reasoners_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_fig1.png
--------------------------------------------------------------------------------
/images/zero-shot_reasoners_fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_fig2.png
--------------------------------------------------------------------------------
/images/zero-shot_reasoners_tab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_tab1.png
--------------------------------------------------------------------------------
/images/zero-shot_reasoners_tab5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dfkai/openai-cookbook-zh-cn/75a53d879f28bb2404802fc765c778254489a834/images/zero-shot_reasoners_tab5.png
--------------------------------------------------------------------------------
/solutions/web_crawl_Q&A/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.3
2 | aiosignal==1.3.1
3 | appnope==0.1.3
4 | asttokens==2.2.1
5 | async-timeout==4.0.2
6 | attrs==22.2.0
7 | backcall==0.2.0
8 | beautifulsoup4==4.11.1
9 | blobfile==2.0.1
10 | bs4==0.0.1
11 | certifi==2022.12.7
12 | charset-normalizer==2.1.1
13 | comm==0.1.2
14 | contourpy==1.0.7
15 | cycler==0.11.0
16 | debugpy==1.6.5
17 | decorator==5.1.1
18 | docopt==0.6.2
19 | entrypoints==0.4
20 | executing==1.2.0
21 | filelock==3.9.0
22 | fonttools==4.38.0
23 | frozenlist==1.3.3
24 | html==1.13
25 | huggingface-hub==0.11.1
26 | idna==3.4
27 | ipykernel==6.20.1
28 | ipython==8.8.0
29 | jedi==0.18.2
30 | joblib==1.2.0
31 | jupyter_client==7.4.8
32 | jupyter_core==5.1.3
33 | kiwisolver==1.4.4
34 | lxml==4.9.2
35 | matplotlib==3.6.3
36 | matplotlib-inline==0.1.6
37 | multidict==6.0.4
38 | nest-asyncio==1.5.6
39 | numpy==1.24.1
40 | openai==0.26.1
41 | packaging==23.0
42 | pandas==1.5.2
43 | parso==0.8.3
44 | pexpect==4.8.0
45 | pickleshare==0.7.5
46 | Pillow==9.4.0
47 | pipreqs==0.4.11
48 | platformdirs==2.6.2
49 | plotly==5.12.0
50 | prompt-toolkit==3.0.36
51 | psutil==5.9.4
52 | ptyprocess==0.7.0
53 | pure-eval==0.2.2
54 | pycryptodomex==3.17
55 | Pygments==2.14.0
56 | pyparsing==3.0.9
57 | python-dateutil==2.8.2
58 | pytz==2022.7.1
59 | PyYAML==6.0
60 | pyzmq==24.0.1
61 | regex==2022.10.31
62 | requests==2.28.1
63 | scikit-learn==1.2.0
64 | scipy==1.10.0
65 | six==1.16.0
66 | soupsieve==2.3.2.post1
67 | stack-data==0.6.2
68 | tenacity==8.1.0
69 | threadpoolctl==3.1.0
70 | tiktoken==0.1.2
71 | tokenizers==0.13.2
72 | tornado==6.2
73 | tqdm==4.64.1
74 | traitlets==5.8.1
75 | transformers==4.25.1
76 | typing_extensions==4.4.0
77 | urllib3==1.26.13
78 | wcwidth==0.2.5
79 | yarg==0.1.9
80 | yarl==1.8.2
--------------------------------------------------------------------------------
/solutions/web_crawl_Q&A/web-qa.py:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | ### Step 1
3 | ################################################################################
4 |
5 | import requests
6 | import re
7 | import urllib.request
8 | from bs4 import BeautifulSoup
9 | from collections import deque
10 | from html.parser import HTMLParser
11 | from urllib.parse import urlparse
12 | import os
13 | import pandas as pd
14 | import tiktoken
15 | import openai
16 | from openai.embeddings_utils import distances_from_embeddings
17 | import pandas as pd
18 | import numpy as np
19 | from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
20 |
21 | # Regex pattern to match a URL
22 | HTTP_URL_PATTERN = r'^http[s]*://.+'
23 |
24 | # Define root domain to crawl
25 | domain = "openai.com"
26 | full_url = "https://openai.com/"
27 |
28 | # Create a class to parse the HTML and get the hyperlinks
29 | class HyperlinkParser(HTMLParser):
30 | def __init__(self):
31 | super().__init__()
32 | # Create a list to store the hyperlinks
33 | self.hyperlinks = []
34 |
35 | # Override the HTMLParser's handle_starttag method to get the hyperlinks
36 | def handle_starttag(self, tag, attrs):
37 | attrs = dict(attrs)
38 |
39 | # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
40 | if tag == "a" and "href" in attrs:
41 | self.hyperlinks.append(attrs["href"])
42 |
43 | ################################################################################
44 | ### Step 2
45 | ################################################################################
46 |
47 | # Function to get the hyperlinks from a URL
48 | def get_hyperlinks(url):
49 |
50 | # Try to open the URL and read the HTML
51 | try:
52 | # Open the URL and read the HTML
53 | with urllib.request.urlopen(url) as response:
54 |
55 | # If the response is not HTML, return an empty list
56 | if not response.info().get('Content-Type').startswith("text/html"):
57 | return []
58 |
59 | # Decode the HTML
60 | html = response.read().decode('utf-8')
61 | except Exception as e:
62 | print(e)
63 | return []
64 |
65 | # Create the HTML Parser and then Parse the HTML to get hyperlinks
66 | parser = HyperlinkParser()
67 | parser.feed(html)
68 |
69 | return parser.hyperlinks
70 |
71 | ################################################################################
72 | ### Step 3
73 | ################################################################################
74 |
75 | # Function to get the hyperlinks from a URL that are within the same domain
76 | def get_domain_hyperlinks(local_domain, url):
77 | clean_links = []
78 | for link in set(get_hyperlinks(url)):
79 | clean_link = None
80 |
81 | # If the link is a URL, check if it is within the same domain
82 | if re.search(HTTP_URL_PATTERN, link):
83 | # Parse the URL and check if the domain is the same
84 | url_obj = urlparse(link)
85 | if url_obj.netloc == local_domain:
86 | clean_link = link
87 |
88 | # If the link is not a URL, check if it is a relative link
89 | else:
90 | if link.startswith("/"):
91 | link = link[1:]
92 | elif link.startswith("#") or link.startswith("mailto:"):
93 | continue
94 | clean_link = "https://" + local_domain + "/" + link
95 |
96 | if clean_link is not None:
97 | if clean_link.endswith("/"):
98 | clean_link = clean_link[:-1]
99 | clean_links.append(clean_link)
100 |
101 | # Return the list of hyperlinks that are within the same domain
102 | return list(set(clean_links))
103 |
104 |
105 | ################################################################################
106 | ### Step 4
107 | ################################################################################
108 |
109 | def crawl(url):
110 | # Parse the URL and get the domain
111 | local_domain = urlparse(url).netloc
112 |
113 | # Create a queue to store the URLs to crawl
114 | queue = deque([url])
115 |
116 | # Create a set to store the URLs that have already been seen (no duplicates)
117 | seen = set([url])
118 |
119 | # Create a directory to store the text files
120 | if not os.path.exists("text/"):
121 | os.mkdir("text/")
122 |
123 | if not os.path.exists("text/"+local_domain+"/"):
124 | os.mkdir("text/" + local_domain + "/")
125 |
126 | # Create a directory to store the csv files
127 | if not os.path.exists("processed"):
128 | os.mkdir("processed")
129 |
130 | # While the queue is not empty, continue crawling
131 | while queue:
132 |
133 | # Get the next URL from the queue
134 | url = queue.pop()
135 | print(url) # for debugging and to see the progress
136 |
137 | # Save text from the url to a .txt file
138 | with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
139 |
140 | # Get the text from the URL using BeautifulSoup
141 | soup = BeautifulSoup(requests.get(url).text, "html.parser")
142 |
143 | # Get the text but remove the tags
144 | text = soup.get_text()
145 |
146 | # If the crawler gets to a page that requires JavaScript, it will stop the crawl
147 | if ("You need to enable JavaScript to run this app." in text):
148 | print("Unable to parse page " + url + " due to JavaScript being required")
149 |
150 | # Otherwise, write the text to the file in the text directory
151 | f.write(text)
152 |
153 | # Get the hyperlinks from the URL and add them to the queue
154 | for link in get_domain_hyperlinks(local_domain, url):
155 | if link not in seen:
156 | queue.append(link)
157 | seen.add(link)
158 |
159 | crawl(full_url)
160 |
161 | ################################################################################
162 | ### Step 5
163 | ################################################################################
164 |
165 | def remove_newlines(serie):
166 | serie = serie.str.replace('\n', ' ')
167 | serie = serie.str.replace('\\n', ' ')
168 | serie = serie.str.replace(' ', ' ')
169 | serie = serie.str.replace(' ', ' ')
170 | return serie
171 |
172 |
173 | ################################################################################
174 | ### Step 6
175 | ################################################################################
176 |
177 | # Create a list to store the text files
178 | texts=[]
179 |
180 | # Get all the text files in the text directory
181 | for file in os.listdir("text/" + domain + "/"):
182 |
183 | # Open the file and read the text
184 | with open("text/" + domain + "/" + file, "r") as f:
185 | text = f.read()
186 |
187 | # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
188 | texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
189 |
190 | # Create a dataframe from the list of texts
191 | df = pd.DataFrame(texts, columns = ['fname', 'text'])
192 |
193 | # Set the text column to be the raw text with the newlines removed
194 | df['text'] = df.fname + ". " + remove_newlines(df.text)
195 | df.to_csv('processed/scraped.csv')
196 | df.head()
197 |
198 | ################################################################################
199 | ### Step 7
200 | ################################################################################
201 |
202 | # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
203 | tokenizer = tiktoken.get_encoding("cl100k_base")
204 |
205 | df = pd.read_csv('processed/scraped.csv', index_col=0)
206 | df.columns = ['title', 'text']
207 |
208 | # Tokenize the text and save the number of tokens to a new column
209 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
210 |
211 | # Visualize the distribution of the number of tokens per row using a histogram
212 | df.n_tokens.hist()
213 |
214 | ################################################################################
215 | ### Step 8
216 | ################################################################################
217 |
218 | max_tokens = 500
219 |
220 | # Function to split the text into chunks of a maximum number of tokens
221 | def split_into_many(text, max_tokens = max_tokens):
222 |
223 | # Split the text into sentences
224 | sentences = text.split('. ')
225 |
226 | # Get the number of tokens for each sentence
227 | n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
228 |
229 | chunks = []
230 | tokens_so_far = 0
231 | chunk = []
232 |
233 | # Loop through the sentences and tokens joined together in a tuple
234 | for sentence, token in zip(sentences, n_tokens):
235 |
236 | # If the number of tokens so far plus the number of tokens in the current sentence is greater
237 | # than the max number of tokens, then add the chunk to the list of chunks and reset
238 | # the chunk and tokens so far
239 | if tokens_so_far + token > max_tokens:
240 | chunks.append(". ".join(chunk) + ".")
241 | chunk = []
242 | tokens_so_far = 0
243 |
244 | # If the number of tokens in the current sentence is greater than the max number of
245 | # tokens, go to the next sentence
246 | if token > max_tokens:
247 | continue
248 |
249 | # Otherwise, add the sentence to the chunk and add the number of tokens to the total
250 | chunk.append(sentence)
251 | tokens_so_far += token + 1
252 |
253 | return chunks
254 |
255 |
256 | shortened = []
257 |
258 | # Loop through the dataframe
259 | for row in df.iterrows():
260 |
261 | # If the text is None, go to the next row
262 | if row[1]['text'] is None:
263 | continue
264 |
265 | # If the number of tokens is greater than the max number of tokens, split the text into chunks
266 | if row[1]['n_tokens'] > max_tokens:
267 | shortened += split_into_many(row[1]['text'])
268 |
269 | # Otherwise, add the text to the list of shortened texts
270 | else:
271 | shortened.append( row[1]['text'] )
272 |
273 | ################################################################################
274 | ### Step 9
275 | ################################################################################
276 |
277 | df = pd.DataFrame(shortened, columns = ['text'])
278 | df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
279 | df.n_tokens.hist()
280 |
281 | ################################################################################
282 | ### Step 10
283 | ################################################################################
284 |
285 | df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
286 | df.to_csv('processed/embeddings.csv')
287 | df.head()
288 |
289 | ################################################################################
290 | ### Step 11
291 | ################################################################################
292 |
293 | df=pd.read_csv('processed/embeddings.csv', index_col=0)
294 | df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
295 |
296 | df.head()
297 |
298 | ################################################################################
299 | ### Step 12
300 | ################################################################################
301 |
302 | def create_context(
303 | question, df, max_len=1800, size="ada"
304 | ):
305 | """
306 | Create a context for a question by finding the most similar context from the dataframe
307 | """
308 |
309 | # Get the embeddings for the question
310 | q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
311 |
312 | # Get the distances from the embeddings
313 | df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
314 |
315 |
316 | returns = []
317 | cur_len = 0
318 |
319 | # Sort by distance and add the text to the context until the context is too long
320 | for i, row in df.sort_values('distances', ascending=True).iterrows():
321 |
322 | # Add the length of the text to the current length
323 | cur_len += row['n_tokens'] + 4
324 |
325 | # If the context is too long, break
326 | if cur_len > max_len:
327 | break
328 |
329 | # Else add it to the text that is being returned
330 | returns.append(row["text"])
331 |
332 | # Return the context
333 | return "\n\n###\n\n".join(returns)
334 |
335 | def answer_question(
336 | df,
337 | model="text-davinci-003",
338 | question="Am I allowed to publish model outputs to Twitter, without a human review?",
339 | max_len=1800,
340 | size="ada",
341 | debug=False,
342 | max_tokens=150,
343 | stop_sequence=None
344 | ):
345 | """
346 | Answer a question based on the most similar context from the dataframe texts
347 | """
348 | context = create_context(
349 | question,
350 | df,
351 | max_len=max_len,
352 | size=size,
353 | )
354 | # If debug, print the raw model response
355 | if debug:
356 | print("Context:\n" + context)
357 | print("\n\n")
358 |
359 | try:
360 | # Create a completions using the questin and context
361 | response = openai.Completion.create(
362 | prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
363 | temperature=0,
364 | max_tokens=max_tokens,
365 | top_p=1,
366 | frequency_penalty=0,
367 | presence_penalty=0,
368 | stop=stop_sequence,
369 | model=model,
370 | )
371 | return response["choices"][0]["text"].strip()
372 | except Exception as e:
373 | print(e)
374 | return ""
375 |
376 | ################################################################################
377 | ### Step 13
378 | ################################################################################
379 |
380 | print(answer_question(df, question="What day is it?", debug=False))
381 |
382 | print(answer_question(df, question="What is our newest embeddings model?"))
--------------------------------------------------------------------------------
/text_comparison_examples.md:
--------------------------------------------------------------------------------
1 | # Text comparison examples
2 |
3 | The [OpenAI API embeddings endpoint](https://beta.openai.com/docs/guides/embeddings) can be used to measure relatedness or similarity between pieces of text.
4 |
5 | By leveraging GPT-3's understanding of text, these embeddings [achieved state-of-the-art results](https://arxiv.org/abs/2201.10005) on benchmarks in unsupervised learning and transfer learning settings.
6 |
7 | Embeddings can be used for semantic search, recommendations, cluster analysis, near-duplicate detection, and more.
8 |
9 | For more information, read OpenAI's blog post announcements:
10 |
11 | * [Introducing Text and Code Embeddings (Jan 2022)](https://openai.com/blog/introducing-text-and-code-embeddings/)
12 | * [New and Improved Embedding Model (Dec 2022)](https://openai.com/blog/new-and-improved-embedding-model/)
13 |
14 | ## Semantic search
15 |
16 | Embeddings can be used for search either by themselves or as a feature in a larger system.
17 |
18 | The simplest way to use embeddings for search is as follows:
19 |
20 | * Before the search (precompute):
21 | * Split your text corpus into chunks smaller than the token limit (8,191 tokens for `text-embedding-ada-002`)
22 | * Embed each chunk of text
23 | * Store those embeddings in your own database or in a vector search provider like [Pinecone](https://www.pinecone.io) or [Weaviate](https://weaviate.io)
24 | * At the time of the search (live compute):
25 | * Embed the search query
26 | * Find the closest embeddings in your database
27 | * Return the top results
28 |
29 | An example of how to use embeddings for search is shown in [Semantic_text_search_using_embeddings.ipynb](examples/Semantic_text_search_using_embeddings.ipynb).
30 |
31 | In more advanced search systems, the the cosine similarity of embeddings can be used as one feature among many in ranking search results.
32 |
33 | ## Question answering
34 |
35 | The best way to get reliably honest answers from GPT-3 is to give it source documents in which it can locate correct answers. Using the semantic search procedure above, you can cheaply search a corpus of documents for relevant information and then give that information to GPT-3, via the prompt, to answer a question. We demonstrate in [Question_answering_using_embeddings.ipynb](examples/Question_answering_using_embeddings.ipynb).
36 |
37 | ## Recommendations
38 |
39 | Recommendations are quite similar to search, except that instead of a free-form text query, the inputs are items in a set.
40 |
41 | An example of how to use embeddings for recommendations is shown in [Recommendation_using_embeddings.ipynb](examples/Recommendation_using_embeddings.ipynb).
42 |
43 | Similar to search, these cosine similarity scores can either be used on their own to rank items or as features in larger ranking algorithms.
44 |
45 | ## Customizing Embeddings
46 |
47 | Although OpenAI's embedding model weights cannot be fine-tuned, you can nevertheless use training data to customize embeddings to your application.
48 |
49 | In [Customizing_embeddings.ipynb](examples/Customizing_embeddings.ipynb), we provide an example method for customizing your embeddings using training data. The idea of the method is to train a custom matrix to multiply embedding vectors by in order to get new customized embeddings. With good training data, this custom matrix will help emphasize the features relevant to your training labels. You can equivalently consider the matrix multiplication as (a) a modification of the embeddings or (b) a modification of the distance function used to measure the distances between embeddings.
50 |
--------------------------------------------------------------------------------
/text_editing_examples.md:
--------------------------------------------------------------------------------
1 | # Text editing examples
2 |
3 | In addition to the [completions API endpoint][Completion API Docs], OpenAI offers an [edits API endpoint][Edit API Docs]. Read more at:
4 |
5 | * [Blog post announcement (Mar 2022)][GPT3 Edit Blog Post]
6 | * [Edit API documentation][Edit API Docs]
7 |
8 | In contrast to completions, which only take a single text input, edits take two text inputs: the instruction and the text to be modified. For example:
9 |
10 | Instruction input:
11 |
12 | ```text
13 | Fix the OCR errors
14 | ```
15 |
16 | Text input:
17 |
18 | ```text
19 | Therewassomehostilityntheenergybehindthe researchreportedinPerceptrons....Part of ourdrivecame,aswequiteplainlyacknoweldgednourbook,fromhe facthatfundingndresearchnergywerebeingdissipatedon. . .misleadingttemptsouseconnectionistmethodsnpracticalappli-cations.
20 | ```
21 |
22 | [Output](https://beta.openai.com/playground/p/5W5W6HHlHrGsLu1cpx0VF4qu):
23 |
24 | ```text
25 | There was some hostility in the energy behind the research reported in Perceptrons....Part of our drive came, as we quite plainly acknowledged in our book, from the fact that funding and research energy were being dissipated on...misleading attempts to use connectionist methods in practical applications.
26 | ```
27 |
28 | In general, instructions can be imperative, present tense, or past tense. Experiment to see what works best for your use case.
29 |
30 | ## Translation
31 |
32 | One application of the edit API is translation.
33 |
34 | Large language models are excellent at translating across common languages. In 2021, [GPT-3 set](https://arxiv.org/abs/2110.05448) a new state-of-the-art record in unsupervised translation on the WMT14 English-French benchmark.
35 |
36 | Here's an example of how to translate text using the edits endpoint:
37 |
38 | Instruction input:
39 |
40 | ```text
41 | translation into French
42 | ```
43 |
44 | Text input:
45 |
46 | ```text
47 | That's life.
48 | ```
49 |
50 | [Output](https://beta.openai.com/playground/p/6JWAH8a4ZbEafSDyRsSVdgKr):
51 |
52 | ```text
53 | C'est la vie.
54 | ```
55 |
56 | Of course, many tasks that can be accomplished with the edits endpoint can also be done by the completions endpoint too. For example, you can request a translate by prepending an instruction as follows:
57 |
58 | ```text
59 | Translate the following text from English to French.
60 |
61 | English: That's life.
62 | French:
63 | ```
64 |
65 | [Output](https://beta.openai.com/playground/p/UgaPfgjBNTRRPeNcMSNtGzcu):
66 |
67 | ```text
68 | C'est la vie.
69 | ```
70 |
71 | Tips for translation:
72 |
73 | * Performance is best on the most common languages
74 | * We've seen better performance when the instruction is given in the final language (so if translating into French, give the instruction `Traduire le texte de l'anglais au français.` rather than `Translate the following text from English to French.`)
75 | * Backtranslation (as described [here](https://arxiv.org/abs/2110.05448)) can also increase performance
76 | * Text with colons and heavy punctuation can trip up the instruction-following models, especially if the instruction uses colons (e.g., `English: {english text} French:`)
77 | * The edits endpoint sometimes repeats the original text input alongside the translation, which can be monitored and filtered
78 |
79 | When it comes to translation, large language models particularly shine at combining other instructions alongside translation. For example, you can ask GPT-3 to translate Slovenian to English but keep all LaTeX typesetting commands unchanged. The following notebook details how we translated a Slovenian math book into English:
80 |
81 | [Translation of a Slovenian math book into English](examples/book_translation/translate_latex_book.ipynb)
82 |
83 |
84 | [Edit API Docs]: https://beta.openai.com/docs/api-reference/edits
85 | [Completion API Docs]: https://beta.openai.com/docs/api-reference/completions
86 | [GPT3 Edit Blog Post]: https://openai.com/blog/gpt-3-edit-insert/
--------------------------------------------------------------------------------
/text_explanation_examples.md:
--------------------------------------------------------------------------------
1 | # Text explanation examples
2 |
3 | Large language models are useful for distilling information from long texts. Applications include:
4 |
5 | * Answering questions about a piece of text, e.g.:
6 | * Querying an knowledge base to help people look up things they don't know
7 | * Querying an unfamiliar document to understand what it contains
8 | * Querying a document with structured questions in order to extract tags, classes, entities, etc.
9 | * Summarizing text, e.g.:
10 | * Summarizing long documents
11 | * Summarizing back-and-forth emails or message threads
12 | * Summarizing detailed meeting notes with key points and next steps
13 | * Classifying text, e.g.:
14 | * Classifying customer feedback messages by topic or type
15 | * Classifying documents by topic or type
16 | * Classifying the tone or sentiment of text
17 | * Extracting entities, e.g.:
18 | * Extracting contact information from a customer message
19 | * Extracting names of people or companies or products from a document
20 | * Extracting things mentioned in customer reviews or feedback
21 |
22 | Below are some simple examples of each.
23 |
24 | ## Answering questions about a piece of text
25 |
26 | Here's an example prompt for answering questions about a piece of text:
27 |
28 | ```text
29 | Using the following text, answer the following question. If the answer is not contained within the text, say "I don't know."
30 |
31 | Text:
32 | """
33 | Oklo Mine (sometimes Oklo Reactor or Oklo Mines), located in Oklo, Gabon on the west coast of Central Africa, is believed to be the only natural nuclear fission reactor. Oklo consists of 16 sites at which self-sustaining nuclear fission reactions are thought to have taken place approximately 1.7 billion years ago, and ran for hundreds of thousands of years. It is estimated to have averaged under 100 kW of thermal power during that time.
34 | """
35 |
36 | Question: How many natural fission reactors have ever been discovered?
37 |
38 | Answer:
39 | ```
40 |
41 | [Output](https://beta.openai.com/playground/p/c8ZL7ioqKK7zxrMT2T9Md3gJ):
42 |
43 | ```text
44 | One. Oklo Mine is believed to be the only natural nuclear fission reactor.
45 | ```
46 |
47 | If the text you wish to ask about is longer than the token limit (~4,000 tokens for `text-davinci-002`/`-003` and ~2,000 tokens for earlier models), you can split the text into smaller pieces, rank them by relevance, and then ask your question only using the most-relevant-looking pieces. This is demonstrated in [Question_answering_using_embeddings.ipynb](examples/Question_answering_using_embeddings.ipynb).
48 |
49 | In the same way that students do better on tests when allowed to access notes, GPT-3 does better at answering questions when it's given text containing the answer.
50 | Without notes, GPT-3 has to rely on its own long-term memory (i.e., internal weights), which are more prone to result in confabulated or hallucinated answers.
51 |
52 | ## Summarization
53 |
54 | Here's a simple example prompt to summarize a piece of text:
55 |
56 | ```text
57 | Summarize the following text.
58 |
59 | Text:
60 | """
61 | Two independent experiments reported their results this morning at CERN, Europe's high-energy physics laboratory near Geneva in Switzerland. Both show convincing evidence of a new boson particle weighing around 125 gigaelectronvolts, which so far fits predictions of the Higgs previously made by theoretical physicists.
62 |
63 | "As a layman I would say: 'I think we have it'. Would you agree?" Rolf-Dieter Heuer, CERN's director-general, asked the packed auditorium. The physicists assembled there burst into applause.
64 | """
65 |
66 | Summary:
67 | ```
68 |
69 | [Output](https://beta.openai.com/playground/p/pew7DNB908TkUYiF0ZOdaIGc):
70 |
71 | ```text
72 | CERN's director-general asked a packed auditorium if they agreed that two independent experiments had found convincing evidence of a new boson particle that fits predictions of the Higgs, to which the physicists assembled there responded with applause.
73 | ```
74 |
75 | The triple quotation marks `"""` used in these example prompts aren't special; GPT-3 can recognize most delimiters, including `<>`, `{}`, or `###`. For long pieces of text, we recommend using some kind of delimiter to help disambiguate where one section of text ends and the next begins.
76 |
77 | ## Classification
78 |
79 | If you want to classify the text, the best approach depends on whether the classes are known in advance.
80 |
81 | If your classes _are_ known in advance, classification is often best done with a fine-tuned model, as demonstrated in [Fine-tuned_classification.ipynb](examples/Fine-tuned_classification.ipynb).
82 |
83 | If your classes are not known in advance (e.g., they are set by a user or generated on the fly), you can try zero-shot classification by either giving an instruction containing the classes or even by using embeddings to see which class label (or other classified texts) are most similar to the text (as demonstrated in [Zero-shot_classification.ipynb](examples/Zero-shot_classification_with_embeddings.ipynb)).
84 |
85 | ## Entity extraction
86 |
87 | Here's an example prompt for entity extraction:
88 |
89 | ```text
90 | From the text below, extract the following entities in the following format:
91 | Companies:
92 | People & titles:
93 |
94 | Text:
95 | """
96 | In March 1981, United States v. AT&T came to trial under Assistant Attorney General William Baxter. AT&T chairman Charles L. Brown thought the company would be gutted. He realized that AT&T would lose and, in December 1981, resumed negotiations with the Justice Department. Reaching an agreement less than a month later, Brown agreed to divestiture—the best and only realistic alternative. AT&T's decision allowed it to retain its research and manufacturing arms. The decree, titled the Modification of Final Judgment, was an adjustment of the Consent Decree of 14 January 1956. Judge Harold H. Greene was given the authority over the modified decree....
97 |
98 | In 1982, the U.S. government announced that AT&T would cease to exist as a monopolistic entity. On 1 January 1984, it was split into seven smaller regional companies, Bell South, Bell Atlantic, NYNEX, American Information Technologies, Southwestern Bell, US West, and Pacific Telesis, to handle regional phone services in the U.S. AT&T retains control of its long distance services, but was no longer protected from competition.
99 | """
100 | ```
101 |
102 | [Output](https://beta.openai.com/playground/p/of47T7N5CtHF4RlvwFkTu3pN):
103 |
104 | ```text
105 |
106 | Companies: AT&T, Bell South, Bell Atlantic, NYNEX, American Information Technologies, Southwestern Bell, US West, Pacific Telesis
107 | People & titles: William Baxter (Assistant Attorney General), Charles L. Brown (AT&T chairman), Harold H. Greene (Judge)
108 | ```
--------------------------------------------------------------------------------
/text_writing_examples.md:
--------------------------------------------------------------------------------
1 | # Text writing examples
2 |
3 | Large language models are excellent at writing. They can assist with all sorts of tasks:
4 |
5 | * Blog posts
6 | * Email copy
7 | * Ad copy
8 | * Website copy
9 | * Product descriptions
10 | * Memos
11 | * Storytelling
12 | * Brainstorming
13 | * Question generation
14 | * etc.
15 |
16 | The easiest way to prompt GPT-3 is to tell it what you'd like. For example, if you want it to write an email, you could use a prompt like this:
17 |
18 | ```text
19 | Write an email to a colleague named Jill congratulating her on her promotion. The tone should be warm yet professional. Mention how you admire the work she's been putting in. Include a joke about how her pet lizard Max enjoys eating grasshoppers. Mention how you're looking forward to the team off-site next week.
20 | ```
21 |
22 | [Output](https://beta.openai.com/playground/p/fUeh5saVLI2dhU0WU3nZtIiU):
23 |
24 | ```text
25 | Subject: Congratulations on Your Promotion, Jill!
26 |
27 | Dear Jill,
28 |
29 | Congratulations on your promotion! I'm so proud of you and all the hard work you've been putting in. It's been inspiring to watch you grow in your career.
30 |
31 | I'm sure Max the lizard is also celebrating with you. I hear he loves eating grasshoppers, so you should treat him to a special snack!
32 |
33 | I'm looking forward to seeing you at the team off-site next week. I'm sure it will be a great opportunity to celebrate your success.
34 |
35 | All the best,
36 |
37 | [Your Name]
38 | ```
39 |
40 | In general, writing can work with any style of prompt, whether it's an instruction or a series of examples to follow. Experiment to see what works best for your use case.
41 |
42 | Writing also works with any type of model, though they each have strengths and weaknesses.
43 |
44 | | | Advantages | Disadvantages |
45 | | ---------------------------------------------------------- | ----------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
46 | | Instruction-following models
(e.g., `text-davinci-003`) | Easiest to use | Less diverse; less creative; sometimes harder to steer tone, style, etc. |
47 | | Base models
(e.g., `davinci`) | Potentially more creative and diverse | Harder to prompt well, more expensive (as examples in the prompt cost extra tokens) |
48 | | Fine-tuned models | Can train off of many examples; cheaper than including examples in the prompt | Hard to gather training data; training makes iteration slower and more expensive |
49 |
--------------------------------------------------------------------------------
/transition_guides_for_deprecated_API_endpoints/README.md:
--------------------------------------------------------------------------------
1 | # Deprecation of Answers, Classification, and Search
2 |
3 | In 2021, OpenAI released specialized endpoints in beta for Answers, Classification, and Search.
4 |
5 | While these specialized endpoints were convenient, they had two drawbacks:
6 |
7 | 1. These specialized endpoints were eclipsed by techniques that achieved better results.
8 | 2. These specialized endpoints were more difficult to customize and optimize for individual use cases.
9 |
10 | As a result, **the Answers, Classifications, and Search endpoints are being deprecated.**
11 |
12 | ## Timeline of deprecation
13 |
14 | For those who have not used these endpoints, nothing will change except that access will no longer be available.
15 |
16 | **For existing users of these endpoints, access will continue until December 3, 2022.** Before that date, we strongly encourage developers to switch over to newer techniques which produce better results.
17 |
18 | ## How to transition
19 |
20 | We've written guides and code examples for transitioning from the deprecated API endpoints to better methods.
21 |
22 | ### Answers
23 |
24 | [Guide: How to transition off the Answers endpoint](https://help.openai.com/en/articles/6233728-answers-transition-guide)
25 |
26 | * Option 1: transition to embeddings-based search **(recommended)**
27 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
28 |
29 | * Option 2: reimplement Answers endpoint functionality
30 | * Example code: [answers_functionality_example.py](answers_functionality_example.py)
31 |
32 | ### Classification
33 |
34 | [Guide: How to transition off the Classifications endpoint](https://help.openai.com/en/articles/6272941-classifications-transition-guide)
35 |
36 | * Option 1: transition to fine-tuning **(recommended)**
37 | * Example code: [Fine-tuned_classification.ipynb](../examples/Fine-tuned_classification.ipynb)
38 | * Option 2: transition to embeddings
39 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
40 | * Option 3: reimplement Classifications endpoint functionality
41 | * Example code: [classification_functionality_example.py](classification_functionality_example.py)
42 |
43 | ### Search
44 |
45 | [Guide: How to transition off the Search endpoint](https://help.openai.com/en/articles/6272952-search-transition-guide)
46 |
47 | * Option 1: transition to embeddings-based search **(recommended)**
48 | * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
49 | * Option 2: reimplement Search endpoint functionality
50 | * Example code: [search_functionality_example.py](search_functionality_example.py)
51 |
--------------------------------------------------------------------------------
/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py:
--------------------------------------------------------------------------------
1 | from transformers import GPT2TokenizerFast
2 |
3 | import openai
4 |
5 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
6 |
7 | MAX_TOKENS_LIMIT = 2048
8 | ANSWERS_INSTRUCTION = "Please answer the question according to the above context.\n"
9 | CONTEXT_TEMPLATE = "===\nContext: {context}\n===\n"
10 |
11 |
12 | def extract_instruction(instruction):
13 | """
14 | Extract `instruction` parameter and format it properly.
15 | If not exist, return empty string.
16 | """
17 | if instruction is None:
18 | return ""
19 |
20 | return f"{instruction.strip()}\n\n"
21 |
22 |
23 | def semantic_search(
24 | search_model, query_for_search, file_id=None, max_documents=None, examples=None
25 | ):
26 | """
27 | :param examples: A list of {"text":...} or {"text": ..., "label": ...}.
28 | :return:
29 | a list of semantic search result dict of documents sorted by "score":
30 | [
31 | {
32 | "document": ...,
33 | "object": "search_result",
34 | "score": ...,
35 | "text": ...,
36 | },
37 | ...
38 | ]
39 | """
40 | assert (examples is None) ^ (file_id is None) # xor
41 |
42 | if file_id is not None:
43 | # This is where you'd do an elastic search call. Since there isn't an example of this
44 | # we can query, we'll raise an error.
45 | # The return value from this would be a list of examples
46 | raise NotImplementedError()
47 |
48 | # This isn't quite accurate since Search is also being deprecated. See our search guide for more
49 | # information.
50 |
51 | search_result = openai.Search.create(
52 | model=search_model,
53 | documents=[x["text"] for x in examples],
54 | query=query_for_search,
55 | )
56 |
57 | info_dict = {d["document"]: d for d in search_result["data"]}
58 | sorted_doc_ids = sorted(
59 | info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
60 | )
61 | if max_documents:
62 | sorted_doc_ids = sorted_doc_ids[:max_documents]
63 | return [info_dict[i] for i in sorted_doc_ids]
64 |
65 |
66 | def select_by_length(
67 | sorted_doc_infos,
68 | max_token_len,
69 | lambda_fn=None,
70 | ):
71 | """
72 | Give a list of (document ID, document content in string), we will select as many
73 | documents as possible as long as the total length does not go above `max_token_len`.
74 |
75 | :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
76 | :param max_token_len: The maximum token length for selected documents.
77 | :param lambda_fn: A function that takes in search results dict and output a formatted
78 | example for context stuffing.
79 | :return: A tuple of (
80 | A concatenation of selected documents used as context,
81 | A list of selected document IDs
82 | )
83 | """
84 | if not sorted_doc_infos:
85 | return "", []
86 |
87 | selected_indices = []
88 | total_doc_tokens = 0
89 | doc_dict = {}
90 | for i, doc_info in enumerate(sorted_doc_infos):
91 | doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
92 | n_doc_tokens = len(tokenizer.encode(doc))
93 | if total_doc_tokens + n_doc_tokens < max_token_len:
94 | total_doc_tokens += n_doc_tokens
95 | selected_indices.append(i)
96 | doc_dict[i] = doc
97 |
98 | # The top ranked documents should go at the end.
99 | selected_indices = selected_indices[::-1]
100 |
101 | context = "".join([doc_dict[i] for i in selected_indices])
102 | selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
103 | return context, selected_doc_infos
104 |
105 |
106 | def answers(
107 | examples,
108 | question,
109 | model,
110 | examples_context,
111 | file_id=None,
112 | documents=None,
113 | logit_bias=None,
114 | max_rerank=200,
115 | max_tokens=16,
116 | alternative_question=None,
117 | search_model="ada",
118 | temperature=0.0,
119 | logprobs=0,
120 | stop=None,
121 | n=1,
122 | ):
123 | """
124 | Given a prompt, a question, a list of (question, answer) pairs as examples, and
125 | a list of documents for context, it tries to include all the QA examples and top
126 | relevant context documents.
127 |
128 | The constructed prompt for the final completion call:
129 | ```
130 | Please answer the question according to the above context.
131 |
132 | ===
133 | Context: {{ the context for example QA pairs. }}
134 | ===
135 | Q: example 1 question
136 | A: example 1 answer
137 | ---
138 | Q: example 2 question
139 | A: example 2 answer
140 | ===
141 | Context: {{ a list of relevant documents sorted via search(question, documents) }}
142 | ===
143 | Q: question
144 | A:
145 | ```
146 |
147 | The returned object has a structure like:
148 | {
149 | "answers": [
150 | "Beijing",
151 | "Beijing, China"
152 | ],
153 | "completion_id": "xxx-xxx",
154 | "object": "answer",
155 | "selected_documents": [
156 | {
157 | "document": ..., # document index, same as in search/ results.
158 | "object": "search_result",
159 | "text": ...,
160 | },
161 | ...
162 | ],
163 | }
164 | """
165 |
166 | examples = examples if examples else []
167 |
168 | example_prompts = [f"Q: {x}\nA: {y}" for x, y in examples]
169 | prompt = f"Q: {question}\nA:"
170 |
171 | # Append all the QA examples into the prompt.
172 | if examples_context:
173 | examples_context = CONTEXT_TEMPLATE.format(context=examples_context)
174 | instruction = (
175 | ANSWERS_INSTRUCTION + examples_context + "\n---\n".join(example_prompts) + "\n"
176 | )
177 |
178 | logit_bias = logit_bias if logit_bias is not None else {}
179 |
180 | if file_id is None and documents is None:
181 | raise Exception("Please submit at least one of `documents` or `file`.")
182 | if file_id is not None and documents is not None:
183 | raise Exception("Please submit only one of `documents` or `file`.")
184 |
185 | instruction = extract_instruction(instruction)
186 |
187 | n_instruction_tokens = len(tokenizer.encode(instruction))
188 | n_prompt_tokens = len(tokenizer.encode(prompt))
189 | n_query_tokens = len(tokenizer.encode(question))
190 | n_context_tokens = len(tokenizer.encode(CONTEXT_TEMPLATE.format(context="")))
191 |
192 | if documents is not None:
193 | documents = [doc.strip() + " " for doc in documents]
194 | n_docs_tokens = [len(tokenizer.encode(doc)) for doc in documents]
195 |
196 | # Except all the required content, how many tokens left for context stuffing.
197 | leftover_token_len = MAX_TOKENS_LIMIT - (
198 | n_instruction_tokens + n_context_tokens + n_prompt_tokens + max_tokens
199 | )
200 | sorted_doc_infos = []
201 |
202 | question_for_search = (
203 | alternative_question if alternative_question is not None else question
204 | )
205 | if file_id is not None:
206 | search_model_, sorted_doc_infos = semantic_search(
207 | search_model,
208 | question_for_search,
209 | file_id=file_id,
210 | max_documents=max_rerank,
211 | )
212 |
213 | elif len(documents) == 0:
214 | # If no context document is provided, do nothing.
215 | pass
216 |
217 | elif min(n_docs_tokens) >= leftover_token_len:
218 | # If there is no room for adding any context doc.
219 | pass
220 |
221 | elif (max_rerank is None or max_rerank >= len(documents)) and sum(
222 | n_docs_tokens
223 | ) < leftover_token_len:
224 | # If the total length of docs is short enough to be added all.
225 | selected_indices = list(range(len(documents)))
226 |
227 | sorted_doc_infos = [
228 | {"document": i, "text": documents[i]} for i in selected_indices
229 | ]
230 |
231 | elif n_query_tokens + max(n_docs_tokens) >= MAX_TOKENS_LIMIT:
232 | # If the prompt and the longest document together go above the limit.
233 | total_tokens = n_query_tokens + max(n_docs_tokens)
234 | raise Exception(
235 | f"The longest document and prompt pair together contains {total_tokens} "
236 | f"tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. Please consider "
237 | f"shortening the prompt or the longest document."
238 | )
239 |
240 | else:
241 | # If we can add some context documents but not all of them, we should
242 | # query search endpoint to rank docs by score.
243 | sorted_doc_infos = semantic_search(
244 | search_model,
245 | question_for_search,
246 | examples=[{"text": doc} for doc in documents],
247 | max_documents=max_rerank,
248 | )
249 |
250 | # Select documents w.r.t. the context length limitation.
251 | context, sorted_doc_infos = select_by_length(
252 | sorted_doc_infos,
253 | leftover_token_len,
254 | lambda_fn=lambda x: x["text"].strip() + " ",
255 | )
256 |
257 | # Add instruction before the context and the prompt after the context.
258 | if context:
259 | context = CONTEXT_TEMPLATE.format(context=context.strip())
260 | full_prompt = instruction + context + prompt
261 |
262 | completion_result = openai.Completion.create(
263 | engine=model,
264 | prompt=full_prompt,
265 | logit_bias=logit_bias,
266 | temperature=temperature,
267 | n=n,
268 | max_tokens=max_tokens,
269 | stop=stop,
270 | logprobs=logprobs,
271 | )
272 |
273 | completion_result["selected_documents"] = sorted_doc_infos
274 |
275 | result = dict(
276 | object="answer",
277 | selected_documents=completion_result.pop("selected_documents"),
278 | completion=completion_result["id"],
279 | )
280 |
281 | result["answers"] = [
282 | item["text"].replace("A:", "").split("Q:")[0].strip()
283 | for item in completion_result["choices"]
284 | ]
285 |
286 | return result
287 |
288 |
289 | print(
290 | answers(
291 | examples=[
292 | ["What is the capital of Washington", "Olympia"],
293 | ["What is the capital of Oregon", "Salem"],
294 | ],
295 | question="What is the capital of China?",
296 | examples_context="I am a bot that names country capitals",
297 | documents=["I am a bot that names country capitals"],
298 | model="davinci",
299 | search_model="ada",
300 | alternative_question="different test",
301 | max_tokens=16,
302 | stop=["\n\n"],
303 | )
304 | )
305 |
--------------------------------------------------------------------------------
/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from collections import defaultdict
3 |
4 | from transformers import GPT2TokenizerFast
5 |
6 | import openai
7 |
8 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
9 |
10 | MAX_TOKENS_LIMIT = 2048
11 |
12 |
13 | def create_instruction(labels) -> str:
14 | """
15 | Construct an instruction for a classification task.
16 | """
17 | instruction = f"Please classify a piece of text into the following categories: {', '.join(labels)}."
18 |
19 | return f"{instruction.strip()}\n\n"
20 |
21 |
22 | def semantic_search(
23 | search_model, query_for_search, file_id=None, max_documents=None, examples=None
24 | ):
25 | """
26 | :param examples: A list of {"text":...} or {"text": ..., "label": ...}.
27 | :return:
28 | a list of semantic search result dict of documents sorted by "score":
29 | [
30 | {
31 | "document": ...,
32 | "object": "search_result",
33 | "score": ...,
34 | "text": ...,
35 | },
36 | ...
37 | ]
38 |
39 | """
40 | assert (examples is None) ^ (file_id is None) # xor
41 |
42 | if file_id is not None:
43 | # This is where you'd do an elastic search call. Since there isn't an example of this
44 | # we can query, we'll raise an error.
45 | # The return value from this would be a list of examples
46 | raise NotImplementedError()
47 |
48 | # This isn't quite accurate since Search is also being deprecated. See our search guide for more
49 | # information.
50 |
51 | search_result = openai.Search.create(
52 | model=search_model,
53 | documents=[x["text"] for x in examples],
54 | query=query_for_search,
55 | )
56 |
57 | info_dict = {d["document"]: d for d in search_result["data"]}
58 | sorted_doc_ids = sorted(
59 | info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
60 | )
61 | if max_documents:
62 | sorted_doc_ids = sorted_doc_ids[:max_documents]
63 | return [info_dict[i] for i in sorted_doc_ids]
64 |
65 |
66 | def select_by_length(
67 | sorted_doc_infos,
68 | max_token_len,
69 | lambda_fn=None,
70 | ):
71 | """
72 | Give a list of (document ID, document content in string), we will select as many
73 | documents as possible as long as the total length does not go above `max_token_len`.
74 |
75 | :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
76 | :param max_token_len: The maximum token length for selected documents.
77 | :param lambda_fn: A function that takes in search results dict and output a formatted
78 | example for context stuffing.
79 | :return: A tuple of (
80 | A concatenation of selected documents used as context,
81 | A list of selected document IDs
82 | )
83 | """
84 | if not sorted_doc_infos:
85 | return "", []
86 |
87 | selected_indices = []
88 | total_doc_tokens = 0
89 | doc_dict = {}
90 | for i, doc_info in enumerate(sorted_doc_infos):
91 | doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
92 | n_doc_tokens = len(tokenizer.encode(doc))
93 | if total_doc_tokens + n_doc_tokens < max_token_len:
94 | total_doc_tokens += n_doc_tokens
95 | selected_indices.append(i)
96 | doc_dict[i] = doc
97 |
98 | # The top ranked documents should go at the end.
99 | selected_indices = selected_indices[::-1]
100 |
101 | context = "".join([doc_dict[i] for i in selected_indices])
102 | selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
103 | return context, selected_doc_infos
104 |
105 |
106 | def format_example_fn(x: dict) -> str:
107 | return "Text: {text}\nCategory: {label}\n---\n".format(
108 | text=x["text"].replace("\n", " ").strip(),
109 | label=x["label"].replace("\n", " ").strip(),
110 | )
111 |
112 |
113 | def classifications(
114 | query,
115 | model,
116 | search_model="ada",
117 | examples=None,
118 | file=None,
119 | labels=None,
120 | temperature=0.0,
121 | logprobs=None,
122 | max_examples=200,
123 | logit_bias=None,
124 | alternative_query=None,
125 | max_tokens=16,
126 | ) -> dict:
127 | """
128 | Given a prompt, a question and a list of examples, containing (text, label) pairs,
129 | it selects top relevant examples to construct a prompt for few-shot classification.
130 |
131 | The constructed prompt for the final completion call:
132 | ```
133 | {{ an optional instruction }}
134 |
135 | Text: example 1 text
136 | Category: example 1 label
137 | ---
138 | Text: example 1 text
139 | Category: example 2 label
140 | ---
141 | Text: question
142 | Category:
143 | ```
144 |
145 | The returned object has a structure like:
146 | {
147 | "label": "Happy",
148 | "model": "ada",
149 | "object": "classification",
150 | "selected_examples": [
151 | {
152 | "document": ..., # document index, same as in search/ results.
153 | "text": ...,
154 | "label": ...,
155 | },
156 | ...
157 | ],
158 | }
159 | """
160 |
161 | query = query.replace("\n", " ").strip()
162 | logit_bias = logit_bias if logit_bias else {}
163 | labels = labels if labels else []
164 |
165 | if file is None and examples is None:
166 | raise Exception("Please submit at least one of `examples` or `file`.")
167 | if file is not None and examples is not None:
168 | raise Exception("Please submit only one of `examples` or `file`.")
169 |
170 | instruction = create_instruction(labels)
171 |
172 | query_for_search = alternative_query if alternative_query is not None else query
173 |
174 | # Extract examples and example labels first.
175 | if file is not None:
176 | sorted_doc_infos = semantic_search(
177 | search_model,
178 | query_for_search,
179 | file_id=file,
180 | max_documents=max_examples,
181 | )
182 |
183 | else:
184 | example_prompts = [
185 | format_example_fn(dict(text=x, label=y)) for x, y in examples
186 | ]
187 | n_examples_tokens = [len(tokenizer.encode(x)) for x in example_prompts]
188 |
189 | query_prompt = f"Text: {query}\nCategory:"
190 | n_instruction_tokens = len(tokenizer.encode(instruction))
191 | n_query_tokens = len(tokenizer.encode(query_prompt))
192 |
193 | # Except all the required content, how many tokens left for context stuffing.
194 | leftover_token_len = MAX_TOKENS_LIMIT - (
195 | n_instruction_tokens + n_query_tokens + max_tokens
196 | )
197 |
198 | # Process when `examples` are provided but no `file` is provided.
199 | if examples:
200 | if (max_examples is None or max_examples >= len(examples)) and sum(
201 | n_examples_tokens
202 | ) < leftover_token_len:
203 | # If the total length of docs is short enough that we can add all examples, no search call.
204 | selected_indices = list(range(len(examples)))
205 |
206 | sorted_doc_infos = [
207 | {"document": i, "text": examples[i][0], "label": examples[i][1]}
208 | for i in selected_indices
209 | ]
210 |
211 | elif max(n_examples_tokens) + n_query_tokens >= MAX_TOKENS_LIMIT:
212 | # If the prompt and the longest example together go above the limit:
213 | total_tokens = max(n_examples_tokens) + n_query_tokens
214 | raise Exception(
215 | user_message=f"The longest classification example, query and prompt together contain "
216 | f"{total_tokens} tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. "
217 | f"Please consider shortening your instruction, query or the longest example."
218 | )
219 |
220 | else:
221 | # If we can add some context documents but not all of them, we should
222 | # query search endpoint to rank docs by score.
223 | sorted_doc_infos = semantic_search(
224 | search_model,
225 | query_for_search,
226 | examples=[{"text": x, "label": y} for x, y in examples],
227 | max_documents=max_examples,
228 | )
229 |
230 | # Per label, we have a list of doc id sorted by its relevancy to the query.
231 | label_to_indices = defaultdict(list)
232 | for idx, d in enumerate(sorted_doc_infos):
233 | label_to_indices[d["label"]].append(idx)
234 |
235 | # Do a round robin for each of the different labels, taking the best match for each label.
236 | label_indices = [label_to_indices[label] for label in labels]
237 | mixed_indices = [
238 | i for x in itertools.zip_longest(*label_indices) for i in x if i is not None
239 | ]
240 | sorted_doc_infos = [sorted_doc_infos[i] for i in mixed_indices]
241 |
242 | # Try to select as many examples as needed to fit into the context
243 | context, sorted_doc_infos = select_by_length(
244 | sorted_doc_infos,
245 | leftover_token_len,
246 | lambda_fn=format_example_fn,
247 | )
248 |
249 | prompt = instruction + context + query_prompt
250 |
251 | completion_params = {
252 | "engine": model,
253 | "prompt": prompt,
254 | "temperature": temperature,
255 | "logprobs": logprobs,
256 | "logit_bias": logit_bias,
257 | "max_tokens": max_tokens,
258 | "stop": "\n",
259 | "n": 1,
260 | }
261 |
262 | completion_resp = openai.Completion.create(
263 | **completion_params,
264 | )
265 |
266 | label = completion_resp["choices"][0]["text"]
267 | label = label.split("\n")[0].strip().lower().capitalize()
268 | if label not in labels:
269 | label = "Unknown"
270 |
271 | result = dict(
272 | # TODO: Add id for object persistence.
273 | object="classification",
274 | model=completion_resp["model"],
275 | label=label,
276 | completion=completion_resp["id"],
277 | )
278 |
279 | result["selected_examples"] = sorted_doc_infos
280 |
281 | return result
282 |
283 |
284 | print(
285 | classifications(
286 | query="this is my test",
287 | model="davinci",
288 | search_model="ada",
289 | examples=[
290 | ["this is my test", "davinci"],
291 | ["this is other test", "blahblah"],
292 | ],
293 | file=None,
294 | labels=["davinci", "blahblah"],
295 | temperature=0.1,
296 | logprobs=0,
297 | max_examples=200,
298 | logit_bias=None,
299 | alternative_query="different test",
300 | max_tokens=16,
301 | )
302 | )
303 |
--------------------------------------------------------------------------------
/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py:
--------------------------------------------------------------------------------
1 | from transformers import GPT2TokenizerFast
2 |
3 | import openai
4 |
5 | tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
6 |
7 | docs = ["test1", "asdklgjnasdv", "banana", "lord lollipop"]
8 | query = "apple orang asdansbdausd"
9 |
10 | print(openai.Search.create(model="davinci", query=query, documents=docs))
11 |
12 |
13 | def construct_context(query, document):
14 | return "<|endoftext|>{document}\n\n---\n\nThe above passage is related to: {query}".format(
15 | document=document, query=query
16 | )
17 |
18 |
19 | def get_score(context, query, log_probs, text_offsets) -> float:
20 | SCORE_MULTIPLIER = 100.0
21 |
22 | log_prob = 0
23 | count = 0
24 | cutoff = len(context) - len(query)
25 |
26 | for i in range(len(text_offsets) - 1, 0, -1):
27 | log_prob += log_probs[i]
28 | count += 1
29 |
30 | if text_offsets[i] <= cutoff and text_offsets[i] != text_offsets[i - 1]:
31 | break
32 |
33 | return log_prob / float(count) * SCORE_MULTIPLIER
34 |
35 |
36 | def search(query, documents, engine):
37 |
38 | prompts = [construct_context(query, doc) for doc in [""] + documents]
39 |
40 | resps = openai.Completion.create(
41 | model=engine,
42 | prompt=prompts,
43 | temperature=1.0,
44 | top_p=1.0,
45 | max_tokens=0,
46 | logprobs=0,
47 | n=1,
48 | echo=True,
49 | )
50 |
51 | resps_by_index = {choice["index"]: choice for choice in resps["choices"]}
52 |
53 | scores = [
54 | get_score(
55 | prompts[i],
56 | query,
57 | resps_by_index[i]["logprobs"]["token_logprobs"],
58 | resps_by_index[i]["logprobs"]["text_offset"],
59 | )
60 | for i in range(len(prompts))
61 | ]
62 |
63 | # Process results
64 | scores = [score - scores[0] for score in scores][1:]
65 |
66 | return [
67 | {
68 | "object": "search_result",
69 | "document": document_idx,
70 | "score": round(score, 3),
71 | }
72 | for document_idx, score in enumerate(scores)
73 | ]
74 |
75 |
76 | print(search(query=query, documents=docs, engine="davinci"))
77 |
--------------------------------------------------------------------------------